diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
new file mode 100644
index 0000000..4c01dc2
--- /dev/null
+++ b/CODE_OF_CONDUCT.md
@@ -0,0 +1,82 @@
+# Code of Conduct
+
+## Overview
+
+Define the code of conduct followed and enforced for the cuQuantum Python project.
+
+## Our Pledge
+
+In the interest of fostering an open and welcoming environment, we as
+contributors and maintainers pledge to making participation in our project and
+our community a harassment-free experience for everyone, regardless of age, body
+size, disability, ethnicity, sex characteristics, gender identity and expression,
+level of experience, education, socio-economic status, nationality, personal
+appearance, race, religion, or sexual identity and orientation.
+
+## Our Standards
+
+Examples of behavior that contributes to creating a positive environment
+include:
+
+* Using welcoming and inclusive language
+* Being respectful of differing viewpoints and experiences
+* Gracefully accepting constructive criticism
+* Focusing on what is best for the community
+* Showing empathy towards other community members
+
+Examples of unacceptable behavior by participants include:
+
+* The use of sexualized language or imagery and unwelcome sexual attention or
+  advances
+* Trolling, insulting/derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or electronic
+  address, without explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+  professional setting
+
+## Our Responsibilities
+
+Project maintainers are responsible for clarifying the standards of acceptable
+behavior and are expected to take appropriate and fair corrective action in
+response to any instances of unacceptable behavior.
+
+Project maintainers have the right and responsibility to remove, edit, or
+reject comments, commits, code, wiki edits, issues, and other contributions
+that are not aligned to this Code of Conduct, or to ban temporarily or
+permanently any contributor for other behaviors that they deem inappropriate,
+threatening, offensive, or harmful.
+
+## Scope
+
+This Code of Conduct applies both within project spaces and in public spaces
+when an individual is representing the project or its community. Examples of
+representing a project or community include using an official project e-mail
+address, posting via an official social media account, or acting as an appointed
+representative at an online or offline event. Representation of a project may be
+further defined and clarified by project maintainers.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported by contacting the project team at
+[cuquantum-python@nvidia.com](mailto:cuquantum-python@nvidia.com) All
+complaints will be reviewed and investigated and will result in a response that
+is deemed necessary and appropriate to the circumstances. The project team is
+obligated to maintain confidentiality with regard to the reporter of an
+incident. Further details of specific enforcement policies may be posted
+separately.
+
+Project maintainers who do not follow or enforce the Code of Conduct in good
+faith may face temporary or permanent repercussions as determined by other
+members of the project's leadership.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
+available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
+
+[homepage]: https://www.contributor-covenant.org
+
+For answers to common questions about this code of conduct, see
+https://www.contributor-covenant.org/faq
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 0000000..f1a8d15
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,12 @@
+# Contributing
+
+Thank you for your interest in contributing to cuQuantum Python! Based on the type of contribution, it will fall into two categories:
+
+1. You want to report a bug, feature request, or documentation issue
+    - File an [issue](https://github.com/NVIDIA/cuQuantum/issues/new)
+    describing what you encountered or what you want to see changed.
+    - The NVIDIA team will evaluate the issues and triage them, scheduling
+    them for a release. If you believe the issue needs priority attention
+    comment on the issue to notify the team.
+2. You want to implement a feature or bug-fix
+    - At this time we do not accept code contributions.
diff --git a/LICENSE b/LICENSE
index c33765e..33a89f0 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,5 +1,6 @@
-SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-SPDX-License-Identifier: BSD-3-Clause
+BSD-3-Clause
+
+Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
diff --git a/README.md b/README.md
index 09ca78d..b14d72b 100644
--- a/README.md
+++ b/README.md
@@ -1,9 +1,11 @@
 # Welcome to the cuQuantum repository!
 
-This public repository contains two sets of files:
+This public repository contains two sets of files related to the [NVIDIA cuQuantum SDK](https://developer.nvidia.com/cuquantum-sdk):
 
-- `samples`: All C/C++ sample codes for the [NVIDIA cuQuantum SDK](https://developer.nvidia.com/cuquantum-sdk).
-- `python`: The open-sourced cuQuantum Python project (**coming soon**).
+- `samples`: All C/C++ sample codes for the cuQuantum SDK.
+- `python`: The open-sourced cuQuantum Python project.
+
+Other components of the cuQuantum SDK can be accessed following the instruction given in the documentation.
 
 ## Installation
 
diff --git a/python/CODE_OF_CONDUCT.md b/python/CODE_OF_CONDUCT.md
new file mode 100644
index 0000000..4c01dc2
--- /dev/null
+++ b/python/CODE_OF_CONDUCT.md
@@ -0,0 +1,82 @@
+# Code of Conduct
+
+## Overview
+
+Define the code of conduct followed and enforced for the cuQuantum Python project.
+
+## Our Pledge
+
+In the interest of fostering an open and welcoming environment, we as
+contributors and maintainers pledge to making participation in our project and
+our community a harassment-free experience for everyone, regardless of age, body
+size, disability, ethnicity, sex characteristics, gender identity and expression,
+level of experience, education, socio-economic status, nationality, personal
+appearance, race, religion, or sexual identity and orientation.
+
+## Our Standards
+
+Examples of behavior that contributes to creating a positive environment
+include:
+
+* Using welcoming and inclusive language
+* Being respectful of differing viewpoints and experiences
+* Gracefully accepting constructive criticism
+* Focusing on what is best for the community
+* Showing empathy towards other community members
+
+Examples of unacceptable behavior by participants include:
+
+* The use of sexualized language or imagery and unwelcome sexual attention or
+  advances
+* Trolling, insulting/derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or electronic
+  address, without explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+  professional setting
+
+## Our Responsibilities
+
+Project maintainers are responsible for clarifying the standards of acceptable
+behavior and are expected to take appropriate and fair corrective action in
+response to any instances of unacceptable behavior.
+
+Project maintainers have the right and responsibility to remove, edit, or
+reject comments, commits, code, wiki edits, issues, and other contributions
+that are not aligned to this Code of Conduct, or to ban temporarily or
+permanently any contributor for other behaviors that they deem inappropriate,
+threatening, offensive, or harmful.
+
+## Scope
+
+This Code of Conduct applies both within project spaces and in public spaces
+when an individual is representing the project or its community. Examples of
+representing a project or community include using an official project e-mail
+address, posting via an official social media account, or acting as an appointed
+representative at an online or offline event. Representation of a project may be
+further defined and clarified by project maintainers.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported by contacting the project team at
+[cuquantum-python@nvidia.com](mailto:cuquantum-python@nvidia.com) All
+complaints will be reviewed and investigated and will result in a response that
+is deemed necessary and appropriate to the circumstances. The project team is
+obligated to maintain confidentiality with regard to the reporter of an
+incident. Further details of specific enforcement policies may be posted
+separately.
+
+Project maintainers who do not follow or enforce the Code of Conduct in good
+faith may face temporary or permanent repercussions as determined by other
+members of the project's leadership.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
+available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
+
+[homepage]: https://www.contributor-covenant.org
+
+For answers to common questions about this code of conduct, see
+https://www.contributor-covenant.org/faq
diff --git a/python/CONTRIBUTING.md b/python/CONTRIBUTING.md
new file mode 100644
index 0000000..f1a8d15
--- /dev/null
+++ b/python/CONTRIBUTING.md
@@ -0,0 +1,12 @@
+# Contributing
+
+Thank you for your interest in contributing to cuQuantum Python! Based on the type of contribution, it will fall into two categories:
+
+1. You want to report a bug, feature request, or documentation issue
+    - File an [issue](https://github.com/NVIDIA/cuQuantum/issues/new)
+    describing what you encountered or what you want to see changed.
+    - The NVIDIA team will evaluate the issues and triage them, scheduling
+    them for a release. If you believe the issue needs priority attention
+    comment on the issue to notify the team.
+2. You want to implement a feature or bug-fix
+    - At this time we do not accept code contributions.
diff --git a/python/LICENSE b/python/LICENSE
new file mode 100644
index 0000000..33a89f0
--- /dev/null
+++ b/python/LICENSE
@@ -0,0 +1,28 @@
+BSD-3-Clause
+
+Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/python/README.md b/python/README.md
new file mode 100644
index 0000000..a8a3bcd
--- /dev/null
+++ b/python/README.md
@@ -0,0 +1,78 @@
+# cuQuantum Python
+
+## Documentation
+
+Please visit the [NVIDIA cuQuantum SDK documentation](https://docs.nvidia.com/cuda/cuquantum/).
+
+## Building
+
+### Requirements
+
+Build-time dependencies of the cuQuantum Python package and some versions that
+are known to work are as follows:
+
+* CUDA Toolkit 11.4+
+* cuQuantum 0.1.0
+* cuTENSOR 1.4.0+
+* Cython - e.g. 0.29.21
+
+### Install cuQuantum Python from conda-forge
+
+If you already have a Conda environment set up, it is the easiest to install cuQuantum Python from the conda-forge channel:
+```
+conda install -c conda-forge cuquantum-python
+```
+The Conda solver will address all dependencies for you.
+
+### Install cuQuantum Python from source
+
+To compile and install cuQuantum Python from source, please follow the steps below:
+
+1. Set `CUDA_PATH` to point to your CUDA installation
+2. Set `CUQUANTUM_ROOT` to point to your cuQuantum installation
+3. Set `CUTENSOR_ROOT` to point to your cuTENSOR installation
+4. Make sure CUDA, cuQuantum and cuTENSOR are visible in your `LD_LIBRARY_PATH`
+5. Run `pip install -v .`
+
+Notes:
+- For the `pip install` step, adding the `-e` flag after `-v` would allow installing the package in-place (i.e., in "editable mode" for testing/developing).
+- If `CUSTATEVEC_ROOT` and `CUTENSORNET_ROOT` are set (for the cuStateVec and the cuTensorNet libraries, respectively), they overwrite `CUQUANTUM_ROOT`.
+- For local development, set `CUQUANTUM_IGNORE_SOLVER=1` to ignore the dependency on the `cuquantum` wheel.
+
+
+## Running
+
+### Requirements
+
+Runtime dependencies of the cuQuantum Python package include:
+
+* An NVIDIA GPU with compute capability 7.0+
+* Driver: Linux (450.80.02+)
+* CUDA Toolkit 11.4+
+* cuQuantum 0.1.0
+* cuTENSOR 1.4.0+
+* NumPy v1.17+
+* CuPy v9.5.0+
+* PyTorch v1.10+ (optional)
+
+If you install everything from conda-forge, the dependencies are taken care for you (except for the driver).
+
+If you build cuQuantum Python from source, please make sure the paths to the cuQuantum and cuTENSOR libraries are added
+to your `LD_LIBRARY_PATH` environment variable.
+
+Known issues:
+- If a system has multiple copies of cuTENSOR, one of which is installed in a default system path, the Python runtime could pick it up despite cuQuantum Python is linked to another copy installed elsewhere, potentially causing a version-mismatch error. The proper fix is to remove cuTENSOR from the system paths to ensure the visibility of the proper copy. **DO NOT ATTEMPT** to use `LD_PRELOAD` to overwrite it --- it could cause hard to debug behaviors!
+- In certain environments, if PyTorch is installed `import cuquantum` could fail (with a segmentation fault). It is currently under investigation and a temporary workaround is to import `torch` before importing `cuquantum`.
+
+### Samples
+
+Samples for demonstrating the usage of both low-level and high-level Python APIs are
+available in the `samples` directory. The low-level API samples are 1:1 translations of the corresponding
+samples written in C. The high-level API samples demonstrate pythonic usages of the cuTensorNet
+library in Python.
+
+
+## Testing
+
+If pytest is installed, run `pytest tests` in the Python source root directory would
+run all tests.
diff --git a/python/cuquantum/__init__.pxd b/python/cuquantum/__init__.pxd
new file mode 100644
index 0000000..e69de29
diff --git a/python/cuquantum/__init__.py b/python/cuquantum/__init__.py
new file mode 100644
index 0000000..fefd8fc
--- /dev/null
+++ b/python/cuquantum/__init__.py
@@ -0,0 +1,29 @@
+from cuquantum import custatevec
+from cuquantum import cutensornet
+from cuquantum.cutensornet import (
+    contract, contract_path, einsum, einsum_path, Network,
+    NetworkOptions, OptimizerInfo, OptimizerOptions, PathFinderOptions, ReconfigOptions, SlicerOptions)
+from cuquantum.utils import ComputeType, cudaDataType, libraryPropertyType
+
+
+# We patch all enum values so that they have the correct docstrings
+for enum in (
+        custatevec.Pauli,
+        custatevec.MatrixLayout,
+        custatevec.MatrixType,
+        custatevec.Collapse,
+        custatevec.SamplerOutput,
+        cutensornet.ContractionOptimizerInfoAttribute,
+        cutensornet.ContractionOptimizerConfigAttribute,
+        cutensornet.ContractionAutotunePreferenceAttribute,
+        ):
+    cutensornet._internal.enum_utils.add_enum_class_doc(enum, chomp="_ATTRIBUTE|_PREFERENCE_ATTRIBUTE")
+# these have yet another convention...
+for v in cutensornet.GraphAlgorithm:
+    v.__doc__ = f"See `CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_GRAPH_ALGORITHM_{v.name}`."
+cutensornet.MemoryModel.SLICER_HEURISTIC.__doc__ = \
+    f"See `CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_SLICER_MEMORY_MODEL_HEURISTIC`."
+cutensornet.MemoryModel.SLICER_CUTENSOR.__doc__ = \
+    f"See `CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_SLICER_MEMORY_MODEL_CUTENSOR`."
+
+del enum, utils, v
diff --git a/python/cuquantum/custatevec/__init__.py b/python/cuquantum/custatevec/__init__.py
new file mode 100644
index 0000000..4b56cc4
--- /dev/null
+++ b/python/cuquantum/custatevec/__init__.py
@@ -0,0 +1 @@
+from cuquantum.custatevec.custatevec import *
diff --git a/python/cuquantum/custatevec/custatevec.pxd b/python/cuquantum/custatevec/custatevec.pxd
new file mode 100644
index 0000000..884b917
--- /dev/null
+++ b/python/cuquantum/custatevec/custatevec.pxd
@@ -0,0 +1,57 @@
+# TODO: Ultimately, everything should be auto-generated using
+# the scripts from the CUDA Python team
+
+from libc.stdint cimport intptr_t, int32_t, uint32_t, int64_t
+
+
+# The C types are prefixed with an underscore because we are not
+# yet protected by the module namespaces as done in CUDA Python.
+# Once we switch over the names would be prettier (in the Cython
+# layer).
+
+cdef extern from '<custatevec.h>' nogil:
+    # cuStateVec types
+    ctypedef void* _Handle 'custatevecHandle_t'
+    ctypedef int64_t _Index 'custatevecIndex_t'
+    ctypedef int _Status 'custatevecStatus_t'
+    ctypedef struct _SamplerDescriptor 'custatevecSamplerDescriptor_t':
+        pass
+    ctypedef struct _AccessorDescriptor 'custatevecAccessorDescriptor':
+        pass
+    ctypedef enum _ComputeType 'custatevecComputeType_t':
+        pass
+    # ctypedef void(*custatevecLoggerCallback_t)(
+    #     int32_t logLevel,
+    #     const char* functionName,
+    #     const char* message)
+    # ctypedef custatevecLoggerCallback_t LoggerCallback
+
+    # cuStateVec enums
+    ctypedef enum _Pauli 'custatevecPauli_t':
+        CUSTATEVEC_PAULI_I
+        CUSTATEVEC_PAULI_X
+        CUSTATEVEC_PAULI_Y
+        CUSTATEVEC_PAULI_Z
+
+    ctypedef enum _MatrixLayout 'custatevecMatrixLayout_t':
+        CUSTATEVEC_MATRIX_LAYOUT_COL
+        CUSTATEVEC_MATRIX_LAYOUT_ROW
+
+    ctypedef enum _MatrixType 'custatevecMatrixType_t':
+        CUSTATEVEC_MATRIX_TYPE_GENERAL
+        CUSTATEVEC_MATRIX_TYPE_UNITARY
+        CUSTATEVEC_MATRIX_TYPE_HERMITIAN
+
+    ctypedef enum _CollapseOp 'custatevecCollapseOp_t':
+        CUSTATEVEC_COLLAPSE_NONE
+        CUSTATEVEC_COLLAPSE_NORMALIZE_AND_ZERO
+
+    ctypedef enum _SamplerOutput 'custatevecSamplerOutput_t':
+        CUSTATEVEC_SAMPLER_OUTPUT_RANDNUM_ORDER
+        CUSTATEVEC_SAMPLER_OUTPUT_ASCENDING_ORDER
+
+    # cuStateVec consts
+    int CUSTATEVEC_VER_MAJOR
+    int CUSTATEVEC_VER_MINOR
+    int CUSTATEVEC_VER_PATCH
+    int CUSTATEVEC_VERSION
diff --git a/python/cuquantum/custatevec/custatevec.pyx b/python/cuquantum/custatevec/custatevec.pyx
new file mode 100644
index 0000000..526529a
--- /dev/null
+++ b/python/cuquantum/custatevec/custatevec.pyx
@@ -0,0 +1,1546 @@
+# distutils: language = c++
+
+cimport cython
+from libc.stdio cimport FILE
+from libcpp.vector cimport vector
+cimport cpython
+from cpython.mem cimport PyMem_Malloc, PyMem_Free
+
+from cuquantum.utils cimport is_nested_sequence
+
+from enum import IntEnum
+
+import numpy as _numpy
+
+
+cdef extern from * nogil:
+    # from CUDA
+    ctypedef int Stream 'cudaStream_t'
+    ctypedef enum DataType 'cudaDataType_t':
+        pass
+    ctypedef enum LibPropType 'libraryPropertyType':
+        pass
+
+    # cuStateVec functions
+    int custatevecCreate(_Handle*)
+    int custatevecDestroy(_Handle)
+    const char* custatevecGetErrorName(_Status)
+    const char* custatevecGetErrorString(_Status)
+    int custatevecGetDefaultWorkspaceSize(_Handle, size_t*)
+    int custatevecSetWorkspace(_Handle, void*, size_t)
+    int custatevecGetProperty(LibPropType, int32_t*)
+    size_t custatevecGetVersion()
+    int custatevecSetStream(_Handle, Stream)
+    int custatevecGetStream(_Handle, Stream*)
+    # int custatevecLoggerSetCallback(LoggerCallback)
+    # int custatevecLoggerSetFile(FILE*)
+    # int custatevecLoggerOpenFile(const char*)
+    # int custatevecLoggerSetLevel(int32_t)
+    # int custatevecLoggerSetMask(int32_t)
+    # int custatevecLoggerForceDisable()
+    int custatevecAbs2SumOnZBasis(
+        _Handle, const void*, DataType, const uint32_t, double*, double*,
+        const int32_t*, const uint32_t)
+    int custatevecAbs2SumArray(
+        _Handle, const void*, DataType, const uint32_t, double*, const int32_t*,
+        const uint32_t, const int32_t*, const int32_t*, const uint32_t)
+    int custatevecCollapseOnZBasis(
+        _Handle, void*, DataType, const uint32_t, const int32_t, const int32_t*,
+        const uint32_t, double)
+    int custatevecCollapseByBitString(
+        _Handle, void*, DataType, const uint32_t, const int32_t*, const int32_t*,
+        const uint32_t, double)
+    int custatevecMeasureOnZBasis(
+        _Handle, void*, DataType, const uint32_t, int32_t*, const int32_t*,
+        const uint32_t, const double, _CollapseOp)
+    int custatevecBatchMeasure(
+        _Handle, void*, DataType, const uint32_t, int32_t*, const int32_t*,
+        const uint32_t, const double, _CollapseOp)
+    int custatevecApplyExp(
+        _Handle, void*, DataType, const uint32_t, double, const _Pauli*,
+        const int32_t*, const uint32_t, const int32_t*, const int32_t*,
+        const uint32_t)
+    int custatevecApplyMatrix_bufferSize(
+        _Handle, DataType, const uint32_t, const void*, DataType,
+        _MatrixLayout, const int32_t, const uint32_t, const uint32_t,
+        _ComputeType, size_t*)
+    int custatevecApplyMatrix(
+        _Handle, void*, DataType, const uint32_t, const void*,
+        DataType, _MatrixLayout, const int32_t, const int32_t*,
+        const uint32_t, const int32_t*, const uint32_t, const int32_t*,
+        _ComputeType, void*, size_t)
+    int custatevecExpectation_bufferSize(
+        _Handle, DataType, const uint32_t, const void*, DataType, _MatrixLayout,
+        const uint32_t, _ComputeType, size_t*)
+    int custatevecExpectation(
+        _Handle, const void*, DataType, const uint32_t, void*, DataType, double*,
+        const void*, DataType, _MatrixLayout, const int32_t*,
+        const uint32_t, _ComputeType, void*, size_t)
+    int custatevecSampler_create(
+        _Handle, const void*, DataType, const uint32_t, _SamplerDescriptor*,
+        uint32_t, size_t*)
+    int custatevecSampler_preprocess(
+        _Handle, _SamplerDescriptor*, void*, const size_t)
+    int custatevecSampler_sample(
+        _Handle, _SamplerDescriptor*, _Index*, const int32_t*, const uint32_t,
+        const double*, const uint32_t, _SamplerOutput)
+    int custatevecApplyGeneralizedPermutationMatrix_bufferSize(
+        _Handle, DataType, const uint32_t, const _Index*, void*, DataType,
+        const int32_t*, const uint32_t, const uint32_t, size_t*)
+    int custatevecApplyGeneralizedPermutationMatrix(
+        _Handle, void*, DataType, const uint32_t, _Index*, const void*,
+        DataType, const int32_t, const int32_t*, const uint32_t,
+        const int32_t*, const int32_t*, const uint32_t, void*, size_t)
+    int custatevecExpectationsOnPauliBasis(
+        _Handle, void*, DataType, const uint32_t, double*, const _Pauli**,
+        const int32_t**, const uint32_t*, const uint32_t)
+    int custatevecAccessor_create(
+        _Handle, void*, DataType, const uint32_t,
+        _AccessorDescriptor*, const int32_t*, const uint32_t, const int32_t*,
+        const int32_t*, const uint32_t, size_t*)
+    int custatevecAccessor_createReadOnly(
+        _Handle, const void*, DataType, const uint32_t,
+        _AccessorDescriptor*, const int32_t*, const uint32_t, const int32_t*,
+        const int32_t*, const uint32_t, size_t*)
+    int custatevecAccessor_setExtraWorkspace(
+        _Handle, _AccessorDescriptor*, void*, size_t)
+    int custatevecAccessor_get(
+        _Handle, _AccessorDescriptor*, void*, const _Index, const _Index)
+    int custatevecAccessor_set(
+        _Handle, _AccessorDescriptor*, const void*, const _Index, const _Index)
+
+
+class cuStateVecError(RuntimeError):
+
+    def __init__(self, status):
+        self.status = status
+        cdef str err_name = custatevecGetErrorName(status).decode()
+        cdef str err_desc = custatevecGetErrorString(status).decode()
+        cdef str err = f"{err_name} ({err_desc})"
+        super().__init__(err)
+
+    def __reduce__(self):
+        return (type(self), (self.status,))
+
+
+@cython.profile(False)
+cdef inline check_status(int status):
+    if status != 0:
+        raise cuStateVecError(status)
+
+
+cpdef intptr_t create() except*:
+    """Initialize the cuStateVec library and create a handle.
+
+    Returns:
+        intptr_t: The opaque library handle (as Python `int`).
+
+    .. note:: The returned handle should be tied to the current device.
+
+    .. seealso:: `custatevecCreate`
+    """
+    cdef _Handle handle
+    cdef int status
+    with nogil:
+        status = custatevecCreate(&handle)
+    check_status(status)
+    return <intptr_t>handle
+
+
+cpdef destroy(intptr_t handle):
+    """Destroy the cuStateVec library handle.
+
+    Args:
+        handle (intptr_t): The library handle.
+
+    .. seealso:: `custatevecDestroy`
+    """
+    with nogil:
+        status = custatevecDestroy(<_Handle>handle)
+    check_status(status)
+
+
+cpdef size_t get_default_workspace_size(intptr_t handle) except*:
+    """Get the default workspace size defined by cuStateVec.
+
+    Args:
+        handle (intptr_t): The library handle.
+
+    Returns:
+        size_t: The workspace size (in bytes).
+
+    .. seealso:: `custatevecGetDefaultWorkspaceSize`
+    """
+    cdef size_t workspaceSizeInBytes
+    with nogil:
+        status = custatevecGetDefaultWorkspaceSize(
+            <_Handle>handle, &workspaceSizeInBytes)
+    check_status(status)
+    return workspaceSizeInBytes
+
+
+cpdef set_workspace(intptr_t handle, intptr_t workspace, size_t workspace_size):
+    """Set the workspace to be used by cuStateVec.
+
+    Args:
+        handle (intptr_t): The library handle.
+        workspace (intptr_t): The pointer address (as Python `int`) to the
+            workspace (on device).
+        workspace_size (size_t): The workspace size (in bytes).
+
+    .. seealso:: `custatevecSetWorkspace`
+    """
+    with nogil:
+        status = custatevecSetWorkspace(
+            <_Handle>handle, <void*>workspace, workspace_size)
+    check_status(status)
+
+
+cpdef int get_property(int lib_prop_type) except-1:
+    """Get the version information of cuStateVec.
+
+    Args:
+        lib_prop_type (cuquantum.libraryPropertyType): The property type.
+
+    Returns:
+        int: The corresponding value of the requested property.
+
+    .. seealso:: `custatevecGetProperty`
+    """
+    cdef int32_t value
+    status = custatevecGetProperty(<LibPropType>lib_prop_type, &value)
+    check_status(status)
+    return value
+
+
+cpdef size_t get_version() except*:
+    """Get the version of cuStateVec.
+
+    Returns:
+        size_t: The library version.
+
+    .. seealso:: `custatevecGetVersion`
+    """
+    cdef size_t version = custatevecGetVersion()
+    return version
+
+
+cpdef set_stream(intptr_t handle, intptr_t stream):
+    """Set the stream to be used by cuStateVec.
+
+    Args:
+        handle (intptr_t): The library handle.
+        stream (intptr_t): The CUDA stream handle (``cudaStream_t`` as Python
+            `int`).
+
+    .. seealso:: `custatevecSetStream`
+    """
+    with nogil:
+        status = custatevecSetStream(
+            <_Handle>handle, <Stream>stream)
+    check_status(status)
+
+
+cpdef intptr_t get_stream(intptr_t handle):
+    """Get the stream used by cuStateVec.
+
+    Args:
+        handle (intptr_t): The library handle.
+
+    Returns:
+        intptr_t:
+            The CUDA stream handle (``cudaStream_t`` as Python `int`).
+
+    .. seealso:: `custatevecGetStream`
+    """
+    cdef intptr_t stream
+    with nogil:
+        status = custatevecGetStream(
+            <_Handle>handle, <Stream*>(&stream))
+    check_status(status)
+    return stream
+
+
+# TODO(leofang): add logger callback APIs
+
+
+cpdef tuple abs2sum_on_z_basis(
+        intptr_t handle, intptr_t sv, int sv_data_type, uint32_t n_index_bits,
+        bint get_parity0, bint get_parity1,
+        basis_bits, uint32_t n_basis_bits):
+    """Calculates the sum of squared absolute values on a given Z product basis.
+
+    Args:
+        handle (intptr_t): The library handle.
+        sv (intptr_t): The pointer address (as Python `int`) to the statevector
+            (on device).
+        sv_data_type (cuquantum.cudaDataType): The data type of the statevector.
+        n_index_bits (uint32_t): The number of index bits.
+        get_parity0 (bool): Whether to compute the sum of squared absolute values
+            for parity 0.
+        get_parity1 (bool): Whether to compute the sum of squared absolute values
+            for parity 1.
+        basis_bits: A host array of Z-basis index bits. It can be
+
+            - an `int` as the pointer address to the array
+            - a Python sequence of index bits
+
+        n_basis_bits (uint32_t): the number of basis bits.
+
+    Returns:
+        tuple:
+            A 2-tuple of the calculated sums for partiy 0 and 1, respectively.
+            If the corresponding bool is set to `False`, `None` is returned.
+
+    .. seealso:: `custatevecAbs2SumOnZBasis`
+    """
+    if not get_parity0 and not get_parity1:
+        raise ValueError("no target to compute")
+    cdef double abs2sum0, abs2sum1
+    cdef double* abs2sum0_ptr
+    cdef double* abs2sum1_ptr
+    abs2sum0_ptr = &abs2sum0 if get_parity0 else NULL
+    abs2sum1_ptr = &abs2sum1 if get_parity1 else NULL
+
+    # basis_bits can be a pointer address, or a Python sequence
+    cdef vector[int32_t] basisBitsData
+    cdef int32_t* basisBitsPtr
+    if cpython.PySequence_Check(basis_bits):
+        basisBitsData = basis_bits
+        basisBitsPtr = basisBitsData.data()
+    else:  # a pointer address
+        basisBitsPtr = <int32_t*><intptr_t>basis_bits
+
+    with nogil:
+        status = custatevecAbs2SumOnZBasis(
+            <_Handle>handle, <void*>sv, <DataType>sv_data_type, n_index_bits,
+            abs2sum0_ptr, abs2sum1_ptr,
+            basisBitsPtr, n_basis_bits)
+    check_status(status)
+    if get_parity0 and get_parity1:
+        return (abs2sum0, abs2sum1)
+    elif get_parity0:
+        return (abs2sum0, None)
+    elif get_parity1:
+        return (None, abs2sum1)
+
+
+cpdef abs2sum_array(
+        intptr_t handle, intptr_t sv, int sv_data_type, uint32_t n_index_bits,
+        intptr_t abs2sum,
+        bit_ordering, uint32_t bit_ordering_len,
+        mask_bit_string, mask_ordering, uint32_t mask_len):
+    """Calculates the sum of squared absolute values for a given set of index
+    bits.
+
+    Args:
+        handle (intptr_t): The library handle.
+        sv (intptr_t): The pointer address (as Python `int`) to the statevector
+            (on device).
+        sv_data_type (cuquantum.cudaDataType): The data type of the statevector.
+        n_index_bits (uint32_t): The number of index bits.
+        abs2sum (intptr_t): The pointer address (as Python `int`) to the array
+            (on either host or device) that would hold the sums.
+        bit_ordering: A host array of index bit ordering. It can be
+
+            - an `int` as the pointer address to the array
+            - a Python sequence of index bit ordering
+
+        bit_ordering_len (uint32_t): The length of ``bit_ordering``.
+        mask_bit_string: A host array for a bit string to specify mask. It can
+            be
+
+            - an `int` as the pointer address to the array
+            - a Python sequence of index bit ordering
+
+        mask_ordering: A host array of mask ordering. It can be
+
+            - an `int` as the pointer address to the array
+            - a Python sequence of index bit ordering
+
+        mask_len (uint32_t): The length of ``mask_ordering``.
+
+
+    .. seealso:: `custatevecAbs2SumArray`
+    """
+    # bit_ordering can be a pointer address, or a Python sequence
+    cdef vector[int32_t] bitOrderingData
+    cdef int32_t* bitOrderingPtr
+    if cpython.PySequence_Check(bit_ordering):
+        bitOrderingData = bit_ordering
+        bitOrderingPtr = bitOrderingData.data()
+    else:  # a pointer address
+        bitOrderingPtr = <int32_t*><intptr_t>bit_ordering
+
+    # mask_bit_string can be a pointer address, or a Python sequence
+    cdef vector[int32_t] maskBitStringData
+    cdef int32_t* maskBitStringPtr
+    if cpython.PySequence_Check(mask_bit_string):
+        maskBitStringData = mask_bit_string
+        maskBitStringPtr = maskBitStringData.data()
+    else:  # a pointer address
+        maskBitStringPtr = <int32_t*><intptr_t>mask_bit_string
+
+    # mask_ordering can be a pointer address, or a Python sequence
+    cdef vector[int32_t] maskOrderingData
+    cdef int32_t* maskOrderingPtr
+    if cpython.PySequence_Check(mask_ordering):
+        maskOrderingData = mask_ordering
+        maskOrderingPtr = maskOrderingData.data()
+    else:  # a pointer address
+        maskOrderingPtr = <int32_t*><intptr_t>mask_ordering
+
+    with nogil:
+        status = custatevecAbs2SumArray(
+            <_Handle>handle, <void*>sv, <DataType>sv_data_type, n_index_bits,
+            <double*>abs2sum, bitOrderingPtr, bit_ordering_len,
+            maskBitStringPtr, maskOrderingPtr, mask_len)
+    check_status(status)
+
+
+cpdef collapse_on_z_basis(
+        intptr_t handle, intptr_t sv, int sv_data_type, uint32_t n_index_bits,
+        int32_t parity, basis_bits, uint32_t n_basis_bits, double norm):
+    """Collapse the statevector on the given Z product basis.
+
+    Args:
+        handle (intptr_t): The library handle.
+        sv (intptr_t): The pointer address (as Python `int`) to the statevector
+            (on device).
+        sv_data_type (cuquantum.cudaDataType): The data type of the statevector.
+        n_index_bits (uint32_t): The number of index bits.
+        parity (int32_t): The parity, 0 or 1.
+        basis_bits: A host array of Z-basis index bits. It can be
+
+            - an `int` as the pointer address to the array
+            - a Python sequence of index bits
+
+        n_basis_bits (uint32_t): the number of basis bits.
+        norm (double): The normalization factor for the statevector after
+            collapse.
+
+    .. seealso:: `custatevecCollapseOnZBasis`
+    """
+    # basis_bits can be a pointer address, or a Python sequence
+    cdef vector[int32_t] basisBitsData
+    cdef int32_t* basisBitsPtr
+    if cpython.PySequence_Check(basis_bits):
+        basisBitsData = basis_bits
+        basisBitsPtr = basisBitsData.data()
+    else:  # a pointer address
+        basisBitsPtr = <int32_t*><intptr_t>basis_bits
+
+    with nogil:
+        status = custatevecCollapseOnZBasis(
+            <_Handle>handle, <void*>sv, <DataType>sv_data_type, n_index_bits,
+            parity, basisBitsPtr, n_basis_bits, norm)
+    check_status(status)
+
+
+cpdef collapse_by_bitstring(
+        intptr_t handle, intptr_t sv, int sv_data_type, uint32_t n_index_bits,
+        bit_string, bit_ordering, uint32_t bit_string_len, double norm):
+    """Collapse the statevector to the state specified by the given bit string.
+
+    Args:
+        handle (intptr_t): The library handle.
+        sv (intptr_t): The pointer address (as Python `int`) to the statevector
+            (on device).
+        sv_data_type (cuquantum.cudaDataType): The data type of the statevector.
+        n_index_bits (uint32_t): The number of index bits.
+        bit_string: A host array of a bit string. It can be
+
+            - an `int` as the pointer address to the array
+            - a Python sequence of bits
+
+        bit_ordering: A host array of bit string ordering. It can be
+
+            - an `int` as the pointer address to the array
+            - a Python sequence of bit ordering
+
+        bit_string_len (uint32_t): The length of ``bit_string``.
+        norm (double): The normalization factor for the statevector after
+            collapse.
+
+    .. seealso:: `custatevecCollapseByBitString`
+    """
+    # bit_string can be a pointer address, or a Python sequence
+    cdef vector[int32_t] bitStringData
+    cdef int32_t* bitStringPtr
+    if cpython.PySequence_Check(bit_string):
+        bitStringData = bit_string
+        bitStringPtr = bitStringData.data()
+    else:  # a pointer address
+        bitStringPtr = <int32_t*><intptr_t>bit_string
+
+    # bit_ordering can be a pointer address, or a Python sequence
+    cdef vector[int32_t] bitOrderingData
+    cdef int32_t* bitOrderingPtr
+    if cpython.PySequence_Check(bit_ordering):
+        bitOrderingData = bit_ordering
+        bitOrderingPtr = bitOrderingData.data()
+    else:  # a pointer address
+        bitOrderingPtr = <int32_t*><intptr_t>bit_ordering
+
+    with nogil:
+        status = custatevecCollapseByBitString(
+            <_Handle>handle, <void*>sv, <DataType>sv_data_type, n_index_bits,
+            bitStringPtr, bitOrderingPtr,
+            bit_string_len, norm)
+    check_status(status)
+
+
+cpdef int measure_on_z_basis(
+        intptr_t handle, intptr_t sv, int sv_data_type, uint32_t n_index_bits,
+        basis_bits, const uint32_t n_basis_bits, double rand_num,
+        int collapse) except -1:
+    """Performs measurement on the given Z-product basis.
+
+    Args:
+        handle (intptr_t): The library handle.
+        sv (intptr_t): The pointer address (as Python `int`) to the statevector
+            (on device).
+        sv_data_type (cuquantum.cudaDataType): The data type of the statevector.
+        n_index_bits (uint32_t): The number of index bits.
+        basis_bits: A host array of Z-basis index bits. It can be
+
+            - an `int` as the pointer address to the array
+            - a Python sequence of index bits
+
+        n_basis_bits (uint32_t): The number of basis bits.
+        rand_num (double): A random number in [0, 1).
+        collapse (Collapse): Indicate the collapse operation.
+
+    Returns:
+        int: The parity measurement outcome.
+
+    .. seealso:: `custatevecMeasureOnZBasis`
+    """
+    # basis_bits can be a pointer address, or a Python sequence
+    cdef vector[int32_t] basisBitsData
+    cdef int32_t* basisBitsPtr
+    if cpython.PySequence_Check(basis_bits):
+        basisBitsData = basis_bits
+        basisBitsPtr = basisBitsData.data()
+    else:  # a pointer address
+        basisBitsPtr = <int32_t*><intptr_t>basis_bits
+
+    cdef int32_t parity
+    with nogil:
+        status = custatevecMeasureOnZBasis(
+            <_Handle>handle, <void*>sv, <DataType>sv_data_type, n_index_bits,
+            &parity, basisBitsPtr, n_basis_bits, rand_num,
+            <_CollapseOp>collapse)
+    check_status(status)
+    return parity
+
+
+cpdef batch_measure(
+        intptr_t handle, intptr_t sv, int sv_data_type,
+        uint32_t n_index_bits, intptr_t bit_string, bit_ordering,
+        const uint32_t bit_string_len, double rand_num, int collapse):
+    """Performs measurement of arbitrary number of single qubits.
+
+    Args:
+        handle (intptr_t): The library handle.
+        sv (intptr_t): The pointer address (as Python `int`) to the statevector
+            (on device).
+        sv_data_type (cuquantum.cudaDataType): The data type of the statevector.
+        n_index_bits (uint32_t): The number of index bits.
+        bit_string (intptr_t): The pointer address (as Python `int`) to a host
+            array of measured bit string.
+        bit_ordering: A host array of bit string ordering. It can be
+
+            - an `int` as the pointer address to the array
+            - a Python sequence of bit ordering
+
+        bit_string_len (uint32_t): The length of ``bit_string``.
+        rand_num (double): A random number in [0, 1).
+        collapse (Collapse): Indicate the collapse operation.
+
+    .. seealso:: `custatevecBatchMeasure`
+    """
+    # bit_ordering can be a pointer address, or a Python sequence
+    cdef vector[int32_t] bitOrderingData
+    cdef int32_t* bitOrderingPtr
+    if cpython.PySequence_Check(bit_ordering):
+        bitOrderingData = bit_ordering
+        bitOrderingPtr = bitOrderingData.data()
+    else:  # a pointer address
+        bitOrderingPtr = <int32_t*><intptr_t>bit_ordering
+
+    with nogil:
+        status = custatevecBatchMeasure(
+            <_Handle>handle, <void*>sv, <DataType>sv_data_type, n_index_bits,
+            <int32_t*>bit_string, bitOrderingPtr, bit_string_len,
+            rand_num, <_CollapseOp>collapse)
+    check_status(status)
+
+
+cpdef apply_exp(
+        intptr_t handle, intptr_t sv, int sv_data_type, uint32_t n_index_bits,
+        double theta, paulis,
+        targets, uint32_t n_targets,
+        controls, control_bit_values, uint32_t n_controls):
+    """Apply the exponential of a multi-qubit Pauli operator.
+
+    Args:
+        handle (intptr_t): The library handle.
+        sv (intptr_t): The pointer address (as Python `int`) to the statevector
+            (on device).
+        sv_data_type (cuquantum.cudaDataType): The data type of the statevector.
+        n_index_bits (uint32_t): The number of index bits.
+        theta (double): The rotation angle.
+        paulis: A host array of :data:`Pauli` operators. It can be
+
+            - an `int` as the pointer address to the array
+            - a Python sequence of :data:`Pauli`
+
+        targets: A host array of target bits. It can be
+
+            - an `int` as the pointer address to the array
+            - a Python sequence of target bits
+
+        n_targets (uint32_t): The length of ``targets``.
+        controls: A host array of control bits. It can be
+
+            - an `int` as the pointer address to the array
+            - a Python sequence of control bits
+
+        control_bit_values: A host array of control bit values. It can be
+
+            - an `int` as the pointer address to the array
+            - a Python sequence of control bit values
+
+        n_controls (uint32_t): The length of ``controls``.
+
+    .. seealso:: `custatevecApplyExp`
+    """
+    # paulis can be a pointer address, or a Python sequence
+    cdef vector[_Pauli] paulisData
+    cdef _Pauli* paulisPtr
+    if cpython.PySequence_Check(paulis):
+        paulisData = paulis
+        paulisPtr = paulisData.data()
+    else:  # a pointer address
+        paulisPtr = <_Pauli*><intptr_t>paulis
+
+    # targets can be a pointer address, or a Python sequence
+    cdef vector[int32_t] targetsData
+    cdef int32_t* targetsPtr
+    if cpython.PySequence_Check(targets):
+        targetsData = targets
+        targetsPtr = targetsData.data()
+    else:  # a pointer address
+        targetsPtr = <int32_t*><intptr_t>targets
+
+    # controls can be a pointer address, or a Python sequence
+    cdef vector[int32_t] controlsData
+    cdef int32_t* controlsPtr
+    if cpython.PySequence_Check(controls):
+        controlsData = controls
+        controlsPtr = controlsData.data()
+    else:  # a pointer address
+        controlsPtr = <int32_t*><intptr_t>controls
+
+    # control_bit_values can be a pointer address, or a Python sequence
+    cdef vector[int32_t] controlBitValuesData
+    cdef int32_t* controlBitValuesPtr
+    if cpython.PySequence_Check(control_bit_values):
+        controlBitValuesData = control_bit_values
+        controlBitValuesPtr = controlBitValuesData.data()
+    else:  # a pointer address
+        controlBitValuesPtr = <int32_t*><intptr_t>control_bit_values
+
+    with nogil:
+        status = custatevecApplyExp(
+            <_Handle>handle, <void*>sv, <DataType>sv_data_type, n_index_bits,
+            theta, paulisPtr,
+            targetsPtr, n_targets,
+            controlsPtr, controlBitValuesPtr, n_controls)
+    check_status(status)
+
+
+cpdef size_t apply_matrix_buffer_size(
+        intptr_t handle, int sv_data_type, uint32_t n_index_bits, intptr_t matrix,
+        int matrix_data_type, int layout, int32_t adjoint, uint32_t n_targets,
+        uint32_t n_controls, int compute_type) except*:
+    """Computes the required workspace size for :func:`apply_matrix`.
+
+    Args:
+        handle (intptr_t): The library handle.
+        sv_data_type (cuquantum.cudaDataType): The data type of the statevector.
+        n_index_bits (uint32_t): The number of index bits.
+        matrix (intptr_t): The pointer address (as Python `int`) to a matrix
+            (on either host or device).
+        matrix_data_type (cuquantum.cudaDataType): The data type of the matrix.
+        layout (MatrixLayout): The memory layout the the matrix.
+        adjoint (int32_t): Whether the adjoint of the matrix would be applied.
+        n_targets (uint32_t): The length of ``targets``.
+        n_controls (uint32_t): The length of ``controls``.
+        compute_type (cuquantum.ComputeType): The compute type of matrix
+            multiplication.
+
+    Returns:
+        size_t: The required workspace size (in bytes).
+
+    .. seealso:: `custatevecApplyMatrix_bufferSize`
+    """
+    cdef size_t extraWorkspaceSizeInBytes
+    with nogil:
+        status = custatevecApplyMatrix_bufferSize(
+            <_Handle>handle, <DataType>sv_data_type, n_index_bits, <void*>matrix,
+            <DataType>matrix_data_type, <_MatrixLayout>layout, adjoint, n_targets,
+            n_controls, <_ComputeType>compute_type, &extraWorkspaceSizeInBytes)
+    check_status(status)
+    return extraWorkspaceSizeInBytes
+
+
+cpdef apply_matrix(
+        intptr_t handle, intptr_t sv, int sv_data_type, uint32_t n_index_bits,
+        intptr_t matrix, int matrix_data_type, int layout, int32_t adjoint,
+        targets, uint32_t n_targets,
+        controls, uint32_t n_controls, control_bit_values,
+        int compute_type, intptr_t workspace, size_t workspace_size):
+    """Apply the specified gate matrix.
+
+    Args:
+        handle (intptr_t): The library handle.
+        sv (intptr_t): The pointer address (as Python `int`) to the statevector
+            (on device).
+        sv_data_type (cuquantum.cudaDataType): The data type of the statevector.
+        n_index_bits (uint32_t): The number of index bits.
+        matrix (intptr_t): The pointer address (as Python `int`) to a matrix
+            (on either host or device).
+        matrix_data_type (cuquantum.cudaDataType): The data type of the matrix.
+        layout (MatrixLayout): The memory layout the the matrix.
+        adjoint (int32_t): Whether the adjoint of the matrix would be applied.
+        targets: A host array of target bits. It can be
+
+            - an `int` as the pointer address to the array
+            - a Python sequence of target bits
+
+        n_targets (uint32_t): The length of ``targets``.
+        controls: A host array of control bits. It can be
+
+            - an `int` as the pointer address to the array
+            - a Python sequence of control bits
+
+        n_controls (uint32_t): The length of ``controls``.
+        control_bit_values: A host array of control bit values. It can be
+
+            - an `int` as the pointer address to the array
+            - a Python sequence of control bit values
+
+        compute_type (cuquantum.ComputeType): The compute type of matrix
+            multiplication.
+        workspace (intptr_t): The pointer address (as Python `int`) to the
+            workspace (on device).
+        workspace_size (size_t): The workspace size (in bytes).
+
+    .. seealso:: `custatevecApplyMatrix`
+    """
+    # targets can be a pointer address, or a Python sequence
+    cdef vector[int32_t] targetsData
+    cdef int32_t* targetsPtr
+    if cpython.PySequence_Check(targets):
+        targetsData = targets
+        targetsPtr = targetsData.data()
+    else:  # a pointer address
+        targetsPtr = <int32_t*><intptr_t>targets
+
+    # controls can be a pointer address, or a Python sequence
+    cdef vector[int32_t] controlsData
+    cdef int32_t* controlsPtr
+    if cpython.PySequence_Check(controls):
+        controlsData = controls
+        controlsPtr = controlsData.data()
+    else:  # a pointer address
+        controlsPtr = <int32_t*><intptr_t>controls
+
+    # control_bit_values can be a pointer address, or a Python sequence
+    cdef vector[int32_t] controlBitValuesData
+    cdef int32_t* controlBitValuesPtr
+    if cpython.PySequence_Check(control_bit_values):
+        controlBitValuesData = control_bit_values
+        controlBitValuesPtr = controlBitValuesData.data()
+    else:  # a pointer address
+        controlBitValuesPtr = <int32_t*><intptr_t>control_bit_values
+
+    with nogil:
+        status = custatevecApplyMatrix(
+            <_Handle>handle, <void*>sv, <DataType>sv_data_type, n_index_bits,
+            <void*>matrix, <DataType>matrix_data_type,
+            <_MatrixLayout>layout, adjoint,
+            targetsPtr, n_targets,
+            controlsPtr, n_controls,
+            controlBitValuesPtr, <_ComputeType>compute_type,
+            <void*>workspace, workspace_size)
+    check_status(status)
+
+
+cpdef size_t expectation_buffer_size(
+        intptr_t handle, int sv_data_type, uint32_t n_index_bits, intptr_t matrix,
+        int matrix_data_type, int layout, uint32_t n_basis_bits, int compute_type) except*:
+    """Computes the required workspace size for :func:`expectation`.
+
+    Args:
+        handle (intptr_t): The library handle.
+        sv_data_type (cuquantum.cudaDataType): The data type of the statevector.
+        n_index_bits (uint32_t): The number of index bits.
+        matrix (intptr_t): The pointer address (as Python `int`) to a matrix
+            (on either host or device).
+        matrix_data_type (cuquantum.cudaDataType): The data type of the matrix.
+        layout (MatrixLayout): The memory layout the the matrix.
+        n_basis_bits (uint32_t): The length of ``basis_bits``.
+        compute_type (cuquantum.ComputeType): The compute type of matrix
+            multiplication.
+
+    Returns:
+        size_t: The required workspace size (in bytes).
+
+    .. seealso:: `custatevecExpectation_bufferSize`
+    """
+    cdef size_t extraWorkspaceSizeInBytes
+    with nogil:
+        status = custatevecExpectation_bufferSize(
+            <_Handle>handle, <DataType>sv_data_type, n_index_bits, <void*>matrix,
+            <DataType>matrix_data_type, <_MatrixLayout>layout, n_basis_bits,
+            <_ComputeType>compute_type, &extraWorkspaceSizeInBytes)
+    check_status(status)
+    return extraWorkspaceSizeInBytes
+
+
+cpdef expectation(
+        intptr_t handle, intptr_t sv, int sv_data_type, uint32_t n_index_bits,
+        intptr_t expect, int expect_data_type,
+        intptr_t matrix, int matrix_data_type, int layout,
+        basis_bits, uint32_t n_basis_bits,
+        int compute_type, intptr_t workspace, size_t workspace_size):
+    """Compute the expectation value of the given matrix with respect to the
+    statevector.
+
+    Args:
+        handle (intptr_t): The library handle.
+        sv (intptr_t): The pointer address (as Python `int`) to the statevector
+            (on device).
+        sv_data_type (cuquantum.cudaDataType): The data type of the statevector.
+        n_index_bits (uint32_t): The number of index bits.
+        expect (intptr_t): The pointer address (as Python `int`) for storing the
+            expectation value (on host).
+        expect_data_type (cuquantum.cudaDataType): The data type of ``expect``.
+        matrix (intptr_t): The pointer address (as Python `int`) to a matrix
+            (on either host or device).
+        matrix_data_type (cuquantum.cudaDataType): The data type of the matrix.
+        layout (MatrixLayout): The memory layout the the matrix.
+        basis_bits: A host array of basis index bits. It can be
+
+            - an `int` as the pointer address to the array
+            - a Python sequence of basis bits
+
+        n_basis_bits (uint32_t): The length of ``basis_bits``.
+        compute_type (cuquantum.ComputeType): The compute type of matrix
+            multiplication.
+        workspace (intptr_t): The pointer address (as Python `int`) to the
+            workspace (on device).
+        workspace_size (size_t): The workspace size (in bytes).
+
+    .. seealso:: `custatevecExpectation`
+    """
+    # basis_bits can be a pointer address, or a Python sequence
+    cdef vector[int32_t] basisBitsData
+    cdef int32_t* basisBitsPtr
+    if cpython.PySequence_Check(basis_bits):
+        basisBitsData = basis_bits
+        basisBitsPtr = basisBitsData.data()
+    else:  # a pointer address
+        basisBitsPtr = <int32_t*><intptr_t>basis_bits
+
+    # Note: residualNorm is not supported in beta 1
+    # TODO(leofang): check for beta 2
+    cdef double residualNorm
+    with nogil:
+        status = custatevecExpectation(
+            <_Handle>handle, <void*>sv, <DataType>sv_data_type, n_index_bits,
+            <void*>expect, <DataType>expect_data_type, &residualNorm,
+            <void*>matrix, <DataType>matrix_data_type,
+            <_MatrixLayout>layout,
+            basisBitsPtr, n_basis_bits,
+            <_ComputeType>compute_type,
+            <void*>workspace, workspace_size)
+    check_status(status)
+
+
+cpdef tuple sampler_create(
+        intptr_t handle, intptr_t sv, int sv_data_type, uint32_t n_index_bits,
+        uint32_t n_max_shots):
+    """Create a sampler descriptor.
+
+    Args:
+        handle (intptr_t): The library handle.
+        sv (intptr_t): The pointer address (as Python `int`) to the statevector
+            (on device).
+        sv_data_type (cuquantum.cudaDataType): The data type of the statevector.
+        n_index_bits (uint32_t): The number of index bits.
+        n_max_shots (uint32_t): The maximal number of shots that will be
+            performed using this sampler.
+
+    Returns:
+        tuple:
+            A 2-tuple. The first element is the pointer address (as Python
+            `int`) to the sampler descriptor, and the second element is the
+            amount of required workspace size (in bytes).
+
+    .. note:: Unlike its C counterpart, the returned sampler descriptor must
+        be explicitly cleaned up using :func:`sampler_destroy` when the work
+        is done.
+
+    .. seealso:: `custatevecSampler_create`
+    """
+    cdef _SamplerDescriptor* sampler = <_SamplerDescriptor*>(
+        PyMem_Malloc(sizeof(_SamplerDescriptor)))
+    cdef size_t extraWorkspaceSizeInBytes
+    with nogil:
+        status = custatevecSampler_create(
+            <_Handle>handle, <void*>sv, <DataType>sv_data_type, n_index_bits,
+            sampler, n_max_shots, &extraWorkspaceSizeInBytes)
+    check_status(status)
+    return (<intptr_t>sampler, extraWorkspaceSizeInBytes)
+
+
+# TODO(leofang): fix this when the beta 2 (?) APIs are up
+cpdef sampler_destroy(intptr_t sampler):
+    """Destroy the sampler descriptor.
+
+    Args:
+        sampler (intptr_t): The pointer address (as Python `int`) to the
+            sampler descriptor.
+
+    .. note:: This function has no C counterpart in the current release.
+
+    .. seealso:: :func:`sampler_create`
+    """
+    # This API is unique in Python as we can't pass around structs
+    # allocated on stack
+    PyMem_Free(<void*>sampler)
+
+
+cpdef sampler_preprocess(
+        intptr_t handle, intptr_t sampler, intptr_t workspace,
+        size_t workspace_size):
+    """Preprocess the statevector to prepare for sampling.
+
+    Args:
+        handle (intptr_t): The library handle.
+        sampler (intptr_t): The pointer address (as Python `int`) to the
+            sampler descriptor.
+        workspace (intptr_t): The pointer address (as Python `int`) to the
+            workspace (on device).
+        workspace_size (size_t): The workspace size (in bytes).
+
+    .. seealso:: `custatevecSampler_preprocess`
+    """
+    with nogil:
+        status = custatevecSampler_preprocess(
+            <_Handle>handle, <_SamplerDescriptor*>sampler,
+            <void*>workspace, workspace_size)
+    check_status(status)
+
+
+cpdef sampler_sample(
+        intptr_t handle, intptr_t sampler, intptr_t bit_strings,
+        bit_ordering, uint32_t bit_string_len, rand_nums,
+        uint32_t n_shots, int order):
+    """Sample bit strings from the statevector.
+
+    Args:
+        handle (intptr_t): The library handle.
+        sampler (intptr_t): The pointer address (as Python `int`) to the
+            sampler descriptor.
+        bit_strings (intptr_t): The pointer address (as Python `int`) for
+            storing the sampled bit strings (on host).
+        bit_ordering: A host array of bit string ordering. It can be
+
+            - an `int` as the pointer address to the array
+            - a Python sequence of bit ordering
+
+        bit_string_len (uint32_t): The number of bits in ``bit_ordering``.
+        rand_nums: A host array of random numbers in [0, 1). It can be
+
+            - an `int` as the pointer address to the array
+            - a Python sequence of random numbers
+
+        n_shots (uint32_t): The number of shots.
+        order (SamplerOutput): The order of sampled bit strings.
+
+    .. seealso:: `custatevecSampler_sample`
+    """
+    # bit_ordering can be a pointer address, or a Python sequence
+    cdef vector[int32_t] bitOrderingData
+    cdef int32_t* bitOrderingPtr
+    if cpython.PySequence_Check(bit_ordering):
+        bitOrderingData = bit_ordering
+        bitOrderingPtr = bitOrderingData.data()
+    else:  # a pointer address
+        bitOrderingPtr = <int32_t*><intptr_t>bit_ordering
+
+    # rand_nums can be a pointer address, or a Python sequence
+    cdef vector[double] randNumsData
+    cdef double* randNumsPtr
+    if cpython.PySequence_Check(rand_nums):
+        randNumsData = rand_nums
+        randNumsPtr = randNumsData.data()
+    else:  # a pointer address
+        randNumsPtr = <double*><intptr_t>rand_nums
+
+    with nogil:
+        status = custatevecSampler_sample(
+            <_Handle>handle, <_SamplerDescriptor*>sampler, <_Index*>bit_strings,
+            bitOrderingPtr, bit_string_len, randNumsPtr, n_shots,
+            <_SamplerOutput>order)
+    check_status(status)
+
+
+cpdef size_t apply_generalized_permutation_matrix_buffer_size(
+        intptr_t handle, int sv_data_type, uint32_t n_index_bits,
+        permutation, intptr_t diagonals, int diagonals_data_type,
+        basis_bits, uint32_t n_basis_bits, uint32_t mask_len) except*:
+    """Computes the required workspace size for :func:`apply_generalized_permutation_matrix`.
+
+    Args:
+        handle (intptr_t): The library handle.
+        sv_data_type (cuquantum.cudaDataType): The data type of the statevector.
+        n_index_bits (uint32_t): The number of index bits.
+        permutation: A host or device array for the permutation table. It can be
+
+            - an `int` as the pointer address to the array
+            - a Python sequence of permutation elements
+
+        diagonals (intptr_t): The pointer address (as Python `int`) to a matrix
+            (on either host or device).
+        diagonals_data_type (cuquantum.cudaDataType): The data type of the matrix.
+        basis_bits: A host array of permutation matrix basis bits. It can be
+
+            - an `int` as the pointer address to the array
+            - a Python sequence of basis bits
+
+        n_basis_bits (uint32_t): The length of ``basis_bits``.
+        mask_len (uint32_t): The length of ``mask_ordering``.
+
+    Returns:
+        size_t: The required workspace size (in bytes).
+
+    .. seealso:: `custatevecApplyGeneralizedPermutationMatrix_bufferSize`
+    """
+    cdef size_t extraWorkspaceSize
+
+    # permutation can be a pointer address (on host or device), or a Python
+    # sequence (on host)
+    cdef vector[_Index] permutationData
+    cdef _Index* permutationPtr
+    if cpython.PySequence_Check(permutation):
+        permutationData = permutation
+        permutationPtr = permutationData.data()
+    else:  # a pointer address
+        permutationPtr = <_Index*><intptr_t>permutation
+
+    # basis_bits can be a pointer address, or a Python sequence
+    cdef vector[int32_t] basisBitsData
+    cdef int32_t* basisBitsPtr
+    if cpython.PySequence_Check(basis_bits):
+        basisBitsData = basis_bits
+        basisBitsPtr = basisBitsData.data()
+    else:  # a pointer address
+        basisBitsPtr = <int32_t*><intptr_t>basis_bits
+
+    with nogil:
+        status = custatevecApplyGeneralizedPermutationMatrix_bufferSize(
+            <_Handle>handle, <DataType>sv_data_type, n_index_bits,
+            permutationPtr, <void*>diagonals, <DataType>diagonals_data_type,
+            basisBitsPtr, n_basis_bits, mask_len, &extraWorkspaceSize)
+    check_status(status)
+    return extraWorkspaceSize
+
+
+cpdef apply_generalized_permutation_matrix(
+        intptr_t handle, intptr_t sv, int sv_data_type, uint32_t n_index_bits,
+        permutation, intptr_t diagonals, int diagonals_data_type,
+        int32_t adjoint, basis_bits, uint32_t n_basis_bits,
+        mask_bit_string, mask_ordering, uint32_t mask_len,
+        intptr_t workspace, size_t workspace_size):
+    """Apply a generalized permutation matrix.
+
+    Args:
+        handle (intptr_t): The library handle.
+        sv (intptr_t): The pointer address (as Python `int`) to the statevector
+            (on device).
+        sv_data_type (cuquantum.cudaDataType): The data type of the statevector.
+        n_index_bits (uint32_t): The number of index bits.
+        permutation: A host or device array for the permutation table. It can be
+
+            - an `int` as the pointer address to the array
+            - a Python sequence of permutation elements
+
+        diagonals (intptr_t): The pointer address (as Python `int`) to a matrix
+            (on either host or device).
+        diagonals_data_type (cuquantum.cudaDataType): The data type of the matrix.
+        adjoint (int32_t): Whether the adjoint of the matrix would be applied.
+        basis_bits: A host array of permutation matrix basis bits. It can be
+
+            - an `int` as the pointer address to the array
+            - a Python sequence of basis bits
+
+        n_basis_bits (uint32_t): The length of ``basis_bits``.
+        mask_bit_string: A host array for a bit string to specify mask. It can
+            be
+
+            - an `int` as the pointer address to the array
+            - a Python sequence of index bit ordering
+
+        mask_ordering: A host array of mask ordering. It can be
+
+            - an `int` as the pointer address to the array
+            - a Python sequence of index bit ordering
+
+        mask_len (uint32_t): The length of ``mask_ordering``.
+        workspace (intptr_t): The pointer address (as Python `int`) to the
+            workspace (on device).
+        workspace_size (size_t): The workspace size (in bytes).
+
+    .. seealso:: `custatevecApplyGeneralizedPermutationMatrix`
+    """
+    # permutation can be a pointer address (on host or device), or a Python
+    # sequence (on host)
+    cdef vector[_Index] permutationData
+    cdef _Index* permutationPtr
+    if cpython.PySequence_Check(permutation):
+        permutationData = permutation
+        permutationPtr = permutationData.data()
+    else:  # a pointer address
+        permutationPtr = <_Index*><intptr_t>permutation
+
+    # basis_bits can be a pointer address, or a Python sequence
+    cdef vector[int32_t] basisBitsData
+    cdef int32_t* basisBitsPtr
+    if cpython.PySequence_Check(basis_bits):
+        basisBitsData = basis_bits
+        basisBitsPtr = basisBitsData.data()
+    else:  # a pointer address
+        basisBitsPtr = <int32_t*><intptr_t>basis_bits
+
+    # mask_bit_string can be a pointer address, or a Python sequence
+    cdef vector[int32_t] maskBitStringData
+    cdef int32_t* maskBitStringPtr
+    if cpython.PySequence_Check(mask_bit_string):
+        maskBitStringData = mask_bit_string
+        maskBitStringPtr = maskBitStringData.data()
+    else:  # a pointer address
+        maskBitStringPtr = <int32_t*><intptr_t>mask_bit_string
+
+    # mask_ordering can be a pointer address, or a Python sequence
+    cdef vector[int32_t] maskOrderingData
+    cdef int32_t* maskOrderingPtr
+    if cpython.PySequence_Check(mask_ordering):
+        maskOrderingData = mask_ordering
+        maskOrderingPtr = maskOrderingData.data()
+    else:  # a pointer address
+        maskOrderingPtr = <int32_t*><intptr_t>mask_ordering
+
+    with nogil:
+        status = custatevecApplyGeneralizedPermutationMatrix(
+            <_Handle>handle, <void*>sv, <DataType>sv_data_type, n_index_bits,
+            permutationPtr, <void*>diagonals, <DataType>diagonals_data_type,
+            adjoint, basisBitsPtr, n_basis_bits,
+            maskBitStringPtr, maskOrderingPtr, mask_len,
+            <void*>workspace, workspace_size)
+    check_status(status)
+
+
+cpdef expectations_on_pauli_basis(
+        intptr_t handle, intptr_t sv, int sv_data_type, uint32_t n_index_bits,
+        intptr_t expectations, pauli_ops,
+        basis_bits, n_basis_bits, uint32_t n_pauli_op_arrays):
+    """Compute expectation values for multiple multi-qubit Pauli strings.
+
+    Args:
+        handle (intptr_t): The library handle.
+        sv (intptr_t): The pointer address (as Python `int`) to the statevector
+            (on device).
+        sv_data_type (cuquantum.cudaDataType): The data type of the statevector.
+        n_index_bits (uint32_t): The number of index bits.
+        expectations (intptr_t): The pointer address (as Python `int`) to store
+            the corresponding expectation values on host. The returned values
+            are stored in double (float64).
+        pauli_ops: A host array of :data:`Pauli` operators. It can be
+
+            - an `int` as the pointer address to the nested sequence
+            - a Python sequence of `int`, each of which is a pointer address
+              to the corresponding Pauli string
+            - a nested Python sequence of :data:`Pauli`
+
+        basis_bits: A host array of basis index bits. It can be
+
+            - an `int` as the pointer address to the nested sequence
+            - a Python sequence of `int`, each of which is a pointer address
+              to the corresponding basis bits
+            - a nested Python sequence of basis bits
+
+        n_basis_bits: A host array of the length of each array in
+            ``basis_bits``. It can be
+
+            - an `int` as the pointer address to the array
+            - a Python sequence of `int`
+
+        n_pauli_op_arrays (uint32_t): The number of Pauli operator arrays.
+
+    .. seealso:: `custatevecExpectationsOnPauliBasis`
+    """
+    # pauli_ops can be:
+    #   - a plain pointer address
+    #   - a Python sequence (of pointer addresses)
+    #   - a nested Python sequence (of _Pauli)
+    # Note: it cannot be a mix of sequences and ints.
+    cdef vector[intptr_t] pauliOpsCData
+    cdef _Pauli** pauliOpsPtr
+    if is_nested_sequence(pauli_ops):
+        # flatten the 2D sequence
+        pauliOpsPyData = []
+        for i in pauli_ops:
+            # too bad a Python list can't hold C++ vectors, so we use NumPy
+            # arrays as the container here to keep data alive
+            data = _numpy.asarray(i, dtype=_numpy.int32)
+            assert data.ndim == 1
+            pauliOpsPyData.append(data)
+            pauliOpsCData.push_back(<intptr_t>data.ctypes.data)
+        pauliOpsPtr = <_Pauli**>(pauliOpsCData.data())
+    elif cpython.PySequence_Check(pauli_ops):
+        # handle 1D sequence
+        pauliOpsCData = pauli_ops
+        pauliOpsPtr = <_Pauli**>(pauliOpsCData.data())
+    else:
+        # a pointer address, take it as is
+        pauliOpsPtr = <_Pauli**><intptr_t>pauli_ops
+
+    # basis_bits can be:
+    #   - a plain pointer address
+    #   - a Python sequence (of pointer addresses)
+    #   - a nested Python sequence (of int32_t)
+    # Note: it cannot be a mix of sequences and ints.
+    cdef vector[intptr_t] basisBitsCData
+    cdef int32_t** basisBitsPtr
+    if is_nested_sequence(basis_bits):
+        # flatten the 2D sequence
+        basisBitsPyData = []
+        for i in basis_bits:
+            # too bad a Python list can't hold C++ vectors, so we use NumPy
+            # arrays as the container here to keep data alive
+            data = _numpy.asarray(i, dtype=_numpy.int32)
+            assert data.ndim == 1
+            basisBitsPyData.append(data)
+            basisBitsCData.push_back(<intptr_t>data.ctypes.data)
+        basisBitsPtr = <int32_t**>(basisBitsCData.data())
+    elif cpython.PySequence_Check(basis_bits):
+        # handle 1D sequence
+        basisBitsCData = basis_bits
+        basisBitsPtr = <int32_t**>(basisBitsCData.data())
+    else:
+        # a pointer address, take it as is
+        basisBitsPtr = <int32_t**><intptr_t>basis_bits
+
+    # n_basis_bits can be a pointer address, or a Python sequence
+    cdef vector[uint32_t] nBasisBitsData
+    cdef uint32_t* nBasisBitsPtr
+    if cpython.PySequence_Check(n_basis_bits):
+        nBasisBitsData = n_basis_bits
+        nBasisBitsPtr = nBasisBitsData.data()
+    else:  # a pointer address
+        nBasisBitsPtr = <uint32_t*><intptr_t>n_basis_bits
+
+    with nogil:
+        status = custatevecExpectationsOnPauliBasis(
+            <_Handle>handle, <void*>sv, <DataType>sv_data_type, n_index_bits,
+            <double*>expectations, <const _Pauli**>pauliOpsPtr,
+            <const int32_t**>basisBitsPtr, nBasisBitsPtr, n_pauli_op_arrays)
+    check_status(status)
+
+
+cpdef (intptr_t, size_t) accessor_create(
+        intptr_t handle, intptr_t sv, int sv_data_type, uint32_t n_index_bits,
+        bit_ordering, uint32_t bit_ordering_len,
+        mask_bit_string, mask_ordering, uint32_t mask_len):
+    """Create accessor to copy elements between the statevector and external
+    buffers.
+
+    Args:
+        handle (intptr_t): The library handle.
+        sv (intptr_t): The pointer address (as Python `int`) to the statevector
+            (on device).
+        sv_data_type (cuquantum.cudaDataType): The data type of the statevector.
+        n_index_bits (uint32_t): The number of index bits.
+        bit_ordering: A host array of basis bits for the external buffer. It can be
+
+            - an `int` as the pointer address to the array
+            - a Python sequence of basis bits
+
+        bit_ordering_len (uint32_t): The length of ``bit_ordering``.
+        mask_bit_string: A host array for specifying mask values. It can be
+
+            - an `int` as the pointer address to the array
+            - a Python sequence of mask values
+
+        mask_ordering: A host array of mask ordering. It can be
+
+            - an `int` as the pointer address to the array
+            - a Python sequence of index bit ordering
+
+        mask_len (uint32_t): The length of ``mask_ordering``.
+
+    Returns:
+        tuple:
+            A 2-tuple. The first element is the accessor descriptor (as Python
+            `int`), and the second element is the required workspace size (in
+            bytes).
+
+    .. note:: Unlike its C counterpart, the returned accessor descriptor must
+        be explicitly cleaned up using :func:`accessor_destroy` when the work
+        is done.
+
+    .. seealso:: `custatevecAccessor_create`
+    """
+    cdef _AccessorDescriptor* accessor = <_AccessorDescriptor*>(
+        PyMem_Malloc(sizeof(_AccessorDescriptor)))
+    cdef size_t workspace_size
+
+    # bit_ordering can be a pointer address, or a Python sequence
+    cdef vector[int32_t] bitOrderingData
+    cdef int32_t* bitOrderingPtr
+    if cpython.PySequence_Check(bit_ordering):
+        bitOrderingData = bit_ordering
+        bitOrderingPtr = bitOrderingData.data()
+    else:  # a pointer address
+        bitOrderingPtr = <int32_t*><intptr_t>bit_ordering
+
+    # mask_bit_string can be a pointer address, or a Python sequence
+    cdef vector[int32_t] maskBitStringData
+    cdef int32_t* maskBitStringPtr
+    if cpython.PySequence_Check(mask_bit_string):
+        maskBitStringData = mask_bit_string
+        maskBitStringPtr = maskBitStringData.data()
+    else:  # a pointer address
+        maskBitStringPtr = <int32_t*><intptr_t>mask_bit_string
+
+    # mask_ordering can be a pointer address, or a Python sequence
+    cdef vector[int32_t] maskOrderingData
+    cdef int32_t* maskOrderingPtr
+    if cpython.PySequence_Check(mask_ordering):
+        maskOrderingData = mask_ordering
+        maskOrderingPtr = maskOrderingData.data()
+    else:  # a pointer address
+        maskOrderingPtr = <int32_t*><intptr_t>mask_ordering
+
+    with nogil:
+        status = custatevecAccessor_create(
+            <_Handle>handle, <void*>sv, <DataType>sv_data_type, n_index_bits,
+            accessor, bitOrderingPtr, bit_ordering_len,
+            maskBitStringPtr, maskOrderingPtr, mask_len, &workspace_size)
+    check_status(status)
+    return (<intptr_t>accessor, workspace_size)
+
+
+cpdef (intptr_t, size_t) accessor_create_readonly(
+        intptr_t handle, intptr_t sv, int sv_data_type, uint32_t n_index_bits,
+        bit_ordering, uint32_t bit_ordering_len,
+        mask_bit_string, mask_ordering, uint32_t mask_len):
+    """Create accessor to copy elements from the statevector to external buffers.
+
+    Args:
+        handle (intptr_t): The library handle.
+        sv (intptr_t): The pointer address (as Python `int`) to the statevector
+            (on device). The statevector is read-only.
+        sv_data_type (cuquantum.cudaDataType): The data type of the statevector.
+        n_index_bits (uint32_t): The number of index bits.
+        bit_ordering: A host array of basis bits for the external buffer. It can be
+
+            - an `int` as the pointer address to the array
+            - a Python sequence of basis bits
+
+        bit_ordering_len (uint32_t): The length of ``bit_ordering``.
+        mask_bit_string: A host array for specifying mask values. It can be
+
+            - an `int` as the pointer address to the array
+            - a Python sequence of mask values
+
+        mask_ordering: A host array of mask ordering. It can be
+
+            - an `int` as the pointer address to the array
+            - a Python sequence of index bit ordering
+
+        mask_len (uint32_t): The length of ``mask_ordering``.
+
+    Returns:
+        tuple:
+            A 2-tuple. The first element is the accessor descriptor (as Python
+            `int`), and the second element is the required workspace size (in
+            bytes).
+
+    .. note:: Unlike its C counterpart, the returned accessor descriptor must
+        be explicitly cleaned up using :func:`accessor_destroy` when the work
+        is done.
+
+    .. seealso:: `custatevecAccessor_createReadOnly`
+    """
+    cdef _AccessorDescriptor* accessor = <_AccessorDescriptor*>(
+        PyMem_Malloc(sizeof(_AccessorDescriptor)))
+    cdef size_t workspace_size
+
+    # bit_ordering can be a pointer address, or a Python sequence
+    cdef vector[int32_t] bitOrderingData
+    cdef int32_t* bitOrderingPtr
+    if cpython.PySequence_Check(bit_ordering):
+        bitOrderingData = bit_ordering
+        bitOrderingPtr = bitOrderingData.data()
+    else:  # a pointer address
+        bitOrderingPtr = <int32_t*><intptr_t>bit_ordering
+
+    # mask_bit_string can be a pointer address, or a Python sequence
+    cdef vector[int32_t] maskBitStringData
+    cdef int32_t* maskBitStringPtr
+    if cpython.PySequence_Check(mask_bit_string):
+        maskBitStringData = mask_bit_string
+        maskBitStringPtr = maskBitStringData.data()
+    else:  # a pointer address
+        maskBitStringPtr = <int32_t*><intptr_t>mask_bit_string
+
+    # mask_ordering can be a pointer address, or a Python sequence
+    cdef vector[int32_t] maskOrderingData
+    cdef int32_t* maskOrderingPtr
+    if cpython.PySequence_Check(mask_ordering):
+        maskOrderingData = mask_ordering
+        maskOrderingPtr = maskOrderingData.data()
+    else:  # a pointer address
+        maskOrderingPtr = <int32_t*><intptr_t>mask_ordering
+
+    with nogil:
+        status = custatevecAccessor_createReadOnly(
+            <_Handle>handle, <void*>sv, <DataType>sv_data_type, n_index_bits,
+            accessor, bitOrderingPtr, bit_ordering_len,
+            maskBitStringPtr, maskOrderingPtr, mask_len, &workspace_size)
+    check_status(status)
+    return (<intptr_t>accessor, workspace_size)
+
+
+cpdef accessor_destroy(intptr_t accessor):
+    """Destroy the accessor descriptor.
+
+    Args:
+        accessor (intptr_t): The accessor descriptor.
+
+    .. note:: This function has no C counterpart in the current release.
+
+    .. seealso:: :func:`accessor_create`
+    """
+    # This API is unique in Python as we can't pass around structs
+    # allocated on stack
+    PyMem_Free(<void*>accessor)
+
+
+cpdef accessor_set_extra_workspace(
+        intptr_t handle, intptr_t accessor,
+        intptr_t workspace, size_t workspace_size):
+    """Set the external workspace to the accessor.
+
+    Args:
+        handle (intptr_t): The library handle.
+        accessor (intptr_t): The accessor descriptor.
+        workspace (intptr_t): The pointer address to the workspace (on device).
+        workspace_size (size_t): The size of workspace (in bytes).
+
+    .. seealso:: `custatevecAccessor_setExtraWorkspace`
+    """
+    with nogil:
+        status = custatevecAccessor_setExtraWorkspace(
+            <_Handle>handle, <_AccessorDescriptor*>accessor,
+            <void*>workspace, workspace_size)
+    check_status(status)
+
+
+cpdef accessor_get(
+        intptr_t handle, intptr_t accessor, intptr_t buf,
+        _Index begin, _Index end):
+    """Copy elements from the statevector to an external buffer.
+
+    Args:
+        handle (intptr_t): The library handle.
+        accessor (intptr_t): The accessor descriptor.
+        buf (intptr_t): The external buffer to store the copied elements.
+        begin (int): The beginning index.
+        end (int): The end index.
+
+    .. seealso:: `custatevecAccessor_get`
+    """
+    with nogil:
+        status = custatevecAccessor_get(
+            <_Handle>handle, <_AccessorDescriptor*>accessor, <void*>buf,
+            begin, end)
+    check_status(status)
+
+
+cpdef accessor_set(
+        intptr_t handle, intptr_t accessor, intptr_t buf,
+        _Index begin, _Index end):
+    """Copy elements from an external buffer to the statevector.
+
+    Args:
+        handle (intptr_t): The library handle.
+        accessor (intptr_t): The accessor descriptor.
+        buf (intptr_t): The external buffer to copy elements from.
+        begin (int): The beginning index.
+        end (int): The end index.
+
+    .. seealso:: `custatevecAccessor_set`
+    """
+    with nogil:
+        status = custatevecAccessor_set(
+            <_Handle>handle, <_AccessorDescriptor*>accessor, <void*>buf,
+            begin, end)
+    check_status(status)
+
+
+class Pauli(IntEnum):
+    """See `custatevecPauli_t`."""
+    I = CUSTATEVEC_PAULI_I
+    X = CUSTATEVEC_PAULI_X
+    Y = CUSTATEVEC_PAULI_Y
+    Z = CUSTATEVEC_PAULI_Z
+
+class MatrixLayout(IntEnum):
+    """See `custatevecMatrixLayout_t`."""
+    COL = CUSTATEVEC_MATRIX_LAYOUT_COL
+    ROW = CUSTATEVEC_MATRIX_LAYOUT_ROW
+
+# unused in beta 1
+class MatrixType(IntEnum):
+    """See `custatevecMatrixType_t`."""
+    GENERAL = CUSTATEVEC_MATRIX_TYPE_GENERAL
+    UNITARY = CUSTATEVEC_MATRIX_TYPE_UNITARY
+    HERMITIAN = CUSTATEVEC_MATRIX_TYPE_HERMITIAN
+
+class Collapse(IntEnum):
+    """See `custatevecCollapseOp_t`."""
+    NONE = CUSTATEVEC_COLLAPSE_NONE
+    NORMALIZE_AND_ZERO = CUSTATEVEC_COLLAPSE_NORMALIZE_AND_ZERO
+
+class SamplerOutput(IntEnum):
+    """See `custatevecSamplerOutput_t`."""
+    RANDNUM_ORDER = CUSTATEVEC_SAMPLER_OUTPUT_RANDNUM_ORDER
+    ASCENDING_ORDER = CUSTATEVEC_SAMPLER_OUTPUT_ASCENDING_ORDER
+
+
+del IntEnum
+
+
+# expose them to Python
+MAJOR_VER = CUSTATEVEC_VER_MAJOR
+MINOR_VER = CUSTATEVEC_VER_MINOR
+PATCH_VER = CUSTATEVEC_VER_PATCH
+VERSION = CUSTATEVEC_VERSION
diff --git a/python/cuquantum/cutensornet/__init__.py b/python/cuquantum/cutensornet/__init__.py
new file mode 100644
index 0000000..c5eecc9
--- /dev/null
+++ b/python/cuquantum/cutensornet/__init__.py
@@ -0,0 +1,3 @@
+from cuquantum.cutensornet.cutensornet import *
+from cuquantum.cutensornet.tensor_network import *
+from cuquantum.cutensornet.configuration import *
diff --git a/python/cuquantum/cutensornet/_internal/__init__.py b/python/cuquantum/cutensornet/_internal/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/python/cuquantum/cutensornet/_internal/cupy_ifc.py b/python/cuquantum/cutensornet/_internal/cupy_ifc.py
new file mode 100644
index 0000000..9c566fb
--- /dev/null
+++ b/python/cuquantum/cutensornet/_internal/cupy_ifc.py
@@ -0,0 +1,93 @@
+"""
+Interface to seamlessly use Cupy ndarray objects.
+"""
+
+__all__ = ['CupyTensor']
+
+import cupy
+import numpy
+
+from .tensor_ifc import Tensor
+
+
+class CupyTensor(Tensor):
+    """
+    Tensor wrapper for cupy ndarrays.
+    """
+    name = 'cupy'
+    module = cupy
+    name_to_dtype = Tensor.create_name_dtype_map(conversion_function=lambda name: cupy.dtype(name), exception_type=TypeError)
+
+    def __init__(self, tensor):
+        super().__init__(tensor)
+
+    @property
+    def data_ptr(self):
+        return self.tensor.data.ptr
+
+    @property
+    def device(self):
+        return 'cuda'
+
+    @property
+    def device_id(self):
+        return self.tensor.device.id
+
+    @property
+    def dtype(self):
+        """Name of the data type"""
+        return self.tensor.dtype.name
+
+    @property
+    def shape(self):
+        return tuple(self.tensor.shape)
+
+    @property
+    def strides(self):
+        return tuple(stride_in_bytes / self.tensor.itemsize for stride_in_bytes in self.tensor.strides)
+
+    def numpy(self):
+        return self.tensor.get()
+
+    @classmethod
+    def empty(cls, shape, **context):
+        """
+        Create an empty tensor of the specified shape and data type.
+        """
+        name = context.get('dtype', 'float32')
+        dtype = CupyTensor.name_to_dtype[name]
+        device = context.get('device', None)
+        with cupy.cuda.Device(device=device):
+            tensor = cupy.empty(shape, dtype=dtype)
+
+        return tensor
+
+    def to(self, device='cpu'):
+        """
+        Create a copy of the tensor on the specified device (integer or 
+          'cpu'). Copy to  Numpy ndarray if CPU, otherwise return Cupy type.
+        """
+        if device == 'cpu':
+            return self.numpy()
+
+        if not isinstance(device, int):
+            raise ValueError(f"The device must be specified as an integer or 'cpu', not '{device}'.")
+
+        with cupy.cuda.Device(device):
+            tensor_device = cupy.asarray(self.tensor)
+
+        return tensor_device
+
+    def copy_(self, src):
+        """
+        Inplace copy of src (copy the data from src into self).
+        """
+
+        cupy.copyto(self.tensor, src)
+
+    def istensor(self):
+        """
+        Check if the object is ndarray-like.
+        """
+        return isinstance(self.tensor, cupy.ndarray)
+
diff --git a/python/cuquantum/cutensornet/_internal/einsum_parser.py b/python/cuquantum/cutensornet/_internal/einsum_parser.py
new file mode 100644
index 0000000..2780510
--- /dev/null
+++ b/python/cuquantum/cutensornet/_internal/einsum_parser.py
@@ -0,0 +1,234 @@
+"""
+A collection of functions for parsing Einsum expressions.
+"""
+
+import numpy as np
+
+from .tensor_wrapper import wrap_operands
+
+
+def parse_einsum_str(expr):
+    """
+    Parse einsum expression. Note that no validity checks are performed.
+
+    Return operand as well as output indices if explicit mode or None for implicit mode.
+    """
+    inputs, output = expr.split('->') if "->" in expr else (expr, None)
+
+    ellipses = '...' in inputs
+    if ellipses:
+        raise ValueError("Ellipsis broadcasting is not supported.")
+
+    inputs = tuple(tuple(_input) for _input in inputs.split(","))
+
+    return inputs, output
+
+
+def parse_einsum_interleaved(operand_sublists):
+    """
+    Parse einsum expression in interleaved format. Note that no validity checks are performed.
+
+    Return operands as well as output indices if explicit mode or None for implicit mode.
+    """
+    inputs   = list()
+    operands = list()
+
+    N = len(operand_sublists) // 2
+    for i in range(N):
+        operands.append(operand_sublists[2*i])
+        inputs.append(operand_sublists[2*i + 1])
+    
+    N = len(operand_sublists)
+    output = operand_sublists[N-1] if N % 2 == 1 else None
+
+    ellipses = [Ellipsis in _input for _input in inputs]
+    if any(ellipses):
+        raise ValueError("Ellipsis broadcasting is not supported.")
+
+    return operands, inputs, output
+
+
+def map_modes(user_inputs, user_output):
+    """
+    Map modes in user-defined inputs and output to ordinals. Create the forward as well as inverse maps.
+
+    Return mapped inputs and output along with the forward and reverse maps.
+    """
+
+    ordinal = 0
+    mode_map_user_to_ord = dict()
+    for modes in user_inputs:
+        for mode in modes:
+            if mode not in mode_map_user_to_ord:
+                mode_map_user_to_ord[mode] = ordinal
+                ordinal += 1
+
+    mode_map_ord_to_user = { v : k for k, v in mode_map_user_to_ord.items() }
+
+    inputs = tuple(tuple(mode_map_user_to_ord[m] for m in modes) for modes in user_inputs)
+
+    output = None
+    if user_output is not None:
+        extra = set(user_output) - set(mode_map_user_to_ord.keys())
+        if extra:
+            output_modes = "'{}'".format(user_output) if isinstance(user_output, str) else user_output
+            message = f"""Extra modes in output.
+The specified output modes {output_modes} contain the extra modes: {extra}"""
+            raise ValueError(message)
+        output = tuple(mode_map_user_to_ord[m] for m in user_output) 
+
+    return inputs, output, mode_map_user_to_ord, mode_map_ord_to_user
+
+
+def check_einsum_with_operands(user_inputs, operands, interleaved):
+    """
+    Check that the number of modes in each Einsum term is consistent with the shape of the corresponding operand.
+    operands == wrapped
+    user_inputs = *before* mapping
+    """
+
+    checks = [len(i) == len(o.shape) for i, o in zip(user_inputs, operands)]
+    if not all(checks):
+        morpher =  (lambda s : tuple(s)) if interleaved else lambda s : "'" + ''.join(s) + "'"
+        mismatch = [f"{location}: {morpher(user_inputs[location])} <=> {operands[location].shape}" 
+                        for location, predicate in enumerate(checks) if predicate is False]
+        mismatch = np.array2string(np.array(mismatch, dtype='object'), separator=', ', formatter={'object': lambda s: s})
+        message = f"""Term-operand shape mismatch.
+The number of modes in each term of the expression must match the shape of the corresponding operand.
+The mismatch in the number of modes as a sequence of "operand position: modes in term <=> operand shape" is: \n{mismatch}"""
+        raise ValueError(message)
+
+
+def create_size_dict(inputs, operands):
+    """
+    Create size dictionary capturing the extent of each mode.
+    inputs = based on renumbered modes.
+    """
+
+    size_dict = dict()
+    for i, _input in enumerate(inputs):
+        for m, mode in enumerate(_input):
+            shape = operands[i].shape
+            if mode in size_dict:
+                if size_dict[mode] == 1:    # Handle broadcasting
+                    size_dict[mode] = shape[m]
+                elif size_dict[mode] != shape[m]:
+                    message = f"""Extent mismatch.
+The extent ({shape[m]}) of mode {m} for operand {i} does not match the extent ({size_dict[mode]}) of the same mode found
+in previous operand(s)."""
+                    raise ValueError(message)
+            else:
+                size_dict[mode] = shape[m]
+
+    return size_dict
+
+
+def calculate_mode_frequency(inputs):
+    """
+    Calculate the number of times a mode appears in the operand list.
+    """
+    from collections import defaultdict
+    mode_frequency = defaultdict(int)
+
+    for index, modes in enumerate(inputs):
+        for mode in modes:
+            mode_frequency[mode] += 1
+
+    return mode_frequency
+
+
+def check_classical_einsum(mode_frequency, output, mode_map_user_to_ord, mode_map_ord_to_user):
+    """
+    Check if classical Einsum. Also infer output indices (all the modes that appear exactly once).
+    """
+
+    single_modes = set()
+    double_modes = set()
+    rest = set()
+    for mode, frequency in mode_frequency.items():
+        if frequency == 1:
+            single_modes.add(mode)
+        elif frequency == 2:
+            double_modes.add(mode)
+        else:
+            rest.add(mode)
+
+    if rest:
+        rest = tuple(mode_map_ord_to_user[r] for r in rest)
+        message = f"""No generalized Einsum support.
+These modes appear more than twice: {rest}"""
+        raise ValueError(message)
+
+    if output is None:
+        # Implicit mode: lexical sort based on user mode labels.
+        output = sorted(mode_map_ord_to_user[m] for m in single_modes)
+        output = tuple(mode_map_user_to_ord[m] for m in output)
+        return output
+
+    output_set = set(output)
+
+    missing = set(mode_map_ord_to_user[m] for m in single_modes - output_set)
+    if missing:
+        message = f"""No generalized Einsum support.
+These single modes must appear in the output: {missing}"""
+        raise ValueError(message)
+
+    common = set(mode_map_ord_to_user[c] for c in output_set & double_modes)
+    if common:
+        message = f"""No generalized Einsum support.
+These double modes must not appear in the output: {common}"""
+        raise ValueError(message)
+
+    return output
+
+
+def parse_einsum(*operands):
+    """
+    Classical Einsum definition: modes that appear twice are summed over and those that appear once must appear in the output.
+    Recognizes both string and interleaved formats. Any hashable type is accepted in interleaved format for mode specification, 
+    and unicode strings are accepted. If the output is not provided (implicit form or missing output sublist), it will be 
+    inferred from the expression.
+
+    Returns wrapped operands, mapped inputs and output, size dictionary based on internal mode numbers, and the forward as 
+    well as the reverse mode maps.
+    """
+
+    interleaved = False
+    if isinstance(operands[0], str):
+        inputs, output = parse_einsum_str(operands[0])
+        operands = operands[1:]
+    else:
+        interleaved = True
+        operands, inputs, output = parse_einsum_interleaved(operands)
+
+    num_operand, num_input = len(operands), len(inputs)
+    if num_operand != num_input:
+        message = f"""Operand-term mismatch.
+The number of operands ({num_operand}) must match the number of inputs ({num_input}) specified in the Einsum expression."""
+        raise ValueError(message)
+
+    if num_operand < 2:
+        message = "The network must consist of at least two tensors."
+        raise ValueError(message)
+
+    # First wrap operands.
+    operands = wrap_operands(operands)
+
+    # Basic check to ensure that the number of modes is consistent with the operand shape.
+    check_einsum_with_operands(inputs, operands, interleaved)
+
+    # Map data to ordinals for cutensornet.
+    inputs, output, mode_map_user_to_ord, mode_map_ord_to_user = map_modes(inputs, output)
+
+    # Create mode-extent map based on internal mode numbers.
+    size_dict = create_size_dict(inputs, operands)
+
+    # Create output modes if not specified.
+    mode_frequency = calculate_mode_frequency(inputs)
+
+    # Finally, check if the expression is a classical Einsum. Calculate output indices in implicit mode (output=None).
+    output = check_classical_einsum(mode_frequency, output, mode_map_user_to_ord, mode_map_ord_to_user)
+
+    return operands, inputs, output, size_dict, mode_map_user_to_ord, mode_map_ord_to_user
+
+
diff --git a/python/cuquantum/cutensornet/_internal/enum_utils.py b/python/cuquantum/cutensornet/_internal/enum_utils.py
new file mode 100644
index 0000000..99bf6a0
--- /dev/null
+++ b/python/cuquantum/cutensornet/_internal/enum_utils.py
@@ -0,0 +1,99 @@
+"""
+Factories for create options dataclasses, as well as utilities to add docstring to enum classes.
+"""
+import dataclasses
+from enum import IntEnum
+import re
+from typing import Any, Callable, ClassVar, Dict, Optional
+
+import numpy
+
+def create_options_class_from_enum(options_class_name: str, enum_class: IntEnum, get_attr_dtype: Callable, description: str, filter_re: str =r'(?P<option_name>.*)'):
+    """
+    Create an options dataclass from a Python enum class. Names can be filtered if desired.
+
+    Args:
+       options_class_name: Name of the dataclass that will be created.
+       enum_class: The IntEnum class that contains the options for the dataclass.
+       get_attr_dtype: A callable that takes in an enum value as the argument and returns the size in bytes of the cuTensorNet.
+       filter_re: A re definition that defines the match named 'option_name'.
+    """
+    if r'(?P<option_name>' not in filter_re:
+        message = """Incorrect re.
+The re for the filter must contain the named group 'option_name'."""
+        raise ValueError(message)
+
+    # Helper vars for creating attribute docstring.
+    doc = f"""A data class for capturing the {description} options.
+
+    Attributes:
+"""
+    indent = ' '*8
+    prefix = determine_enum_prefix(enum_class, '_ATTRIBUTE')
+
+    filter_re = re.compile(filter_re)
+    option_to_enum = dict()
+    option_to_dtype = dict()
+    for e in enum_class:
+        m = filter_re.match(e.name)
+        if not m:
+            continue
+        option_name = m.group('option_name').lower()
+        option_to_enum[option_name] = e
+        option_to_dtype[option_name] = get_attr_dtype(e)
+
+        # Add docstring for this attribute.
+        doc += indent + option_name + ':' + f" See `{prefix + '_' + m.group(0)}`.\n"
+
+    fields = list()
+    for option_name, dtype in option_to_dtype.items():
+        if numpy.issubdtype(dtype, numpy.integer):
+            field = option_name, Optional[int], dataclasses.field(default=None)
+        else:
+            field = option_name, Optional[Any], dataclasses.field(default=None)
+        fields.append(field)
+
+    # Add class attributes.
+
+    field = 'option_to_enum', ClassVar[Dict], dataclasses.field(default=option_to_enum)
+    fields.append(field)
+
+    field = 'option_to_dtype', ClassVar[Dict], dataclasses.field(default=option_to_dtype)
+    fields.append(field)
+
+    # Create the options class.
+    options_class = dataclasses.make_dataclass(options_class_name, fields)
+    options_class.__doc__ = doc
+
+    return options_class
+
+
+def camel_to_snake(name, upper=True):
+    """
+    Convert string from camel case to snake style.
+    """
+    name = re.sub("^([A-Z])|(?<!_)([A-Z])|([A-Z])", lambda m:
+            m.group(1).lower() if m.group(1) else (('_' + m.group(2).lower()) if m.group(2) else m.group(3).lower()), name)
+    if upper:
+        name = name.upper()
+    return name
+
+
+def determine_enum_prefix(enum_class, chomp):
+    """
+    This function assumes that the convention used to translate C enumerators to Python enum names holds.
+    """
+
+    prefix = enum_class.__module__.split('.')[-1].upper()
+    prefix += '_' + camel_to_snake(enum_class.__name__)
+    prefix = re.sub(chomp, '', prefix)
+    return prefix
+
+
+def add_enum_class_doc(enum_class, chomp):
+    """
+    Add docstring to enum classes.
+    """
+    for e in enum_class:
+        e.__doc__ = f"See `{determine_enum_prefix(enum_class, chomp) + '_' + e.name.upper()}`."
+
diff --git a/python/cuquantum/cutensornet/_internal/mem_limit.py b/python/cuquantum/cutensornet/_internal/mem_limit.py
new file mode 100644
index 0000000..653341f
--- /dev/null
+++ b/python/cuquantum/cutensornet/_internal/mem_limit.py
@@ -0,0 +1,26 @@
+"""
+Memory specification regular expression.
+"""
+
+__all__ = ['MEM_LIMIT_RE_PCT', 'MEM_LIMIT_RE_VAL', 'MEM_LIMIT_DOC']
+
+import re
+
+MEM_LIMIT_RE_PCT = re.compile(r"(?P<value>[-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?)\s*%\s*$")
+MEM_LIMIT_RE_VAL = re.compile(r"(?P<value>[-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?)\s*(?P<units>[kmg])?(?P<binary>(?<=[kmg])i)?b\s*$", re.IGNORECASE)
+MEM_LIMIT_DOC = """The memory limit must be specified in one of the following forms:
+  (1) A number (int or float). If the number is between 0 and 1, the memory limit is interpreted as a fraction of the 
+      total device memory. 
+      Examples: 0.75, 50E6, 50000000, ...
+  (2) A string containing a value followed by B, kB, MB, or GB for powers of 1000. 
+      Examples: "0.05 GB", "50 MB", "50000000 B" ...
+  (3) A string containing a value followed by kiB, MiB, or GiB for powers of 1024.
+      Examples:  "0.05 GB", "51.2 MB", "53687091 B" ...
+  (4) A string with value in the range (0, 100] followed by a %% symbol.
+      Examples: "26%%", "82%%", ...
+  
+  Whitespace between values and units is optional.
+
+The provided memory limit is "%s".
+"""
+
diff --git a/python/cuquantum/cutensornet/_internal/numpy_ifc.py b/python/cuquantum/cutensornet/_internal/numpy_ifc.py
new file mode 100644
index 0000000..3286c24
--- /dev/null
+++ b/python/cuquantum/cutensornet/_internal/numpy_ifc.py
@@ -0,0 +1,82 @@
+"""
+Interface to seamlessly use Numpy ndarray objects.
+"""
+
+__all__ = ['NumpyTensor']
+
+import cupy
+import numpy
+
+from .tensor_ifc import Tensor
+
+class NumpyTensor(Tensor):
+    """
+    Tensor wrapper for numpy ndarrays.
+    """
+    name = 'numpy'
+    module = numpy
+    name_to_dtype = Tensor.create_name_dtype_map(conversion_function=lambda name: numpy.dtype(name), exception_type=TypeError)
+
+    def __init__(self, tensor):
+        super().__init__(tensor)
+
+    @property
+    def data_ptr(self):
+        return self.tensor.ctypes.data
+
+    @property
+    def device(self):
+        return 'cpu'
+
+    @property
+    def device_id(self):
+        return None
+
+    @property
+    def dtype(self):
+        """Name of the data type"""
+        return self.tensor.dtype.name
+
+    @property
+    def shape(self):
+        return tuple(self.tensor.shape)
+
+    @property
+    def strides(self):
+        return tuple(stride_in_bytes / self.tensor.itemsize for stride_in_bytes in self.tensor.strides)
+
+    def numpy(self):
+        return self.tensor
+
+    @classmethod
+    def empty(cls, shape, **context):
+        """
+        Create an empty tensor of the specified shape and data type.
+        """
+        name = context.get('dtype', 'float32')
+        dtype = NumpyTensor.name_to_dtype[name]
+        return cls(module.empty(shape, dtype=dtype))
+
+    def to(self, device='cpu'):
+        """
+        Create a copy of the tensor on the specified device (integer or 
+          'cpu'). Copy to  Cupy ndarray on the specified device if it 
+          is not CPU. Otherwise, return self.
+        """
+        if device == 'cpu':
+            return self
+
+        if not isinstance(device, int):
+            raise ValueError(f"The device must be specified as an integer or 'cpu', not '{device}'.")
+
+        with cupy.cuda.Device(device):
+            tensor_device = cupy.asarray(self.tensor)
+
+        return tensor_device
+
+    def istensor(self):
+        """
+        Check if the object is ndarray-like.
+        """
+        return isinstance(self.tensor, numpy.ndarray)
+
diff --git a/python/cuquantum/cutensornet/_internal/optimizer_ifc.py b/python/cuquantum/cutensornet/_internal/optimizer_ifc.py
new file mode 100644
index 0000000..a344090
--- /dev/null
+++ b/python/cuquantum/cutensornet/_internal/optimizer_ifc.py
@@ -0,0 +1,221 @@
+"""
+Interface class to encapsulate low-level calls to get or set optimizer information.
+"""
+
+__all__ = ['OptimizerInfoInterface']
+
+from collections.abc import Sequence
+import operator
+
+import numpy as np
+
+from cuquantum import cutensornet as cutn
+
+
+def _parse_and_map_sliced_modes(sliced_modes, mode_map_user_to_ord, size_dict, dtype_mode=np.int32, dtype_extent=np.int64):
+    """
+    Parse user-provided sliced modes and create individual, contiguous sliced_modes and sliced extents array.
+    """
+
+    num_sliced_modes = len(sliced_modes)
+    if num_sliced_modes == 0:
+        return num_sliced_modes, np.zeros((num_sliced_modes,), dtype=dtype_mode), np.zeros((num_sliced_modes,), dtype=dtype_extent)
+
+    # The sliced modes have already passed basic checks when creating the OptimizerOptions dataclass.
+
+    pairs =  not isinstance(sliced_modes[0], str) and isinstance(sliced_modes[0], Sequence)
+    if pairs:
+        sliced_modes, sliced_extents = zip(*sliced_modes)
+    else:
+        sliced_extents = np.ones((num_sliced_modes,), dtype=dtype_extent)
+
+    sliced_modes = np.asarray([mode_map_user_to_ord[m] for m in sliced_modes], dtype=dtype_mode)
+    remainder = tuple(size_dict[m] % e for m, e in zip(sliced_modes, sliced_extents))
+    if any(remainder):
+        raise ValueError("The sliced extents must evenly divide the original extents of the corresponding mode.")
+
+    return num_sliced_modes, sliced_modes, np.asanyarray(sliced_extents, dtype=dtype_extent)
+
+
+InfoEnum = cutn.ContractionOptimizerInfoAttribute
+
+class OptimizerInfoInterface(object):
+    """
+    """
+    def __init__(self, network):
+        """
+        """
+        self.network = network
+
+        get_dtype = cutn.contraction_optimizer_info_get_attribute_dtype
+
+        self._flop_count = np.zeros((1,), dtype=get_dtype(InfoEnum.FLOP_COUNT))
+        self._largest_tensor = np.zeros((1,), dtype=get_dtype(InfoEnum.LARGEST_TENSOR))
+        self._num_slices = np.zeros((1,), dtype=get_dtype(InfoEnum.NUM_SLICES))
+        self._num_sliced_modes = np.zeros((1,), dtype=get_dtype(InfoEnum.NUM_SLICED_MODES))
+        self._slicing_overhead = np.zeros((1,), dtype=get_dtype(InfoEnum.SLICING_OVERHEAD))
+
+        self.num_contraction = len(self.network.operands) - 1
+        self._path = np.zeros((2*self.num_contraction, ), dtype=np.int32)
+
+    @staticmethod
+    def _get_scalar_attribute(network, name, attribute):
+        """
+        name      = cutensornet enum for the attribute
+        attribute = numpy ndarray object into which the value is stored by cutensornet
+        """
+        assert network.optimizer_info_ptr is not None, "Internal error"
+        cutn.contraction_optimizer_info_get_attribute(network.handle, network.optimizer_info_ptr, name, attribute.ctypes.data, attribute.dtype.itemsize)
+
+    @staticmethod
+    def _set_scalar_attribute(network, name, attribute, value):
+        """
+        name      = cutensornet enum for the attribute
+        attribute = numpy ndarray object into which the value is stored
+        value     = the value to set the the attribute to
+        """
+        assert network.optimizer_info_ptr is not None, "Internal error"
+        attribute[0] = value
+        cutn.contraction_optimizer_info_set_attribute(network.handle, network.optimizer_info_ptr, name, attribute.ctypes.data, attribute.dtype.itemsize)
+
+    @property
+    def num_slices(self):
+        """
+        The number of slices in the network.
+        """
+        OptimizerInfoInterface._get_scalar_attribute(self.network, InfoEnum.NUM_SLICES, self._num_slices)
+
+        return int(self._num_slices)
+
+    @num_slices.setter
+    def num_slices(self, number):
+        """
+        Set the number of slices in the network.
+        """
+        OptimizerInfoInterface._set_scalar_attribute(network, InfoEnum.NUM_SLICES, self._num_slices, number)
+
+    @property
+    def flop_count(self):
+        """
+        The cost of contracting the network.
+        """
+        OptimizerInfoInterface._get_scalar_attribute(self.network, InfoEnum.FLOP_COUNT, self._flop_count)
+
+        return float(self._flop_count)
+
+    @property
+    def largest_intermediate(self):
+        """
+        The size of the largest intermediate.
+        """
+        OptimizerInfoInterface._get_scalar_attribute(self.network, InfoEnum.LARGEST_TENSOR, self._largest_tensor)
+
+        return float(self._largest_tensor)
+
+
+    @property
+    def slicing_overhead(self):
+        """
+        The slicing overhead.
+        """
+        OptimizerInfoInterface._get_scalar_attribute(self.network, InfoEnum.SLICING_OVERHEAD, self._slicing_overhead)
+
+        return float(self._slicing_overhead)
+
+    @property
+    def path(self):
+        """
+        Return the contraction path in linear format.
+        """
+
+        network = self.network
+
+        path_wrapper = cutn.ContractionPath(self.num_contraction, self._path.ctypes.data)
+        size = path_wrapper.get_size()
+        cutn.contraction_optimizer_info_get_attribute(network.handle, network.optimizer_info_ptr, InfoEnum.PATH, path_wrapper.get_path(), size)
+
+        path = tuple(zip(*[iter(self._path)]*2))
+
+        return path
+
+    @path.setter
+    def path(self, path):
+        """
+        Set the path.
+        """
+        from functools import reduce
+
+        get_dtype = cutn.contraction_optimizer_info_get_attribute_dtype
+
+        network = self.network
+
+        num_contraction = len(path)
+        if num_contraction != len(network.operands) - 1:
+            raise ValueError(f"The length of the contraction path ({num_contraction}) must be one less than the number of operands ({len(network.operands)}).")
+
+        path = reduce(operator.concat, path)
+        self._path = np.array(path, dtype=np.int32)
+        path_wrapper = cutn.ContractionPath(num_contraction, self._path.ctypes.data)
+        size = path_wrapper.get_size()
+        cutn.contraction_optimizer_info_set_attribute(network.handle, network.optimizer_info_ptr, InfoEnum.PATH, path_wrapper.get_path(), size)
+
+    @property
+    def num_sliced_modes(self):
+        """
+        The number of sliced modes in the network.
+        """
+        OptimizerInfoInterface._get_scalar_attribute(self.network, InfoEnum.NUM_SLICED_MODES, self._num_sliced_modes)
+
+        return int(self._num_sliced_modes)
+
+    @num_sliced_modes.setter
+    def num_sliced_modes(self, number):
+        """
+        Set the number of sliced_modes in the network.
+        """
+        OptimizerInfoInterface._set_scalar_attribute(self.network, InfoEnum.NUM_SLICED_MODES, self._num_sliced_modes, number)
+
+    @property
+    def sliced_mode_extent(self):
+        """
+        Return the sliced modes as a sequence of (sliced mode, sliced extent) pairs.
+        """
+
+        get_dtype = cutn.contraction_optimizer_info_get_attribute_dtype
+
+        network = self.network
+
+        num_sliced_modes = self.num_sliced_modes
+
+        sliced_modes = np.zeros((num_sliced_modes,), dtype=get_dtype(InfoEnum.SLICED_MODE))
+        size = num_sliced_modes * sliced_modes.dtype.itemsize
+        cutn.contraction_optimizer_info_get_attribute(network.handle, network.optimizer_info_ptr, InfoEnum.SLICED_MODE, sliced_modes.ctypes.data, size)
+        sliced_modes = tuple(network.mode_map_ord_to_user[m] for m in sliced_modes)    # Convert to user mode labels
+
+        sliced_extents = np.zeros((num_sliced_modes,), dtype=get_dtype(InfoEnum.SLICED_EXTENT))
+        size = num_sliced_modes * sliced_extents.dtype.itemsize
+        cutn.contraction_optimizer_info_get_attribute(network.handle, network.optimizer_info_ptr, InfoEnum.SLICED_EXTENT, sliced_extents.ctypes.data, size)
+
+        return tuple(zip(sliced_modes, sliced_extents))
+
+    @sliced_mode_extent.setter
+    def sliced_mode_extent(self, sliced_modes):
+        """
+        Set the sliced modes (and possibly sliced extent).
+
+        sliced_mode = sequence of sliced modes, or sequence of (sliced mode, sliced extent) pairs
+        """
+
+        network = self.network
+
+        num_sliced_modes, sliced_modes, sliced_extents = _parse_and_map_sliced_modes(sliced_modes, network.mode_map_user_to_ord, network.size_dict)
+
+        # Set the number of sliced modes first
+        self.num_sliced_modes = num_sliced_modes
+
+        size = num_sliced_modes * sliced_modes.dtype.itemsize
+        cutn.contraction_optimizer_info_set_attribute(network.handle, network.optimizer_info_ptr, InfoEnum.SLICED_MODE, sliced_modes.ctypes.data, size)
+
+        size = num_sliced_modes * sliced_extents.dtype.itemsize
+        cutn.contraction_optimizer_info_set_attribute(network.handle, network.optimizer_info_ptr, InfoEnum.SLICED_EXTENT, sliced_extents.ctypes.data, size)
+
diff --git a/python/cuquantum/cutensornet/_internal/tensor_ifc.py b/python/cuquantum/cutensornet/_internal/tensor_ifc.py
new file mode 100644
index 0000000..e268437
--- /dev/null
+++ b/python/cuquantum/cutensornet/_internal/tensor_ifc.py
@@ -0,0 +1,78 @@
+"""
+Interface to seamlessly use tensors (or ndarray-like objects) from different libraries.
+"""
+
+from abc import ABC, abstractmethod
+
+from . import typemaps
+
+class Tensor(ABC):
+    """
+    A simple wrapper type for tensors to make the API package-agnostic.
+    """
+
+    def __init__(self, tensor):
+        self.tensor = tensor
+
+    @property
+    @abstractmethod
+    def data_ptr(self):
+        raise NotImplementedError
+
+    @property
+    @abstractmethod
+    def device(self):
+        raise NotImplementedError
+
+    @property
+    @abstractmethod
+    def device_id(self):
+        raise NotImplementedError
+
+    @property
+    @abstractmethod
+    def dtype(self):
+        raise NotImplementedError
+
+    @classmethod
+    @abstractmethod
+    def empty(cls, shape, **context):
+        raise NotImplementedError
+
+    @abstractmethod
+    def numpy(self):
+        raise NotImplementedError
+
+    @property
+    @abstractmethod
+    def shape(self):
+        raise NotImplementedError
+
+    @property
+    @abstractmethod
+    def strides(self):
+        raise NotImplementedError
+
+    @classmethod
+    @abstractmethod
+    def empty(cls, shape, **context):
+        raise NotImplementedError
+
+    @abstractmethod
+    def to(self, device='cpu'):
+        raise NotImplementedError
+
+    @staticmethod
+    def create_name_dtype_map(conversion_function, exception_type):
+        """
+        Create a map between CUDA data type names and the corresponding package dtypes for supported data types.
+        """
+        names = typemaps.NAME_TO_DATA_TYPE.keys()
+        name_to_dtype = dict()
+        for name in names:
+            try:
+                name_to_dtype[name] = conversion_function(name)
+            except exception_type:
+                pass
+        return name_to_dtype
+
diff --git a/python/cuquantum/cutensornet/_internal/tensor_wrapper.py b/python/cuquantum/cutensornet/_internal/tensor_wrapper.py
new file mode 100644
index 0000000..257ef70
--- /dev/null
+++ b/python/cuquantum/cutensornet/_internal/tensor_wrapper.py
@@ -0,0 +1,108 @@
+"""
+Entry point to using tensors from different libraries seamlessly.
+"""
+
+__all__ = [ 'infer_tensor_package', 'wrap_operands', 'wrap_operands', 'to', 'copy_']
+
+import numpy as np
+
+from .cupy_ifc import CupyTensor
+from .numpy_ifc import NumpyTensor
+
+
+_TENSOR_TYPES = {
+    'cupy': CupyTensor,
+    'numpy': NumpyTensor
+}
+
+# Optional modules
+try:
+    import torch
+    from .torch_ifc import TorchTensor
+    _TENSOR_TYPES['torch']  = TorchTensor
+except ImportError as e:
+    pass
+
+_SUPPORTED_PACKAGES = tuple(_TENSOR_TYPES.keys())
+
+def infer_tensor_package(tensor):
+    """
+    Infer the package that defines this tensor.
+    """
+    if issubclass(tensor.__class__, np.ndarray):
+        return 'numpy'
+    module = tensor.__class__.__module__
+    return module.split('.')[0]
+
+
+def wrap_operand(native_operand):
+    """
+    Wrap one "native" operand so that package-agnostic API can be used.
+    """
+    wrapped_operand = _TENSOR_TYPES[infer_tensor_package(native_operand)](native_operand)
+    return wrapped_operand
+
+
+def check_valid_package(native_operands):
+    """
+    Check if the operands belong to one of the supported packages.
+    """
+    operands_pkg = [infer_tensor_package(o) for o in native_operands]
+    checks = [p in _SUPPORTED_PACKAGES for p in operands_pkg]
+    if not all(checks):
+        unknown = [f"{location}: {operands_pkg[location]}" for location, predicate in enumerate(checks) if predicate is False]
+        unknown = np.array2string(np.array(unknown, dtype='object'), separator=', ', formatter={'object': lambda s: s})
+        message = f"""The operands should be ndarray-like objects from one of {_SUPPORTED_PACKAGES} packages.
+The unsupported operands as a sequence of "zero-based operand ordinal: package" is: \n{unknown}"""
+        raise ValueError(message)
+
+    return operands_pkg
+
+def check_valid_operand_type(wrapped_operands):
+    """
+    Check if the wrapped operands are ndarray-like.
+    """
+    istensor = [o.istensor() for o in wrapped_operands]
+    if not all(istensor):
+        unknown = [f"{location}: {type(wrapped_operands[location].tensor)}" 
+                    for location, predicate in enumerate(istensor) if predicate is False]
+        unknown = np.array2string(np.array(unknown, dtype='object'), separator=', ', formatter={'object': lambda s: s})
+        message = f"""The operands should be ndarray-like objects from one of {_SUPPORTED_PACKAGES} packages.
+The unsupported operands as a sequence of "zero-based operand ordinal: type" is: \n{unknown}"""
+        raise ValueError(message)
+
+
+def wrap_operands(native_operands):
+    """
+    Wrap the "native" operands so that package-agnostic API can be used.
+    """
+
+    operands_pkg = check_valid_package(native_operands)
+
+    wrapped_operands = tuple(_TENSOR_TYPES[operands_pkg[i]](o) for i, o in enumerate(native_operands))
+
+    check_valid_operand_type(wrapped_operands)
+
+    return wrapped_operands
+
+
+def to(operands, device):
+    """
+    Copy the wrapped operands to the specified device ('cpu' or int) and return the 
+    wrapped operands on the device.
+    """
+    operands = tuple(o.to(device) for o in operands)
+               
+    return wrap_operands(operands)
+
+
+def copy_(src, dest):
+    """
+    Copy the wrapped operands in dest to the corresponding wrapped operands in src.
+    """
+    for s, d in zip(src, dest):
+        if s.device_id is None:
+            s = wrap_operand(s.to(d.device_id))
+        d.copy_(s.tensor)
+
+
diff --git a/python/cuquantum/cutensornet/_internal/torch_ifc.py b/python/cuquantum/cutensornet/_internal/torch_ifc.py
new file mode 100644
index 0000000..2a28ef7
--- /dev/null
+++ b/python/cuquantum/cutensornet/_internal/torch_ifc.py
@@ -0,0 +1,88 @@
+"""
+Interface to seamlessly use Numpy ndarray objects.
+"""
+
+__all__ = ['NumpyTensor']
+
+import torch
+
+from . import typemaps
+from .tensor_ifc import Tensor
+
+
+class TorchTensor(Tensor):
+    """
+    Tensor wrapper for Torch Tensors.
+    """
+    name = 'torch'
+    module = torch
+    name_to_dtype = Tensor.create_name_dtype_map(conversion_function=lambda name: eval('torch.'+name), exception_type=AttributeError)
+
+    def __init__(self, tensor):
+        super().__init__(tensor)
+
+    @property
+    def data_ptr(self):
+        return self.tensor.data_ptr()
+
+    @property
+    def device(self):
+        str(self.tensor.device).split(':')[0]
+
+    @property
+    def device_id(self):
+        return self.tensor.device.index
+
+    @property
+    def dtype(self):
+        """Name of the data type"""
+        return str(self.tensor.dtype).split('.')[-1]
+
+    @property
+    def shape(self):
+        return tuple(self.tensor.shape)
+
+    @property
+    def strides(self):
+        return self.tensor.stride()
+
+    def numpy(self):
+        return self.tensor.get()
+
+    @classmethod
+    def empty(cls, shape, **context):
+        """
+        Create an empty tensor of the specified shape and data type on the specified device (None, 'cpu', or device id).
+        """
+        name = context.get('dtype', 'float32')
+        dtype = TorchTensor.name_to_dtype[name]
+        device = context.get('device', None)
+        tensor = torch.empty(shape, dtype=dtype, device=device)
+
+        return tensor
+
+    def to(self, device='cpu'):
+        """
+        Create a copy of the tensor on the specified device (integer or 
+          'cpu'). Copy to  Numpy ndarray if CPU, otherwise return Cupy type.
+        """
+        if not(device == 'cpu' or isinstance(device, int)):
+            raise ValueError(f"The device must be specified as an integer or 'cpu', not '{device}'.")
+
+        tensor_device = self.tensor.to(device=device)
+
+        return tensor_device
+
+    def copy_(self, src):
+        """
+        Inplace copy of src (copy the data from src into self).
+        """
+
+        self.tensor.copy_(src)
+
+    def istensor(self):
+        """
+        Check if the object is ndarray-like.
+        """
+        return isinstance(self.tensor, torch.Tensor)
+
diff --git a/python/cuquantum/cutensornet/_internal/typemaps.py b/python/cuquantum/cutensornet/_internal/typemaps.py
new file mode 100644
index 0000000..9c2443d
--- /dev/null
+++ b/python/cuquantum/cutensornet/_internal/typemaps.py
@@ -0,0 +1,74 @@
+"""
+Functions to link type names with CUDA data and compute types.
+"""
+
+__all__ = ['NAME_TO_DATA_TYPE', 'NAME_TO_COMPUTE_TYPE']
+
+import re
+
+# hack to break circular import
+from cuquantum.utils import ComputeType, cudaDataType
+
+
+def create_cuda_data_type_map(cuda_data_type_enum_class):
+    """
+    Map the data type name to the corresponding CUDA data type.
+    """
+    cuda_data_type_pattern = re.compile("CUDA_(?P<cr>C|R)_(?P<width>\d+)(?P<type>F|I|U|BF)")
+
+    type_code_map = { 'i' : 'int', 'u' : 'uint', 'f' : 'float', 'bf' : 'bfloat' }
+
+    cuda_data_type_map = dict()
+    for d in cuda_data_type_enum_class:
+        m = cuda_data_type_pattern.match(d.name)
+
+        is_complex = m.group('cr').lower() == 'c'
+        type_code = type_code_map[m.group('type').lower()]
+
+        if is_complex and type_code != 'float':
+            continue
+
+        width = int(m.group('width'))
+        if is_complex:
+            width *= 2
+            type_code = 'complex'
+
+        name = type_code + str(width)
+        cuda_data_type_map[name] = d
+
+    return cuda_data_type_map
+
+
+def create_cuda_compute_type_map(cuda_compute_type_enum_class):
+    """
+    Map the data type name to the corresponding CUDA compute type.
+    """
+    cuda_compute_type_pattern = re.compile("COMPUTE_(?:(?P<width>\d+)(?P<type>F|I|U|BF)|(?P<tf32>TF32))")
+
+    type_code_map = { 'i' : 'int', 'u' : 'uint', 'f' : 'float', 'bf' : 'bfloat' }
+
+    cuda_compute_type_map = dict()
+    for c in cuda_compute_type_enum_class:
+        if c.name == 'COMPUTE_DEFAULT':
+            continue
+
+        m = cuda_compute_type_pattern.match(c.name)
+
+        if not m:
+            raise ValueError("Internal error - unexpected enum entry")
+
+        if m.group('tf32'): 
+            continue
+
+        name = type_code_map[m.group('type').lower()] + m.group('width')
+        cuda_compute_type_map[name] = c
+
+    # Treat complex types as special case.
+    cuda_compute_type_map['complex64'] = cuda_compute_type_enum_class.COMPUTE_32F
+    cuda_compute_type_map['complex128'] = cuda_compute_type_enum_class.COMPUTE_64F
+
+    return cuda_compute_type_map
+
+
+NAME_TO_DATA_TYPE = create_cuda_data_type_map(cudaDataType)
+NAME_TO_COMPUTE_TYPE = create_cuda_compute_type_map(ComputeType)
diff --git a/python/cuquantum/cutensornet/_internal/utils.py b/python/cuquantum/cutensornet/_internal/utils.py
new file mode 100644
index 0000000..1c741ac
--- /dev/null
+++ b/python/cuquantum/cutensornet/_internal/utils.py
@@ -0,0 +1,320 @@
+"""
+A collection of (internal use) helper functions.
+"""
+
+import functools
+from typing import Callable, Dict, Optional
+
+import cupy as cp
+import numpy as np
+
+from . import tensor_wrapper
+from . import mem_limit
+
+def infer_object_package(obj):
+    """
+    Infer the package that defines this object.
+    """
+    module = obj.__class__.__module__
+    return module.split('.')[0]
+
+
+def check_or_create_options(cls, options, options_description):
+    """
+    Create the specified options dataclass from a dictionary of options or None.
+    """
+
+    if options is None:
+        options = cls()
+    elif isinstance(options, Dict):
+        options = cls(**options)
+
+    if not isinstance(options, cls):
+        raise TypeError(f"The {options_description} must be provided as an object " 
+                        f"of type {cls.__name__} or as a dict with valid {options_description}. " 
+                        f"The provided object is '{options}'.")
+
+    return options
+
+
+def get_or_create_stream(device, stream):
+    """
+    Create a stream object from a stream pointer or extract the stream pointer from a stream object.
+    Return the stream object as well as the stream pointer.
+    """
+
+    if stream is None:
+        with device: 
+            stream = cp.cuda.get_current_stream()
+            stream_ptr = stream.ptr
+        return stream, stream_ptr
+
+    if isinstance(stream, int):
+        stream_ptr = stream
+        stream = cp.cuda.ExternalStream(stream_ptr)
+
+        return stream, stream_ptr
+
+    module = infer_object_package(stream)
+
+    if module not in ['cupy', 'torch']:
+        raise TypeError("The CUDA stream must be specified as a CuPy or Torch stream object. "
+                        "Alternatively, the stream pointer can be directly provided as an int.")
+
+    if module == 'cupy':
+        stream_ptr = stream.ptr
+
+    if module == 'torch':
+        stream_ptr = stream.cuda_stream
+        stream = cp.cuda.ExternalStream(stream_ptr)
+
+    return stream, stream_ptr
+
+
+def get_memory_limit(memory_limit, device):
+    """
+    Parse user provided memory limit and return the memory limit in bytes.
+    """
+    import re
+
+    _, total_memory = device.mem_info
+    if isinstance(memory_limit, (int, float)):
+        if memory_limit <= 0:
+            raise ValueError("The specified memory limit must be greater than 0.")
+        if memory_limit < 1:
+            memory_limit *= total_memory
+        return int(memory_limit)
+
+    m = mem_limit.MEM_LIMIT_RE_PCT.match(memory_limit)
+    if m:
+        factor = float(m.group(1))
+        if factor <= 0 or factor > 100:
+            raise ValueError("The memory limit percentage must be in the range (0, 100].")
+        return int(factor * total_memory / 100.)
+
+    m = mem_limit.MEM_LIMIT_RE_VAL.match(memory_limit)
+    if not m:
+        raise ValueError(mem_limit.MEM_LIMIT_DOC % memory_limit)
+
+    base = 1000
+    if m.group('binary'):
+        base = 1024
+
+    powers = { '' : 0, 'k' : 1, 'm' : 2, 'g' : 3 }
+    unit = m.group('units').lower() if m.group('units') else ''
+    multiplier = base ** powers[unit]
+
+    value = float(m.group('value'))
+    memory_limit = int(value * multiplier)
+
+    return memory_limit
+
+
+def get_operands_data(operands):
+    """
+    Get the raw data pointer of the input operands and their alignment for cutensornet.
+    """
+    op_data = tuple(o.data_ptr for o in operands)    
+    alignments = tuple(get_maximal_alignment(p) for p in op_data)
+    return op_data, alignments
+
+
+def create_empty_tensor(cls, extents, dtype, device):
+    """
+    Create a wrapped tensor of the same type as (the wrapped) cls on the specified device having the 
+    specified extents and dtype.
+    """
+    tensor = cls.empty(extents, dtype=dtype, device=device)
+    tensor = tensor_wrapper.wrap_operand(tensor)
+    return tensor
+
+
+def create_output_tensor(cls, output, size_dict, device_id, data_type):
+    """
+    Create output tensor and associated data (modes, extents, strides, alignment)
+    """
+    modes = tuple(m for m in output)
+    extents = tuple(size_dict[m] for m in output)
+
+    output = create_empty_tensor(cls, extents, data_type, device_id)
+
+    strides = output.strides
+    alignment = get_maximal_alignment(output.data_ptr)
+
+    return output, modes, extents, strides, alignment
+
+
+def get_network_device_id(operands):
+    """
+    Return the id (ordinal) of the device the tensor network is on, or None if it is on the CPU.
+    """
+    device_id = operands[0].device_id
+    if not all(operand.device_id == device_id for operand in operands):
+        devices = set(operand.device_id for operand in operands)
+        raise ValueError(f"All tensors in the network are not on the same device. Devices = {devices}.")
+
+    return device_id
+
+
+def get_operands_dtype(operands):
+    """
+    Return the data type name of the tensors.
+    """
+    dtype = operands[0].dtype
+    if not all(operand.dtype == dtype for operand in operands):
+        dtypes = set(operand.dtype for operand in operands)
+        raise ValueError(f"All tensors in the network must have the same data type. Data types found = {dtypes}.")
+    return dtype
+
+
+def get_maximal_alignment(address):
+    """
+    Calculate the maximal alignment of the provided memory location.
+    """
+    alignment = 1
+    while address % alignment == 0 and alignment < 256:
+        alignment *= 2
+
+    return alignment
+
+
+def check_operands_match(orig_operands, new_operands, attribute, description):
+    """
+    Check if the specified attribute matches between the corresponding new and old operands, and raise an exception if it 
+    doesn't.
+    """
+    checks = [getattr(o, attribute) == getattr(n, attribute) for o, n in zip(orig_operands, new_operands)]
+
+    if not all(checks): 
+        mismatch = [f"{location}: {getattr(orig_operands[location], attribute)} => {getattr(new_operands[location], attribute)}"
+                        for location, predicate in enumerate(checks) if predicate is False]
+        mismatch = np.array2string(np.array(mismatch, dtype='object'), separator=', ', formatter={'object': lambda s: s})
+        message = f"""The {description} of each new operand must match the {description} of the corresponding original operand.
+The mismatch in {description} as a sequence of "operand position: original {description} => new {description}" is: \n{mismatch}"""
+        raise ValueError(message)
+
+
+def check_alignments_match(orig_alignments, new_alignments):
+    """
+    Check if alignment matches between the corresponding new and old operands, and raise an exception if it doesn't.
+    """
+    checks = [o == n for o, n in zip(orig_alignments, new_alignments)]
+
+    if not all(checks): 
+        mismatch = [f"{location}: {orig_alignments[location]} => {new_alignments[location]}" 
+                        for location, predicate in enumerate(checks) if predicate is False] 
+        mismatch = np.array2string(np.array(mismatch, dtype='object'), separator=', ', formatter={'object': lambda s: s})
+        message = f"""The data alignment of each new operand must match the data alignment of the corresponding original operand.
+The mismatch in data alignment as a sequence of "operand position: original alignment => new alignment" is: \n{mismatch}"""
+        raise ValueError(message)
+
+
+def convert_memory_with_units(memory):
+    """
+    Convert the provided memory value into a form suitable for printing.
+    """
+    base = 1024
+
+    if memory < base:
+        value, unit = memory, 'B'
+    elif memory < base**2:
+        value, unit = memory/base, 'KiB'
+    elif memory < base**3:
+        value, unit = memory/base**2, 'MiB'
+    else:
+        value, unit = memory/base**3, 'GiB'
+
+    return value, unit
+
+
+def check_autotune_params(iterations):
+    """
+    Check if the autotune parameters are of the correct type and within range.
+    """
+
+    if not isinstance(iterations, int):
+        raise ValueError("Integer expected.")
+    if iterations < 0:
+        raise ValueError("Integer >= 0 expected.")
+
+    message = f"Autotuning parameters: iterations = {iterations}."
+
+    return message
+
+
+# Decorator definitions
+
+def atomic(handler: Callable[[Optional[object]], None], method: bool = False) -> Callable:
+    """
+    A decorator that provides "succeed or roll-back" semantics. A typical use for this is to release partial resources if an
+    exception occurs.
+
+    Args:
+        handler: A function to call when an exception occurs. The handler takes a single argument, which is the exception
+            object, and returns a boolean stating whether the same exception should be reraised. We assume that this function
+            does not raise an exception.
+        method: Specify if the wrapped function as well as the exception handler are methods bound to the same object 
+            (method = True) or they are free functions (method = False). 
+
+    Returns:
+        Callable: A decorator that creates the wrapping. 
+    """
+    def outer(wrapped_function):
+        """
+        A decorator that actually wraps the function for exception handling.
+        """
+        @functools.wraps(wrapped_function)
+        def inner(*args, **kwargs):
+            """
+            Call the wrapped function and return the result. If an exception occurs, then call the exception handler and
+            reraise the exception.
+            """
+            try:
+                result = wrapped_function(*args, **kwargs)
+            except BaseException as e:
+                if method:
+                    flag = handler(args[0], e)
+                else:
+                    flag = handler(e)
+
+                if flag:
+                    raise e
+
+            return result
+
+        return inner
+
+    return outer
+
+
+def precondition(checker: Callable[..., None], what: str = "") -> Callable:
+    """
+    A decorator that adds checks to ensure any preconditions are met.
+
+    Args:
+        checker: The function to call to check whether the preconditions are met. It has the same signature as the wrapped
+            function with the addition of the keyword argument `what`.
+        what: A string that is passed in to `checker` to provide context information.
+
+    Returns:
+        Callable: A decorator that creates the wrapping. 
+    """
+    def outer(wrapped_function):
+        """
+        A decorator that actually wraps the function for checking preconditions.
+        """
+        @functools.wraps(wrapped_function)
+        def inner(*args, **kwargs):
+            """
+            Check preconditions and if they are met, call the wrapped function.
+            """
+            checker(*args, **kwargs, what=what)
+            result = wrapped_function(*args, **kwargs)
+
+            return result
+
+        return inner
+
+    return outer
+
+
diff --git a/python/cuquantum/cutensornet/configuration.py b/python/cuquantum/cutensornet/configuration.py
new file mode 100644
index 0000000..d3945a0
--- /dev/null
+++ b/python/cuquantum/cutensornet/configuration.py
@@ -0,0 +1,161 @@
+""" 
+A collection of types for defining options to cutensornet.
+"""
+
+__all__ = ['NetworkOptions', 'OptimizerInfo', 'OptimizerOptions', 'PathFinderOptions', 'ReconfigOptions', 'SlicerOptions']
+
+import collections
+from dataclasses import dataclass
+from logging import Logger
+from typing import Dict, Hashable, Iterable, Mapping, Optional, Tuple, Type, Union
+
+import cupy as cp
+
+from cuquantum import cutensornet as cutn
+from ._internal import enum_utils
+from ._internal.mem_limit import MEM_LIMIT_RE_PCT, MEM_LIMIT_RE_VAL, MEM_LIMIT_DOC
+
+
+@dataclass
+class NetworkOptions(object):
+    """A data class for providing options to the :class:`cuquantum.Network` object.
+
+    Attributes:
+        compute_type (cuquantum.ComputeType): CUDA compute type. A suitable compute type will be selected if not specified.
+        device_id: CUDA device ordinal (used if the tensor network resides on the CPU). Device 0 will be used if not specified.
+        handle: cuTensorNet library handle. A handle will be created if one is not provided.
+        logger (logging.Logger): Python Logger object. The root logger will be used if a logger object is not provided.
+        memory_limit: Maximum memory available to cuTensorNet. It can be specified as a value (with optional suffix like 
+            K[iB], M[iB], G[iB]) or as a percentage. The default is 80%.
+    """
+    compute_type : Optional[int] = None
+    device_id : Optional[int] = None
+    handle : Optional[int] = None
+    logger : Optional[Type[Logger]] = None
+    memory_limit : Optional[Union[int, str]] = r'80%'
+
+    def __post_init__(self):
+        #  Defer creating handle as well as computing the memory limit till we know the device the network is on.
+
+        if self.device_id is None:
+            self.device_id = 0
+
+        if not isinstance(self.memory_limit, (int, float)):
+            m1 = MEM_LIMIT_RE_PCT.match(self.memory_limit)
+            if m1:
+                factor = float(m1.group('value'))
+                if factor <= 0 or factor > 100:
+                    raise ValueError("The memory limit percentage must be in the range (0, 100].")
+            m2 = MEM_LIMIT_RE_VAL.match(self.memory_limit)
+            if not (m1 or m2):
+                raise ValueError(MEM_LIMIT_DOC % self.memory_limit)
+
+
+# Generate the options dataclasses from ContractionOptimizerConfigAttributes.
+
+_create_options = enum_utils.create_options_class_from_enum
+_opt_conf_enum = cutn.ContractionOptimizerConfigAttribute
+_get_dtype = cutn.contraction_optimizer_config_get_attribute_dtype
+
+PathFinderOptions = _create_options('PathFinderOptions', _opt_conf_enum, _get_dtype, "path finder", 'GRAPH_(?P<option_name>.*)')
+
+SlicerOptions = _create_options('SlicerOptions', _opt_conf_enum, _get_dtype, 'slicer', 'SLICER_(?P<option_name>.*)')
+
+ReconfigOptions = _create_options('ReconfigOptions', _opt_conf_enum, _get_dtype, 'reconfiguration', 'RECONFIG_(?P<option_name>.*)')
+
+del _create_options, _opt_conf_enum, _get_dtype
+
+PathType = Iterable[Tuple[int, int]]
+ModeSequenceType = Iterable[Hashable]
+ModeExtentSequenceType = Iterable[Tuple[Hashable, int]]
+KeywordArgType = Dict
+
+
+@dataclass
+class OptimizerOptions(object):
+    """A data class for providing options to the cuTensorNet optimizer.
+
+    Attributes:
+        samples: Number of samples for hyperoptimization. See `CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_HYPER_NUM_SAMPLES`.
+        path: Options for the path finder (:class:`~cuquantum.PathFinderOptions` object or dict containing the ``(parameter, value)``
+            items for ``PathFinderOptions``). Alternatively, the path can be provided as a sequence of pairs in the
+            :func:`numpy.einsum_path` format.
+        slicing: Options for the slicer (:class:`~cuquantum.SlicerOptions` object or dict containing the ``(parameter, value)`` items for 
+            ``SlicerOptions``). Alternatively, a sequence of sliced modes or sequence of ``(sliced mode, sliced extent)`` pairs 
+            can be directly provided.
+        reconfiguration: Options for the reconfiguration algorithm as a :class:`~cuquantum.ReconfigOptions` object or dict containing the 
+            ``(parameter, value)`` items for ``ReconfigOptions``.
+        seed: Optional seed for the random number generator. See `CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_SEED`.
+    """
+    samples : Optional[int] = None
+    path : Optional[Union[Type[PathFinderOptions], PathType]] = None
+    slicing : Optional[Union[Type[SlicerOptions], ModeSequenceType, ModeExtentSequenceType]] = None
+    reconfiguration : Optional[Type[ReconfigOptions]] = None
+    seed : Optional[int] = None
+
+    def _check_option(self, option, option_class, checker=None):
+        if isinstance(option, option_class):
+            return option
+
+        if option is None:
+            option = option_class()
+        elif isinstance(option, KeywordArgType):
+            option = option_class(**option)
+        elif checker is not None:
+            checker()
+
+        return option
+
+    def _check_specified_path(self):
+        if not isinstance(self.path, collections.abc.Sequence):
+            raise TypeError("The path must be a sequence of pairs in Numpy Einsum format.")
+
+        for pair in self.path:
+            if not isinstance(pair, collections.abc.Sequence) or len(pair) != 2:
+                raise TypeError("The path must be a sequence of pairs in Numpy Einsum format.")
+
+    def _check_specified_slices(self):
+        if not isinstance(self.slicing, collections.abc.Sequence):
+            raise TypeError("Slicing must be specified as a sequence of modes or as a sequence of (mode, extent) pairs.")
+
+        pair = False
+        for slc in self.slicing:
+            if isinstance(slc, collections.abc.Sequence) and not isinstance(slc, str):
+                pair = True
+                break
+
+        for s in self.slicing:
+            if pair and (isinstance(s, str) or not isinstance(s, collections.abc.Sequence) or len(s) != 2):
+                raise TypeError("Slicing must be specified as a sequence of modes or as a sequence of (mode, extent) pairs.")
+
+    def _check_int(self, attribute, name):
+        message = f"Invalid value ({attribute}) for '{name}'. Expect positive integer or None."  
+        if not isinstance(attribute, (type(None), int)):
+            raise ValueError(message)
+        if isinstance(attribute, int) and attribute < 0:
+            raise ValueError(message)
+
+    def __post_init__(self):
+        self._check_int(self.samples, "samples")
+        self.path = self._check_option(self.path, PathFinderOptions, self._check_specified_path)
+        self.slicing = self._check_option(self.slicing, SlicerOptions, self._check_specified_slices)
+        self.reconfiguration = self._check_option(self.reconfiguration, ReconfigOptions, None)
+        self._check_int(self.seed, "seed")
+
+
+@dataclass
+class OptimizerInfo(object):
+    """A data class for capturing optimizer information.
+
+    Attributes:
+        largest_intermediate: The number of elements in the largest intermediate tensor. See `CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_LARGEST_TENSOR`.
+        opt_cost: The FLOP count of the optimized contraction path per slice. See `CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_FLOP_COUNT`.
+        path: The contraction path as a sequence of pairs in the :func:`numpy.einsum_path` format.
+        slices: A sequence of ``(sliced mode, sliced extent)`` pairs.
+    """
+    largest_intermediate : float
+    opt_cost : float  
+    path : PathType
+    slices : ModeExtentSequenceType
+
+
diff --git a/python/cuquantum/cutensornet/cutensornet.pxd b/python/cuquantum/cutensornet/cutensornet.pxd
new file mode 100644
index 0000000..a847c88
--- /dev/null
+++ b/python/cuquantum/cutensornet/cutensornet.pxd
@@ -0,0 +1,74 @@
+# TODO: Ultimately, everything should be auto-generated using
+# the scripts from the CUDA Python team
+
+# The C types are prefixed with an underscore because we are not
+# yet protected by the module namespaces as done in CUDA Python.
+# Once we switch over the names would be prettier (in the Cython
+# layer).
+
+cdef extern from '<cutensornet.h>' nogil:
+    # cuTensorNet types
+    ctypedef void* _Handle 'cutensornetHandle_t'
+    ctypedef int _Status 'cutensornetStatus_t'
+    ctypedef void* _NetworkDescriptor 'cutensornetNetworkDescriptor_t'
+    ctypedef void* _ContractionPlan 'cutensornetContractionPlan_t'
+    ctypedef void* _ContractionOptimizerConfig 'cutensornetContractionOptimizerConfig_t'
+    ctypedef void* _ContractionOptimizerInfo 'cutensornetContractionOptimizerInfo_t'
+    ctypedef void* _ContractionAutotunePreference 'cutensornetContractionAutotunePreference_t'
+    ctypedef enum _ComputeType 'cutensornetComputeType_t':
+        pass
+
+    # cuTensorNet structs
+    ctypedef struct _NodePair 'cutensornetNodePair_t':
+        int first
+        int second
+    ctypedef struct _ContractionPath 'cutensornetContractionPath_t':
+        int numContractions
+        _NodePair *data
+
+    # cuTensorNet enums
+    ctypedef enum _GraphAlgo 'cutensornetGraphAlgo_t':
+        CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_GRAPH_ALGORITHM_RB
+        CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_GRAPH_ALGORITHM_KWAY
+
+    ctypedef enum _MemoryModel 'cutensornetMemoryModel_t':
+        CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_SLICER_MEMORY_MODEL_HEURISTIC
+        CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_SLICER_MEMORY_MODEL_CUTENSOR
+
+    ctypedef enum _ContractionOptimizerConfigAttribute 'cutensornetContractionOptimizerConfigAttributes_t':
+        CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_GRAPH_NUM_PARTITIONS
+        CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_GRAPH_CUTOFF_SIZE
+        CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_GRAPH_ALGORITHM
+        CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_GRAPH_IMBALANCE_FACTOR
+        CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_GRAPH_NUM_ITERATIONS
+        CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_GRAPH_NUM_CUTS
+        CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_RECONFIG_NUM_ITERATIONS
+        CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_RECONFIG_NUM_LEAVES
+        CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_SLICER_DISABLE_SLICING
+        CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_SLICER_MEMORY_MODEL
+        CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_SLICER_MEMORY_FACTOR
+        CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_SLICER_MIN_SLICES
+        CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_SLICER_SLICE_FACTOR
+        CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_HYPER_NUM_SAMPLES
+        CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_SIMPLIFICATION_DISABLE_DR
+        CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_SEED
+
+    ctypedef enum _ContractionOptimizerInfoAttribute 'cutensornetContractionOptimizerInfoAttributes_t':
+        CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_NUM_SLICES
+        CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_NUM_SLICED_MODES
+        CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_SLICED_MODE
+        CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_SLICED_EXTENT
+        CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_PATH
+        CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_PHASE1_FLOP_COUNT
+        CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_FLOP_COUNT
+        CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_LARGEST_TENSOR
+        CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_SLICING_OVERHEAD
+
+    ctypedef enum _ContractionAutotunePreferenceAttribute 'cutensornetContractionAutotunePreferenceAttributes_t':
+        CUTENSORNET_CONTRACTION_AUTOTUNE_MAX_ITERATIONS
+
+    # cuTensorNet consts
+    int CUTENSORNET_MAJOR
+    int CUTENSORNET_MINOR
+    int CUTENSORNET_PATCH
+    int CUTENSORNET_VERSION
diff --git a/python/cuquantum/cutensornet/cutensornet.pyx b/python/cuquantum/cutensornet/cutensornet.pyx
new file mode 100644
index 0000000..e94868e
--- /dev/null
+++ b/python/cuquantum/cutensornet/cutensornet.pyx
@@ -0,0 +1,1084 @@
+# distutils: language = c++
+
+cimport cpython
+cimport cython
+from cpython.mem cimport PyMem_Malloc, PyMem_Free
+from libc.stdint cimport intptr_t, int32_t, uint32_t, int64_t, uint64_t, uintptr_t
+from libcpp.vector cimport vector
+
+from cuquantum.utils cimport is_nested_sequence
+
+from enum import IntEnum
+import warnings
+
+import numpy as _numpy
+
+
+cdef extern from * nogil:
+    # from CUDA
+    ctypedef int Stream 'cudaStream_t'
+    ctypedef enum DataType 'cudaDataType_t':
+        pass
+
+    # cuTensorNet functions
+    # library
+    int cutensornetCreate(_Handle*)
+    int cutensornetDestroy(_Handle)
+    size_t cutensornetGetVersion()
+    size_t cutensornetGetCudartVersion()
+    const char* cutensornetGetErrorString(_Status)
+
+    # network descriptor
+    int cutensornetCreateNetworkDescriptor(
+        _Handle, int32_t, const int32_t[], const int64_t* const[],
+        const int64_t* const[], const int32_t* const[], const uint32_t[],
+        int32_t, const int64_t[], const int64_t[], const int32_t[],
+        uint32_t, DataType, _ComputeType, _NetworkDescriptor*)
+    int cutensornetDestroyNetworkDescriptor(_NetworkDescriptor)
+
+    # optimizer info
+    int cutensornetCreateContractionOptimizerInfo(
+        const _Handle, const _NetworkDescriptor,
+        _ContractionOptimizerInfo*)
+    int cutensornetDestroyContractionOptimizerInfo(
+        _ContractionOptimizerInfo)
+    int cutensornetContractionOptimizerInfoGetAttribute(
+        const _Handle, const _ContractionOptimizerInfo,
+        _ContractionOptimizerInfoAttribute, void*, size_t)
+    int cutensornetContractionOptimizerInfoSetAttribute(
+        const _Handle, _ContractionOptimizerInfo,
+        _ContractionOptimizerInfoAttribute, const void*, size_t)
+
+    # optimizer config
+    int cutensornetCreateContractionOptimizerConfig(
+        const _Handle, _ContractionOptimizerConfig*)
+    int cutensornetDestroyContractionOptimizerConfig(
+        _ContractionOptimizerConfig)
+    int cutensornetContractionOptimizerConfigGetAttribute(
+        const _Handle, _ContractionOptimizerConfig,
+        _ContractionOptimizerConfigAttribute, void*, size_t)
+    int cutensornetContractionOptimizerConfigSetAttribute(
+        const _Handle, _ContractionOptimizerConfig,
+        _ContractionOptimizerConfigAttribute, const void*, size_t)
+
+    # contraction
+    int cutensornetContractionGetWorkspaceSize(
+        const _Handle, const _NetworkDescriptor,
+        const _ContractionOptimizerInfo,
+        uint64_t* workspaceSize)
+    int cutensornetContractionOptimize(
+        const _Handle, const _NetworkDescriptor,
+        const _ContractionOptimizerConfig,
+        uint64_t, _ContractionOptimizerInfo)
+    int cutensornetCreateContractionPlan(
+        const _Handle, const _NetworkDescriptor,
+        const _ContractionOptimizerInfo,
+        const uint64_t, _ContractionPlan)
+    int cutensornetDestroyContractionPlan(_ContractionPlan)
+    int cutensornetContractionAutotune(
+        const _Handle, _ContractionPlan, const void* const[],
+        void*, void*, uint64_t, _ContractionAutotunePreference, Stream)
+    int cutensornetCreateContractionAutotunePreference(
+        const _Handle, _ContractionAutotunePreference*)
+    int cutensornetDestroyContractionAutotunePreference(
+        _ContractionAutotunePreference)
+    int cutensornetContractionAutotunePreferenceGetAttribute(
+        const _Handle, _ContractionAutotunePreference,
+        _ContractionAutotunePreferenceAttribute, void*, size_t)
+    int cutensornetContractionAutotunePreferenceSetAttribute(
+        const _Handle, _ContractionAutotunePreference,
+        _ContractionAutotunePreferenceAttribute, const void*, size_t)
+    int cutensornetContraction(
+        const _Handle, const _ContractionPlan, const void* const[],
+        void*, void*, uint64_t, int64_t, Stream)
+
+
+class cuTensorNetError(RuntimeError):
+    def __init__(self, status):
+        self.status = status
+        cdef str err = cutensornetGetErrorString(status).decode()
+        super().__init__(err)
+
+    def __reduce__(self):
+        return (type(self), (self.status,))
+
+
+cdef inline check_status(int status):
+    if status != 0:
+        raise cuTensorNetError(status)
+
+
+cpdef intptr_t create() except*:
+    """Create a cuTensorNet handle.
+
+    Returns:
+        intptr_t: the opaque library handle (as Python `int`).
+
+    .. seealso:: `cutensornetCreate`
+    """
+    cdef _Handle handle
+    cdef int status
+    with nogil:
+        status = cutensornetCreate(&handle)
+    check_status(status)
+    return <intptr_t>handle
+
+
+cpdef destroy(intptr_t handle):
+    """Destroy a cuTensorNet handle.
+
+    .. seealso:: `cutensornetDestroy`
+    """
+    with nogil:
+        status = cutensornetDestroy(<_Handle>handle)
+    check_status(status)
+
+
+cpdef size_t get_version() except*:
+    """Query the version of the cuTensorNet library.
+
+    Returns:
+        size_t: the library version.
+
+    .. seealso:: `cutensornetGetVersion`
+    """
+    cdef size_t ver = cutensornetGetVersion()
+    return ver
+
+
+cpdef size_t get_cudart_version() except*:
+    """Query the version of the CUDA runtime.
+
+    Returns:
+        size_t: the CUDA runtime version (ex: 11040 for CUDA 11.4).
+
+    .. seealso:: `cutensornetGetCudartVersion`
+    """
+    cdef size_t ver = cutensornetGetCudartVersion()
+    return ver
+
+
+cpdef intptr_t create_network_descriptor(
+        intptr_t handle,
+        int32_t n_inputs, n_modes_in, extents_in,
+        strides_in, modes_in, alignments_in,
+        int32_t n_modes_out, extents_out,
+        strides_out, modes_out, uint32_t alignment_out,
+        int data_type, int compute_type) except*:
+    """Create a tensor network descriptor.
+
+    Args:
+        handle (intptr_t): The library handle.
+        n_inputs (int): The number of input tensors.
+        n_modes_in: A host array of the number of modes for each input tensor.
+            It can be
+
+            - an `int` as the pointer address to the array
+            - a Python sequence of `int`
+
+        extents_in: A host array of extents for each input tensor. It can be
+
+            - an `int` as the pointer address to the nested sequence
+            - a Python sequence of `int`, each of which is a pointer address
+              to the corresponding tensor's extents
+            - a nested Python sequence of `int`
+
+        strides_in: A host array of strides for each input tensor. It can be
+
+            - an `int` as the pointer address to the nested sequence
+            - a Python sequence of `int`, each of which is a pointer address
+              to the corresponding tensor's strides
+            - a nested Python sequence of `int`
+
+        modes_in: A host array of modes for each input tensor. It can be
+
+            - an `int` as the pointer address to the nested sequence
+            - a Python sequence of `int`, each of which is a pointer address
+              to the corresponding tensor's modes
+            - a nested Python sequence of `int`
+
+        alignments_in: A host array of alignments for each input tensor. It can
+            be
+
+            - an `int` as the pointer address to the array
+            - a Python sequence of `int`
+
+        n_modes_out (int32_t): The number of modes of the output tensor.
+        extents_out: The extents of the output tensor (on host). It can be
+
+            - an `int` as the pointer address to the array
+            - a Python sequence of `int`
+
+        strides_out: The strides of the output tensor (on host). It can be
+
+            - an `int` as the pointer address to the array
+            - a Python sequence of `int`
+
+        modes_out: The modes of the output tensor (on host). It can be
+
+            - an `int` as the pointer address to the array
+            - a Python sequence of `int`
+
+        alignment_out (uint32_t): The alignment for the output tensor.
+        data_type (cuquantum.cudaDataType): The data type of the input and
+            output tensors.
+        compute_type (cuquantum.ComputeType): The compute type of the tensor
+            contraction.
+
+    Returns:
+        intptr_t: An opaque descriptor handle (as Python `int`).
+
+    .. note::
+        If ``strides_in`` (``strides_out``) is set to 0 (`NULL`), it means
+        the input tensors (output tensor) are in the Fortran layout (F-contiguous).
+
+    .. seealso:: `cutensornetCreateNetworkDescriptor`
+    """
+    # n_modes_in can be a pointer address, or a Python sequence
+    cdef vector[int32_t] numModesInData
+    cdef int32_t* numModesInPtr
+    if cpython.PySequence_Check(n_modes_in):
+        numModesInData = n_modes_in
+        numModesInPtr = numModesInData.data()
+    else:  # a pointer address
+        numModesInPtr = <int32_t*><intptr_t>n_modes_in
+
+    # extents_in can be:
+    #   - a plain pointer address
+    #   - a Python sequence (of pointer addresses)
+    #   - a nested Python sequence (of int64_t)
+    # Note: it cannot be a mix of sequences and ints.
+    cdef vector[intptr_t] extentsInCData
+    cdef int64_t** extentsInPtr
+    if is_nested_sequence(extents_in):
+        # flatten the 2D sequence
+        extentsInPyData = []
+        for i in extents_in:
+            # too bad a Python list can't hold C++ vectors, so we use NumPy
+            # arrays as the container here to keep data alive
+            data = _numpy.asarray(i, dtype=_numpy.int64)
+            assert data.ndim == 1
+            extentsInPyData.append(data)
+            extentsInCData.push_back(<intptr_t>data.ctypes.data)
+        extentsInPtr = <int64_t**>(extentsInCData.data())
+    elif cpython.PySequence_Check(extents_in):
+        # handle 1D sequence
+        extentsInCData = extents_in
+        extentsInPtr = <int64_t**>(extentsInCData.data())
+    else:
+        # a pointer address, take it as is
+        extentsInPtr = <int64_t**><intptr_t>extents_in
+
+    # strides_in can be:
+    #   - a plain pointer address
+    #   - a Python sequence (of pointer addresses)
+    #   - a nested Python sequence (of int64_t)
+    # Note: it cannot be a mix of sequences and ints.
+    cdef vector[intptr_t] stridesInCData
+    cdef int64_t** stridesInPtr
+    if is_nested_sequence(strides_in):
+        # flatten the 2D sequence
+        stridesInPyData = []
+        for i in strides_in:
+            # too bad a Python list can't hold C++ vectors, so we use NumPy
+            # arrays as the container here to keep data alive
+            data = _numpy.asarray(i, dtype=_numpy.int64)
+            assert data.ndim == 1
+            stridesInPyData.append(data)
+            stridesInCData.push_back(<intptr_t>data.ctypes.data)
+        stridesInPtr = <int64_t**>(stridesInCData.data())
+    elif cpython.PySequence_Check(strides_in):
+        # handle 1D sequence
+        stridesInCData = strides_in
+        stridesInPtr = <int64_t**>(stridesInCData.data())
+    else:
+        # a pointer address, take it as is
+        stridesInPtr = <int64_t**><intptr_t>strides_in
+
+    # modes_in can be:
+    #   - a plain pointer address
+    #   - a Python sequence (of pointer addresses)
+    #   - a nested Python sequence (of int32_t)
+    # Note: it cannot be a mix of sequences and ints.
+    cdef vector[intptr_t] modesInCData
+    cdef int32_t** modesInPtr
+    if is_nested_sequence(modes_in):
+        # flatten the 2D sequence
+        modesInPyData = []
+        for i in modes_in:
+            # too bad a Python list can't hold C++ vectors, so we use NumPy
+            # arrays as the container here to keep data alive
+            data = _numpy.asarray(i, dtype=_numpy.int32)
+            assert data.ndim == 1
+            modesInPyData.append(data)
+            modesInCData.push_back(<intptr_t>data.ctypes.data)
+        modesInPtr = <int32_t**>(modesInCData.data())
+    elif cpython.PySequence_Check(modes_in):
+        # handle 1D sequence
+        modesInCData = modes_in
+        modesInPtr = <int32_t**>(modesInCData.data())
+    else:
+        # a pointer address, take it as is
+        modesInPtr = <int32_t**><intptr_t>modes_in
+
+    # alignments_in can be a pointer address, or a Python sequence
+    cdef vector[uint32_t] alignmentsInData
+    cdef uint32_t* alignmentsInPtr
+    if cpython.PySequence_Check(alignments_in):
+        alignmentsInData = alignments_in
+        alignmentsInPtr = alignmentsInData.data()
+    else:  # a pointer address
+        alignmentsInPtr = <uint32_t*><intptr_t>alignments_in
+
+    # extents_out can be a pointer address, or a Python sequence
+    cdef vector[int64_t] extentsOutData
+    cdef int64_t* extentsOutPtr
+    if cpython.PySequence_Check(extents_out):
+        extentsOutData = extents_out
+        extentsOutPtr = extentsOutData.data()
+    else:  # a pointer address
+        extentsOutPtr = <int64_t*><intptr_t>extents_out
+
+    # strides_out can be a pointer address, or a Python sequence
+    cdef vector[int64_t] stridesOutData
+    cdef int64_t* stridesOutPtr
+    if cpython.PySequence_Check(strides_out):
+        stridesOutData = strides_out
+        stridesOutPtr = stridesOutData.data()
+    else:  # a pointer address
+        stridesOutPtr = <int64_t*><intptr_t>strides_out
+
+    # modes_out can be a pointer address, or a Python sequence
+    cdef vector[int32_t] modesOutData
+    cdef int32_t* modesOutPtr
+    if cpython.PySequence_Check(modes_out):
+        modesOutData = modes_out
+        modesOutPtr = modesOutData.data()
+    else:  # a pointer address
+        modesOutPtr = <int32_t*><intptr_t>modes_out
+
+    cdef _NetworkDescriptor tn_desc
+    with nogil:
+        status = cutensornetCreateNetworkDescriptor(<_Handle>handle,
+            n_inputs, numModesInPtr, extentsInPtr, stridesInPtr, modesInPtr, alignmentsInPtr,
+            n_modes_out, extentsOutPtr, stridesOutPtr, modesOutPtr, alignment_out,
+            <DataType>data_type, <_ComputeType>compute_type, &tn_desc)
+    check_status(status)
+    return <intptr_t>tn_desc
+
+
+cpdef destroy_network_descriptor(intptr_t tn_desc):
+    """Destroy a tensor network descriptor.
+
+    Args:
+        tn_desc (intptr_t): The tensor network descriptor.
+
+    .. seealso:: `cutensornetDestroyNetworkDescriptor`
+    """
+    with nogil:
+        status = cutensornetDestroyNetworkDescriptor(<_NetworkDescriptor>tn_desc)
+    check_status(status)
+
+
+cpdef intptr_t create_contraction_optimizer_info(
+        intptr_t handle, intptr_t tn_desc) except*:
+    """Create a contraction optimizer info object.
+
+    Args:
+        handle (intptr_t): The library handle.
+        tn_desc (intptr_t): the tensor network descriptor.
+
+    Returns:
+        intptr_t: An opaque optimizer info handle (as Python `int`).
+
+    .. seealso:: `cutensornetCreateContractionOptimizerInfo`
+    """
+    cdef _ContractionOptimizerInfo info
+    with nogil:
+        status = cutensornetCreateContractionOptimizerInfo(
+            <_Handle>handle, <_NetworkDescriptor>tn_desc, &info)
+    check_status(status)
+    return <intptr_t>info
+
+
+cpdef destroy_contraction_optimizer_info(intptr_t info):
+    """Destroy a contraction optimizer info object.
+
+    Args:
+        info (intptr_t): The optimizer info handle.
+
+    .. seealso:: `cutensornetDestroyContractionOptimizerInfo`
+    """
+    with nogil:
+        status = cutensornetDestroyContractionOptimizerInfo(
+            <_ContractionOptimizerInfo>info)
+    check_status(status)
+
+
+######################### Python specific utility #########################
+
+cdef dict contract_opti_info_sizes = {
+    CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_NUM_SLICES: _numpy.int64,
+    CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_NUM_SLICED_MODES: _numpy.int32,
+    CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_SLICED_MODE: _numpy.int32,
+    CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_SLICED_EXTENT: _numpy.int64,
+    CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_PATH: ContractionPath,
+    CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_PHASE1_FLOP_COUNT: _numpy.float64,
+    CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_FLOP_COUNT: _numpy.float64,
+    CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_LARGEST_TENSOR: _numpy.float64,
+    CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_SLICING_OVERHEAD: _numpy.float64,
+}
+
+cpdef contraction_optimizer_info_get_attribute_dtype(int attr):
+    """Get the Python data type of the corresponding optimizer info attribute.
+
+    Args:
+        attr (ContractionOptimizerInfoAttribute): The attribute to query.
+
+    Returns:
+        The data type of the queried attribute.
+
+    .. note:: This API has no C counterpart and is a convenient helper for
+        allocating memory for :func:`contraction_optimizer_info_get_attribute`
+        and :func:`contraction_optimizer_info_set_attribute`.
+
+    .. note:: Unlike other enum values, for :data:`ContractionOptimizerInfoAttribute.PATH`
+        the following usage pattern is expected:
+
+        .. code-block:: python
+
+            val = ContractionOptimizerInfoAttribute.PATH
+            dtype = contraction_optimizer_info_get_attribute_dtype(val)
+
+            # setter
+            path = np.asarray([(1, 3), (1, 2), (0, 1)], dtype=np.int32)
+            path_obj = dtype(path.size//2, path.ctypes.data)
+            contraction_optimizer_info_set_attribute(
+                handle, info, val, path_obj.get_data(), path_obj.get_size())
+
+            # getter
+            # num_contractions is the number of input tensors minus one
+            path = np.empty(2*num_contractions, dtype=np.int32)
+            path_obj = dtype(num_contractions, path.ctypes.data)
+            contraction_optimizer_info_get_attribute(
+                handle, info, val, path_obj.get_data(), path_obj.get_size())
+            # now path is filled
+            print(path)
+
+        See also the documentation of :class:`ContractionPath`. This design is subject
+        to change in a future release.
+    """
+    return contract_opti_info_sizes[attr]
+
+###########################################################################
+
+
+cpdef contraction_optimizer_info_get_attribute(
+        intptr_t handle, intptr_t info, int attr,
+        intptr_t buf, size_t size):
+    """Get the optimizer info attribute.
+
+    Args:
+        handle (intptr_t): The library handle.
+        info (intptr_t): The optimizer info handle.
+        attr (ContractionOptimizerInfoAttribute): The attribute to query.
+        buf (intptr_t): The pointer address (as Python `int`) for storing
+            the returned attribute value.
+        size (size_t): The size of ``buf`` (in bytes).
+
+    .. note:: To compute ``size``, use the itemsize of the corresponding data
+        type, which can be queried using :func:`contraction_optimizer_info_get_attribute_dtype`.
+
+    .. note:: For getting the :data:`ContractionOptimizerInfoAttribute.PATH` attribute
+        please see :func:`contraction_optimizer_info_get_attribute_dtype`.
+
+    .. seealso:: `cutensornetContractionOptimizerInfoGetAttribute`
+    """
+    with nogil:
+        status = cutensornetContractionOptimizerInfoGetAttribute(
+            <_Handle>handle, <_ContractionOptimizerInfo>info,
+            <_ContractionOptimizerInfoAttribute>attr,
+            <void*>buf, size)
+    check_status(status)
+
+
+cpdef contraction_optimizer_info_set_attribute(
+        intptr_t handle, intptr_t info, int attr,
+        intptr_t buf, size_t size):
+    """Set the optimizer info attribute.
+
+    Args:
+        handle (intptr_t): The library handle.
+        info (intptr_t): The optimizer info handle.
+        attr (ContractionOptimizerInfoAttribute): The attribute to set.
+        buf (intptr_t): The pointer address (as Python `int`) to the attribute data.
+        size (size_t): The size of ``buf`` (in bytes).
+
+    .. note:: To compute ``size``, use the itemsize of the corresponding data
+        type, which can be queried using :func:`contraction_optimizer_info_get_attribute_dtype`.
+
+    .. note:: For setting the :data:`ContractionOptimizerInfoAttribute.PATH` attribute
+        please see :func:`contraction_optimizer_info_get_attribute_dtype`.
+
+    .. seealso:: `cutensornetContractionOptimizerInfoSetAttribute`
+    """
+    with nogil:
+        status = cutensornetContractionOptimizerInfoSetAttribute(
+            <_Handle>handle, <_ContractionOptimizerInfo>info,
+            <_ContractionOptimizerInfoAttribute>attr,
+            <void*>buf, size)
+    check_status(status)
+
+
+cpdef intptr_t create_contraction_optimizer_config(
+        intptr_t handle) except*:
+    """Create a contraction optimizer config object.
+
+    Args:
+        handle (intptr_t): The library handle.
+
+    Returns:
+        intptr_t: An opaque optimizer config handle (as Python `int`).
+
+    .. seealso:: `cutensornetCreateContractionOptimizerConfig`
+    """
+    cdef _ContractionOptimizerConfig config
+    with nogil:
+        status = cutensornetCreateContractionOptimizerConfig(
+            <_Handle>handle, &config)
+    check_status(status)
+    return <intptr_t>config
+
+
+cpdef destroy_contraction_optimizer_config(intptr_t config):
+    """Destroy a contraction optimizer config object.
+
+    Args:
+        config (intptr_t): The optimizer config handle.
+
+    .. seealso:: `cutensornetDestroyContractionOptimizerConfig`
+    """
+    with nogil:
+        status = cutensornetDestroyContractionOptimizerConfig(
+            <_ContractionOptimizerConfig>config)
+    check_status(status)
+
+
+######################### Python specific utility #########################
+
+cdef dict contract_opti_cfg_sizes = {
+    CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_GRAPH_NUM_PARTITIONS: _numpy.int32,
+    CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_GRAPH_CUTOFF_SIZE: _numpy.int32,
+    CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_GRAPH_ALGORITHM: _numpy.int32,  # = sizeof(enum value)
+    CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_GRAPH_IMBALANCE_FACTOR: _numpy.int32,
+    CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_GRAPH_NUM_ITERATIONS: _numpy.int32,
+    CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_GRAPH_NUM_CUTS: _numpy.int32,
+    CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_RECONFIG_NUM_ITERATIONS: _numpy.int32,
+    CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_RECONFIG_NUM_LEAVES: _numpy.int32,
+    CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_SLICER_DISABLE_SLICING: _numpy.int32,
+    CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_SLICER_MEMORY_MODEL: _numpy.int32,  # = sizeof(enum value)
+    CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_SLICER_MEMORY_FACTOR: _numpy.int32,
+    CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_SLICER_MIN_SLICES: _numpy.int32,
+    CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_SLICER_SLICE_FACTOR: _numpy.int32,
+    CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_HYPER_NUM_SAMPLES: _numpy.int32,
+    CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_SIMPLIFICATION_DISABLE_DR: _numpy.int32,
+    CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_SEED: _numpy.int32,
+}
+
+cpdef contraction_optimizer_config_get_attribute_dtype(int attr):
+    """Get the Python data type of the corresponding optimizer config attribute.
+
+    Args:
+        attr (ContractionOptimizerConfigAttribute): The attribute to query.
+
+    Returns:
+        The data type of the queried attribute.
+
+    .. note:: This API has no C counterpart and is a convenient helper for
+        allocating memory for :func:`contraction_optimizer_config_get_attribute`
+        and :func:`contraction_optimizer_config_set_attribute`.
+    """
+    dtype = contract_opti_cfg_sizes[attr]
+    if attr == CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_GRAPH_ALGORITHM:
+        if _numpy.dtype(dtype).itemsize != sizeof(_GraphAlgo):
+            warnings.warn("binary size may be incompatible")
+    elif attr == CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_SLICER_MEMORY_MODEL:
+        if _numpy.dtype(dtype).itemsize != sizeof(_MemoryModel):
+            warnings.warn("binary size may be incompatible")
+    return dtype
+
+###########################################################################
+
+
+cpdef contraction_optimizer_config_get_attribute(
+        intptr_t handle, intptr_t config, int attr,
+        intptr_t buf, size_t size):
+    """Get the optimizer config attribute.
+
+    Args:
+        handle (intptr_t): The library handle.
+        config (intptr_t): The optimizer config handle.
+        attr (ContractionOptimizerConfigAttribute): The attribute to set.
+        buf (intptr_t): The pointer address (as Python `int`) for storing
+            the returned attribute value.
+        size (size_t): The size of ``buf`` (in bytes).
+
+    .. note:: To compute ``size``, use the itemsize of the corresponding data
+        type, which can be queried using :func:`contraction_optimizer_config_get_attribute_dtype`.
+
+    .. seealso:: `cutensornetContractionOptimizerConfigGetAttribute`
+    """
+    with nogil:
+        status = cutensornetContractionOptimizerConfigGetAttribute(
+            <_Handle>handle, <_ContractionOptimizerConfig>config,
+            <_ContractionOptimizerConfigAttribute>attr,
+            <void*>buf, size)
+    check_status(status)
+
+
+cpdef contraction_optimizer_config_set_attribute(
+        intptr_t handle, intptr_t config, int attr,
+        intptr_t buf, size_t size):
+    """Set the optimizer config attribute.
+
+    Args:
+        handle (intptr_t): The library handle.
+        config (intptr_t): The optimizer config handle.
+        attr (ContractionOptimizerConfigAttribute): The attribute to set.
+        buf (intptr_t): The pointer address (as Python `int`) to the attribute data.
+        size (size_t): The size of ``buf`` (in bytes).
+
+    .. note:: To compute ``size``, use the itemsize of the corresponding data
+        type, which can be queried using :func:`contraction_optimizer_config_get_attribute_dtype`.
+
+    .. seealso:: `cutensornetContractionOptimizerConfigSetAttribute`
+    """
+    with nogil:
+        status = cutensornetContractionOptimizerConfigSetAttribute(
+            <_Handle>handle, <_ContractionOptimizerConfig>config,
+            <_ContractionOptimizerConfigAttribute>attr,
+            <void*>buf, size)
+    check_status(status)
+
+
+cpdef uint64_t contraction_get_workspace_size(
+        intptr_t handle, intptr_t tn_desc, intptr_t info) except*:
+    """Compute the required workspace size for contracting the input tensor
+    network.
+
+    Args:
+        handle (intptr_t): The library handle.
+        tn_desc (intptr_t): the tensor network descriptor.
+        info (intptr_t): The optimizer info handle.
+
+    Returns:
+        uint64_t: The workspace size (in bytes).
+
+    .. note:: This function should be called either after a contraction path
+        is manually set, or after :func:`contraction_optimize` is called.
+
+    .. seealso:: `cutensornetContractionGetWorkspaceSize`
+    """
+    # TODO(leofang): note in the docstring that the API name deviates
+    # from its C counterpart in beta 2
+    cdef uint64_t workspaceSize
+    with nogil:
+        status = cutensornetContractionGetWorkspaceSize(
+            <_Handle>handle, <_NetworkDescriptor>tn_desc,
+            <_ContractionOptimizerInfo>info, &workspaceSize)
+    check_status(status)
+    return workspaceSize
+
+
+cpdef contraction_optimize(
+        intptr_t handle, intptr_t tn_desc, intptr_t config,
+        uint64_t size_limit, intptr_t info):
+    """Optimize the contraction path, slicing, etc, for the given tensor network.
+
+    Args:
+        handle (intptr_t): The library handle.
+        tn_desc (intptr_t): the tensor network descriptor.
+        config (intptr_t): The optimizer config handle.
+        size_limit (uint64_t): Maximal device memory that is available to the
+            user.
+        info (intptr_t): The optimizer info handle.
+
+    .. note:: The ``size_limit`` argument here should not be confused with the
+        workspace size returned by :func:`contraction_get_workspace_size`. The
+        former is an upper bound for the available memory, whereas the latter
+        is the needed size to perform the actual contraction.
+
+    .. seealso:: `cutensornetContractionOptimize`
+    """
+    with nogil:
+        status = cutensornetContractionOptimize(
+            <_Handle>handle, <_NetworkDescriptor>tn_desc,
+            <_ContractionOptimizerConfig>config,
+            size_limit, <_ContractionOptimizerInfo>info)
+    check_status(status)
+
+
+cpdef intptr_t create_contraction_plan(
+        intptr_t handle, intptr_t tn_desc, intptr_t info,
+        uint64_t workspace_size) except*:
+    """Create a contraction plan for the given tensor network and the
+    associated path.
+
+    When this function is called, the optimizer info object should already
+    contain a contraction path.
+
+    Args:
+        handle (intptr_t): The library handle.
+        tn_desc (intptr_t): the tensor network descriptor.
+        info (intptr_t): The optimizer info handle.
+        workspace_size (uint64_t): The workspace size (in bytes).
+
+    Returns:
+        intptr_t: An opaque contraction plan handle (as Python `int`).
+
+    .. seealso:: `cutensornetCreateContractionPlan`
+    """
+    cdef _ContractionPlan plan
+    with nogil:
+        status = cutensornetCreateContractionPlan(
+            <_Handle>handle, <_NetworkDescriptor>tn_desc,
+            <_ContractionOptimizerInfo>info,
+            workspace_size, &plan)
+    check_status(status)
+    return <intptr_t>plan
+
+
+cpdef destroy_contraction_plan(intptr_t plan):
+    """Destroy a contraction plan.
+
+    Args:
+        plan (intptr_t): The contraction plan handle.
+
+    .. seealso:: `cutensornetDestroyContractionPlan`
+    """
+    with nogil:
+        status = cutensornetDestroyContractionPlan(<_ContractionPlan>plan)
+    check_status(status)
+
+
+cpdef contraction_autotune(
+        intptr_t handle, intptr_t plan,
+        raw_data_in, intptr_t raw_data_out, intptr_t workspace,
+        uint64_t workspace_size, intptr_t pref, intptr_t stream):
+    """Autotune the contraction plan to find the best kernels for each pairwise
+    tensor contraction.
+
+    The input tensors should form a tensor network that is prescribed by the
+    tensor network descriptor that was used to create the contraction plan.
+
+    Args:
+        handle (intptr_t): The library handle.
+        plan (intptr_t): The contraction plan handle.
+        raw_data_in: A host array of pointer addresses (as Python `int`) for
+            each input tensor (on device). It can be
+
+            - an `int` as the pointer address to the array
+            - a Python sequence of `int`
+
+        raw_data_out (intptr_t): The pointer address (as Python `int`) to the
+            output tensor (on device).
+        workspace (intptr_t): The pointer address (as Python `int`) to the
+            workspace (on device).
+        workspace_size (uint64_t): The workspace size (in bytes).
+        pref (intptr_t): The autotune preference handle.
+        stream (intptr_t): The CUDA stream handle (``cudaStream_t`` as Python
+            `int`).
+
+    .. seealso:: `cutensornetContractionAutotune`
+    """
+    # raw_data_in can be a pointer address, or a Python sequence
+    cdef vector[intptr_t] rawDataInData
+    cdef void** rawDataInPtr
+    if cpython.PySequence_Check(raw_data_in):
+        rawDataInData = raw_data_in
+        rawDataInPtr = <void**>(rawDataInData.data())
+    else:  # a pointer address
+        rawDataInPtr = <void**><intptr_t>raw_data_in
+
+    with nogil:
+        status = cutensornetContractionAutotune(
+            <_Handle>handle, <_ContractionPlan>plan,
+            rawDataInPtr, <void*>raw_data_out, <void*>workspace,
+            workspace_size, <_ContractionAutotunePreference>pref,
+            <Stream>stream)
+    check_status(status)
+
+
+cpdef intptr_t create_contraction_autotune_preference(intptr_t handle):
+    """Create a handle to hold all autotune parameters.
+
+    Args:
+        handle (intptr_t): The library handle.
+
+    Returns:
+        intptr_t: An opaque autotune preference handle.
+
+    .. seealso:: `cutensornetCreateContractionAutotunePreference`
+    """
+    cdef _ContractionAutotunePreference pref
+    with nogil:
+        status = cutensornetCreateContractionAutotunePreference(
+            <_Handle>handle, &pref)
+    check_status(status)
+    return <intptr_t>pref
+
+
+cpdef intptr_t destroy_contraction_autotune_preference(intptr_t pref):
+    """Destroy the autotue preference handle.
+
+    Args:
+        pref (intptr_t): The opaque autotune preference handle.
+
+    .. seealso:: `cutensornetDestroyContractionAutotunePreference`
+    """
+    with nogil:
+        status = cutensornetDestroyContractionAutotunePreference(
+            <_ContractionAutotunePreference>pref)
+    check_status(status)
+
+
+######################### Python specific utility #########################
+
+cdef dict contract_autotune_pref_sizes = {
+    CUTENSORNET_CONTRACTION_AUTOTUNE_MAX_ITERATIONS: _numpy.int32,
+}
+
+cpdef contraction_autotune_preference_get_attribute_dtype(int attr):
+    """Get the Python data type of the corresponding autotune preference
+    attribute.
+
+    Args:
+        attr (ContractionAutotunePreferenceAttribute): The attribute to query.
+
+    Returns:
+        The data type of the queried attribute.
+
+    .. note:: This API has no C counterpart and is a convenient helper for
+        allocating memory for :func:`contraction_autotune_preference_get_attribute`
+        and :func:`contraction_autotune_preference_set_attribute`.
+    """
+    return contract_autotune_pref_sizes[attr]
+
+###########################################################################
+
+
+cpdef contraction_autotune_preference_get_attribute(
+        intptr_t handle, intptr_t autotune_preference, int attr,
+        intptr_t buf, size_t size):
+    """Get the autotue preference attributes.
+
+    Args:
+        handle (intptr_t): The library handle.
+        autotune_preference (intptr_t): The autotune preference handle.
+        attr (ContractionAutotunePreferenceAttribute): The attribute to query.
+        buf (intptr_t): The pointer address (as Python `int`) for storing
+            the returned attribute value.
+        size (size_t): The size of ``buf`` (in bytes).
+
+    .. seealso:: `cutensornetContractionAutotunePreferenceGetAttribute`
+    """
+    with nogil:
+        status = cutensornetContractionAutotunePreferenceGetAttribute(
+            <_Handle>handle,
+            <_ContractionAutotunePreference>autotune_preference,
+            <_ContractionAutotunePreferenceAttribute>attr,
+            <void*>buf, size)
+
+
+cpdef contraction_autotune_preference_set_attribute(
+        intptr_t handle, intptr_t autotune_preference, int attr,
+        intptr_t buf, size_t size):
+    """Set the autotue preference attributes.
+
+    Args:
+        handle (intptr_t): The library handle.
+        autotune_preference (intptr_t): The autotune preference handle.
+        attr (ContractionAutotunePreferenceAttribute): The attribute to query.
+        buf (intptr_t): The pointer address (as Python `int`) to the attribute data.
+        size (size_t): The size of ``buf`` (in bytes).
+
+    .. note:: To compute ``size``, use the itemsize of the corresponding data
+        type, which can be queried using :func:`contraction_autotune_preference_get_attribute_dtype`.
+
+    .. seealso:: `cutensornetContractionAutotunePreferenceSetAttribute`
+    """
+    with nogil:
+        status = cutensornetContractionAutotunePreferenceSetAttribute(
+            <_Handle>handle,
+            <_ContractionAutotunePreference>autotune_preference,
+            <_ContractionAutotunePreferenceAttribute>attr,
+            <void*>buf, size)
+
+
+cpdef contraction(
+        intptr_t handle, intptr_t plan,
+        raw_data_in, intptr_t raw_data_out, intptr_t workspace,
+        uint64_t workspace_size, int64_t slice_id, intptr_t stream):
+    """Perform the contraction of the input tensors.
+
+    The input tensors should form a tensor network that is prescribed by the
+    tensor network descriptor that was used to create the contraction plan.
+
+    Args:
+        handle (intptr_t): The library handle.
+        plan (intptr_t): The contraction plan handle.
+        raw_data_in: A host array of pointer addresses (as Python `int`) for
+            each input tensor (on device). It can be
+
+            - an `int` as the pointer address to the array
+            - a Python sequence of `int`
+
+        raw_data_out (intptr_t): The pointer address (as Python `int`) to the
+            output tensor (on device).
+        workspace (intptr_t): The pointer address (as Python `int`) to the
+            workspace (on device).
+        workspace_size (uint64_t): The workspace size (in bytes).
+        slice_id (int64_t): The slice ID.
+        stream (intptr_t): The CUDA stream handle (``cudaStream_t`` as Python
+            `int`).
+
+    .. note:: The number of slices can be queried by :func:`contraction_optimizer_info_get_attribute`.
+
+    .. seealso:: `cutensornetContraction`
+    """
+    # raw_data_in can be a pointer address, or a Python sequence
+    cdef vector[intptr_t] rawDataInData
+    cdef void** rawDataInPtr
+    if cpython.PySequence_Check(raw_data_in):
+        rawDataInData = raw_data_in
+        rawDataInPtr = <void**>(rawDataInData.data())
+    else:  # a pointer address
+        rawDataInPtr = <void**><intptr_t>raw_data_in
+
+    with nogil:
+        status = cutensornetContraction(
+            <_Handle>handle, <_ContractionPlan>plan,
+            rawDataInPtr, <void*>raw_data_out, <void*>workspace,
+            workspace_size, slice_id, <Stream>stream)
+    check_status(status)
+
+
+cdef class ContractionPath:
+    """A proxy object to hold a `cutensornetContractionPath_t` struct.
+
+    Users provide the number of contractions and a pointer address to the actual
+    contraction path, and this object creates an `cutensornetContractionPath_t`
+    instance and fills in the provided information.
+
+    Example:
+
+        .. code-block:: python
+
+            # the pairwise contraction order is stored as C int
+            path = np.asarray([(1, 3), (1, 2), (0, 1)], dtype=np.int32)
+            path_obj = ContractionPath(path.size//2, path.ctypes.data)
+
+            # get the pointer address to the underlying `cutensornetContractionPath_t`
+            my_func(..., path_obj.get_data(), ...)
+
+            # path must outlive path_obj!
+            del path_obj
+            del path
+
+    Args:
+        num_contractions (int): The number of contractions in the provided path.
+        data (uintptr_t): The pointer address (as Python `int`) to the provided path.
+
+    .. note::
+        Users are responsible for managing the lifetime of the underlying path data
+        (i.e. the validity of the ``data`` pointer).
+
+    .. warning::
+        The design of how `cutensornetContractionPath_t` is handled in Python is
+        experimental and subject to change in a future release.
+    """
+    cdef _ContractionPath* path
+
+    def __cinit__(self, int num_contractions, uintptr_t data):
+        self.path = <_ContractionPath*>PyMem_Malloc(sizeof(_ContractionPath))
+
+    def __dealloc__(self):
+        PyMem_Free(<void*>self.path)
+
+    def __init__(self, int num_contractions, uintptr_t data):
+        """
+        __init__(self, int num_contractions, uintptr_t data)
+        """
+        self.path.numContractions = num_contractions
+        self.path.data = <_NodePair*>data
+
+    def get_path(self):
+        """Get the pointer address to the underlying `cutensornetContractionPath_t` struct.
+
+        Returns:
+            uintptr_t: The pointer address.
+        """
+        return <uintptr_t>self.path
+
+    def get_size(self):
+        """Get the size of the `cutensornetContractionPath_t` struct.
+
+        Returns:
+            size_t: ``sizeof(cutensornetContractionPath_t)``.
+        """
+        return sizeof(_ContractionPath)
+
+
+class GraphAlgorithm(IntEnum):
+    """See `cutensornetGraphAlgo_t`."""
+    RB = CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_GRAPH_ALGORITHM_RB
+    KWAY = CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_GRAPH_ALGORITHM_KWAY
+
+class MemoryModel(IntEnum):
+    """See `cutensornetMemoryModel_t`."""
+    SLICER_HEURISTIC = CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_SLICER_MEMORY_MODEL_HEURISTIC
+    SLICER_CUTENSOR = CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_SLICER_MEMORY_MODEL_CUTENSOR
+
+class ContractionOptimizerConfigAttribute(IntEnum):
+    """See `cutensornetContractionOptimizerConfigAttributes_t`."""
+    GRAPH_NUM_PARTITIONS = CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_GRAPH_NUM_PARTITIONS
+    GRAPH_CUTOFF_SIZE = CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_GRAPH_CUTOFF_SIZE
+    GRAPH_ALGORITHM = CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_GRAPH_ALGORITHM
+    GRAPH_IMBALANCE_FACTOR = CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_GRAPH_IMBALANCE_FACTOR
+    GRAPH_NUM_ITERATIONS = CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_GRAPH_NUM_ITERATIONS
+    GRAPH_NUM_CUTS = CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_GRAPH_NUM_CUTS
+    RECONFIG_NUM_ITERATIONS = CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_RECONFIG_NUM_ITERATIONS
+    RECONFIG_NUM_LEAVES = CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_RECONFIG_NUM_LEAVES
+    SLICER_DISABLE_SLICING = CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_SLICER_DISABLE_SLICING
+    SLICER_MEMORY_MODEL = CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_SLICER_MEMORY_MODEL
+    SLICER_MEMORY_FACTOR = CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_SLICER_MEMORY_FACTOR
+    SLICER_MIN_SLICES = CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_SLICER_MIN_SLICES
+    SLICER_SLICE_FACTOR = CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_SLICER_SLICE_FACTOR
+    HYPER_NUM_SAMPLES = CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_HYPER_NUM_SAMPLES
+    SIMPLIFICATION_DISABLE_DR = CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_SIMPLIFICATION_DISABLE_DR
+    SEED = CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_SEED
+
+class ContractionOptimizerInfoAttribute(IntEnum):
+    """See `cutensornetContractionOptimizerInfoAttributes_t`."""
+    NUM_SLICES = CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_NUM_SLICES
+    NUM_SLICED_MODES = CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_NUM_SLICED_MODES
+    SLICED_MODE = CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_SLICED_MODE
+    SLICED_EXTENT = CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_SLICED_EXTENT
+    PATH = CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_PATH
+    PHASE1_FLOP_COUNT = CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_PHASE1_FLOP_COUNT
+    FLOP_COUNT = CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_FLOP_COUNT
+    LARGEST_TENSOR = CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_LARGEST_TENSOR
+    SLICING_OVERHEAD = CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_SLICING_OVERHEAD
+
+class ContractionAutotunePreferenceAttribute(IntEnum):
+    """See `cutensornetContractionAutotunePreferenceAttributes_t`."""
+    MAX_ITERATIONS = CUTENSORNET_CONTRACTION_AUTOTUNE_MAX_ITERATIONS
+
+del IntEnum
+
+
+# expose them to Python
+MAJOR_VER = CUTENSORNET_MAJOR
+MINOR_VER = CUTENSORNET_MINOR
+PATCH_VER = CUTENSORNET_PATCH
+VERSION = CUTENSORNET_VERSION
diff --git a/python/cuquantum/cutensornet/tensor_network.py b/python/cuquantum/cutensornet/tensor_network.py
new file mode 100644
index 0000000..0033a49
--- /dev/null
+++ b/python/cuquantum/cutensornet/tensor_network.py
@@ -0,0 +1,1007 @@
+"""
+Tensor network contraction with the standard einsum interface using cutensornet.
+"""
+
+__all__ = ['contract', 'contract_path', 'einsum', 'einsum_path', 'Network']
+
+import collections
+import dataclasses
+import functools
+import logging
+import os
+import sys
+
+import cupy as cp
+import numpy as np
+
+from cuquantum import cutensornet as cutn
+from . import configuration
+from ._internal import einsum_parser
+from ._internal import optimizer_ifc
+from ._internal import tensor_wrapper
+from ._internal import typemaps
+from ._internal import utils
+
+
+class InvalidNetworkState(Exception):
+    pass
+
+
+class Network:
+
+    """
+    Network(subscripts, *operands, options=None)
+
+    Create a tensor network object specified as an einsum expression.
+
+    The Einstein summation convention provides an elegant way of representing many tensor network operations.
+    This object allows the user to invest
+    considerable effort into computing the best contraction path as well as autotuning the contraction upfront
+    for repeated contractions over the same network *topology* (different input tensors, or "operands", with the same Einstein
+    summation expression). Also see :meth:`~Network.contract_path` and :meth:`autotune`.
+
+    For the Einstein summation expression, both the explicit and implicit forms are supported.
+
+    In the implicit form, the output indices are inferred from the summation expression and *reordered lexicographically*.
+    An example is the expression ``'ij,jh'``, for which the output indices are ``'hi'``. (This corresponds to a matrix
+    multiplication followed by a transpose.)
+
+    In the explicit form, output indices can be directly stated following the identifier ``'->'`` in the summation expression.
+    An example is the expression ``'ij,jh->ih'`` (which corresponds to a matrix multiplication).
+
+    To specify an Einstein summation expression, both the subscript format (as shown above) and the ""interleaved" format
+    are supported.
+
+    The interleaved format is an alternative way for specifying the operands and their modes as 
+    ``Network(op0, modes0, op1, modes1, ..., [modes_out])``, where ``opN``
+    is the N-th operand and ``modesN`` is a sequence of hashable object (strings, integers, etc) representing the N-th operand's
+    modes.
+
+    Ellipsis broadcasting is currently *not supported*.
+
+    Additional information on various operations on the network can be obtained by passing in a :class:`logging.Logger` object
+    to :class:`NetworkOptions` or by setting the appropriate options in the root logger object, which is used by default::
+
+        import logging
+        logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)-8s %(message)s', datefmt='%m-%d %H:%M:%S')
+
+    Args:
+        subscripts : The modes (subscripts) for summation as a comma-separated list of characters. Unicode characters are
+            allowed in the expression thereby expanding the size of the tensor network that can be specified using the 
+            Einstein summation convention.
+        operands : A sequence of tensors (ndarray-like objects). The currently supported types are :class:`numpy.ndarray`, 
+            :class:`cupy.ndarray`, and :class:`torch.Tensor`.
+        options : Specify options for the tensor network as a :class:`~cuquantum.NetworkOptions` object. Alternatively, a `dict`
+            containing the parameters for the ``NetworkOptions`` constructor can also be provided. If not specified,
+            the value will be set to the default-constructed ``NetworkOptions`` object.
+
+    See Also:
+        :meth:`~Network.contract_path`, :meth:`autotune`, :meth:`~Network.contract`, :meth:`reset_operands`
+
+    Note:
+        In this release, only the *classical* Einstein summation is supported -- an index (mode) must appear exactly once or
+        twice. An index that appears twice represents an inner product on that dimension. If an index appears once,
+        it must appear in the output.
+
+    Examples:
+
+        >>> from cuquantum import Network
+        >>> import numpy as np
+
+        Define the parameters of the tensor network:
+
+        >>> expr = 'ehl,gj,edhg,bif,d,c,k,iklj,cf,a->ba'
+        >>> shapes = [(8, 2, 5), (5, 7), (8, 8, 2, 5), (8, 6, 3), (8,), (6,), (5,), (6, 5, 5, 7), (6, 3), (3,)]
+
+        Create the input tensors using NumPy:
+
+        >>> operands = [np.random.rand(*shape) for shape in shapes]
+
+        Create a :class:`Network` object:
+       
+        >>> n = Network(expr, *operands)
+
+        Find the best contraction order:
+       
+        >>> path, info = n.contract_path({'samples': 500})
+
+        Autotune the network:
+       
+        >>> n.autotune(iterations=5)
+
+        Perform the contraction. The result is of the same type and on the same device as the operands:
+
+        >>> r1 = n.contract()
+
+        Reset operands to new values:
+
+        >>> operands = [i*operand for i, operand in enumerate(operands, start=1)]
+        >>> n.reset_operands(*operands)
+
+        Get the result of the new contraction:
+
+        >>> r2 = n.contract()
+        >>> from math import factorial
+        >>> np.allclose(r2, factorial(len(operands))*r1)
+        True
+
+        Finally, free network resources. If this call isn't made, it may hinder further operations (especially if the 
+        network is large) as it causes **memory leak**. (*To avoid having to explicitly make this call, it is recommended
+        to use the* :class:`Network` *object as a context manager*.)
+
+        >>> n.free()
+
+        If the operands are on the GPU, they can also be updated using in-place operations. In this case, the call 
+        to :meth:`reset_operands` can be skipped -- subsequent :meth:`~Network.contract` calls will use the same 
+        operands (with updated contents). The following example illustrates this using CuPy operands and also demonstrates
+        the usage of a :class:`Network` context (so as to skip calling :meth:`free`):
+
+        >>> import cupy as cp
+        >>> expr = 'ehl,gj,edhg,bif,d,c,k,iklj,cf,a->ba'
+        >>> shapes = [(8, 2, 5), (5, 7), (8, 8, 2, 5), (8, 6, 3), (8,), (6,), (5,), (6, 5, 5, 7), (6, 3), (3,)]
+        >>> operands = [cp.random.rand(*shape) for shape in shapes]
+        >>>
+        >>> with Network(expr, *operands) as n:
+        ...     path, info = n.contract_path({'samples': 500})
+        ...     n.autotune(iterations=5)
+        ...
+        ...     # Perform the contraction
+        ...     r1 = n.contract()
+        ...
+        ...     # Update the operands in place
+        ...     for i, operand in enumerate(operands, start=1):
+        ...         operand *= i
+        ...
+        ...     # Perform the contraction with the updated operand values
+        ...     r2 = n.contract()
+        ...
+        ... # The resources used by the network are automatically released when the context ends.
+        >>>
+        >>> from math import factorial
+        >>> cp.allclose(r2, factorial(len(operands))*r1)
+        array(True)
+
+        PyTorch CPU and GPU tensors can be passed as input operands in the same fashion.
+
+        See :func:`contract` for more examples on specifying the Einstein summation expression as well
+        as specifying options for the tensor network and the optimizer.
+    """
+
+    def __init__(self, *operands, options=None):
+        """
+        __init__(subscripts, *operands, options=None)
+        """
+
+        options = utils.check_or_create_options(configuration.NetworkOptions, options, "network options")
+        self.options = options
+
+        # Logger
+        self.logger = options.logger if options.logger is not None else logging.getLogger()
+        self.logger.info("Beginning network creation...")
+
+        # Parse Einsum expression.
+        self.operands, self.inputs, self.output, self.size_dict, self.mode_map_user_to_ord, self.mode_map_ord_to_user = einsum_parser.parse_einsum(*operands)
+
+        # Copy operands to device if needed.
+        self.network_location = 'cuda'
+        self.device_id = utils.get_network_device_id(self.operands)
+        if self.device_id is None:
+            self.network_location = 'cpu'
+            self.device_id = options.device_id
+            self.operands = tensor_wrapper.to(self.operands, self.device_id)
+
+        # The output class is that of the first wrapped device operand.
+        self.output_class = self.operands[0].__class__
+    
+        # Ensure all the operands are on the same device
+        self.device = cp.cuda.Device(self.device_id)
+
+        # Set memory limit
+        self.memory_limit = utils.get_memory_limit(self.options.memory_limit, self.device) 
+        self.logger.info(f"The memory limit is {self.memory_limit} bytes.")
+
+        # Define data types
+        self.data_type = utils.get_operands_dtype(self.operands)
+        self.compute_type = options.compute_type if options.compute_type is not None else typemaps.NAME_TO_COMPUTE_TYPE[self.data_type]
+
+        # Prepare data for cutensornet
+        num_inputs = len(self.inputs)
+        num_modes_out = len(self.output)
+
+        extents_in = tuple(o.shape for o in self.operands)
+        strides_in = tuple(o.strides for o in self.operands)
+        self.operands_data, alignments_in = utils.get_operands_data(self.operands)
+        modes_in = tuple(tuple(m for m in _input) for _input in self.inputs)
+        num_modes_in = tuple(len(m) for m in modes_in)
+
+        self.contraction, modes_out, extents_out, strides_out, alignment_out = utils.create_output_tensor(
+                self.output_class, self.output, self.size_dict, self.device_id, self.data_type)
+
+        # Create/set handle
+        if options.handle is not None:
+            self.own_handle = False
+            self.handle = options.handle
+        else:
+            self.own_handle = True
+            with self.device:
+                self.handle = cutn.create()
+
+        # Network definition
+        self.network = cutn.create_network_descriptor(self.handle, num_inputs,
+                num_modes_in, extents_in, strides_in, modes_in, alignments_in,        # inputs
+                num_modes_out, extents_out, strides_out, modes_out, alignment_out,    # output
+                typemaps.NAME_TO_DATA_TYPE[self.data_type], self.compute_type)
+
+        # Keep output extents for creating new tensors, if needed.
+        self.extents_out = extents_out
+
+        self.optimizer_config_ptr, self.optimizer_info_ptr = None, None
+        self.workspace, self.workspace_size = None, None
+        self.plan = None
+        self.autotune_pref_ptr = None
+        self.optimized = False
+        self.autotuned = False
+
+        self.valid_state = True
+
+        self.logger.info("The network has been created.")
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.free()
+
+    def _check_valid_network(self, *args, **kwargs):
+        """
+        """
+        if not self.valid_state:
+            raise InvalidNetworkState("The network cannot be used after resources are free'd")
+
+    def _check_optimized(self, *args, **kwargs):
+        """
+        """
+        what = kwargs['what']
+        if not self.optimized:
+            raise RuntimeError(f"{what} cannot be performed before contract_path() has been called.")
+
+    def _free_plan_resources(self, exception=None):
+        """
+        Free resources allocated in network contraction planning.
+        """
+
+        if self.plan is not None:
+            cutn.destroy_contraction_plan(self.plan)
+            self.plan = None
+
+        return True
+
+    def _free_workspace_resources(self, exception=None):
+        """
+        Free resources related to network workspace.
+        """
+
+        if self.workspace is not None:
+            with self.device:
+                cp.cuda.runtime.free(self.workspace)
+            self.workspace = None
+
+        self.workspace_size = None
+
+        return True
+
+    def _free_path_resources(self, exception=None):
+        """
+        Free resources allocated in path computation.
+        """
+
+        if self.optimizer_config_ptr is not None:
+            cutn.destroy_contraction_optimizer_config(self.optimizer_config_ptr)
+            self.optimizer_config_ptr = None
+
+        if self.optimizer_info_ptr is not None:
+            cutn.destroy_contraction_optimizer_info(self.optimizer_info_ptr)
+            self.optimizer_info_ptr = None
+
+        self._free_workspace_resources()
+
+        self._free_plan_resources()
+
+        return True
+
+    @utils.precondition(_check_valid_network)
+    @utils.precondition(_check_optimized, "Workspace allocation")
+    @utils.atomic(_free_workspace_resources, method=True)
+    def _allocate_workspace(self):
+        """
+        Allocate workspace for cutensornet.
+        """
+
+        workspace_size = cutn.contraction_get_workspace_size(self.handle, self.network, self.optimizer_info_ptr)
+        self.workspace_size = workspace_size
+
+        self.logger.debug("Allocating workspace for contraction...")
+
+        with self.device:
+            if self.workspace:
+                cp.cuda.runtime.free(self.workspace)
+
+            self.workspace = cp.cuda.runtime.malloc(workspace_size)
+
+        value, unit = utils.convert_memory_with_units(workspace_size)
+        value = f"{value}" if unit == "B" else f"{value:0.2f}"
+        self.logger.debug(f"Finished allocating workspace of size {value} {unit} for contraction.")
+
+    @utils.precondition(_check_valid_network)
+    @utils.precondition(_check_optimized, "Planning")
+    @utils.atomic(_free_plan_resources, method=True)
+    def _create_plan(self):
+        """
+        Create network plan.
+        """
+
+        self.logger.debug("Creating contraction plan...")
+
+        if self.plan:
+            cutn.destroy_contraction_plan(self.plan)
+
+        self.plan = cutn.create_contraction_plan(self.handle, self.network, self.optimizer_info_ptr, self.workspace_size)
+
+        self.logger.debug("Finished creating contraction plan.")
+    
+    def _set_opt_config_options(self, options):
+        """
+        Set ContractionOptimizerConfig options if the value is not None.
+
+        Args:
+            options: A PathFinderOptions, SlicerOptions, or ReconfigOptions object.
+        """
+        for field in dataclasses.fields(options):
+            name, value = field.name, getattr(options, field.name)
+            if value is None:
+                continue
+
+            enum = options.option_to_enum[name]
+            self._set_opt_config_option(name, enum, value)
+
+    def _set_opt_config_option(self, name, enum, value):
+        """
+        Set a single ContractionOptimizerConfig option if the value is not None.
+
+        Args:
+            name: The name of the attribute.
+            enum: A ContractionOptimizerConfigAttribute to set.
+            value: The value to which the attribute is set to.
+        """
+        if value is None:
+            return
+
+        dtype = cutn.contraction_optimizer_config_get_attribute_dtype(enum)
+        value = np.array((value,), dtype=dtype)
+        cutn.contraction_optimizer_config_set_attribute(self.handle, self.optimizer_config_ptr, enum, value.ctypes.data, value.dtype.itemsize)
+        self.logger.info(f"The optimizer config attribute '{name}' has been set to {value[0]}.")
+
+    @utils.precondition(_check_valid_network)
+    def _set_optimizer_options(self, optimize):
+        """
+        """
+        # Loop over the options and set if not None.
+
+        assert isinstance(optimize.path, configuration.PathFinderOptions), "Internal error."
+
+        # PathFinder options
+        self._set_opt_config_options(optimize.path)
+
+        # Slicer options
+        if isinstance(optimize.slicing, configuration.SlicerOptions):
+            self._set_opt_config_options(optimize.slicing)
+
+        # Reconfiguration options
+        self._set_opt_config_options(optimize.reconfiguration)
+
+        # The "global" options
+        ConfEnum = cutn.ContractionOptimizerConfigAttribute
+
+        enum = ConfEnum.HYPER_NUM_SAMPLES
+        self._set_opt_config_option('samples', enum, optimize.samples)
+
+        enum = ConfEnum.SEED
+        self._set_opt_config_option('seed', enum, optimize.seed)
+
+    @utils.precondition(_check_valid_network)
+    @utils.atomic(_free_path_resources, method=True)
+    def contract_path(self, optimize=None):
+        """Compute the best contraction path together with any slicing that is needed to ensure that the contraction can be 
+        performed within the specified memory limit.
+
+        Args:
+            optimize :  This parameter specifies options for path optimization as an :class:`OptimizerOptions` object. Alternatively, a 
+                dictionary containing the parameters for the ``OptimizerOptions`` constructor can also be provided. If not 
+                specified, the value will be set to the default-constructed ``OptimizerOptions`` object.
+
+        Returns: 
+            tuple: A 2-tuple (``path``, ``opt_info``):
+
+                - ``path`` :  A sequence of pairs of operand indices representing the best contraction order in the
+                  :func:`numpy.einsum_path` format.
+                - ``opt_info`` : An object of type :class:`OptimizerInfo` containing information about the best contraction order.
+
+        Notes:
+
+            - If the path is provided, the user has to set the sliced modes too if slicing is desired.
+            - If the path or sliced modes are provided, the metrics in :class:`OptimizerInfo` may not be correct.
+        """
+
+        optimize = utils.check_or_create_options(configuration.OptimizerOptions, optimize, "path optimizer options")
+
+        if self.optimizer_config_ptr is None:
+            self.optimizer_config_ptr = cutn.create_contraction_optimizer_config(self.handle)
+        if self.optimizer_info_ptr is None:
+            self.optimizer_info_ptr = cutn.create_contraction_optimizer_info(self.handle, self.network)
+
+        opt_info_ifc = optimizer_ifc.OptimizerInfoInterface(self)
+
+        # Compute path (or set provided path)
+        if isinstance(optimize.path, configuration.PathFinderOptions):
+            # Set optimizer options.
+            self._set_optimizer_options(optimize)
+            # Find "optimal" path.
+            self.logger.info("Finding optimal path as well as sliced modes...")
+            cutn.contraction_optimize(self.handle, self.network, self.optimizer_config_ptr, self.memory_limit, self.optimizer_info_ptr)
+            self.logger.info("Finished finding optimal path as well as sliced modes.")
+        else:
+            self.logger.info("Setting user-provided path...")
+            opt_info_ifc.path = optimize.path
+            self.logger.info("Finished setting user-provided path.")
+
+        # Set slicing if provided
+        if not isinstance(optimize.slicing, configuration.SlicerOptions):
+            self.logger.info("Setting user-provided sliced modes...")
+            opt_info_ifc.sliced_mode_extent = optimize.slicing
+            self.logger.info("Finished setting user-provided sliced modes.")
+
+        self.num_slices = opt_info_ifc.num_slices
+        assert self.num_slices > 0
+
+        # Create OptimizerInfo object here
+        largest_intermediate = opt_info_ifc.largest_intermediate
+        opt_cost = opt_info_ifc.flop_count
+        path = opt_info_ifc.path
+        slices = opt_info_ifc.sliced_mode_extent
+
+        opt_info = configuration.OptimizerInfo(largest_intermediate, opt_cost, path, slices)
+
+        self.optimized = True
+
+        # Allocate workspace
+        self._allocate_workspace()
+
+        # Create plan
+        self._create_plan()
+
+        return opt_info.path, opt_info
+
+    def _set_autotune_options(self, options):
+        """
+        Set ContractionAutotunePreference options if the value is not None.
+
+        Args:
+            options: dict of name : (enum, value) AutotunePreference parameters.
+        """
+        for name in options:
+            enum, value = options[name]
+            if value is None:
+                continue
+
+            self._set_autotune_option(name, enum, value)
+
+    def _set_autotune_option(self, name, enum, value):
+        """
+        Set a single ContractionAutotunePreference option if the value is not None.
+
+        Args:
+            name: The name of the attribute.
+            enum: A ContractionAutotunePreferenceAttribute to set.
+            value: The value to which the attribute is set to.
+        """
+        if value is None:
+            return
+
+        dtype = cutn.contraction_autotune_preference_get_attribute_dtype(enum)
+        value = np.array((value,), dtype=dtype)
+        cutn.contraction_autotune_preference_set_attribute(self.handle, self.autotune_pref_ptr, enum, value.ctypes.data, value.dtype.itemsize)
+        self.logger.info(f"The autotune preference '{name}' has been set to {value[0]}.")
+
+    @utils.precondition(_check_valid_network)
+    @utils.precondition(_check_optimized, "Autotuning")
+    def autotune(self, *, iterations=3, stream=None):
+        """Autotune the network to reduce the contraction cost.
+
+        This is an optional step that is recommended if the :class:`Network` object is used to perform multiple contractions.
+
+        Args:
+            iterations: The number of iterations for autotuning. See `CUTENSORNET_CONTRACTION_AUTOTUNE_MAX_ITERATIONS`.
+            stream: Provide the CUDA stream to use for the autotuning operation. Acceptable inputs include ``cudaStream_t``
+                (as Python `int`), :class:`cupy.cuda.Stream`, and :class:`torch.cuda.Stream`. If a stream is not provided,
+                the current stream will be used.
+        """
+
+        message = utils.check_autotune_params(iterations)
+        self.logger.info(message)
+        if self.autotune_pref_ptr is None:
+            self.autotune_pref_ptr = cutn.create_contraction_autotune_preference(self.handle)
+
+        AutoEnum = cutn.ContractionAutotunePreferenceAttribute
+        options = {'iterations': (AutoEnum.MAX_ITERATIONS, iterations)}
+        self._set_autotune_options(options)
+
+        # Check if we still hold an output tensor; if not, create a new one.
+        if self.contraction is None:
+            self.contraction = utils.create_empty_tensor(self.output_class, self.extents_out, self.data_type, self.device_id)
+
+        self.logger.info(f"Starting autotuning...")
+        stream, stream_ptr = utils.get_or_create_stream(self.device, stream)
+        with self.device:
+            start = stream.record()
+            cutn.contraction_autotune(self.handle, self.plan, self.operands_data, self.contraction.data_ptr, 
+                    self.workspace, self.workspace_size, self.autotune_pref_ptr, stream_ptr)
+            end = stream.record()
+            end.synchronize()
+            elapsed = cp.cuda.get_elapsed_time(start, end)
+
+        self.autotuned = True
+        self.logger.info(f"The autotuning took {elapsed:.3f} ms to complete.")
+
+    @utils.precondition(_check_valid_network)
+    def reset_operands(self, *operands):
+        """Reset the operands held by this :class:`Network` instance.
+
+        This method is not needed when the operands
+        reside on the GPU and in-place operations are used to update the operand values.
+
+        This method will perform various checks on the new operands to make sure:
+
+            - The shapes, strides, datatypes match those of the old ones.
+            - If input tensors are on GPU, the device and alignments must match.
+
+        Args:
+            operands: See :class:`Network`'s documentation.
+        """
+
+        if len(operands) != len(self.operands):
+            message = f"Mismatch in the number of operands ({len(operands)} provided, need {len(self.operands)})."
+            raise ValueError(message)
+
+        self.logger.info("Resetting operands...")
+        # First wrap operands
+        operands = tensor_wrapper.wrap_operands(operands)
+
+        utils.check_operands_match(self.operands, operands, 'dtype', "data type")
+        utils.check_operands_match(self.operands, operands, 'shape', 'shape')
+        utils.check_operands_match(self.operands, operands, 'strides', 'strides')
+
+        device_id = utils.get_network_device_id(operands)
+        if device_id is None:
+            # Copy to existing device pointers because the new operands are on the CPU.
+            tensor_wrapper.copy_(operands, self.operands)
+        else:
+            if self.device_id != device_id:
+                raise ValueError(f"The new operands must be on the same device ({device_id}) as the original operands "
+                                 f"({self.device_id}).")
+
+            _, orig_alignments = utils.get_operands_data(self.operands)
+            new_operands_data, new_alignments = utils.get_operands_data(operands)
+            utils.check_alignments_match(orig_alignments, new_alignments)
+
+            # Finally, replace the original data pointers by the new ones.
+            self.operands_data = new_operands_data
+        self.logger.info("The operands have been reset.")
+
+    @utils.precondition(_check_valid_network)
+    @utils.precondition(_check_optimized, "Contraction")
+    def contract(self, *, stream=None):
+        """Contract the network and return the result.
+
+        Args:
+            stream: Provide the CUDA stream to use for the autotuning operation. Acceptable inputs include ``cudaStream_t``
+                (as Python `int`), :class:`cupy.cuda.Stream`, and :class:`torch.cuda.Stream`. If a stream is not provided,
+                the current stream will be used.
+
+        Returns:
+            The result is of the same type and on the same device as the operands.
+        """
+
+        self.logger.info("Starting network contraction...") 
+        stream, stream_ptr = utils.get_or_create_stream(self.device, stream)
+
+        # Check if we still hold an output tensor; if not, create a new one.
+        if self.contraction is None:
+            self.contraction = utils.create_empty_tensor(self.output_class, self.extents_out, self.data_type, self.device_id)
+
+        with self.device:
+            start = stream.record()
+            for s in range(self.num_slices):
+                cutn.contraction(self.handle, self.plan, self.operands_data, self.contraction.data_ptr, self.workspace, 
+                        self.workspace_size, s, stream_ptr)
+            end = stream.record()
+            end.synchronize()
+            elapsed = cp.cuda.get_elapsed_time(start, end)
+
+        self.logger.info(f"The contraction took {elapsed:.3f} ms to complete.")
+
+        if self.network_location == 'cpu':
+            out = self.contraction.to('cpu')
+        else:
+            out = self.contraction.tensor
+        self.contraction = None  # we cannot overwrite what we already hand to users
+        return out
+
+    def free(self):
+        """Free network resources.
+
+        It is recommended that the :class:`Network` object be used within a context, but if it is not possible then this
+        method must be called explicitly to ensure that the network resources are properly cleaned up.
+        """
+
+        if not self.valid_state:
+            return
+
+        try:
+            self._free_path_resources()
+
+            if self.autotune_pref_ptr is not None:
+                cutn.destroy_contraction_autotune_preference(self.autotune_pref_ptr)
+                self.autotune_pref_ptr = None
+
+            if self.network is not None:
+                cutn.destroy_network_descriptor(self.network)
+                self.network = None
+
+            if self.handle is not None and self.own_handle:
+                cutn.destroy(self.handle)
+                self.handle = None
+                self.own_handle = False
+        except BaseException as e:
+            self.logger.critical("Internal error: only part of the network resources have been released.")
+            self.logger.critical(e)
+            raise e 
+        finally:
+            self.valid_state = False
+
+        self.logger.info("The network resources have been released.")
+
+
+def contract(*operands, options=None, optimize=None, stream=None, return_info=False):
+    """
+    contract(subscripts, *operands, options=None, optimize=None, stream=None, return_info=False)
+
+    Evaluate the Einstein summation convention on the operands.
+
+    Explicit as well as implicit form is supported for the Einstein summation expression. In addition to the subscript format,
+    the "interleaved" format is also supported as a means of specifying the operands and their modes. See :class:`Network` for more 
+    detail on the types of operands as well as for examples.
+
+    Args: 
+        subscripts : The modes (subscripts) for summation as a comma-separated list of characters. Unicode characters are
+            allowed in the expression thereby expanding the size of the tensor network that can be specified using the 
+            Einstein summation convention.
+        operands : A sequence of tensors (ndarray-like objects). The currently supported types are :class:`numpy.ndarray`, 
+            :class:`cupy.ndarray`, and :class:`torch.Tensor`.
+        options : Specify options for the tensor network as a :class:`~cuquantum.NetworkOptions` object. Alternatively, a `dict`
+            containing the parameters for the ``NetworkOptions`` constructor can also be provided. If not specified,
+            the value will be set to the default-constructed ``NetworkOptions`` object.
+        optimize :  This parameter specifies options for path optimization as an :class:`OptimizerOptions` object. Alternatively, a 
+            dictionary containing the parameters for the ``OptimizerOptions`` constructor can also be provided. If not 
+            specified, the value will be set to the default-constructed ``OptimizerOptions`` object.
+        stream: Provide the CUDA stream to use for the autotuning operation. Acceptable inputs include ``cudaStream_t``
+            (as Python `int`), :class:`cupy.cuda.Stream`, and :class:`torch.cuda.Stream`. If a stream is not provided,
+            the current stream will be used.
+        return_info : If true, information about the best contraction order will also be returned.
+
+    Returns:
+        If ``return_info`` is `False`, the output tensor (ndarray-like object) of the same type and on the same device
+        as the operands containing
+        the result of the contraction; otherwise, a 2-tuple consisting of the output tensor and an :class:`OptimizerInfo`
+        object that contains information about the best contraction order etc.
+
+    .. note::
+        It is encouraged for users to maintain the library handle themselves so as to reduce the context initialization time:
+
+        .. code-block:: python
+
+            from cuquantum import cutensornet, NetworkOptions, contract
+
+            handle = cutensornet.create()
+            network_opts = NetworkOptions(handle=handle, ...)
+            out = contract(..., options=network_opts, ...)
+            # ... the same handle can be reused for further calls ...
+            # when it's done, remember to destroy the handle
+            cutensornet.destroy(handle)
+
+    Examples:
+        
+        Use NumPy operands:
+
+        >>> from cuquantum import contract
+        >>> import numpy as np
+        >>> a = np.ones((3,2))
+        >>> b = np.ones((2,3))
+
+        Perform matrix multiplication in the explicit form. The result ``r`` is a NumPy ndarray (with the computation
+        performed on the GPU):
+
+        >>> r = contract('ij,jk->ik', a, b)
+
+        Implicit form:
+
+        >>> r = contract('ij,jk', a, b)
+
+        Interleaved format using characters for modes:
+
+        >>> r = contract(a, ['i', 'j'], b, ['j', 'k'], ['i', 'k'], return_info=True)
+
+        Interleaved format using string labels for modes, using implicit form:
+
+        >>> r = contract(a, ['first', 'second'], b, ['second', 'third'])
+
+        Interleaved format using integer modes, using explicit form:
+
+        >>> r = contract(a, [1, 2], b, [2, 3], [1, 3])
+
+        Obtain information ``i`` on the best contraction path along with the result ``r``:
+
+        >>> r, i = contract('ij,jk', a, b, return_info=True)
+
+        Provide options for the tensor network:
+
+        >>> from cuquantum import NetworkOptions
+        >>> n = NetworkOptions(device_id=1)
+        >>> r = contract('ij,jk->ik', a, b, options=n)
+
+        Alternatively, the options can be provided as a dict instead of a :class:`NetworkOptions` object:
+
+        >>> r = contract('ij,jk->ik', a, b, options={'device_id': 1})
+
+        Specify options for the optimizer:
+
+        >>> from cuquantum import OptimizerOptions, PathFinderOptions
+        >>> p = PathFinderOptions(imbalance_factor=230, cutoff_size=8)
+        >>> o = OptimizerOptions(path=p, seed=123)
+        >>> r = contract('ij,jk,kl', a, b, a, optimize=o)
+
+        Alternatively, the options above can be provided as a dict:
+
+        >>> r = contract('ij,jk,kl', a, b, a, optimize={'path': {'imbalance_factor': 230, 'cutoff_size': 8}, 'seed': 123})
+
+        Specify the path directly:
+
+        >>> o = OptimizerOptions(path = [(0,2), (0,1)])
+        >>> r = contract('ij,jk,kl', a, b, a, optimize=o)
+
+        Use CuPy operands. The result ``r`` is a CuPy ndarray on the same device as the operands, and ``dev`` is any valid
+        device ID on your system that you wish to use to store the tensors and compute the contraction:
+
+        >>> import cupy
+        >>> dev = 0
+        >>> with cupy.cuda.Device(dev):
+        ...     a = cupy.ones((3,2))
+        ...     b = cupy.ones((2,3))
+        >>> r = contract('ij,jk', a, b)
+
+        Use PyTorch operands. The result ``r`` is a PyTorch tensor on the same device (``dev``) as the operands:
+
+        >>> import torch
+        >>> dev = 0
+        >>> a = torch.ones((3,2), device=f'cuda:{dev}')
+        >>> b = torch.ones((2,3), device=f'cuda:{dev}')
+        >>> r = contract('ij,jk', a, b)
+    """
+
+    options = utils.check_or_create_options(configuration.NetworkOptions, options, "network options")
+
+    optimize = utils.check_or_create_options(configuration.OptimizerOptions, optimize, "path optimizer options")
+
+    # Create network
+    with Network(*operands, options=options) as network:
+
+        # Compute path
+        opt_info = network.contract_path(optimize=optimize)
+
+        # Skip autotuning since the network is contracted only once.
+
+        # Contraction
+        output = network.contract(stream=stream)
+
+    if return_info:
+        return output, opt_info
+
+    return output
+
+
+def contract_path(*operands, options=None, optimize=None):
+    """
+    contract_path(subscripts, *operands, options=None, optimize=None)
+
+    Evaluate the "best" contraction order by allowing the creation of intermediate tensors.
+
+    Explicit as well as implicit form is supported for the Einstein summation expression. In addition to the subscript format,
+    the "interleaved" format is also supported as a means of specifying the operands and their modes. See :class:`Network` for more
+    detail on the types of operands as well as for examples.
+
+    Args: 
+        subscripts : The modes (subscripts) for summation as a comma-separated list of characters. Unicode characters are
+            allowed in the expression thereby expanding the size of the tensor network that can be specified using the 
+            Einstein summation convention.
+        operands : A sequence of tensors (ndarray-like objects). The currently supported types are :class:`numpy.ndarray`, 
+            :class:`cupy.ndarray`, and :class:`torch.Tensor`.
+        options : Specify options for the tensor network as a :class:`~cuquantum.NetworkOptions` object. Alternatively, a `dict`
+            containing the parameters for the ``NetworkOptions`` constructor can also be provided. If not specified,
+            the value will be set to the default-constructed ``NetworkOptions`` object.
+        optimize :  This parameter specifies options for path optimization as an :class:`OptimizerOptions` object. Alternatively, a 
+            dictionary containing the parameters for the ``OptimizerOptions`` constructor can also be provided. If not 
+            specified, the value will be set to the default-constructed ``OptimizerOptions`` object.
+
+    Returns: 
+        tuple: A 2-tuple (``path``, ``opt_info``):
+
+            - ``path`` :  A sequence of pairs of operand indices representing the best contraction order in the
+              :func:`numpy.einsum_path` format.
+            - ``opt_info`` : An object of type :class:`OptimizerInfo` containing information about the best contraction order.
+
+    .. note::
+        It is encouraged for users to maintain the library handle themselves so as to reduce the context initialization time:
+
+        .. code-block:: python
+
+            from cuquantum import cutensornet, NetworkOptions, contract_path
+
+            handle = cutensornet.create()
+            network_opts = NetworkOptions(handle=handle, ...)
+            path, info = contract_path(..., options=network_opts, ...)
+            # ... the same handle can be reused for further calls ...
+            # when it's done, remember to destroy the handle
+            cutensornet.destroy(handle)
+
+    """
+
+    options = utils.check_or_create_options(configuration.NetworkOptions, options, "network options")
+
+    optimize = utils.check_or_create_options(configuration.OptimizerOptions, optimize, "path optimizer options")
+
+    # Create network
+    with Network(*operands, options=options) as network:
+
+        # Compute path
+        path, opt_info = network.contract_path(optimize=optimize)
+
+    return path, opt_info
+
+
+def _check_einsum_options(out, dtype, order, casting, optimize):
+    """
+    Check whether the options provided to the einsum function interface are supported.
+    """
+    if out is not None:
+        message = f"value '{out}' for parameter 'out'."
+        raise NotImplementedError(message)
+
+    if dtype is not None:
+        message = f"value '{dtype}' for parameter 'dtype'."
+        raise NotImplementedError(message)
+
+    if order != 'K':
+        message = f"value '{order}' for parameter 'order'."
+        raise NotImplementedError(message)
+
+    if casting.lower() != 'safe':
+        message = f"value '{casting}' for parameter 'casting'."
+        raise NotImplementedError(message)
+
+    if optimize not in (True, False) and not isinstance(optimize, collections.abc.Sequence):
+        message = f"""value '{optimize}' for parameter 'optimize'.
+Only True or False values are allowed. Alternatively an explicit contraction list from einsum_path
+can be provided."""
+        raise NotImplementedError(message)
+
+
+def einsum(*operands, out=None, dtype=None, order='K', casting='safe', optimize=True):
+    """
+    einsum(subscripts, *operands, out=None, dtype=None, order='K', casting='safe', optimize=True)
+
+    A drop-in replacement of :func:`numpy.einsum` for computing the specified tensor contraction using cuTensorNet.
+
+    Not all NumPy options are supported or even used. The :func:`contract` function provides an extensive set of options
+    specific to cuTensorNet and is recommended over this function.
+
+    Explicit as well as implicit form is supported for the Einstein summation expression. In addition to the subscript format,
+    the "interleaved" format is also supported as a means of specifying the operands and their modes. See :class:`Network` for more 
+    detail on the types of operands as well as for examples.
+
+    Args: 
+        subscripts : The modes (subscripts) for summation as a comma-separated list of characters. Unicode characters are
+            allowed in the expression thereby expanding the size of the tensor network that can be specified using the 
+            Einstein summation convention.
+        operands : A sequence of tensors (ndarray-like objects). The currently supported types are :class:`numpy.ndarray`, 
+            :class:`cupy.ndarray`, and :class:`torch.Tensor`.
+        out : Not supported in this release.
+        dtype : Not supported in this release.
+        order : Not supported in this release.
+        casting : Not supported in this release.
+        optimize :  This parameter specifies options for path optimization. The only values accepted by this interface are `True`,
+            `False`, or the contraction path specified in the :func:`numpy.einsum_path` format.
+
+    Returns:
+        output:
+            A tensor (ndarray-like object) of the same type and on the same device as the operands containing the result of
+            the contraction.
+    """
+
+    _check_einsum_options(out, dtype, order, casting, optimize)
+
+    # Create network
+    with Network(*operands) as network:
+
+        if optimize is True:
+            # Compute path
+            network.contract_path()
+        else:
+            if optimize is False:
+                # Use canonical path.
+                path = [(0, 1)] * (network.num_inputs - 1)
+            else: 
+                # Use specified path.
+                path = optimize
+                
+            # Set path (path validation is done when setting OptimizerOptions).
+            optimize = configuration.OptimizerOptions(path=path)
+            network.contract_path(optimize=optimize)
+
+        # Skip autotuning since the network is contracted only once.
+
+        # Contraction
+        output = network.contract()
+
+    return output
+
+
+def einsum_path(*operands, optimize=True):
+    """
+    einsum_path(subscripts, *operands, optimize=True)
+
+    A drop-in replacement of :func:`numpy.einsum_path` for evaluating the "best" contraction order using cuTensorNet.
+
+    Only a subset of the NumPy options is supported using this interface. The :func:`contract_path` function provides an
+    extensive set of options specific to cuTensorNet and is recommended over this function.
+
+    Explicit as well as implicit form is supported for the Einstein summation expression. In addition to the subscript format,
+    the "interleaved" format is also supported as a means of specifying the operands and their modes. See `Network` for more 
+    detail on the types of operands as well as for examples.
+
+    Args: 
+        subscripts : The modes (subscripts) for summation as a comma-separated list of characters. Unicode characters are
+            allowed in the expression thereby expanding the size of the tensor network that can be specified using the 
+            Einstein summation convention.
+        operands : A sequence of tensors (ndarray-like objects). The currently supported types are :class:`numpy.ndarray`, 
+            :class:`cupy.ndarray`, and :class:`torch.Tensor`.
+        optimize : This parameter specifies options for path optimization. The only value allowed with this interface is `True`.
+
+    Returns:
+        tuple: A 2-tuple (``path``, ``opt_info``):
+
+            - ``path`` :  A sequence of pairs of operand indices representing the best contraction order in the
+              :func:`numpy.einsum_path` format.
+            - ``opt_info`` : An object of type :class:`OptimizerInfo` containing information about the best contraction order.
+    """
+
+    if optimize is not True:
+        message = f"""Invalid value for parameter 'optimize'.
+The only allowed value for 'optimize' is True."""
+        raise NotImplementedError(message)
+
+    # Create network
+    with Network(*operands) as network:
+
+        # Compute path
+        path, opt_info = network.contract_path()
+
+    return path, opt_info
diff --git a/python/cuquantum/utils.pxd b/python/cuquantum/utils.pxd
new file mode 100644
index 0000000..e5a4b7b
--- /dev/null
+++ b/python/cuquantum/utils.pxd
@@ -0,0 +1,12 @@
+cimport cpython
+
+
+cdef inline bint is_nested_sequence(data):
+    if not cpython.PySequence_Check(data):
+        return False
+    else:
+        for i in data:
+            if not cpython.PySequence_Check(i):
+                return False
+        else:
+            return True
diff --git a/python/cuquantum/utils.py b/python/cuquantum/utils.py
new file mode 100644
index 0000000..655f925
--- /dev/null
+++ b/python/cuquantum/utils.py
@@ -0,0 +1,58 @@
+from enum import IntEnum
+
+
+# The (subset of) compute types below are shared by cuStateVec and cuTensorNet
+class ComputeType(IntEnum):
+    """An enumeration of CUDA compute types."""
+    COMPUTE_DEFAULT = 0
+    COMPUTE_16F     = 1 << 0
+    COMPUTE_32F     = 1 << 2
+    COMPUTE_64F     = 1 << 4
+    COMPUTE_8U      = 1 << 6
+    COMPUTE_8I      = 1 << 8
+    COMPUTE_32U     = 1 << 7
+    COMPUTE_32I     = 1 << 9
+    COMPUTE_16BF    = 1 << 10
+    COMPUTE_TF32    = 1 << 12
+
+
+# TODO: use those exposed by CUDA Python instead, but before removing these
+# duplicates, check if they are fixed to inherit IntEnum instead of Enum.
+class cudaDataType(IntEnum):
+    """An enumeration of `cudaDataType_t`."""
+    CUDA_R_16F  =  2
+    CUDA_C_16F  =  6
+    CUDA_R_16BF = 14
+    CUDA_C_16BF = 15
+    CUDA_R_32F  =  0
+    CUDA_C_32F  =  4
+    CUDA_R_64F  =  1
+    CUDA_C_64F  =  5
+    CUDA_R_4I   = 16
+    CUDA_C_4I   = 17
+    CUDA_R_4U   = 18
+    CUDA_C_4U   = 19
+    CUDA_R_8I   =  3
+    CUDA_C_8I   =  7
+    CUDA_R_8U   =  8
+    CUDA_C_8U   =  9
+    CUDA_R_16I  = 20
+    CUDA_C_16I  = 21
+    CUDA_R_16U  = 22
+    CUDA_C_16U  = 23
+    CUDA_R_32I  = 10
+    CUDA_C_32I  = 11
+    CUDA_R_32U  = 12
+    CUDA_C_32U  = 13
+    CUDA_R_64I  = 24
+    CUDA_C_64I  = 25
+    CUDA_R_64U  = 26
+    CUDA_C_64U  = 27
+
+class libraryPropertyType(IntEnum):
+    """An enumeration of library version information."""
+    MAJOR_VERSION = 0
+    MINOR_VERSION = 1
+    PATCH_LEVEL = 2
+
+del IntEnum
diff --git a/python/samples/accessor_get.py b/python/samples/accessor_get.py
new file mode 100644
index 0000000..229f508
--- /dev/null
+++ b/python/samples/accessor_get.py
@@ -0,0 +1,57 @@
+import numpy as np
+import cupy as cp
+
+import cuquantum
+from cuquantum import custatevec as cusv
+
+
+nIndexBits = 3
+nSvSize    = (1 << nIndexBits)
+
+bitOrdering = (2, 1)
+maskBitString = (1,)
+maskOrdering = (0,)
+assert len(maskBitString) == len(maskOrdering)
+maskLen = len(maskBitString)
+
+bufferSize  = 3
+accessBegin = 1
+accessEnd   = 4
+
+d_sv       = cp.asarray([0.0+0.0j, 0.0+0.1j, 0.1+0.1j, 0.1+0.2j,
+                         0.2+0.2j, 0.3+0.3j, 0.3+0.4j, 0.4+0.5j], dtype=np.complex64)
+h_buf      = np.empty(bufferSize, dtype=np.complex64)
+h_buf_res  = np.asarray([0.3+0.3j, 0.1+0.2j, 0.4+0.5j], dtype=np.complex64)
+
+####################################################################################
+
+# cuStateVec handle initialization
+handle = cusv.create()
+
+# create accessor and check the size of external workspace
+accessor, workspace_size = cusv.accessor_create_readonly(
+    handle, d_sv.data.ptr, cuquantum.cudaDataType.CUDA_C_32F, nIndexBits, bitOrdering, len(bitOrdering),
+    maskBitString, maskOrdering, maskLen)
+
+if workspace_size > 0:
+    workspace = cp.cuda.alloc(workspace_size)
+    workspace_ptr = workspace.ptr
+else:
+    workspace_ptr = 0
+
+# set external workspace
+cusv.accessor_set_extra_workspace(
+    handle, accessor, workspace_ptr, workspace_size)
+
+# get state vector components
+cusv.accessor_get(
+    handle, accessor, h_buf.ctypes.data, accessBegin, accessEnd)
+
+# destroy handle
+cusv.destroy(handle)
+
+# check result
+if not cp.allclose(h_buf, h_buf_res):
+    raise ValueError("results mismatch")
+else:
+    print("test passed")
diff --git a/python/samples/accessor_set.py b/python/samples/accessor_set.py
new file mode 100644
index 0000000..95ba1ff
--- /dev/null
+++ b/python/samples/accessor_set.py
@@ -0,0 +1,51 @@
+import numpy as np
+import cupy as cp
+
+import cuquantum
+from cuquantum import custatevec as cusv
+
+
+nIndexBits = 3
+nSvSize    = (1 << nIndexBits)
+
+bitOrdering = (1, 2, 0)
+maskLen = 0
+
+d_sv       = cp.zeros(nSvSize, dtype=np.complex64)
+d_sv_res   = cp.asarray([0.0+0.0j, 0.0+0.1j, 0.1+0.1j, 0.1+0.2j,
+                         0.2+0.2j, 0.3+0.3j, 0.3+0.4j, 0.4+0.5j], dtype=np.complex64)
+h_buf      = np.asarray([0.0+0.0j, 0.1+0.1j, 0.2+0.2j, 0.3+0.4j,
+                         0.0+0.1j, 0.1+0.2j, 0.3+0.3j, 0.4+0.5j], dtype=np.complex64)
+
+####################################################################################
+
+# cuStateVec handle initialization
+handle = cusv.create()
+
+# create accessor and check the size of external workspace
+accessor, workspace_size = cusv.accessor_create(
+    handle, d_sv.data.ptr, cuquantum.cudaDataType.CUDA_C_32F, nIndexBits, bitOrdering, len(bitOrdering),
+    0, 0, maskLen)
+
+if workspace_size > 0:
+    workspace = cp.cuda.alloc(workspace_size)
+    workspace_ptr = workspace.ptr
+else:
+    workspace_ptr = 0
+
+# set external workspace
+cusv.accessor_set_extra_workspace(
+    handle, accessor, workspace_ptr, workspace_size)
+
+# set state vector components
+cusv.accessor_set(
+    handle, accessor, h_buf.ctypes.data, 0, nSvSize)
+
+# destroy handle
+cusv.destroy(handle)
+
+# check result
+if not cp.allclose(d_sv, d_sv_res):
+    raise ValueError("results mismatch")
+else:
+    print("test passed")
diff --git a/python/samples/batch_measure.py b/python/samples/batch_measure.py
new file mode 100644
index 0000000..3f47075
--- /dev/null
+++ b/python/samples/batch_measure.py
@@ -0,0 +1,45 @@
+import numpy as np
+import cupy as cp
+
+import cuquantum
+from cuquantum import custatevec as cusv
+
+
+nIndexBits   = 3
+nSvSize      = (1 << nIndexBits)
+bitStringLen = 3
+bitOrdering  = np.asarray([2, 1, 0], dtype=np.int32)
+
+# In real appliction, random number in range [0, 1) will be used.
+randnum      = 0.5
+
+h_sv         = np.asarray([0.0+0.0j, 0.0+0.1j, 0.1+0.1j, 0.1+0.2j, 
+                           0.2+0.2j, 0.3+0.3j, 0.3+0.4j, 0.4+0.5j], dtype=np.complex64)
+d_sv         = cp.asarray(h_sv)
+
+expected_sv  = np.asarray([0.0+0.0j, 0.0+0.0j, 0.0+0.0j, 0.0+0.0j, 
+                           0.0+0.0j, 0.0+0.0j, 0.6+0.8j, 0.0+0.0j], dtype=np.complex64)
+expected_bitString = np.asarray([1, 1, 0], dtype=np.int32)
+
+###################################################################################
+
+# cuStateVec handle initialization
+handle = cusv.create()
+
+# allocate host memory to hold the result
+bitString = np.empty((bitStringLen,), dtype=np.int32)
+
+# batch measurement
+cusv.batch_measure(
+    handle, d_sv.data.ptr, cuquantum.cudaDataType.CUDA_C_32F, nIndexBits, bitString.ctypes.data,
+    bitOrdering.ctypes.data, bitStringLen, randnum, cusv.Collapse.NORMALIZE_AND_ZERO)
+
+# destroy handle
+cusv.destroy(handle)
+
+# check result
+if not cp.allclose(expected_sv, d_sv):
+    raise ValueError("results mismatch")
+if not np.allclose(expected_bitString, bitString):
+    raise ValueError("results mismatch")
+print("test passed")
diff --git a/python/samples/coarse/example1.py b/python/samples/coarse/example1.py
new file mode 100644
index 0000000..dc18dd0
--- /dev/null
+++ b/python/samples/coarse/example1.py
@@ -0,0 +1,16 @@
+"""
+Example using NumPy ndarrays with explicit Einstein summation.
+
+The contraction result is also a NumPy ndarray.
+"""
+import numpy as np
+
+from cuquantum import contract
+
+
+a = np.ones((3,2))
+b = np.ones((2,3))
+
+r = contract("ij,jk->ik", a, b)
+print(r)
+
diff --git a/python/samples/coarse/example10.py b/python/samples/coarse/example10.py
new file mode 100644
index 0000000..3e762df
--- /dev/null
+++ b/python/samples/coarse/example10.py
@@ -0,0 +1,20 @@
+"""
+Example using PyTorch tensors.
+
+The contraction result is also a PyTorch tensor on the same device.
+"""
+import torch
+
+from cuquantum import contract, OptimizerOptions
+
+
+# dev can be any valid device ID on your system, here let's
+# pick the first device
+dev = 0
+a = torch.ones((3,2), device=f'cuda:{dev}')
+b = torch.ones((2,3), device=f'cuda:{dev}')
+
+r = contract("ij,jk", a, b)
+print(f"result type = {type(r)}")
+print(f"result device = {r.device}")
+print(r)
diff --git a/python/samples/coarse/example11.py b/python/samples/coarse/example11.py
new file mode 100644
index 0000000..0687bb5
--- /dev/null
+++ b/python/samples/coarse/example11.py
@@ -0,0 +1,19 @@
+"""
+Example using NumPy ndarrays. Return contraction path and optimizer information.
+
+The contraction result is also a NumPy ndarray.
+"""
+import numpy as np
+
+from cuquantum import contract
+
+
+a = np.ones((3,2))
+b = np.ones((2,8))
+c = np.ones((8,3))
+
+r, (p, i) = contract("ij,jk,kl->il", a, b, c, return_info=True)
+print(f"path = {p}")
+print(f"optimizer information = {i}")
+print(r)
+
diff --git a/python/samples/coarse/example2.py b/python/samples/coarse/example2.py
new file mode 100644
index 0000000..5efbf5b
--- /dev/null
+++ b/python/samples/coarse/example2.py
@@ -0,0 +1,16 @@
+"""
+Example using NumPy ndarrays with implicit Einstein summation.
+
+The contraction result is also a NumPy ndarray.
+"""
+import numpy as np
+
+from cuquantum import contract
+
+
+a = np.ones((3,2))
+b = np.ones((2,3))
+
+r = contract("ij,jk", a, b)
+print(r)
+
diff --git a/python/samples/coarse/example3.py b/python/samples/coarse/example3.py
new file mode 100644
index 0000000..b4cfa8f
--- /dev/null
+++ b/python/samples/coarse/example3.py
@@ -0,0 +1,16 @@
+"""
+Example using NumPy ndarrays with explicit Einstein summation (Unicode characters).
+
+The contraction result is also a NumPy ndarray.
+"""
+import numpy as np
+
+from cuquantum import contract
+
+
+a = np.ones((3,2))
+b = np.ones((2,3))
+
+r = contract("αβ,βγ->αγ", a, b)
+print(r)
+
diff --git a/python/samples/coarse/example4.py b/python/samples/coarse/example4.py
new file mode 100644
index 0000000..3f69c5e
--- /dev/null
+++ b/python/samples/coarse/example4.py
@@ -0,0 +1,16 @@
+"""
+Example using NumPy ndarrays with interleaved format (explicit form for output indices).
+
+The contraction result is also a NumPy ndarray.
+"""
+import numpy as np
+
+from cuquantum import contract
+
+
+a = np.ones((3,2))
+b = np.ones((2,3))
+
+r = contract(a, ['first', 'second'], b, ['second', 'third'], ['first', 'third'])
+print(r)
+
diff --git a/python/samples/coarse/example5.py b/python/samples/coarse/example5.py
new file mode 100644
index 0000000..1b85b7e
--- /dev/null
+++ b/python/samples/coarse/example5.py
@@ -0,0 +1,20 @@
+"""
+Example using NumPy ndarrays. Specify network options.
+
+The contraction result is also a NumPy ndarray.
+"""
+import numpy as np
+
+from cuquantum import contract, NetworkOptions
+
+
+a = np.ones((3,2))
+b = np.ones((2,3))
+
+o = NetworkOptions(memory_limit="10kb")    # As a value with units.
+o = NetworkOptions(memory_limit=12345)     # As a number of bytes (int or float).
+o = NetworkOptions(memory_limit="10%")     # As a percentage of device memory.
+
+r = contract("ij,jk", a, b, options=o)
+print(r)
+
diff --git a/python/samples/coarse/example6.py b/python/samples/coarse/example6.py
new file mode 100644
index 0000000..cd84178
--- /dev/null
+++ b/python/samples/coarse/example6.py
@@ -0,0 +1,18 @@
+"""
+Example using NumPy ndarrays. Specify CUDA stream for the computation.
+
+The contraction result is also a NumPy ndarray.
+"""
+import cupy as cp
+import numpy as np
+
+from cuquantum import contract, NetworkOptions
+
+
+a = np.ones((3,2))
+b = np.ones((2,3))
+
+s = cp.cuda.Stream()
+r = contract("αβ,βγ->αγ", a, b, stream=s)
+print(r)
+
diff --git a/python/samples/coarse/example7.py b/python/samples/coarse/example7.py
new file mode 100644
index 0000000..f6ef22d
--- /dev/null
+++ b/python/samples/coarse/example7.py
@@ -0,0 +1,19 @@
+"""
+Example using NumPy ndarrays. Specify logging options.
+
+The contraction result is also a NumPy ndarray.
+"""
+import logging
+
+import numpy as np
+
+from cuquantum import contract, NetworkOptions
+
+
+a = np.ones((3,2))
+b = np.ones((2,3))
+
+logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)-8s %(message)s', datefmt='%m-%d %H:%M:%S')
+r = contract("ij,jk", a, b)       
+print(r)
+
diff --git a/python/samples/coarse/example8.py b/python/samples/coarse/example8.py
new file mode 100644
index 0000000..0e6044b
--- /dev/null
+++ b/python/samples/coarse/example8.py
@@ -0,0 +1,18 @@
+"""
+Example using NumPy ndarrays. Provide contraction path.
+
+The contraction result is also a NumPy ndarray.
+"""
+import numpy as np
+
+from cuquantum import contract, OptimizerOptions
+
+
+a = np.ones((3,2))
+b = np.ones((2,3))
+c = np.ones((3,3))
+
+o = OptimizerOptions(path=[(0,2), (0,1)])
+r = contract("ij,jk,kl->il", a, b, c, optimize=o)
+print(r)
+
diff --git a/python/samples/coarse/example9.py b/python/samples/coarse/example9.py
new file mode 100644
index 0000000..dab79ab
--- /dev/null
+++ b/python/samples/coarse/example9.py
@@ -0,0 +1,21 @@
+"""
+Example using CuPy ndarrays. 
+
+The contraction result is also a CuPy ndarray on the same device.
+"""
+import cupy as cp
+
+from cuquantum import contract, OptimizerOptions
+
+
+# dev can be any valid device ID on your system, here let's
+# pick the first device
+dev = 0
+with cp.cuda.Device(dev):
+    a = cp.ones((3,2))
+    b = cp.ones((2,3))
+
+r = contract("ij,jk", a, b)
+print(f"result type = {type(r)}")
+print(f"result device = {r.device}")
+print(r)
diff --git a/python/samples/diagonal_matrix.py b/python/samples/diagonal_matrix.py
new file mode 100644
index 0000000..46132ce
--- /dev/null
+++ b/python/samples/diagonal_matrix.py
@@ -0,0 +1,51 @@
+import numpy as np
+import cupy as cp
+
+import cuquantum
+from cuquantum import custatevec as cusv
+
+
+nIndexBits = 3
+nSvSize    = (1 << nIndexBits)
+nBasisBits = 1
+maskLen    = 0
+adjoint    = 0
+
+basisBits  = [2]
+
+d_sv       = cp.asarray([0.0+0.0j, 0.0+0.1j, 0.1+0.1j, 0.1+0.2j,
+                         0.2+0.2j, 0.3+0.3j, 0.3+0.4j, 0.4+0.5j], dtype=np.complex64)
+d_sv_res   = cp.asarray([0.0+0.0j, 0.0+0.1j, 0.1+0.1j, 0.1+0.2j,
+                         0.2-0.2j, 0.3-0.3j, 0.4-0.3j, 0.5-0.4j], dtype=np.complex64)
+diagonals  = np.asarray([1.0+0.0j, 0.0-1.0j], dtype=np.complex64)
+
+####################################################################################
+
+# cuStateVec handle initialization
+handle = cusv.create()
+
+# check the size of external workspace
+workspaceSize = cusv.apply_generalized_permutation_matrix_buffer_size(
+    handle, cuquantum.cudaDataType.CUDA_C_32F, nIndexBits, 0, diagonals.ctypes.data, cuquantum.cudaDataType.CUDA_C_32F,
+    basisBits, nBasisBits, maskLen)
+if workspaceSize > 0:
+    workspace = cp.cuda.memory.alloc(workspaceSize)
+    workspace_ptr = workspace.ptr
+else:
+    workspace_ptr = 0
+
+# apply matrix
+cusv.apply_generalized_permutation_matrix(
+    handle, d_sv.data.ptr, cuquantum.cudaDataType.CUDA_C_32F, nIndexBits,
+    0, diagonals.ctypes.data, cuquantum.cudaDataType.CUDA_C_32F, adjoint,
+    basisBits, nBasisBits, 0, 0, maskLen,
+    workspace_ptr, workspaceSize)
+
+# destroy handle
+cusv.destroy(handle)
+
+# check result
+if not np.allclose(d_sv, d_sv_res):
+    raise ValueError("results mismatch")
+else:
+    print("test passed")
diff --git a/python/samples/expectation.py b/python/samples/expectation.py
new file mode 100644
index 0000000..1d51939
--- /dev/null
+++ b/python/samples/expectation.py
@@ -0,0 +1,61 @@
+import numpy as np
+import cupy as cp
+
+import cuquantum
+from cuquantum import custatevec as cusv
+
+
+nIndexBits = 3
+nSvSize    = (1 << nIndexBits)
+nBasisBits = 1
+
+basisBits  = np.asarray([1], dtype=np.int32)
+
+h_sv       = np.asarray([0.0+0.0j, 0.0+0.1j, 0.1+0.1j, 0.1+0.2j,
+                         0.2+0.2j, 0.3+0.3j, 0.3+0.4j, 0.4+0.5j], dtype=np.complex64)
+d_sv       = cp.asarray(h_sv)
+
+# the gate matrix can live on either host (np) or device (cp)
+matrix     = cp.asarray([1.0+0.0j, 2.0+1.0j, 2.0-1.0j, 3.0+0.0j], dtype=np.complex64)
+if isinstance(matrix, cp.ndarray):
+    matrix_ptr = matrix.data.ptr
+elif isinstance(matrix, np.ndarray):
+    matrix_ptr = matrix.ctypes.data
+else:
+    raise ValueError
+
+# expectation values must stay on host
+expect     = np.empty((2,), dtype=np.float64)
+expect_expected = np.asarray([4.1, 0.0], dtype=np.float64)
+
+####################################################################################
+
+# cuStateVec handle initialization
+handle = cusv.create()
+
+# check the size of external workspace
+workspaceSize = cusv.expectation_buffer_size(
+    handle, cuquantum.cudaDataType.CUDA_C_32F, nIndexBits, matrix_ptr, cuquantum.cudaDataType.CUDA_C_32F,
+    cusv.MatrixLayout.ROW, nBasisBits, cuquantum.ComputeType.COMPUTE_32F)
+if workspaceSize > 0:
+    workspace = cp.cuda.memory.alloc(workspaceSize)
+    workspace_ptr = workspace.ptr
+else:
+    workspace_ptr = 0
+
+# apply gate
+cusv.expectation(
+    handle, d_sv.data.ptr, cuquantum.cudaDataType.CUDA_C_32F, nIndexBits,
+    expect.ctypes.data, cuquantum.cudaDataType.CUDA_C_64F,
+    matrix_ptr, cuquantum.cudaDataType.CUDA_C_32F, cusv.MatrixLayout.ROW,
+    basisBits.ctypes.data, nBasisBits,
+    cuquantum.ComputeType.COMPUTE_32F, workspace_ptr, workspaceSize)
+
+# destroy handle
+cusv.destroy(handle)
+
+# check result
+if not np.allclose(expect, expect_expected, atol=1E-6):
+    raise ValueError("results mismatch")
+else:
+    print("test passed")
diff --git a/python/samples/expectation_pauli.py b/python/samples/expectation_pauli.py
new file mode 100644
index 0000000..bdd4c1a
--- /dev/null
+++ b/python/samples/expectation_pauli.py
@@ -0,0 +1,37 @@
+import numpy as np
+import cupy as cp
+
+import cuquantum
+from cuquantum import custatevec as cusv
+
+
+nIndexBits = 3
+nSvSize    = (1 << nIndexBits)
+paulis     = [[cusv.Pauli.I], [cusv.Pauli.X, cusv.Pauli.Y]]
+basisBits  = [[1], [1, 2]]
+nBasisBits = [len(arr) for arr in basisBits]
+
+exp_values = np.empty(len(paulis), dtype=np.float64)
+expected   = np.asarray([1.0, -0.14], dtype=np.float64)
+
+d_sv       = cp.asarray([0.0+0.0j, 0.0+0.1j, 0.1+0.1j, 0.1+0.2j,
+                         0.2+0.2j, 0.3+0.3j, 0.3+0.4j, 0.4+0.5j], dtype=np.complex64)
+
+####################################################################################
+
+# cuStateVec handle initialization
+handle = cusv.create()
+
+# apply Pauli operator
+cusv.expectations_on_pauli_basis(
+    handle, d_sv.data.ptr, cuquantum.cudaDataType.CUDA_C_32F, nIndexBits, exp_values.ctypes.data,
+    paulis, basisBits, nBasisBits, len(paulis))
+
+# destroy handle
+cusv.destroy(handle)
+
+# check result
+if not cp.allclose(expected, exp_values):
+    raise ValueError("results mismatch")
+else:
+    print("test passed")
diff --git a/python/samples/exponential_pauli.py b/python/samples/exponential_pauli.py
new file mode 100644
index 0000000..47bbf3b
--- /dev/null
+++ b/python/samples/exponential_pauli.py
@@ -0,0 +1,41 @@
+import numpy as np
+import cupy as cp
+
+import cuquantum
+from cuquantum import custatevec as cusv
+
+
+nIndexBits = 3
+nSvSize    = (1 << nIndexBits)
+nTargets   = 1
+nControls  = 1
+
+targets    = np.asarray([2], dtype=np.int32)
+controls   = np.asarray([1], dtype=np.int32)
+controlBitValues = np.asarray([1], dtype=np.int32)
+paulis     = np.asarray([cusv.Pauli.Z], dtype=np.int32)
+
+h_sv       = np.asarray([0.0+0.0j, 0.0+0.1j, 0.1+0.1j, 0.1+0.2j,
+                         0.2+0.2j, 0.3+0.3j, 0.3+0.4j, 0.4+0.5j], dtype=np.complex64)
+expected   = np.asarray([0.0+0.0j, 0.0+0.1j,-0.1+0.1j,-0.2+0.1j, 
+                         0.2+0.2j, 0.3+0.3j, 0.4-0.3j, 0.5-0.4j], dtype=np.complex64)
+d_sv = cp.asarray(h_sv)
+
+####################################################################################
+
+# cuStateVec handle initialization
+handle = cusv.create()
+
+# apply Pauli operator
+cusv.apply_exp(
+    handle, d_sv.data.ptr, cuquantum.cudaDataType.CUDA_C_32F, nIndexBits, np.pi/2, paulis.ctypes.data,
+    targets.ctypes.data, nTargets, controls.ctypes.data, controlBitValues.ctypes.data, nControls)
+
+# destroy handle
+cusv.destroy(handle)
+
+# check result
+if not cp.allclose(expected, d_sv):
+    raise ValueError("results mismatch")
+else:
+    print("test passed")
diff --git a/python/samples/fine/example1.py b/python/samples/fine/example1.py
new file mode 100644
index 0000000..93c43b3
--- /dev/null
+++ b/python/samples/fine/example1.py
@@ -0,0 +1,59 @@
+"""
+Example using operations on the Network object with torch tensors. This can be used to
+amortize the cost of finding the best contraction path and autotuning the network across
+multiple contractions.
+
+The contraction result is also a torch tensor on the same device as the operands. 
+"""
+import torch
+
+from cuquantum import Network
+
+
+# The parameters of the tensor network.
+expr = 'ehl,gj,edhg,bif,d,c,k,iklj,cf,a->ba'
+shapes = [(8, 2, 5), (5, 7), (8, 8, 2, 5), (8, 6, 3), (8,), (6,), (5,), (6, 5, 5, 7), (6, 3), (3,)]
+
+device = 'cuda'
+# Create torch tensors.
+operands = [torch.rand(*shape, dtype=torch.float64, device=device) for shape in shapes]
+
+# Create the network.
+with Network(expr, *operands) as n:
+
+    # Find the contraction path.
+    path, info = n.contract_path({'samples': 500})
+
+    # Autotune the network.
+    n.autotune(iterations=5)
+
+    # Perform the contraction.
+    r1 = n.contract()
+    print("Contract the network (r1):")
+    print(r1)
+
+    # Create new operands. 
+    operands = [i*operand for i, operand in enumerate(operands, start=1)]
+
+    # Reset the network operands.
+    n.reset_operands(*operands)
+
+    # Perform the contraction with the new operands.
+    print("Reset the operands and perform the contraction (r2):")
+    r2 = n.contract()
+    print(r2)
+
+    from math import factorial
+    print(f"Is r2 the expected result?: {torch.allclose(r2, factorial(len(operands))*r1)}")
+
+    # The operands can also be updated using in-place operations if they are on the GPU.
+    for i, operand in enumerate(operands, start=1):
+        operand /= i
+
+    #The operands don't have to be reset for in-place operations. Perform the contraction.
+    print("Reset the operands in-place and perform the contraction (r3):")
+    r3 = n.contract()
+    print(r3)
+    print(f"Is r3 the expected result?: {torch.allclose(r3, r1)}")
+
+# The context manages the network resources, so n.free() doesn't have to be called.
diff --git a/python/samples/gate_application.py b/python/samples/gate_application.py
new file mode 100644
index 0000000..08e65b9
--- /dev/null
+++ b/python/samples/gate_application.py
@@ -0,0 +1,61 @@
+import numpy as np
+import cupy as cp
+
+import cuquantum
+from cuquantum import custatevec as cusv
+
+
+nIndexBits = 3
+nSvSize    = (1 << nIndexBits)
+nTargets   = 1
+nControls  = 2
+adjoint    = 0
+
+targets    = np.asarray([2], dtype=np.int32)
+controls   = np.asarray([0, 1], dtype=np.int32)
+
+h_sv       = np.asarray([0.0+0.0j, 0.0+0.1j, 0.1+0.1j, 0.1+0.2j,
+                         0.2+0.2j, 0.3+0.3j, 0.3+0.4j, 0.4+0.5j], dtype=np.complex64)
+expected   = np.asarray([0.0+0.0j, 0.0+0.1j, 0.1+0.1j, 0.4+0.5j, 
+                         0.2+0.2j, 0.3+0.3j, 0.3+0.4j, 0.1+0.2j], dtype=np.complex64)
+
+# the gate matrix can live on either host (np) or device (cp)
+matrix     = cp.asarray([0.0+0.0j, 1.0+0.0j, 1.0+0.0j, 0.0+0.0j], dtype=np.complex64)
+if isinstance(matrix, cp.ndarray):
+    matrix_ptr = matrix.data.ptr
+elif isinstance(matrix, np.ndarray):
+    matrix_ptr = matrix.ctypes.data
+else:
+    raise ValueError
+
+d_sv = cp.asarray(h_sv)
+
+####################################################################################
+
+# cuStateVec handle initialization
+handle = cusv.create()
+workspaceSize = cusv.apply_matrix_buffer_size(
+    handle, cuquantum.cudaDataType.CUDA_C_32F, nIndexBits, matrix_ptr, cuquantum.cudaDataType.CUDA_C_32F,
+    cusv.MatrixLayout.ROW, adjoint, nTargets, nControls, cuquantum.ComputeType.COMPUTE_32F)
+
+# check the size of external workspace
+if workspaceSize > 0:
+    workspace = cp.cuda.memory.alloc(workspaceSize)
+    workspace_ptr = workspace.ptr
+else:
+    workspace_ptr = 0
+
+# apply gate
+cusv.apply_matrix(
+    handle, d_sv.data.ptr, cuquantum.cudaDataType.CUDA_C_32F, nIndexBits, matrix_ptr, cuquantum.cudaDataType.CUDA_C_32F,
+    cusv.MatrixLayout.ROW, adjoint, targets.ctypes.data, nTargets, controls.ctypes.data, nControls,
+    0, cuquantum.ComputeType.COMPUTE_32F, workspace_ptr, workspaceSize)
+
+# destroy handle
+cusv.destroy(handle)
+
+# check result
+if not cp.allclose(expected, d_sv):
+    raise ValueError("results mismatch")
+else:
+    print("test passed")
diff --git a/python/samples/measure_zbasis.py b/python/samples/measure_zbasis.py
new file mode 100644
index 0000000..e3a970d
--- /dev/null
+++ b/python/samples/measure_zbasis.py
@@ -0,0 +1,42 @@
+import numpy as np
+import cupy as cp
+
+import cuquantum
+from cuquantum import custatevec as cusv
+
+
+nIndexBits   = 3
+nSvSize      = (1 << nIndexBits)
+nBasisBits   = 3
+basisBits    = np.asarray([0, 1, 2], dtype=np.int32)
+
+# In real appliction, random number in range [0, 1) will be used.
+randnum      = 0.2
+
+h_sv         = np.asarray([0.0+0.0j, 0.0+0.1j, 0.3+0.4j, 0.1+0.2j, 
+                           0.2+0.2j, 0.3+0.3j, 0.1+0.1j, 0.4+0.5j], dtype=np.complex64)
+d_sv         = cp.asarray(h_sv)
+
+expected_sv  = np.asarray([0.0+0.0j, 0.0+0.0j, 0.0+0.0j, 0.2+0.4j, 
+                           0.0+0.0j, 0.6+0.6j, 0.2+0.2j, 0.0+0.0j], dtype=np.complex64)
+expected_parity = 0
+
+###################################################################################
+
+# cuStateVec handle initialization
+handle = cusv.create()
+
+# measurement on z basis
+parity = cusv.measure_on_z_basis(
+    handle, d_sv.data.ptr, cuquantum.cudaDataType.CUDA_C_32F, nIndexBits,
+    basisBits.ctypes.data, nBasisBits, randnum, cusv.Collapse.NORMALIZE_AND_ZERO)
+
+# destroy handle
+cusv.destroy(handle)
+
+# check result
+if not cp.allclose(expected_sv, d_sv):
+    raise ValueError("results mismatch")
+if expected_parity != parity:
+    raise ValueError("results mismatch")
+print("test passed")
diff --git a/python/samples/permutation_matrix.py b/python/samples/permutation_matrix.py
new file mode 100644
index 0000000..0486fb1
--- /dev/null
+++ b/python/samples/permutation_matrix.py
@@ -0,0 +1,54 @@
+import numpy as np
+import cupy as cp
+
+import cuquantum
+from cuquantum import custatevec as cusv
+
+
+nIndexBits = 3
+nSvSize    = (1 << nIndexBits)
+nBasisBits = 2
+maskLen    = 1
+adjoint    = 0
+
+basisBits  = [0, 1]
+maskOrdering = [2]
+maskBitString = [1]
+permutation  = np.asarray([0, 2, 1, 3], dtype=np.int64)
+
+d_sv       = cp.asarray([0.0+0.0j, 0.0+0.1j, 0.1+0.1j, 0.1+0.2j,
+                         0.2+0.2j, 0.3+0.3j, 0.3+0.4j, 0.4+0.5j], dtype=np.complex64)
+d_sv_res   = cp.asarray([0.0+0.0j, 0.0+0.1j, 0.1+0.1j, 0.1+0.2j,
+                         0.2+0.2j, -0.4+0.3j, -0.3+0.3j, 0.4+0.5j], dtype=np.complex64)
+diagonals  = np.asarray([1.0+0.0j, 0.0+1.0j, 0.0+1.0j, 1.0+0.0j], dtype=np.complex64)
+
+####################################################################################
+
+# cuStateVec handle initialization
+handle = cusv.create()
+
+# check the size of external workspace
+workspaceSize = cusv.apply_generalized_permutation_matrix_buffer_size(
+    handle, cuquantum.cudaDataType.CUDA_C_32F, nIndexBits, permutation.ctypes.data, diagonals.ctypes.data,
+    cuquantum.cudaDataType.CUDA_C_32F, basisBits, nBasisBits, maskLen)
+if workspaceSize > 0:
+    workspace = cp.cuda.memory.alloc(workspaceSize)
+    workspace_ptr = workspace.ptr
+else:
+    workspace_ptr = 0
+
+# apply matrix
+cusv.apply_generalized_permutation_matrix(
+    handle, d_sv.data.ptr, cuquantum.cudaDataType.CUDA_C_32F, nIndexBits,
+    permutation.ctypes.data, diagonals.ctypes.data, cuquantum.cudaDataType.CUDA_C_32F, adjoint,
+    basisBits, nBasisBits, maskBitString, maskOrdering, maskLen,
+    workspace_ptr, workspaceSize)
+
+# destroy handle
+cusv.destroy(handle)
+
+# check result
+if not np.allclose(d_sv, d_sv_res):
+    raise ValueError("results mismatch")
+else:
+    print("test passed")
diff --git a/python/samples/sampler.py b/python/samples/sampler.py
new file mode 100644
index 0000000..851121e
--- /dev/null
+++ b/python/samples/sampler.py
@@ -0,0 +1,56 @@
+import numpy as np
+import cupy as cp
+
+import cuquantum
+from cuquantum import custatevec as cusv
+
+
+nIndexBits = 3
+nSvSize    = (1 << nIndexBits)
+nMaxShots  = 5
+nShots     = 5
+
+bitStringLen  = 2;
+bitOrdering   = np.asarray([0, 1], dtype=np.int32)
+
+bitStrings = np.empty((nShots,), dtype=np.int64)
+bitStrings_expected = np.asarray([0b00, 0b01, 0b10, 0b11, 0b11], dtype=np.int64)
+
+h_sv          = np.asarray([0.0+0.0j, 0.0+0.1j, 0.1+0.1j, 0.1+0.2j, 
+                            0.2+0.2j, 0.3+0.3j, 0.3+0.4j, 0.4+0.5j], dtype=np.complex64)
+
+d_sv = cp.asarray(h_sv)
+
+# In real appliction, random numbers in range [0, 1) will be used.
+randnums      = np.asarray([0.1, 0.8, 0.4, 0.6, 0.2], dtype=np.float64)
+
+########################################################################
+
+# cuStateVec handle initialization
+handle = cusv.create()
+
+# create sampler and check the size of external workspace
+sampler, extraWorkspaceSizeInBytes = cusv.sampler_create(
+    handle, d_sv.data.ptr, cuquantum.cudaDataType.CUDA_C_32F, nIndexBits, nMaxShots)
+
+# allocate external workspace
+extraWorkspace = cp.cuda.alloc(extraWorkspaceSizeInBytes)
+
+# sample preprocess
+cusv.sampler_preprocess(
+    handle, sampler, extraWorkspace.ptr, extraWorkspaceSizeInBytes)
+
+# sample bit strings
+cusv.sampler_sample(
+    handle, sampler, bitStrings.ctypes.data, bitOrdering.ctypes.data, bitStringLen,
+    randnums.ctypes.data, nShots, cusv.SamplerOutput.ASCENDING_ORDER)
+
+# destroy sampler (only required in Python)
+cusv.sampler_destroy(sampler)
+
+# destroy handle
+cusv.destroy(handle)
+
+if not np.allclose(bitStrings, bitStrings_expected):
+    raise ValueError("results mismatch")
+print("test passed")
diff --git a/python/samples/tensornet_example.py b/python/samples/tensornet_example.py
new file mode 100644
index 0000000..df928dc
--- /dev/null
+++ b/python/samples/tensornet_example.py
@@ -0,0 +1,215 @@
+import numpy as np
+import cupy as cp
+
+import cuquantum
+from cuquantum import cutensornet as cutn
+
+
+##########################################################
+# Computing: D_{m,x,n,y} = A_{m,h,k,n} B_{u,k,h} C_{x,u,y}
+##########################################################
+
+print("Include headers and define data types")
+
+data_type = cuquantum.cudaDataType.CUDA_R_32F
+compute_type = cuquantum.ComputeType.COMPUTE_32F
+numInputs = 3
+
+# Create an array of modes
+modesA = [ord(c) for c in ('m','h','k','n')]
+modesB = [ord(c) for c in ('u','k','h')]
+modesC = [ord(c) for c in ('x','u','y')]
+modesD = [ord(c) for c in ('m','x','n','y')]
+
+# Create an array of extents (shapes) for each tensor
+extentA = (96, 64, 64, 96)
+extentB = (96, 64, 64)
+extentC = (64, 96, 64)
+extentD = (96, 64, 96, 64)
+
+print("Define network, modes, and extents")
+
+############################
+# Allocate & initialize data
+############################
+
+A_d = cp.random.random((np.prod(extentA),), dtype=np.float32)
+B_d = cp.random.random((np.prod(extentB),), dtype=np.float32)
+C_d = cp.random.random((np.prod(extentC),), dtype=np.float32)
+D_d = cp.empty((np.prod(extentD),), dtype=np.float32)
+rawDataIn_d = (A_d.data.ptr, B_d.data.ptr, C_d.data.ptr)
+
+A = cp.asnumpy(A_d)
+B = cp.asnumpy(B_d)
+C = cp.asnumpy(C_d)
+D = np.empty(D_d.shape, dtype=np.float32)
+
+####################
+# Allocate workspace
+####################
+
+dev = cp.cuda.Device()  # get current device
+freeMem, totalMem = dev.mem_info
+worksize = int(freeMem * 0.5)
+work = cp.cuda.alloc(worksize)
+
+print("Allocate memory for data and workspace, and initialize data.")
+
+#############
+# cuTensorNet
+#############
+
+stream = cp.cuda.Stream()
+handle = cutn.create()
+
+nmodeA = len(modesA)
+nmodeB = len(modesB)
+nmodeC = len(modesC)
+nmodeD = len(modesD)
+
+###############################
+# Create Contraction Descriptor
+###############################
+
+# These also work, but require a bit more keystrokes
+#modesA = np.asarray(modesA, dtype=np.int32)
+#modesB = np.asarray(modesB, dtype=np.int32)
+#modesC = np.asarray(modesC, dtype=np.int32)
+#modesIn = (modesA.ctypes.data, modesB.ctypes.data, modesC.ctypes.data)
+#extentA = np.asarray(extentA, dtype=np.int64)
+#extentB = np.asarray(extentB, dtype=np.int64)
+#extentC = np.asarray(extentC, dtype=np.int64)
+#extentsIn = (extentA.ctypes.data, extentB.ctypes.data, extentC.ctypes.data)
+
+modesIn = (modesA, modesB, modesC)
+extentsIn = (extentA, extentB, extentC)
+numModesIn = (nmodeA, nmodeB, nmodeC)
+
+# strides are optional; if no stride (0) is provided, then cuTensorNet assumes a generalized column-major data layout
+stridesIn = (0, 0, 0)
+
+# compute the alignments
+# we hard-code them here because CuPy arrays are at least 256B aligned
+alignmentsIn = (256, 256, 256)
+alignmentOut = 256
+
+# setup tensor network
+descNet = cutn.create_network_descriptor(handle,
+    numInputs, numModesIn, extentsIn, stridesIn, modesIn, alignmentsIn,  # inputs
+    nmodeD, extentD, 0, modesD, alignmentOut,  # output
+    data_type, compute_type)
+
+print("Initialize the cuTensorNet library and create a network descriptor.")
+
+##############################################
+# Find "optimal" contraction order and slicing
+##############################################
+
+optimizerConfig = cutn.create_contraction_optimizer_config(handle)
+
+# Set the value of the partitioner imbalance factor to 30 (if desired)
+imbalance_dtype = cutn.contraction_optimizer_config_get_attribute_dtype(
+    cutn.ContractionOptimizerConfigAttribute.GRAPH_IMBALANCE_FACTOR)
+imbalance_factor = np.asarray((30,), dtype=imbalance_dtype)
+cutn.contraction_optimizer_config_set_attribute(
+    handle, optimizerConfig, cutn.ContractionOptimizerConfigAttribute.GRAPH_IMBALANCE_FACTOR,
+    imbalance_factor.ctypes.data, imbalance_factor.dtype.itemsize)
+
+optimizerInfo = cutn.create_contraction_optimizer_info(handle, descNet)
+
+cutn.contraction_optimize(
+    handle, descNet, optimizerConfig, worksize, optimizerInfo)
+
+numSlices_dtype = cutn.contraction_optimizer_info_get_attribute_dtype(
+    cutn.ContractionOptimizerInfoAttribute.NUM_SLICES)
+numSlices = np.zeros((1,), dtype=numSlices_dtype)
+cutn.contraction_optimizer_info_get_attribute(
+    handle, optimizerInfo, cutn.ContractionOptimizerInfoAttribute.NUM_SLICES,
+    numSlices.ctypes.data, numSlices.dtype.itemsize)
+numSlices = int(numSlices)
+
+assert numSlices > 0
+
+print("Find an optimized contraction path with cuTensorNet optimizer.")
+ 
+###########################################################
+# Initialize all pair-wise contraction plans (for cuTENSOR)
+###########################################################
+
+plan = cutn.create_contraction_plan(
+    handle, descNet, optimizerInfo, worksize)
+
+###################################################################################
+# Optional: Auto-tune cuTENSOR's cutensorContractionPlan to pick the fastest kernel
+###################################################################################
+
+pref = cutn.create_contraction_autotune_preference(handle)
+
+# may be 0
+n_iter_dtype = cutn.contraction_autotune_preference_get_attribute_dtype(
+    cutn.ContractionAutotunePreferenceAttribute.MAX_ITERATIONS)
+numAutotuningIterations = np.asarray([5], dtype=n_iter_dtype)
+cutn.contraction_autotune_preference_set_attribute(
+    handle, pref,
+    cutn.ContractionAutotunePreferenceAttribute.MAX_ITERATIONS,
+    numAutotuningIterations.ctypes.data, numAutotuningIterations.dtype.itemsize)
+
+# modify the plan again to find the best pair-wise contractions
+cutn.contraction_autotune(
+    handle, plan, rawDataIn_d, D_d.data.ptr,
+    work.ptr, worksize, pref, stream.ptr)
+
+cutn.destroy_contraction_autotune_preference(pref)
+ 
+print("Create a contraction plan for cuTENSOR and optionally auto-tune it.")
+ 
+#####
+# Run
+#####
+
+minTimeCUTENSOR = 1e100
+numRuns = 3  # to get stable perf results
+e1 = cp.cuda.Event()
+e2 = cp.cuda.Event()
+
+for i in range(numRuns):
+    # restore output
+    D_d.data.copy_from(D.ctypes.data, D.size * D.dtype.itemsize)
+
+    # Contract over all slices.
+    # A user may choose to parallelize this loop across multiple devices.
+    for sliceId in range(numSlices):
+        e1.record()
+        cutn.contraction(
+            handle, plan, rawDataIn_d, D_d.data.ptr,
+            work.ptr, worksize, sliceId, stream.ptr)
+        e2.record()
+
+        # Synchronize and measure timing
+        e2.synchronize()
+        time = cp.cuda.get_elapsed_time(e1, e2) / 1000  # ms -> s
+        minTimeCUTENSOR = minTimeCUTENSOR if minTimeCUTENSOR < time else time
+
+print("Contract the network, each slice uses the same contraction plan.")
+
+#######################################################
+
+flops_dtype = cutn.contraction_optimizer_info_get_attribute_dtype(
+    cutn.ContractionOptimizerInfoAttribute.FLOP_COUNT)
+flops = np.zeros((1,), dtype=flops_dtype)
+cutn.contraction_optimizer_info_get_attribute(
+    handle, optimizerInfo, cutn.ContractionOptimizerInfoAttribute.FLOP_COUNT,
+    flops.ctypes.data, flops.dtype.itemsize)
+flops = float(flops)
+
+print(f"numSlices: {numSlices}")
+print(f"{minTimeCUTENSOR * 1000} ms / slice")
+print(f"{flops/1e9/minTimeCUTENSOR} GFLOPS/s")
+
+cutn.destroy_contraction_plan(plan)
+cutn.destroy_contraction_optimizer_info(optimizerInfo)
+cutn.destroy_contraction_optimizer_config(optimizerConfig)
+cutn.destroy_network_descriptor(descNet)
+cutn.destroy(handle)
+
+print("Free resource and exit.")
diff --git a/python/setup.py b/python/setup.py
new file mode 100644
index 0000000..3b067d9
--- /dev/null
+++ b/python/setup.py
@@ -0,0 +1,183 @@
+import os
+import site
+import subprocess
+import sys
+
+from setuptools import setup, Extension, find_packages
+from Cython.Build import cythonize
+
+
+# search order:
+# 1. installed "cuquantum" package
+# 2. env var
+for path in site.getsitepackages():
+    path = os.path.join(path, 'cuquantum')
+    if os.path.isdir(path):
+        cuquantum_root = path
+        break
+else:
+    cuquantum_root = os.environ.get('CUQUANTUM_ROOT')
+
+
+# We allow setting CUSTATEVEC_ROOT and CUTENSORNET_ROOT separately for the ease
+# of development, but users are encouraged to either install cuquantum from PyPI
+# or conda, or set CUQUANTUM_ROOT to the existing installation.
+try:
+    custatevec_root = os.environ['CUSTATEVEC_ROOT']
+except KeyError as e:
+    if cuquantum_root is None:
+        raise RuntimeError('cuStateVec is not found, please install "cuquantum" '
+                           'or set $CUQUANTUM_ROOT') from e
+    else:
+        custatevec_root = cuquantum_root
+try:
+    cutensornet_root = os.environ['CUTENSORNET_ROOT']
+except KeyError as e:
+    if cuquantum_root is None:
+        raise RuntimeError('cuTensorNet is not found, please install "cuquantum" '
+                           'or set $CUQUANTUM_ROOT') from e
+    else:
+        cutensornet_root = cuquantum_root
+
+
+# search order:
+# 1. installed "cutensor" package
+# 2. env var
+for path in site.getsitepackages():
+    path = os.path.join(path, 'cutensor')
+    if os.path.isdir(path):
+        cutensor_root = path
+        break
+else:
+    cutensor_root = os.environ.get('CUTENSOR_ROOT')
+if cutensor_root is None:
+    raise RuntimeError('cuTENSOR is not found, please install "cutensor" '
+                       'or set $CUTENSOR_ROOT')
+
+
+# We can't assume users to have CTK installed via pip, so we really need this...
+# TODO(leofang): try /usr/local/cuda?
+try:
+    cuda_path = os.environ['CUDA_PATH']
+except KeyError as e:
+    raise RuntimeError('CUDA is not found, please set $CUDA_PATH') from e
+
+
+setup_requires = [
+    'Cython>=0.29.22,<3',
+    ]
+install_requires = [
+    'numpy',
+    # 'cupy', # <-- can't be listed here as on PyPI this is the name for source build, not for wheel
+    # 'torch', # <-- PyTorch is optional; also, it does not live on PyPI...
+    ]
+ignore_cuquantum_dep = bool(os.environ.get('CUQUANTUM_IGNORE_SOLVER', False))
+if not ignore_cuquantum_dep:
+    setup_requires.append('cuquantum==0.0.1.*')
+    setup_requires.append('cutensor>=1.4.*')
+    install_requires.append('cuquantum==0.0.1.*')
+    install_requires.append('cutensor>=1.4.*')
+
+
+def check_cuda_version():
+    try:
+        # We cannot do a dlopen and call cudaRuntimeGetVersion, because it
+        # requires GPUs. We also do not want to rely on the compiler utility
+        # provided in distutils (deprecated) or setuptools, as this is a very
+        # simple string parsing task.
+        cuda_h = os.path.join(cuda_path, 'include', 'cuda.h')
+        with open(cuda_h, 'r') as f:
+            cuda_h = f.read().split('\n')
+        for line in cuda_h:
+            if "#define CUDA_VERSION" in line:
+                ver = int(line.split()[-1])
+                break
+        else:
+            raise RuntimeError("cannot parse CUDA_VERSION")
+    except:
+        raise
+    else:
+        # 11020 -> "11.2"
+        return str(ver // 1000) + '.' + str((ver % 100) // 10)
+
+
+cuda_ver = check_cuda_version()
+if cuda_ver in ('10.2', '11.0'):
+    cutensor_ver = cuda_ver
+elif '11.0' < cuda_ver < '12.0':
+    cutensor_ver = '11'
+else:
+    raise RuntimeError(f"Unsupported CUDA version: {cuda_ver}")
+
+
+print()
+print("****************************************************************")
+print("CUDA version:", cuda_ver)
+print("CUDA path:", cuda_path)
+print("cuStateVec path:", custatevec_root)
+print("cuTensorNet path:", cutensornet_root)
+print("****************************************************************\n")
+
+
+custatevec = Extension(
+    "cuquantum.custatevec.custatevec",
+    sources=["cuquantum/custatevec/custatevec.pyx"],
+    include_dirs=[os.path.join(cuda_path, 'include'),
+                  os.path.join(custatevec_root, 'include')],
+    library_dirs=[os.path.join(custatevec_root, 'lib64')],
+    libraries=['custatevec'],
+)
+
+
+cutensornet = Extension(
+    "cuquantum.cutensornet.cutensornet",
+    sources=["cuquantum/cutensornet/cutensornet.pyx"],
+    include_dirs=[os.path.join(cuda_path, 'include'),
+                  os.path.join(cutensornet_root, 'include')],
+    library_dirs=[os.path.join(cutensornet_root, 'lib64'),
+                  os.path.join(cutensor_root, 'lib', cutensor_ver)],
+    libraries=['cutensornet', 'cutensor'],
+)
+
+
+setup(
+    name="cuquantum-python",
+    version='0.1.0.0',  # the last digit is dedicated to cuQuantum Python
+    description="Python APIs for cuQuantum",
+    url="https://github.com/NVIDIA/cuQuantum",
+    author="NVIDIA Corporation",
+    author_email="cuquantum-python@nvidia.com",
+    license="BSD-3-Clause",
+    license_files = ('LICENSE',),
+    classifiers=[
+        "Development Status :: 4 - Beta",
+        "License :: OSI Approved :: BSD License",
+        "Operating System :: POSIX :: Linux",
+        "Topic :: Education",
+        "Topic :: Scientific/Engineering",
+        "Programming Language :: Python :: 3 :: Only",
+        "Programming Language :: Python :: 3.7",
+        "Programming Language :: Python :: 3.8",
+        "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: Implementation :: CPython",
+        "Environment :: GPU :: NVIDIA CUDA",
+        "Environment :: GPU :: NVIDIA CUDA :: 11.2",
+        "Environment :: GPU :: NVIDIA CUDA :: 11.3",
+        "Environment :: GPU :: NVIDIA CUDA :: 11.4",
+        "Environment :: GPU :: NVIDIA CUDA :: 11.5",
+    ],
+    ext_modules=cythonize([
+        custatevec,
+        cutensornet,
+        ], verbose=True, language_level=3,
+        compiler_directives={'embedsignature': True}),
+    packages=find_packages(include=['cuquantum', 'cuquantum.*']),
+    package_data={"": ["*.pxd", "*.pyx", "*.py"],},
+    zip_safe=False,
+    setup_requires=setup_requires,
+    install_requires=install_requires,
+    tests_require=install_requires + [
+        # pytest < 6.2 is slow in collecting tests
+        'pytest>=6.2',
+    ]
+)
diff --git a/python/tests/cuquantum_tests/__init__.py b/python/tests/cuquantum_tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/python/tests/cuquantum_tests/custatevec_tests/__init__.py b/python/tests/cuquantum_tests/custatevec_tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/python/tests/cuquantum_tests/custatevec_tests/test_custatevec.py b/python/tests/cuquantum_tests/custatevec_tests/test_custatevec.py
new file mode 100644
index 0000000..a86c4ea
--- /dev/null
+++ b/python/tests/cuquantum_tests/custatevec_tests/test_custatevec.py
@@ -0,0 +1,700 @@
+import cupy
+from cupy import testing
+import numpy
+import pytest
+
+import cuquantum
+from cuquantum import ComputeType, cudaDataType
+from cuquantum import custatevec
+
+
+###################################################################
+#
+# As of beta 2, the test suite for Python bindings is kept minimal.
+# The sole goal is to ensure the Python arguments are properly
+# passed to the C level. We do not ensure coverage nor correctness.
+# This decision will be revisited in the future.
+#
+###################################################################
+
+dtype_to_data_type = {
+    numpy.dtype(numpy.complex64): cudaDataType.CUDA_C_32F,
+    numpy.dtype(numpy.complex128): cudaDataType.CUDA_C_64F,
+}
+
+
+dtype_to_compute_type = {
+    numpy.dtype(numpy.complex64): ComputeType.COMPUTE_32F,
+    numpy.dtype(numpy.complex128): ComputeType.COMPUTE_64F,
+}
+
+
+@pytest.fixture()
+def handle():
+    h = custatevec.create()
+    yield h
+    custatevec.destroy(h)
+
+
+@testing.parameterize(*testing.product({
+    'n_qubits': (3,),
+    'dtype': (numpy.complex64, numpy.complex128),
+}))
+class TestSV:
+    # Base class for all statevector tests
+
+    def get_sv(self):
+        arr = cupy.zeros((2**self.n_qubits,), dtype=self.dtype)
+        arr[0] = 1  # initialize in |000...00>
+        return arr
+
+    def _return_data(self, data, name, dtype, return_value):
+        if return_value == 'int':
+            if len(data) == 0:
+                # empty, give it a NULL
+                return 0, 0
+            else:
+                # return int as void*
+                data = numpy.asarray(data, dtype=dtype)
+                setattr(self, name, data)  # keep data alive
+                return data.ctypes.data, data.size
+        elif return_value == 'seq':
+            # data itself is already a flat sequence
+            return data, len(data)
+        else:
+            assert False
+
+
+class TestLibHelper:
+
+    def test_get_version(self):
+        ver = custatevec.get_version()
+        assert ver == (custatevec.MAJOR_VER * 1000
+            + custatevec.MINOR_VER * 100
+            + custatevec.PATCH_VER)
+        assert ver == custatevec.VERSION
+
+    def test_get_property(self):
+        assert custatevec.MAJOR_VER == custatevec.get_property(
+            cuquantum.libraryPropertyType.MAJOR_VERSION)
+        assert custatevec.MINOR_VER == custatevec.get_property(
+            cuquantum.libraryPropertyType.MINOR_VERSION)
+        assert custatevec.PATCH_VER == custatevec.get_property(
+            cuquantum.libraryPropertyType.PATCH_LEVEL)
+
+
+class TestHandle:
+
+    def test_handle_create_destroy(self, handle):
+        # simple rount-trip test
+        pass
+
+    def test_workspace(self, handle):
+        default_workspace_size = custatevec.get_default_workspace_size(handle)
+        # this is about 18MB as of cuQuantum beta 1
+        assert default_workspace_size > 0
+        # cuStateVec does not like a smaller workspace...
+        size = 24*1024**2
+        assert size > default_workspace_size
+        memptr = cupy.cuda.alloc(size)
+        custatevec.set_workspace(handle, memptr.ptr, size)  # should not fail
+
+    def test_stream(self, handle):
+        # default is on the null stream
+        assert 0 == custatevec.get_stream(handle)
+
+        # simple set/get round-trip
+        stream = cupy.cuda.Stream()
+        custatevec.set_stream(handle, stream.ptr)
+        assert stream.ptr == custatevec.get_stream(handle)
+
+
+class TestAbs2Sum(TestSV):
+
+    @pytest.mark.parametrize(
+        'input_form', (
+            {'basis_bits': (numpy.int32, 'int'),},
+            {'basis_bits': (numpy.int32, 'seq'),},
+        )
+    )
+    def test_abs2sum_on_z_basis(self, handle, input_form):
+        sv = self.get_sv()
+        basis_bits = list(range(self.n_qubits))
+        basis_bits, basis_bits_len = self._return_data(
+            basis_bits, 'basis_bits', *input_form['basis_bits'])
+        data_type = dtype_to_data_type[sv.dtype]
+
+        # case 1: both are computed
+        sum0, sum1 = custatevec.abs2sum_on_z_basis(
+            handle, sv.data.ptr, data_type, self.n_qubits,
+            True, True, basis_bits, basis_bits_len)
+        assert numpy.allclose(sum0+sum1, 1)
+        assert (sum0 is not None) and (sum1 is not None)
+
+        # case 2: only sum0 is computed
+        sum0, sum1 = custatevec.abs2sum_on_z_basis(
+            handle, sv.data.ptr, data_type, self.n_qubits,
+            True, False, basis_bits, basis_bits_len)
+        assert numpy.allclose(sum0, 1)
+        assert (sum0 is not None) and (sum1 is None)
+
+        # case 3: only sum1 is computed
+        sum0, sum1 = custatevec.abs2sum_on_z_basis(
+            handle, sv.data.ptr, data_type, self.n_qubits,
+            False, True, basis_bits, basis_bits_len)
+        assert numpy.allclose(sum1, 0)
+        assert (sum0 is None) and (sum1 is not None)
+
+        # case 4: none is computed
+        with pytest.raises(ValueError):
+            sum0, sum1 = custatevec.abs2sum_on_z_basis(
+                handle, sv.data.ptr, data_type, self.n_qubits,
+                False, False, basis_bits, basis_bits_len)
+
+    @pytest.mark.parametrize(
+        'input_form', (
+            {'bit_ordering': (numpy.int32, 'int'),},
+            {'bit_ordering': (numpy.int32, 'seq'),},
+        )
+    )
+    @pytest.mark.parametrize(
+        'xp', (numpy, cupy)
+     )
+    def test_abs2sum_array_no_mask(self, handle, xp, input_form):
+        # change sv from |000> to 1/\sqrt{2} (|001> + |100>)
+        sv = self.get_sv()
+        sv[0] = 0
+        sv[1] = 1./numpy.sqrt(2)
+        sv[4] = 1./numpy.sqrt(2)
+
+        data_type = dtype_to_data_type[sv.dtype]
+        bit_ordering = list(range(self.n_qubits))
+        bit_ordering, bit_ordering_len = self._return_data(
+            bit_ordering, 'bit_ordering', *input_form['bit_ordering'])
+        # test abs2sum on both host and device
+        abs2sum = xp.empty((2**bit_ordering_len,), dtype=xp.float64)
+        abs2sum_ptr = abs2sum.data.ptr if xp is cupy else abs2sum.ctypes.data
+        custatevec.abs2sum_array(
+            handle, sv.data.ptr, data_type, self.n_qubits, abs2sum_ptr,
+            bit_ordering, bit_ordering_len, 0, 0, 0)
+        assert xp.allclose(abs2sum.sum(), 1)
+        assert xp.allclose(abs2sum[1], 0.5)
+        assert xp.allclose(abs2sum[4], 0.5)
+
+    # TODO(leofang): add more tests for abs2sum_array, such as nontrivial masks
+
+
+class TestCollapse(TestSV):
+
+    @pytest.mark.parametrize(
+        'input_form', (
+            {'basis_bits': (numpy.int32, 'int'),},
+            {'basis_bits': (numpy.int32, 'seq'),},
+        )
+    )
+    @pytest.mark.parametrize(
+        'parity', (0, 1)
+    )
+    def test_collapse_on_z_basis(self, handle, parity, input_form):
+        sv = self.get_sv()
+        basis_bits = list(range(self.n_qubits))
+        basis_bits, basis_bits_len = self._return_data(
+            basis_bits, 'basis_bits', *input_form['basis_bits'])
+        data_type = dtype_to_data_type[sv.dtype]
+
+        custatevec.collapse_on_z_basis(
+            handle, sv.data.ptr, data_type, self.n_qubits,
+            parity, basis_bits, basis_bits_len, 1)
+
+        if parity == 0:
+            assert cupy.allclose(sv.sum(), 1)
+        elif parity == 1:
+            assert cupy.allclose(sv.sum(), 0)
+
+    @pytest.mark.parametrize(
+        'input_form', (
+            {'bit_ordering': (numpy.int32, 'int'), 'bitstring': (numpy.int32, 'int')},
+            {'bit_ordering': (numpy.int32, 'seq'), 'bitstring': (numpy.int32, 'seq')},
+        )
+    )
+    def test_collapse_by_bitstring(self, handle, input_form):
+        # change sv to 1/\sqrt{2} (|000> + |111>)
+        sv = self.get_sv()
+        sv[0] = numpy.sqrt(0.5)
+        sv[-1] = numpy.sqrt(0.5)
+
+        # collapse to |111>
+        bitstring = [1] * self.n_qubits
+        bitstring, bitstring_len = self._return_data(
+            bitstring, 'bitstring', *input_form['bitstring'])
+
+        bit_ordering = list(range(self.n_qubits))
+        bit_ordering, _ = self._return_data(
+            bit_ordering, 'bit_ordering', *input_form['bit_ordering'])
+        data_type = dtype_to_data_type[sv.dtype]
+
+        norm = 0.5
+        # the sv after collapse is normalized as sv -> sv / \sqrt{norm}
+        custatevec.collapse_by_bitstring(
+            handle, sv.data.ptr, data_type, self.n_qubits,
+            bitstring, bit_ordering, bitstring_len,
+            norm)
+        assert cupy.allclose(sv.sum(), 1)
+        assert cupy.allclose(sv[-1], 1)
+
+
+@pytest.mark.parametrize(
+    'rand',
+    # the choices here ensure we get either parity
+    (0, numpy.nextafter(1, 0))
+)
+@pytest.mark.parametrize(
+    'collapse',
+    (custatevec.Collapse.NORMALIZE_AND_ZERO, custatevec.Collapse.NONE)
+)
+class TestMeasure(TestSV):
+
+    @pytest.mark.parametrize(
+        'input_form', (
+            {'basis_bits': (numpy.int32, 'int'),},
+            {'basis_bits': (numpy.int32, 'seq'),},
+        )
+    )
+    def test_measure_on_z_basis(self, handle, rand, collapse, input_form):
+        # change the sv to 1/\sqrt{2} (|000> + |010>) to allow 50-50 chance
+        # of getting either parity
+        sv = self.get_sv()
+        sv[0] = numpy.sqrt(0.5)
+        sv[2] = numpy.sqrt(0.5)
+
+        basis_bits = list(range(self.n_qubits))
+        basis_bits, basis_bits_len = self._return_data(
+            basis_bits, 'basis_bits', *input_form['basis_bits'])
+        data_type = dtype_to_data_type[sv.dtype]
+        orig_sv = sv.copy()
+
+        parity = custatevec.measure_on_z_basis(
+            handle, sv.data.ptr, data_type, self.n_qubits,
+            basis_bits, basis_bits_len, rand, collapse)
+
+        if collapse == custatevec.Collapse.NORMALIZE_AND_ZERO:
+            if parity == 0:
+                # collapse to |000>
+                assert cupy.allclose(sv[0], 1)
+            elif parity == 1:
+                # collapse to |111>
+                assert cupy.allclose(sv[2], 1)
+            # sv is collapsed
+            assert not (sv == orig_sv).all()
+        else:
+            # sv is intact
+            assert (sv == orig_sv).all()
+
+    @pytest.mark.parametrize(
+        'input_form', (
+            {'bit_ordering': (numpy.int32, 'int'),},
+            {'bit_ordering': (numpy.int32, 'seq'),},
+        )
+    )
+    def test_batch_measure(self, handle, rand, collapse, input_form):
+        # change sv to 1/\sqrt{2} (|000> + |111>)
+        sv = self.get_sv()
+        sv[0] = numpy.sqrt(0.5)
+        sv[-1] = numpy.sqrt(0.5)
+        orig_sv = sv.copy()
+
+        data_type = dtype_to_data_type[sv.dtype]
+        bitstring = numpy.empty(self.n_qubits, dtype=numpy.int32)
+        bit_ordering = list(range(self.n_qubits))
+        bit_ordering, _ = self._return_data(
+            bit_ordering, 'bit_ordering', *input_form['bit_ordering'])
+
+        custatevec.batch_measure(
+            handle, sv.data.ptr, data_type, self.n_qubits,
+            bitstring.ctypes.data, bit_ordering, bitstring.size,
+            rand, collapse)
+
+        if collapse == custatevec.Collapse.NORMALIZE_AND_ZERO:
+            if bitstring.sum() == 0:
+                # collapse to |000>
+                assert cupy.allclose(sv[0], 1)
+            elif bitstring.sum() == 3:
+                # collapse to |111>
+                assert cupy.allclose(sv[-1], 1)
+            else:
+                assert False, f"unexpected bitstring: {bitstring}"
+            # sv is collapsed
+            assert not (sv == orig_sv).all()
+        else:
+            assert bitstring.sum() in (0, 3)
+            # sv is intact
+            assert (sv == orig_sv).all()
+
+
+class TestApply(TestSV):
+
+    @pytest.mark.parametrize(
+        'input_form', (
+            {'targets': (numpy.int32, 'int'), 'controls': (numpy.int32, 'int'),
+             # sizeof(enum) == sizeof(int)
+             'paulis': (numpy.int32, 'int'),},
+            {'targets': (numpy.int32, 'seq'), 'controls': (numpy.int32, 'seq'),
+             'paulis': (numpy.int32, 'seq'),},
+        )
+    )
+    def test_apply_exp(self, handle, input_form):
+        # change sv to |100>
+        sv = self.get_sv()
+        sv[0] = 0
+        sv[4] = 1
+
+        data_type = dtype_to_data_type[sv.dtype]
+        targets = [0, 1]
+        targets, targets_len = self._return_data(
+            targets, 'targets', *input_form['targets'])
+        controls = [2]
+        controls, controls_len = self._return_data(
+            controls, 'controls', *input_form['controls'])
+        control_values = 0  # set all control bits to 1
+        paulis = [custatevec.Pauli.X, custatevec.Pauli.X]
+        paulis, _ = self._return_data(
+            paulis, 'paulis', *input_form['paulis'])
+
+        custatevec.apply_exp(
+            handle, sv.data.ptr, data_type, self.n_qubits,
+            0.5*numpy.pi, paulis,
+            targets, targets_len,
+            controls, control_values, controls_len)
+        sv *= -1j
+
+        # result is |111>
+        assert cupy.allclose(sv[-1], 1)
+
+    @pytest.mark.parametrize(
+        'input_form', (
+            {'targets': (numpy.int32, 'int'), 'controls': (numpy.int32, 'int'),
+             # sizeof(enum) == sizeof(int)
+             'paulis': (numpy.int32, 'int'),},
+            {'targets': (numpy.int32, 'seq'), 'controls': (numpy.int32, 'seq'),
+             'paulis': (numpy.int32, 'seq'),},
+        )
+    )
+    @pytest.mark.parametrize(
+        'xp', (numpy, cupy)
+     )
+    def test_apply_matrix(self, handle, xp, input_form):
+        sv = self.get_sv()
+        data_type = dtype_to_data_type[sv.dtype]
+        compute_type = dtype_to_compute_type[sv.dtype]
+        targets = [0, 1, 2]
+        targets, targets_len = self._return_data(
+            targets, 'targets', *input_form['targets'])
+        controls = []
+        controls, controls_len = self._return_data(
+            controls, 'controls', *input_form['controls'])
+
+        # matrix can live on host or device
+        matrix = xp.zeros((2**self.n_qubits, 2**self.n_qubits), dtype=sv.dtype)
+        matrix[-1][0] = 1
+        matrix_ptr = matrix.ctypes.data if xp is numpy else matrix.data.ptr
+
+        workspace_size = custatevec.apply_matrix_buffer_size(
+            handle, data_type, self.n_qubits,
+            matrix_ptr, data_type, custatevec.MatrixLayout.ROW, 0,
+            targets_len, controls_len, compute_type)
+        if workspace_size:
+            workspace = cupy.cuda.alloc(workspace_size)
+            workspace_ptr = workspace.ptr
+        else:
+            workspace_ptr = 0
+
+        custatevec.apply_matrix(
+            handle, sv.data.ptr, data_type, self.n_qubits,
+            matrix_ptr, data_type, custatevec.MatrixLayout.ROW, 0,
+            targets, targets_len,
+            controls, controls_len, 0,
+            compute_type, workspace_ptr, workspace_size)
+
+        assert sv[-1] == 1  # output state is |111>
+
+
+    @pytest.mark.parametrize(
+        'input_form', (
+            {'permutation': (numpy.int64, 'int'), 'basis_bits': (numpy.int32, 'int'),
+             'mask_bitstring': (numpy.int32, 'int'), 'mask_ordering': (numpy.int32, 'int')},
+            {'permutation': (numpy.int64, 'seq'), 'basis_bits': (numpy.int32, 'seq'),
+             'mask_bitstring': (numpy.int32, 'seq'), 'mask_ordering': (numpy.int32, 'seq')},
+        )
+    )
+    @pytest.mark.parametrize(
+        'xp', (numpy, cupy)
+     )
+    def test_apply_generalized_permutation_matrix(self, handle, xp, input_form):
+        sv = self.get_sv()
+        sv[:] = 1  # invalid sv just to make math checking easier
+        data_type = dtype_to_data_type[sv.dtype]
+        compute_type = dtype_to_compute_type[sv.dtype]
+
+        # TODO(leofang): test permutation on either host or device
+        permutation = list(numpy.random.permutation(2**self.n_qubits))
+        permutation_data = permutation
+        permutation, permutation_len = self._return_data(
+            permutation, 'permutation', *input_form['permutation'])
+
+        # diagonal can live on host or device
+        diagonal = 10 * xp.ones((2**self.n_qubits, ), dtype=sv.dtype)
+        diagonal_ptr = diagonal.ctypes.data if xp is numpy else diagonal.data.ptr
+
+        basis_bits = list(range(self.n_qubits))
+        basis_bits, basis_bits_len = self._return_data(
+            basis_bits, 'basis_bits', *input_form['basis_bits'])
+
+        # TODO(leofang): test masks
+        mask_bitstring = 0
+        mask_ordering = 0
+        mask_len = 0
+
+        workspace_size = custatevec.apply_generalized_permutation_matrix_buffer_size(
+            handle, data_type, self.n_qubits,
+            permutation, diagonal_ptr, data_type,
+            basis_bits, basis_bits_len, mask_len)
+
+        if workspace_size:
+            workspace = cupy.cuda.alloc(workspace_size)
+            workspace_ptr = workspace.ptr
+        else:
+            workspace_ptr = 0
+
+        custatevec.apply_generalized_permutation_matrix(
+            handle, sv.data.ptr, data_type, self.n_qubits,
+            permutation, diagonal_ptr, data_type, 0,
+            basis_bits, basis_bits_len,
+            mask_bitstring, mask_ordering, mask_len,
+            workspace_ptr, workspace_size)
+
+        assert cupy.allclose(sv, diagonal[xp.asarray(permutation_data)])
+
+
+class TestExpect(TestSV):
+
+    @pytest.mark.parametrize(
+        'input_form', (
+            {'basis_bits': (numpy.int32, 'int'),},
+            {'basis_bits': (numpy.int32, 'seq'),},
+        )
+    )
+    @pytest.mark.parametrize(
+        'expect_dtype', (numpy.float64, numpy.complex128)
+    )
+    @pytest.mark.parametrize(
+        'xp', (numpy, cupy)
+    )
+    def test_expectation(self, handle, xp, expect_dtype, input_form):
+        # create a uniform sv
+        sv = self.get_sv()
+        sv[:] = numpy.sqrt(1/(2**self.n_qubits))
+
+        data_type = dtype_to_data_type[sv.dtype]
+        compute_type = dtype_to_compute_type[sv.dtype]
+        basis_bits = list(range(self.n_qubits))
+        basis_bits, basis_bits_len = self._return_data(
+            basis_bits, 'basis_bits', *input_form['basis_bits'])
+
+        # matrix can live on host or device
+        matrix = xp.ones((2**self.n_qubits, 2**self.n_qubits), dtype=sv.dtype)
+        matrix_ptr = matrix.ctypes.data if xp is numpy else matrix.data.ptr
+
+        workspace_size = custatevec.expectation_buffer_size(
+            handle, data_type, self.n_qubits,
+            matrix_ptr, data_type, custatevec.MatrixLayout.ROW,
+            basis_bits_len, compute_type)
+        if workspace_size:
+            workspace = cupy.cuda.alloc(workspace_size)
+            workspace_ptr = workspace.ptr
+        else:
+            workspace_ptr = 0
+
+        expect = numpy.empty((1,), dtype=expect_dtype)
+        # TODO(leofang): check if this is relaxed in beta 2
+        expect_data_type = (
+            cudaDataType.CUDA_R_64F if expect_dtype == numpy.float64
+            else cudaDataType.CUDA_C_64F)
+
+        custatevec.expectation(
+            handle, sv.data.ptr, data_type, self.n_qubits,
+            expect.ctypes.data, expect_data_type,
+            matrix_ptr, data_type, custatevec.MatrixLayout.ROW,
+            basis_bits, basis_bits_len,
+            compute_type, workspace_ptr, workspace_size)
+
+        assert xp.allclose(expect, 2**self.n_qubits)
+
+
+class TestSampler(TestSV):
+
+    @pytest.mark.parametrize(
+        'input_form', (
+            {'bit_ordering': (numpy.int32, 'int'),},
+            {'bit_ordering': (numpy.int32, 'seq'),},
+        )
+    )
+    def test_sampling(self, handle, input_form):
+        # create a uniform sv
+        sv = self.get_sv()
+        sv[:] = numpy.sqrt(1/(2**self.n_qubits))
+
+        data_type = dtype_to_data_type[sv.dtype]
+        compute_type = dtype_to_compute_type[sv.dtype]
+        shots = 4096
+
+        bitstrings = numpy.empty((shots,), dtype=numpy.int64)
+        rand_nums = numpy.random.random((shots,)).astype(numpy.float64)
+        # measure all qubits
+        bit_ordering = list(range(self.n_qubits))
+        bit_ordering, _ = self._return_data(
+            bit_ordering, 'bit_ordering', *input_form['bit_ordering'])
+
+        sampler, workspace_size = custatevec.sampler_create(
+            handle, sv.data.ptr, data_type, self.n_qubits, shots)
+        if workspace_size:
+            workspace = cupy.cuda.alloc(workspace_size)
+            workspace_ptr = workspace.ptr
+        else:
+            workspace_ptr = 0
+
+        try:
+            custatevec.sampler_preprocess(
+                handle, sampler, workspace_ptr, workspace_size)
+            custatevec.sampler_sample(
+                handle, sampler, bitstrings.ctypes.data,
+                bit_ordering, self.n_qubits,
+                rand_nums.ctypes.data, shots,
+                custatevec.SamplerOutput.RANDNUM_ORDER)
+        finally:
+            # This is Python-only API. Need finally to ensure it's freed.
+            custatevec.sampler_destroy(sampler)
+
+        keys, counts = numpy.unique(bitstrings, return_counts=True)
+        # keys are the returned bitstrings 000, 001, ..., 111
+        # the sv has all components, and unique() returns a sorted array,
+        # so the following should hold:
+        assert (keys == numpy.arange(2**self.n_qubits)).all()
+
+        # TODO: test counts, which should follow a uniform distribution
+
+
+# TODO(leofang): test mask_bitstring & mask_ordering
+@pytest.mark.parametrize(
+    'input_form', (
+        {'bit_ordering': (numpy.int32, 'int'), 'mask_bitstring': (numpy.int32, 'int'), 'mask_ordering': (numpy.int32, 'int')},
+        {'bit_ordering': (numpy.int32, 'seq'), 'mask_bitstring': (numpy.int32, 'seq'), 'mask_ordering': (numpy.int32, 'seq')},
+    )
+)
+@pytest.mark.parametrize(
+    'readonly', (True, False)
+)
+class TestAccessor(TestSV):
+
+    def test_accessor_get(self, handle, input_form, readonly):
+        # create a monotonically increasing sv
+        sv = self.get_sv()
+        data = cupy.arange(2**self.n_qubits, dtype=sv.dtype)
+        data /= cupy.sqrt(data**2)
+        sv[:] = data
+
+        data_type = dtype_to_data_type[sv.dtype]
+        compute_type = dtype_to_compute_type[sv.dtype]
+
+        # measure all qubits
+        bit_ordering = list(range(self.n_qubits))
+        bit_ordering, bit_ordering_len = self._return_data(
+            bit_ordering, 'bit_ordering', *input_form['bit_ordering'])
+        # TODO(leofang): test them
+        mask_bitstring = 0
+        mask_ordering = 0
+        mask_len = 0
+
+        if readonly:
+            accessor_create = custatevec.accessor_create_readonly
+        else:
+            accessor_create = custatevec.accessor_create
+
+        accessor, workspace_size = accessor_create(
+            handle, sv.data.ptr, data_type, self.n_qubits,
+            bit_ordering, bit_ordering_len,
+            mask_bitstring, mask_ordering, mask_len)
+
+        try:
+            if workspace_size:
+                workspace = cupy.cuda.alloc(workspace_size)
+                custatevec.accessor_set_extra_workspace(
+                    handle, accessor, workspace.ptr, workspace_size)
+
+            buf_len = 2**2
+            buf = cupy.empty(buf_len, dtype=sv.dtype)
+
+            # copy the last buf_len elements
+            custatevec.accessor_get(
+                handle, accessor, buf.data.ptr, sv.size-1-buf_len, sv.size-1)
+        finally:
+            # This is Python-only API. Need finally to ensure it's freed.
+            custatevec.accessor_destroy(accessor)
+
+        assert (sv[sv.size-1-buf_len: sv.size-1] == buf).all()
+
+    def test_accessor_set(self, handle, input_form, readonly):
+        # create a monotonically increasing sv
+        sv = self.get_sv()
+        data = cupy.arange(2**self.n_qubits, dtype=sv.dtype)
+        data /= cupy.sqrt(data**2)
+        sv[:] = data
+
+        data_type = dtype_to_data_type[sv.dtype]
+        compute_type = dtype_to_compute_type[sv.dtype]
+
+        # measure all qubits
+        bit_ordering = list(range(self.n_qubits))
+        bit_ordering, bit_ordering_len = self._return_data(
+            bit_ordering, 'bit_ordering', *input_form['bit_ordering'])
+        # TODO(leofang): test them
+        mask_bitstring = 0
+        mask_ordering = 0
+        mask_len = 0
+
+        if readonly:
+            accessor_create = custatevec.accessor_create_readonly
+        else:
+            accessor_create = custatevec.accessor_create
+
+        accessor, workspace_size = accessor_create(
+            handle, sv.data.ptr, data_type, self.n_qubits,
+            bit_ordering, bit_ordering_len,
+            mask_bitstring, mask_ordering, mask_len)
+
+        try:
+            if workspace_size:
+                workspace = cupy.cuda.alloc(workspace_size)
+                custatevec.accessor_set_extra_workspace(
+                    handle, accessor, workspace.ptr, workspace_size)
+
+            buf_len = 2**2
+            buf = cupy.zeros(buf_len, dtype=sv.dtype)
+
+            if readonly:
+                # copy the last buf_len elements would fail
+                with pytest.raises(custatevec.cuStateVecError) as e_info:
+                    custatevec.accessor_set(
+                        handle, accessor, buf.data.ptr, sv.size-1-buf_len, sv.size-1)
+            else:
+                # copy the last buf_len elements
+                custatevec.accessor_set(
+                    handle, accessor, buf.data.ptr, sv.size-1-buf_len, sv.size-1)
+        finally:
+            # This is Python-only API. Need finally to ensure it's freed.
+            custatevec.accessor_destroy(accessor)
+
+        if readonly:
+            # sv unchanged
+            assert (sv[sv.size-1-buf_len: sv.size-1] == data[sv.size-1-buf_len: sv.size-1]).all()
+        else:
+            assert (sv[sv.size-1-buf_len: sv.size-1] == 0).all()
diff --git a/python/tests/cuquantum_tests/cutensornet_tests/__init__.py b/python/tests/cuquantum_tests/cutensornet_tests/__init__.py
new file mode 100644
index 0000000..9bacbfc
--- /dev/null
+++ b/python/tests/cuquantum_tests/cutensornet_tests/__init__.py
@@ -0,0 +1,10 @@
+import cupy as cp
+
+
+# This is future proof: In the future when CuPy enables cuQuantum Python
+# as an optional backend, we don't want to create a circular dependency
+# that ultimately tests against ourselves. Here we enable CUB as the only
+# optinaly backend and exclude cuTENSOR/cuQuantum Python/etc, using CuPy's
+# private API (for development/testing).
+cp._core.set_reduction_accelerators(['cub'])
+cp._core.set_routine_accelerators(['cub'])
diff --git a/python/tests/cuquantum_tests/cutensornet_tests/data.py b/python/tests/cuquantum_tests/cutensornet_tests/data.py
new file mode 100644
index 0000000..517f3ef
--- /dev/null
+++ b/python/tests/cuquantum_tests/cutensornet_tests/data.py
@@ -0,0 +1,87 @@
+import itertools
+
+
+# TODO: investigate test parallelism across cartesian product
+
+sources = [
+    "numpy",
+    "cupy",
+    "torch"
+]
+
+devices = [
+    "cpu",
+    "cuda"
+]
+
+dtype_names = [
+    "float16",
+    "float32",
+    "float64",
+    "complex64",
+    "complex128"
+]
+
+sources_devices_dtype_names = list(
+    itertools.product(
+        sources,
+        devices,
+        dtype_names
+    )
+)
+
+array_orders = ["C", "F"]
+
+einsum_expressions = [
+    ("ea,fb,abcd,gc,hd->efgh",
+    (1, 1, 0, 1, 1),
+    [(10, 10, 10, 10), (10, 10)]),
+
+    ("ea,fb,abcd,gc,hd",
+    (1, 1, 0, 1, 1),
+    [(10, 10, 10, 10), (10, 10)]),
+
+    ("ij,jk,kl->il",
+    (0, 1, 2),
+    [(2, 2), (2, 5), (5, 2)]),
+
+    ("ij,jk,kl",
+    (0, 1, 2),
+    [(2, 2), (2, 5), (5, 2)]),
+
+    ("ij,jk,ki",
+    (0, 1, 2),
+    [(2, 2), (2, 5), (5, 2)])
+]
+
+compute_types = [None]
+device_ids = [None]
+handles = [None]
+loggers = [None]
+memory_limits = [
+    int(1e6),
+    "1 MiB",
+    "80%"
+]
+
+opt_cmodes = [None, "dict", "object"]
+
+network_options = [dict(zip(
+    ("compute_type", "device_id", "handle", "logger", "memory_limit"),
+    network_option_pack))
+    for network_option_pack in itertools.product(compute_types, device_ids, handles, loggers, memory_limits)
+]
+
+samples = [None]
+path = [None]
+slicing = [None]
+reconfiguration = [None]
+seed = [None]
+
+optimizer_options = [dict(zip(
+    ("samples", "path", "slicing", "reconfiguration", "seed"),
+    optimizer_options_pack))
+    for optimizer_options_pack in itertools.product(samples, path, slicing, reconfiguration, seed)
+]
+
+iterations = [0, 7]  # 0 iterations is equivalent to no autotuning
diff --git a/python/tests/cuquantum_tests/cutensornet_tests/test_contract.py b/python/tests/cuquantum_tests/cutensornet_tests/test_contract.py
new file mode 100644
index 0000000..1226c8e
--- /dev/null
+++ b/python/tests/cuquantum_tests/cutensornet_tests/test_contract.py
@@ -0,0 +1,100 @@
+import pytest
+import itertools
+from copy import deepcopy
+
+from cuquantum import contract, einsum
+
+from .testutils import *
+
+
+class ContractProxyFixture(ProxyFixtureBase):
+    def __init__(self, network_options_pack):
+        super().__init__(network_options_pack)
+    def _test_contract(
+        self,
+        options_constructor_mode,
+        optimize_constructor_mode,
+        skip_sync,
+        use_numpy_einsum_path
+    ):
+        for stream_name in stream_names:
+            optimize = deepcopy(self.optimize)
+
+            if use_numpy_einsum_path:
+                optimize["path"] = self.numpy_einsum_path[0][1:]
+
+            self.cutensornet_einsum = einsum(
+                self.einsum_expr,
+                *self.data_operands
+            )
+
+            self.cutensornet_interleaved_einsum = einsum(
+                *self.interleaved_inputs
+            )
+
+            self.cutensornet_contract = contract(
+                self.einsum_expr,
+                *self.data_operands,
+                options=network_options_dispatcher(self.options, mode=options_constructor_mode),
+                optimize=optimizer_options_dispatcher(optimize, mode=optimize_constructor_mode),
+                stream=streams[stream_name]
+            )
+            
+            self.cutensornet_interleaved_contract = contract(
+                *self.interleaved_inputs,
+                options=network_options_dispatcher(self.options, mode=options_constructor_mode),
+                optimize=optimizer_options_dispatcher(optimize, mode=optimize_constructor_mode),
+                stream=streams[stream_name]
+            )
+
+            stream_name_sync_dispatcher(stream_name, skip=skip_sync)
+            allclose(self.source, self.dtype_name, self.cutensornet_interleaved_einsum, self.cutensornet_einsum)
+            allclose(self.source, self.dtype_name, self.cutensornet_interleaved_contract, self.cutensornet_contract)
+            allclose(self.source, self.dtype_name, self.cutensornet_einsum, self.einsum)
+            allclose(self.source, self.dtype_name, self.cutensornet_contract, self.einsum)
+            
+            tensor_class_equal(self.tensor_class, self.data_operands, self.cutensornet_einsum)
+            tensor_class_equal(self.tensor_class, self.data_operands, self.cutensornet_interleaved_einsum)
+            tensor_class_equal(self.tensor_class, self.data_operands, self.cutensornet_contract)
+            tensor_class_equal(self.tensor_class, self.data_operands, self.cutensornet_interleaved_contract)
+
+            dtypes_equal(self.dtype, self.data_operands, self.cutensornet_einsum)
+            dtypes_equal(self.dtype, self.data_operands, self.cutensornet_interleaved_einsum)
+            dtypes_equal(self.dtype, self.data_operands, self.cutensornet_contract)
+            dtypes_equal(self.dtype, self.data_operands, self.cutensornet_interleaved_contract)
+            
+    def test_contract(self, skip_sync, use_numpy_einsum_path):
+        self._test_contract(
+            self.options_cmode,
+            self.optimize_cmode,
+            skip_sync,
+            use_numpy_einsum_path
+        )
+
+    def run_tests(self):
+        self.test_contract(False, False)
+        self.test_contract(False, True)
+        self.test_contract(True, True)
+        self.test_contract(True, False)
+
+@pytest.fixture
+def ContractFixture(request):
+    return ContractProxyFixture(request.param)
+
+class TestContract:
+    @pytest.mark.parametrize(
+        "ContractFixture",
+        itertools.product(
+            sources_devices_dtype_names,
+            array_orders,
+            einsum_expressions,
+            network_options,
+            optimizer_options,
+            opt_cmodes,  # cmodes for network options
+            opt_cmodes,  # cmodes for optimizer options
+            [None]  # ignore iterations, autotune is not used
+        ),
+        indirect=["ContractFixture"]
+    )
+    def test_contract(self, ContractFixture):
+        ContractFixture.run_tests()
diff --git a/python/tests/cuquantum_tests/cutensornet_tests/test_contract_path.py b/python/tests/cuquantum_tests/cutensornet_tests/test_contract_path.py
new file mode 100644
index 0000000..763c511
--- /dev/null
+++ b/python/tests/cuquantum_tests/cutensornet_tests/test_contract_path.py
@@ -0,0 +1,58 @@
+import pytest
+import itertools
+
+from cuquantum import contract_path, einsum_path
+
+from .testutils import *
+
+
+class ContractPathProxyFixture(ProxyFixtureBase):
+    def __init__(self, network_options_pack):
+        super().__init__(network_options_pack)
+    def test_contract_path(self):
+        cutensornet_contract_path_einsum = contract_path(
+            self.einsum_expr,
+            *self.data_operands,
+            optimize=optimizer_options_dispatcher(
+                self.optimize,
+                mode=self.optimize_cmode
+            )
+        )
+        cutensornet_contract_path_einsum = einsum_path(
+            self.einsum_expr,
+            *self.data_operands
+        )
+        cutensornet_contract_path_interleaved = contract_path(
+            *self.interleaved_inputs,
+            optimize=optimizer_options_dispatcher(
+                self.optimize,
+                mode=self.optimize_cmode
+            )
+        )
+        cutensornet_contract_path_interleaved = einsum_path(
+            *self.interleaved_inputs
+        )
+    def run_tests(self):
+        self.test_contract_path()
+
+@pytest.fixture
+def ContractPathFixture(request):
+    return ContractPathProxyFixture(request.param)
+
+class TestContractPath:
+    @pytest.mark.parametrize(
+        "ContractPathFixture",
+        itertools.product(
+            sources_devices_dtype_names,
+            array_orders,
+            einsum_expressions,
+            [None],  # only consider a single network options path; others tested elsewhere
+            [None],  # only consider a single optimizer options; others tested elsewhere
+            [None],  # ignore network options constructor modes, options constructor mode is not used
+            [None],  # only consider a single optimizer options constructor mode; others tested elsewhere
+            [None]  # ignore iterations, autotune is not used
+        ),
+        indirect=["ContractPathFixture"]
+    )
+    def test_contract_path(self, ContractPathFixture):
+        ContractPathFixture.run_tests()
diff --git a/python/tests/cuquantum_tests/cutensornet_tests/test_cutensornet.py b/python/tests/cuquantum_tests/cutensornet_tests/test_cutensornet.py
new file mode 100644
index 0000000..1c536f4
--- /dev/null
+++ b/python/tests/cuquantum_tests/cutensornet_tests/test_cutensornet.py
@@ -0,0 +1,448 @@
+import contextlib
+from collections import abc
+import functools
+
+import cupy
+from cupy import testing
+import numpy
+import pytest
+
+import cuquantum
+from cuquantum import ComputeType, cudaDataType
+from cuquantum import cutensornet
+
+
+###################################################################
+#
+# As of beta 2, the test suite for Python bindings is kept minimal.
+# The sole goal is to ensure the Python arguments are properly
+# passed to the C level. We do not ensure coverage nor correctness.
+# This decision will be revisited in the future.
+#
+###################################################################
+
+dtype_to_data_type = {
+    numpy.float16: cudaDataType.CUDA_R_16F,
+    numpy.float32: cudaDataType.CUDA_R_32F,
+    numpy.float64: cudaDataType.CUDA_R_64F,
+    numpy.complex64: cudaDataType.CUDA_C_32F,
+    numpy.complex128: cudaDataType.CUDA_C_64F,
+}
+
+
+dtype_to_compute_type = {
+    numpy.float16: ComputeType.COMPUTE_16F,
+    numpy.float32: ComputeType.COMPUTE_32F,
+    numpy.float64: ComputeType.COMPUTE_64F,
+    numpy.complex64: ComputeType.COMPUTE_32F,
+    numpy.complex128: ComputeType.COMPUTE_64F,
+}
+
+
+def manage_resource(name):
+    def decorator(impl):
+        @functools.wraps(impl)
+        def test_func(self, *args, **kwargs):
+            try:
+                if name == 'handle':
+                    h = cutensornet.create()
+                elif name == 'dscr':
+                    tn, dtype, input_form, output_form = self.tn, self.dtype, self.input_form, self.output_form
+                    einsum, shapes = tn  # unpack
+                    tn = TensorNetworkFactory(einsum, shapes, dtype)
+                    i_n_inputs, i_n_modes, i_extents, i_strides, i_modes, i_alignments = \
+                        tn.get_input_metadata(**input_form)
+                    o_n_modes, o_extents, o_strides, o_modes, o_alignments = \
+                        tn.get_output_metadata(**output_form)
+                    h = cutensornet.create_network_descriptor(
+                        self.handle,
+                        i_n_inputs, i_n_modes, i_extents, i_strides, i_modes, i_alignments,
+                        o_n_modes, o_extents, o_strides, o_modes, o_alignments,
+                        dtype_to_data_type[dtype], dtype_to_compute_type[dtype])
+                    # we also need to keep the tn data alive
+                    self.tn = tn
+                elif name == 'config':
+                    h = cutensornet.create_contraction_optimizer_config(self.handle)
+                elif name == 'info':
+                    h = cutensornet.create_contraction_optimizer_info(
+                        self.handle, self.dscr)
+                elif name == 'autotune':
+                    h = cutensornet.create_contraction_autotune_preference(self.handle)
+                else:
+                    assert False, f'name "{name}" not recognized'
+                setattr(self, name, h)
+                impl(self, *args, **kwargs)
+            except:
+                print(f'managing resource {name} failed')
+                raise
+            finally:
+                if name == 'handle' and hasattr(self, name):
+                    cutensornet.destroy(self.handle)
+                    del self.handle
+                elif name == 'dscr' and hasattr(self, name):
+                    cutensornet.destroy_network_descriptor(self.dscr)
+                    del self.dscr
+                elif name == 'config' and hasattr(self, name):
+                    cutensornet.destroy_contraction_optimizer_config(self.config)
+                    del self.config
+                elif name == 'info' and hasattr(self, name):
+                    cutensornet.destroy_contraction_optimizer_info(self.info)
+                    del self.info
+                elif name == 'autotune' and hasattr(self, name):
+                    cutensornet.destroy_contraction_autotune_preference(self.autotune)
+                    del self.autotune
+        return test_func
+    return decorator
+
+
+class TestLibHelper:
+
+    def test_get_version(self):
+        ver = cutensornet.get_version()
+        assert ver == (cutensornet.MAJOR_VER * 10000
+            + cutensornet.MINOR_VER * 100
+            + cutensornet.PATCH_VER)
+        assert ver == cutensornet.VERSION
+
+    def test_get_cudart_version(self):
+        ver = cutensornet.get_cudart_version()
+        assert ver == cupy.cuda.runtime.runtimeGetVersion()
+
+
+class TestHandle:
+
+    @manage_resource('handle')
+    def test_handle_create_destroy(self):
+        # simple rount-trip test
+        pass
+
+
+class TensorNetworkFactory:
+
+    # TODO(leofang): replace the utilities here by high-level private APIs
+
+    # This factory CANNOT be reused; once a TN descriptor uses it, it must
+    # be discarded.
+
+    def __init__(self, einsum, shapes, dtype):
+        inputs, output = einsum.split('->') if "->" in einsum else (einsum, None)
+        i_shapes, o_shape = shapes[:-1], shapes[-1]
+        inputs = tuple(tuple(_input) for _input in inputs.split(","))
+        assert all([len(i) == len(s) for i, s in zip(inputs, i_shapes)])
+        assert len(output) == len(o_shape)
+
+        self.input_tensors = [
+            testing.shaped_random(s, cupy, dtype) for s in i_shapes]
+        self.input_n_modes = [len(i) for i in inputs]
+        self.input_extents = i_shapes
+        self.input_strides = [arr.strides for arr in self.input_tensors]
+        self.input_modes = [tuple([ord(m) for m in i]) for i in inputs]
+        self.input_alignments = [256] * len(i_shapes)
+
+        self.output_tensor = cupy.empty(o_shape, dtype=dtype)
+        self.output_n_modes = len(o_shape)
+        self.output_extent = o_shape
+        self.output_stride = self.output_tensor.strides
+        self.output_mode = tuple([ord(m) for m in output])
+        self.output_alignment = 256
+
+    def _get_data_type(self, category):
+        if 'n_modes' in category:
+            return numpy.int32
+        elif 'extent' in category:
+            return numpy.int64
+        elif 'stride' in category:
+            return numpy.int64
+        elif 'mode' in category:
+            return numpy.int32
+        elif 'alignment' in category:
+            return numpy.uint32
+        elif 'tensor' in category:
+            return None  # unused
+        else:
+            assert False
+
+    def _return_data(self, category, return_value):
+        data = getattr(self, category)
+
+        if return_value == 'int':
+            if len(data) == 0:
+                # empty, give it a NULL
+                return 0
+            elif category == 'input_tensors':
+                # special case for device arrays, return int as void**
+                data = numpy.asarray([d.data.ptr for d in data],
+                    dtype=numpy.intp)
+                setattr(self, f'{category}_ptrs', data)  # keep data alive
+            # some data are not nested in nature, so we peek at the first
+            # element to determine
+            elif isinstance(data[0], abc.Sequence):
+                # return int as void**
+                data = [numpy.asarray(d, dtype=self._get_data_type(category))
+                    for d in data]
+                setattr(self, category, data)  # keep data alive
+                data = numpy.asarray([d.ctypes.data for d in data],
+                    dtype=numpy.intp)
+                setattr(self, f'{category}_ptrs', data)  # keep data alive
+            else:
+                # return int as void*
+                data = numpy.asarray(data, dtype=self._get_data_type(category))
+                setattr(self, category, data)  # keep data alive
+            return data.ctypes.data
+        elif return_value == 'seq':
+            if len(data) == 0:
+                # empty, leave it as is
+                pass
+            elif category == 'input_tensors':
+                # special case for device arrays
+                data = [d.data.ptr for d in data]
+                setattr(self, f'{category}_ptrs', data)  # keep data alive
+            # some data are not nested in nature, so we peek at the first
+            # element to determine
+            elif isinstance(data[0], abc.Sequence):
+                data = [numpy.asarray(d, dtype=self._get_data_type(category))
+                    for d in data]
+                setattr(self, category, data)  # keep data alive
+            else:
+                # data itself is already a flat sequence
+                pass
+            return data
+        elif return_value == 'nested_seq':
+            return data
+        else:
+            assert False
+
+    def get_input_metadata(self, **kwargs):
+        n_inputs = len(self.input_tensors)
+        n_modes = self._return_data('input_n_modes', kwargs.pop('n_modes'))
+        extents = self._return_data('input_extents', kwargs.pop('extent'))
+        strides = self._return_data('input_strides', kwargs.pop('stride'))
+        modes = self._return_data('input_modes', kwargs.pop('mode'))
+        alignments = self._return_data(
+            'input_alignments', kwargs.pop('alignment'))
+        return n_inputs, n_modes, extents, strides, modes, alignments
+
+    def get_output_metadata(self, **kwargs):
+        n_modes = self.output_n_modes
+        extent = self._return_data('output_extent', kwargs.pop('extent'))
+        stride = self._return_data('output_stride', kwargs.pop('stride'))
+        mode = self._return_data('output_mode', kwargs.pop('mode'))
+        alignment = self.output_alignment
+        return n_modes, extent, stride, mode, alignment
+
+    def get_input_tensors(self, **kwargs):
+        data = self._return_data('input_tensors', kwargs['data'])
+        return data
+
+    def get_output_tensor(self):
+        return self.output_tensor.data.ptr
+
+
+@testing.parameterize(*testing.product({
+    'tn': (
+        ('ab,bc->ac', [(2, 3), (3, 2), (2, 2)]),
+        ('ab,ba->', [(2, 3), (3, 2), ()]),
+        ('abc,bca->', [(2, 3, 4), (3, 4, 2), ()]),
+        ('ab,bc,cd->ad', [(2, 3), (3, 1), (1, 5), (2, 5)]),
+    ),
+    'dtype': (
+        numpy.float32, numpy.float64, numpy.complex64, numpy.complex128
+    ),
+    'input_form': (
+        {'n_modes': 'int', 'extent': 'int', 'stride': 'int',
+         'mode': 'int', 'alignment': 'int', 'data': 'int'},
+        {'n_modes': 'int', 'extent': 'seq', 'stride': 'seq',
+         'mode': 'seq', 'alignment': 'int', 'data': 'seq'},
+        {'n_modes': 'seq', 'extent': 'nested_seq', 'stride': 'nested_seq',
+         'mode': 'seq', 'alignment': 'seq', 'data': 'seq'},
+    ),
+    'output_form': (
+        {'extent': 'int', 'stride': 'int', 'mode': 'int'},
+        {'extent': 'seq', 'stride': 'seq', 'mode': 'seq'},
+    )
+}))
+class TestTensorNetworkBase:
+
+    # Use this class as the base to share all common test parametrizations
+    pass
+
+
+class TestTensorNetworkDescriptor(TestTensorNetworkBase):
+
+    @manage_resource('handle')
+    @manage_resource('dscr')
+    def test_descriptor_create_destroy(self):
+        # simple round-trip test
+        pass
+
+
+class TestOptimizerInfo(TestTensorNetworkBase):
+
+    @manage_resource('handle')
+    @manage_resource('dscr')
+    @manage_resource('info')
+    def test_optimizer_info_create_destroy(self):
+        # simple round-trip test
+        pass
+
+    @pytest.mark.parametrize(
+        'attr', [val for val in cutensornet.ContractionOptimizerInfoAttribute]
+    )
+    @manage_resource('handle')
+    @manage_resource('dscr')
+    @manage_resource('info')
+    def test_optimizer_info_get_set_attribute(self, attr):
+        if attr in (
+                cutensornet.ContractionOptimizerInfoAttribute.NUM_SLICES,
+                cutensornet.ContractionOptimizerInfoAttribute.PHASE1_FLOP_COUNT,
+                cutensornet.ContractionOptimizerInfoAttribute.FLOP_COUNT,
+                cutensornet.ContractionOptimizerInfoAttribute.LARGEST_TENSOR,
+                cutensornet.ContractionOptimizerInfoAttribute.SLICING_OVERHEAD,
+                ):
+            pytest.skip("setter not supported")
+        elif attr in (
+                cutensornet.ContractionOptimizerInfoAttribute.PATH,
+                cutensornet.ContractionOptimizerInfoAttribute.SLICED_MODE,
+                cutensornet.ContractionOptimizerInfoAttribute.SLICED_EXTENT,
+                ):
+            pytest.skip("TODO")
+        handle, info = self.handle, self.info
+        dtype = cutensornet.contraction_optimizer_info_get_attribute_dtype(attr)
+        # Hack: assume this is a valid value for all attrs
+        factor = numpy.asarray([30], dtype=dtype)
+        cutensornet.contraction_optimizer_info_set_attribute(
+            handle, info, attr,
+            factor.ctypes.data, factor.dtype.itemsize)
+        # do a round-trip test as a sanity check
+        factor2 = numpy.zeros_like(factor)
+        cutensornet.contraction_optimizer_info_get_attribute(
+            handle, info, attr,
+            factor2.ctypes.data, factor2.dtype.itemsize)
+        assert factor == factor2
+
+
+class TestOptimizerConfig:
+
+    @manage_resource('handle')
+    @manage_resource('config')
+    def test_optimizer_config_create_destroy(self):
+        # simple round-trip test
+        pass
+
+    @pytest.mark.parametrize(
+        # TODO(leofang): enable this when the getter bug is fixed
+        'attr', [val for val in cutensornet.ContractionOptimizerConfigAttribute]
+        #'attr', [cutensornet.ContractionOptimizerConfigAttribute.GRAPH_IMBALANCE_FACTOR]
+    )
+    @manage_resource('handle')
+    @manage_resource('config')
+    def test_optimizer_config_get_set_attribute(self, attr):
+        if attr == cutensornet.ContractionOptimizerConfigAttribute.SIMPLIFICATION_DISABLE_DR:
+            pytest.skip("pending on MR 275")
+        handle, config = self.handle, self.config
+        dtype = cutensornet.contraction_optimizer_config_get_attribute_dtype(attr)
+        # Hack: assume this is a valid value for all attrs
+        if attr in (cutensornet.ContractionOptimizerConfigAttribute.GRAPH_ALGORITHM,
+                    cutensornet.ContractionOptimizerConfigAttribute.SLICER_MEMORY_MODEL,
+                    cutensornet.ContractionOptimizerConfigAttribute.SLICER_DISABLE_SLICING):
+            factor = numpy.asarray([1], dtype=dtype)
+        else:
+            factor = numpy.asarray([30], dtype=dtype)
+        cutensornet.contraction_optimizer_config_set_attribute(
+            handle, config, attr,
+            factor.ctypes.data, factor.dtype.itemsize)
+        # do a round-trip test as a sanity check
+        factor2 = numpy.zeros_like(factor)
+        cutensornet.contraction_optimizer_config_get_attribute(
+            handle, config, attr,
+            factor2.ctypes.data, factor2.dtype.itemsize)
+        assert factor == factor2
+
+
+class TestAutotunePreference:
+
+    @manage_resource('handle')
+    @manage_resource('autotune')
+    def test_autotune_preference_create_destroy(self):
+        # simple round-trip test
+        pass
+
+    @pytest.mark.parametrize(
+        'attr', [val for val in cutensornet.ContractionAutotunePreferenceAttribute]
+    )
+    @manage_resource('handle')
+    @manage_resource('autotune')
+    def test_autotune_preference_get_set_attribute(self, attr):
+        handle, pref = self.handle, self.autotune
+        dtype = cutensornet.contraction_autotune_preference_get_attribute_dtype(attr)
+        # Hack: assume this is a valid value for all attrs
+        factor = numpy.asarray([10], dtype=dtype)
+        cutensornet.contraction_autotune_preference_set_attribute(
+            handle, pref, attr,
+            factor.ctypes.data, factor.dtype.itemsize)
+        # do a round-trip test as a sanity check
+        factor2 = numpy.zeros_like(factor)
+        cutensornet.contraction_autotune_preference_get_attribute(
+            handle, pref, attr,
+            factor2.ctypes.data, factor2.dtype.itemsize)
+        assert factor == factor2
+
+
+@pytest.mark.parametrize(
+    'get_workspace_size', (True, False)
+)
+@pytest.mark.parametrize(
+    'autotune', (True, False)
+)
+@pytest.mark.parametrize(
+    'contract', (True, False)
+)
+@pytest.mark.parametrize(
+    'stream', (cupy.cuda.Stream.null, cupy.cuda.Stream(non_blocking=True))
+)
+class TestContraction(TestTensorNetworkBase):
+
+    # There is no easy way for us to test each API independently, so we instead
+    # parametrize the steps and test the whole workflow
+    @manage_resource('handle')
+    @manage_resource('dscr')
+    @manage_resource('info')
+    @manage_resource('config')
+    @manage_resource('autotune')
+    def test_contraction_workflow(
+            self, get_workspace_size, autotune, contract, stream):
+        # unpack
+        handle, dscr, info, config, pref = self.handle, self.dscr, self.info, self.config, self.autotune
+        tn, input_form, output_form = self.tn, self.input_form, self.output_form
+
+        workspace_size = 4*1024**2  # large enough for our test cases
+        # we have to run this API in any case in order to create a path
+        cutensornet.contraction_optimize(
+            handle, dscr, config, workspace_size, info)
+        if get_workspace_size:
+            workspace_size = cutensornet.contraction_get_workspace_size(
+                handle, dscr, info)
+        workspace = cupy.cuda.alloc(workspace_size)
+
+        plan = None
+        try:
+            plan = cutensornet.create_contraction_plan(
+                handle, dscr, info, workspace_size)
+            if autotune:
+                cutensornet.contraction_autotune(
+                    handle, plan,
+                    tn.get_input_tensors(**input_form),
+                    tn.get_output_tensor(),
+                    workspace.ptr, workspace_size, pref, stream.ptr)
+            if contract:
+                # assume no slicing for simple test cases!
+                cutensornet.contraction(
+                    handle, plan,
+                    tn.get_input_tensors(**input_form),
+                    tn.get_output_tensor(),
+                    workspace.ptr, workspace_size, 0, stream.ptr)
+                # TODO(leofang): check correctness?
+            stream.synchronize()
+        finally:
+            if plan is not None:
+                cutensornet.destroy_contraction_plan(plan)
diff --git a/python/tests/cuquantum_tests/cutensornet_tests/test_network.py b/python/tests/cuquantum_tests/cutensornet_tests/test_network.py
new file mode 100644
index 0000000..bf824af
--- /dev/null
+++ b/python/tests/cuquantum_tests/cutensornet_tests/test_network.py
@@ -0,0 +1,150 @@
+import pytest
+import itertools
+from copy import deepcopy
+
+from .testutils import *
+
+
+class NetworkProxyFixture(ProxyFixtureBase):
+    def __init__(self, network_options_pack):
+        super().__init__(network_options_pack)
+
+    def test_contract_path(
+        self,
+        options_constructor_mode,
+        optimize_constructor_mode,
+    ):
+        network_einsum = network_dispatcher(
+            self.einsum_expr,
+            self.data_operands,
+            self.options,
+            mode=options_constructor_mode
+        )
+        
+        network_einsum.contract_path(
+            optimize=optimizer_options_dispatcher(
+                self.optimize,
+                mode=optimize_constructor_mode
+            )
+        )
+
+        network_einsum.free()
+
+        network_interleaved = network_dispatcher(
+            None,
+            None,
+            self.options,
+            mode=options_constructor_mode,
+            interleaved_inputs=self.interleaved_inputs
+        )
+
+        network_interleaved.contract_path(
+            optimize=optimizer_options_dispatcher(
+                self.optimize,
+                mode=optimize_constructor_mode
+            )
+        )
+
+        network_interleaved.free()
+
+    def _test_contract(
+        self,
+        options_constructor_mode,
+        optimize_constructor_mode,
+        skip_sync,
+        use_numpy_einsum_path
+    ):
+        network_einsum = network_dispatcher(
+            self.einsum_expr,
+            self.data_operands,
+            self.options,
+            mode=options_constructor_mode,
+        )
+
+        network_interleaved = network_dispatcher(
+            self.einsum_expr,
+            self.data_operands,
+            self.options,
+            mode=options_constructor_mode,
+            interleaved_inputs=self.interleaved_inputs
+        )
+
+        optimize = deepcopy(self.optimize)
+
+        if use_numpy_einsum_path:
+            optimize["path"] = self.numpy_einsum_path[0][1:]
+
+        network_einsum.contract_path(
+            optimize=optimizer_options_dispatcher(
+                optimize,
+                mode=optimize_constructor_mode
+            )
+        )
+
+        network_interleaved.contract_path(
+            optimize=optimizer_options_dispatcher(
+                optimize,
+                mode=optimize_constructor_mode
+            )
+        )
+
+        for stream_name in stream_names:
+            network_einsum.autotune(iterations=self.iterations, stream=streams[stream_name])  # if iterations=0, autotune is skipped
+            stream_name_sync_dispatcher(stream_name, skip=skip_sync)
+            cutensornet_contract = network_einsum.contract(stream=streams[stream_name])
+            stream_name_sync_dispatcher(stream_name, skip=skip_sync)
+            allclose(self.source, self.dtype_name, cutensornet_contract, self.einsum)
+
+        network_einsum.free()
+
+        for stream_name in stream_names:
+            network_interleaved.autotune(iterations=self.iterations, stream=streams[stream_name])  # if iterations=0, autotune is skipped
+            stream_name_sync_dispatcher(stream_name, skip=skip_sync)
+            cutensornet_contract = network_interleaved.contract(stream=streams[stream_name])
+            stream_name_sync_dispatcher(stream_name, skip=skip_sync)
+            allclose(self.source, self.dtype_name, cutensornet_contract, self.einsum)
+
+        network_interleaved.free()
+
+    def test_contract(
+        self,
+        skip_sync,
+        use_numpy_einsum_path
+    ):
+        self._test_contract(
+            self.options_cmode,
+            self.optimize_cmode,
+            skip_sync,
+            use_numpy_einsum_path
+        )
+
+    def run_tests(self):
+        self.test_contract_path(self.options_cmode, self.optimize_cmode)
+
+        self.test_contract(False, False)
+        self.test_contract(False, True)
+        self.test_contract(True, True)
+        self.test_contract(True, False)
+
+@pytest.fixture
+def NetworkFixture(request):
+    return NetworkProxyFixture(request.param)
+
+class TestNetwork:
+
+    @pytest.mark.parametrize(
+        "NetworkFixture",
+        itertools.product(
+            sources_devices_dtype_names,
+            array_orders,
+            einsum_expressions,
+            network_options,
+            optimizer_options,
+            opt_cmodes,
+            opt_cmodes,
+            iterations
+        ),
+        indirect=["NetworkFixture"]
+    )
+    def test_network(self, NetworkFixture):
+        NetworkFixture.run_tests()
diff --git a/python/tests/cuquantum_tests/cutensornet_tests/testutils.py b/python/tests/cuquantum_tests/cutensornet_tests/testutils.py
new file mode 100644
index 0000000..57ddc6d
--- /dev/null
+++ b/python/tests/cuquantum_tests/cutensornet_tests/testutils.py
@@ -0,0 +1,217 @@
+import cupy
+import torch
+import numpy
+import functools
+
+from cuquantum import Network
+from cuquantum import NetworkOptions, OptimizerOptions
+
+from .data import *
+
+torch.backends.cuda.matmul.allow_tf32 = False
+torch.backends.cudnn.allow_tf32 = False
+
+
+def dtype_name_dispatcher(source, dtype_name):
+    import sys
+    return getattr(sys.modules[source], dtype_name)
+
+stream_names = [
+    "default",
+    "cupy",
+    "torch"
+]
+
+streams = dict(zip(
+    stream_names,
+    [None, cupy.cuda.Stream(), torch.cuda.Stream()]
+))
+
+def stream_name_sync_dispatcher(stream_name, skip=False):
+    stream = streams[stream_name]
+    if not skip:
+        if stream:
+            stream.synchronize()
+
+# TODO: record seed
+def generate_data_dispatcher(source, device, shape, dtype_name, array_order):
+    data = None
+    dtype = dtype_name_dispatcher(source, dtype_name)
+    if source == "numpy":
+        if "int" in dtype_name:
+            data = numpy.random.randint(-1, high=2, size=shape).astype(order=array_order)
+        elif "complex" in dtype_name:
+            data = (numpy.random.random(shape) +
+                    1.j * numpy.random.random(shape)).astype(dtype, order=array_order)
+        else:
+            data = numpy.random.random(shape).astype(dtype, order=array_order)
+    elif source == "cupy":
+        if "int" in dtype_name:
+            data = cupy.random.randint(-1, high=2, size=shape).astype(order=array_order)
+        elif "complex" in dtype_name:
+            data = (cupy.random.random(shape) +
+                    1.j * cupy.random.random(shape)).astype(dtype, order=array_order)
+        else:
+            data = cupy.random.random(shape).astype(dtype, order=array_order)
+    elif source == "torch":
+        if "int" in dtype_name:
+            data = torch.randint(-1, 2, shape, dtype=dtype, device=device)
+        else:
+            data = torch.rand(shape, dtype=dtype, device=device)
+    return data
+
+def generate_data(source, device, shape, dtype_name, array_order):
+    return generate_data_dispatcher(source, device, shape, dtype_name, array_order)
+
+def generate_data_operands(source, device, dtype_name, array_order, einsum_expression):
+    einsum_expr, orders, shapes = einsum_expression
+    data = [generate_data(source, device, shape, dtype_name, array_order) for shape in shapes]
+    return [data[order] for order in orders]
+
+def data_to_numpy(source, data):
+    if source == "numpy":
+        return data
+    elif source == "cupy":
+        return cupy.asnumpy(data)
+    elif source == "torch":
+        return data.cpu().numpy()
+
+def data_operands_to_numpy(source, data_operands):
+    return [data_to_numpy(source, data) for data in data_operands]
+
+def interleaved_format_from_einsum(einsum_expr, data_operands):
+    einsum_tuples = einsum_expr.split("->")[0].split(",")
+    index_tuples = [[it for it in einsum_tuple] for einsum_tuple in einsum_tuples]
+    inputs = []
+    for index, data in enumerate(data_operands):
+        inputs.append(data)
+        inputs.append(index_tuples[index])
+    return inputs
+
+def einsum_dispatcher(source, einsum_expr, data_operands):
+    if source == "numpy":
+        return numpy.einsum(einsum_expr, *data_operands, optimize="optimal")
+    elif source == "cupy":
+        return cupy.einsum(einsum_expr, *data_operands)
+    elif source == "torch":
+        return torch.einsum(einsum_expr, *data_operands)
+
+def network_options_dispatcher(network_options, mode=None):
+    if mode is None:
+        return None
+    elif mode == "dict":
+        return network_options
+    elif mode == "object":
+        return NetworkOptions(**network_options)
+
+def optimizer_options_dispatcher(optimizer_options, mode=None):
+    if mode is None:
+        return None
+    elif mode == "dict":
+        return optimizer_options
+    elif mode == "object":
+        return OptimizerOptions(**optimizer_options)
+
+def network_dispatcher(einsum_expr, data_operands, network_options, mode=None, interleaved_inputs=None):
+    if interleaved_inputs:
+        return Network(
+            *interleaved_inputs,
+            options=network_options_dispatcher(network_options, mode=mode)
+        )
+    else:
+        return Network(
+            einsum_expr, 
+            *data_operands,
+            options=network_options_dispatcher(network_options, mode=mode)
+        )
+
+def machine_epsilon(dtype_name):
+    dtype = dtype_name_dispatcher("numpy", dtype_name)
+    return numpy.finfo(dtype).eps
+
+machine_epsilon_values = [machine_epsilon(dtype_name) for dtype_name in dtype_names]
+
+rtol_mapper = dict(zip(
+    dtype_names,
+    [numpy.sqrt(m_eps) for m_eps in machine_epsilon_values]
+))
+
+atol_mapper = dict(zip(
+    dtype_names,
+    [10 * m_eps for m_eps in machine_epsilon_values]
+))
+
+def allclose_dispatcher(source, dtype_name):
+    if source == "numpy":
+        return functools.partial(
+            numpy.allclose, rtol=rtol_mapper[dtype_name],
+            atol=atol_mapper[dtype_name]
+        )
+    elif source == "cupy":
+        return functools.partial(
+            cupy.allclose, rtol=rtol_mapper[dtype_name],
+            atol=atol_mapper[dtype_name]
+        )
+    elif source == "torch":
+        return functools.partial(
+            torch.allclose, rtol=rtol_mapper[dtype_name],
+            atol=atol_mapper[dtype_name]
+        )
+
+def allclose(source, dtype_name, tensor, ref_tensor):
+    allclose_func = allclose_dispatcher(source, dtype_name)
+    assert allclose_func(tensor, ref_tensor)
+
+def tensor_class_dispatcher(data_operands):
+    return type(data_operands[0])
+
+def tensor_class_equal(tensor_class, data_operands, result):
+    for data_operand in data_operands:
+        assert issubclass(tensor_class, type(data_operand))
+    assert issubclass(tensor_class, type(result))
+
+# TODO: document rationale ... torch inconsistent with numpy
+def dtypes_equal(dtype, data_operands, result):
+    for data_operand in data_operands:
+        assert data_operand.dtype == dtype
+    assert result.dtype == dtype
+
+class NetworkRuntimeOptions:
+    def __init__(self, runtime_options_pack):
+        (sources_devices_dtype_name,
+         array_order,
+         einsum_expression,
+         options,
+         optimize,
+         options_cmode,
+         optimize_cmode,
+         iterations) = runtime_options_pack
+        self.source, self.device, self.dtype_name = sources_devices_dtype_name
+        self.dtype = dtype_name_dispatcher(self.source, self.dtype_name)
+        self.array_order = array_order
+        self.einsum_expression = einsum_expression
+        self.einsum_expr, self.orders, self.shapes = einsum_expression  # verbose, convenient
+        self.options = options
+        self.options_cmode = options_cmode
+        self.optimize = optimize
+        self.optimize_cmode = optimize_cmode
+        self.iterations = iterations
+ 
+class ProxyFixtureBase(NetworkRuntimeOptions):
+    def __init__(self, network_options_pack):
+        super().__init__(network_options_pack)
+        self.data_operands = generate_data_operands(
+            self.source,
+            self.device,
+            self.dtype_name,
+            self.array_order,
+            self.einsum_expression
+        )
+        self.numpy_data_operands = data_operands_to_numpy(
+            self.source,
+            self.data_operands
+        )
+        self.tensor_class = tensor_class_dispatcher(self.data_operands)
+        self.interleaved_inputs = interleaved_format_from_einsum(self.einsum_expr, self.data_operands)
+        self.numpy_einsum_path = numpy.einsum_path(self.einsum_expr, *self.numpy_data_operands)
+        self.einsum = einsum_dispatcher(self.source, self.einsum_expr, self.data_operands)
diff --git a/python/tests/samples_tests/__init__.py b/python/tests/samples_tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/python/tests/samples_tests/test_samples.py b/python/tests/samples_tests/test_samples.py
new file mode 100644
index 0000000..be6a844
--- /dev/null
+++ b/python/tests/samples_tests/test_samples.py
@@ -0,0 +1,41 @@
+import glob
+import os
+import subprocess
+import sys
+
+import pytest
+
+
+class cuQuantumSampleTestError(Exception):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+
+samples_path = os.path.join(
+    os.path.dirname(__file__), '..', '..', 'samples')
+sample_files = glob.glob(samples_path+'/**/*.py', recursive=True)
+
+
+def run_sample(path, *args):
+    fullpath = os.path.join(samples_path, path)
+    result = subprocess.run(
+        (sys.executable, fullpath) + args, capture_output=True, env=os.environ)
+    if result.returncode:
+        msg = f'Got error:\n'
+        msg += f'{result.stderr.decode()}'
+        if "ModuleNotFoundError: No module named 'torch'" in msg:
+            pytest.skip('PyTorch uninstalled, skipping related tests')
+        else:
+            raise cuQuantumSampleTestError(msg)
+    else:
+        print(result.stdout.decode())
+
+
+@pytest.mark.parametrize(
+    'sample', sample_files
+)
+class TestSamples:
+
+    def test_sample(self, sample):
+        run_sample(sample)
diff --git a/samples/custatevec/CMakeLists.txt b/samples/custatevec/CMakeLists.txt
index 7b0b0bc..32b6e0f 100644
--- a/samples/custatevec/CMakeLists.txt
+++ b/samples/custatevec/CMakeLists.txt
@@ -53,8 +53,6 @@ endif()
 # cuStateVec_example dependencies
 # ##########################################
 
-find_package(CUDA 10.1 REQUIRED)
-
 if (NOT CUSTATEVEC_ROOT)
   set(CUSTATEVEC_ROOT ${CUDA_TOOLKIT_ROOT_DIR})
 endif()
@@ -77,6 +75,13 @@ set(CMAKE_CUDA_STANDARD 11)
 set(CMAKE_CUDA_STANDARD_REQUIRED ON)
 set(CMAKE_CUDA_EXTENSIONS OFF)
 
+set(CMAKE_CUDA_FLAGS_ARCH_SM60 "-gencode arch=compute_60,code=sm_60")
+set(CMAKE_CUDA_FLAGS_ARCH_SM70 "-gencode arch=compute_70,code=sm_70")
+set(CMAKE_CUDA_FLAGS_ARCH_SM75 "-gencode arch=compute_75,code=sm_75")
+set(CMAKE_CUDA_FLAGS_ARCH_SM80 "-gencode arch=compute_80,code=sm_80 -gencode arch=compute_80,code=compute_80")
+set(CMAKE_CUDA_FLAGS_ARCH "${CMAKE_CUDA_FLAGS_ARCH_SM60} ${CMAKE_CUDA_FLAGS_ARCH_SM70} ${CMAKE_CUDA_FLAGS_ARCH_SM75} ${CMAKE_CUDA_FLAGS_ARCH_SM80}")
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${CMAKE_CUDA_FLAGS_ARCH}")
+
 # ##########################################
 # cuStateVec_example target
 # ##########################################
@@ -105,9 +110,14 @@ endfunction()
 
 add_custom_target(custatevec_examples)
 
-add_custatevec_example(custatevec_examples "cuStateVec.example.gate_application"  gate_application.cu)
-add_custatevec_example(custatevec_examples "cuStateVec.example.exponential_pauli" exponential_pauli.cu)
-add_custatevec_example(custatevec_examples "cuStateVec.example.expectation"       expectation.cu)
-add_custatevec_example(custatevec_examples "cuStateVec.example.sampler"           sampler.cu)
-add_custatevec_example(custatevec_examples "cuStateVec.example.measure_zbasis"    measure_zbasis.cu)
-add_custatevec_example(custatevec_examples "cuStateVec.example.batch_measure"     batch_measure.cu)
+add_custatevec_example(custatevec_examples "cuStateVec.example.gate_application"   gate_application.cu)
+add_custatevec_example(custatevec_examples "cuStateVec.example.permutation_matrix" permutation_matrix.cu)
+add_custatevec_example(custatevec_examples "cuStateVec.example.diagonal_matrix"    diagonal_matrix.cu)
+add_custatevec_example(custatevec_examples "cuStateVec.example.exponential_pauli"  exponential_pauli.cu)
+add_custatevec_example(custatevec_examples "cuStateVec.example.expectation"        expectation.cu)
+add_custatevec_example(custatevec_examples "cuStateVec.example.expentation_pauli"  expectation_pauli.cu)
+add_custatevec_example(custatevec_examples "cuStateVec.example.sampler"            sampler.cu)
+add_custatevec_example(custatevec_examples "cuStateVec.example.measure_zbasis"     measure_zbasis.cu)
+add_custatevec_example(custatevec_examples "cuStateVec.example.batch_measure"      batch_measure.cu)
+add_custatevec_example(custatevec_examples "cuStateVec.example.accessor_get"       accessor_get.cu)
+add_custatevec_example(custatevec_examples "cuStateVec.example.accessor_set"       accessor_set.cu)
diff --git a/samples/custatevec/Makefile b/samples/custatevec/Makefile
index d2e80a7..40ca7d5 100644
--- a/samples/custatevec/Makefile
+++ b/samples/custatevec/Makefile
@@ -35,17 +35,27 @@ LIBS            := -L$(CUSTATEVEC_ROOT)/lib64 -lcudart -lcustatevec
 CXX_FLAGS= -std=c++11 $(INC) $(LIBS)
 
 all:
-	nvcc gate_application.cu  -o gate_application  ${CXX_FLAGS}
-	nvcc exponential_pauli.cu -o exponential_pauli ${CXX_FLAGS}
-	nvcc expectation.cu       -o expectation       ${CXX_FLAGS}
-	nvcc sampler.cu           -o sampler           ${CXX_FLAGS}
-	nvcc measure_zbasis.cu    -o measure_zbasis    ${CXX_FLAGS}
-	nvcc batch_measure.cu     -o batch_measure     ${CXX_FLAGS}
+	nvcc gate_application.cu   -o gate_application   ${CXX_FLAGS}
+	nvcc permutation_matrix.cu -o permutation_matrix ${CXX_FLAGS}
+	nvcc diagonal_matrix.cu    -o diagonal_matrix    ${CXX_FLAGS}
+	nvcc exponential_pauli.cu  -o exponential_pauli  ${CXX_FLAGS}
+	nvcc expectation.cu        -o expectation        ${CXX_FLAGS}
+	nvcc expectation_pauli.cu  -o expectation_pauli  ${CXX_FLAGS}
+	nvcc sampler.cu            -o sampler            ${CXX_FLAGS}
+	nvcc measure_zbasis.cu     -o measure_zbasis     ${CXX_FLAGS}
+	nvcc batch_measure.cu      -o batch_measure      ${CXX_FLAGS}
+	nvcc accessor_get.cu       -o accessor_get       ${CXX_FLAGS}
+	nvcc accessor_set.cu       -o accessor_set       ${CXX_FLAGS}
 
 clean:
 	rm -f gate_application \
+		permutation_matrix \
+		diagonal_matrix \
 		exponential_pauli \
 		expectation \
+		expectation_pauli \
 		sampler \
 		measure_zbasis \
-		batch_measure
+		batch_measure \
+		accessor_get \
+		accessor_set
diff --git a/samples/custatevec/README.md b/samples/custatevec/README.md
index bcb639f..6f5ec4b 100644
--- a/samples/custatevec/README.md
+++ b/samples/custatevec/README.md
@@ -25,10 +25,10 @@ make -j8
 
 * **Supported SM Architectures:** SM 7.0, SM 7.5, SM 8.0, SM 8.6
 * **Supported OSes:** Linux
-* **Supported CPU Architectures**: x86_64, arm64
+* **Supported CPU Architectures**: x86_64, arm64, ppc64le
 * **Language**: `C++11`
 
 # Prerequisites
 
 * [CUDA 11.4 toolkit](https://developer.nvidia.com/cuda-downloads) (or above) and compatible driver (see [CUDA Driver Release Notes](https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cuda-major-component-versions)).
-* [CMake 3.13](https://cmake.org/download/) or above on Windows
+* [CMake 3.13](https://cmake.org/download/) or above
diff --git a/samples/custatevec/accessor_get.cu b/samples/custatevec/accessor_get.cu
new file mode 100644
index 0000000..d816ff2
--- /dev/null
+++ b/samples/custatevec/accessor_get.cu
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+ *
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  - Neither the name(s) of the copyright holder(s) nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <cuda_runtime_api.h> // cudaMalloc, cudaMemcpy, etc.
+#include <cuComplex.h>        // cuDoubleComplex
+#include <custatevec.h>       // custatevecApplyMatrix
+#include <stdio.h>            // printf
+#include <stdlib.h>           // EXIT_FAILURE
+
+#include "helper.hpp"         // HANDLE_ERROR, HANDLE_CUDA_ERROR
+
+int main(void) {
+
+    const int nIndexBits = 3;
+    const int nSvSize    = (1 << nIndexBits);
+
+    const int bitOrderingLen = 2;
+    const int bitOrdering[]  = {2, 1};
+
+    const int maskLen         = 1;
+    const int maskBitString[] = {1};
+    const int maskOrdering[]  = {0};
+
+    const int bufferSize  = 3;
+    const int accessBegin = 1;
+    const int accessEnd   = 4;
+
+    cuDoubleComplex h_sv[]        = {{ 0.0, 0.0}, { 0.0, 0.1}, { 0.1, 0.1}, { 0.1, 0.2},
+                                     { 0.2, 0.2}, { 0.3, 0.3}, { 0.3, 0.4}, { 0.4, 0.5}};
+    cuDoubleComplex buffer[]        = {{ 0.0, 0.0}, { 0.0, 0.0}, { 0.0, 0.0}};
+    cuDoubleComplex buffer_result[] = {{ 0.3, 0.3}, { 0.1, 0.2}, { 0.4, 0.5}};
+
+    custatevecAccessorDescriptor_t accessor;
+
+    cuDoubleComplex *d_sv;
+    HANDLE_CUDA_ERROR( cudaMalloc((void**)&d_sv, nSvSize * sizeof(cuDoubleComplex)) );
+
+    HANDLE_CUDA_ERROR( cudaMemcpy(d_sv, h_sv, nSvSize * sizeof(cuDoubleComplex),
+                       cudaMemcpyHostToDevice) );
+
+    //----------------------------------------------------------------------------------------------
+
+    // custatevec handle initialization
+    custatevecHandle_t handle;
+    HANDLE_ERROR( custatevecCreate(&handle) );
+
+    void* extraWorkspace = nullptr;
+    size_t extraWorkspaceSizeInBytes = 0;
+
+    // create accessor and check the size of external workspace
+    HANDLE_ERROR( custatevecAccessor_createReadOnly(
+                  handle, d_sv, CUDA_C_64F, nIndexBits, &accessor, bitOrdering, bitOrderingLen,
+                  maskBitString, maskOrdering, maskLen, &extraWorkspaceSizeInBytes) );
+
+    // allocate external workspace if necessary
+    if (extraWorkspaceSizeInBytes > 0)
+        HANDLE_CUDA_ERROR( cudaMalloc(&extraWorkspace, extraWorkspaceSizeInBytes) );
+
+    // set external workspace
+    HANDLE_ERROR( custatevecAccessor_setExtraWorkspace(
+                  handle, &accessor, extraWorkspace, extraWorkspaceSizeInBytes) );
+
+    // get state vector components
+    HANDLE_ERROR( custatevecAccessor_get(
+                  handle, &accessor, buffer, accessBegin, accessEnd) );
+
+    // destroy handle
+    HANDLE_ERROR( custatevecDestroy(handle) );
+
+    //----------------------------------------------------------------------------------------------
+
+    HANDLE_CUDA_ERROR( cudaMemcpy(h_sv, d_sv, nSvSize * sizeof(cuDoubleComplex),
+                       cudaMemcpyDeviceToHost) );
+
+    bool correct = true;
+    for (int i = 0; i < bufferSize; i++) {
+        if (!almost_equal(buffer[i], buffer_result[i])) {
+            correct = false;
+            break;
+        }
+    }
+
+    HANDLE_CUDA_ERROR( cudaFree(d_sv) );
+    if (extraWorkspaceSizeInBytes)
+        HANDLE_CUDA_ERROR( cudaFree(extraWorkspace) );
+
+    if (correct) {
+        printf("accessor_get example PASSED\n");
+        return EXIT_SUCCESS;
+    }
+    else {
+        printf("accessor_get example FAILED: wrong result\n");
+        return EXIT_FAILURE;
+    }
+
+}
diff --git a/samples/custatevec/accessor_set.cu b/samples/custatevec/accessor_set.cu
new file mode 100644
index 0000000..19da9d9
--- /dev/null
+++ b/samples/custatevec/accessor_set.cu
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+ *
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  - Neither the name(s) of the copyright holder(s) nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <cuda_runtime_api.h> // cudaMalloc, cudaMemcpy, etc.
+#include <cuComplex.h>        // cuDoubleComplex
+#include <custatevec.h>       // custatevecApplyMatrix
+#include <stdio.h>            // printf
+#include <stdlib.h>           // EXIT_FAILURE
+
+#include "helper.hpp"         // HANDLE_ERROR, HANDLE_CUDA_ERROR
+
+int main(void) {
+
+    const int nIndexBits = 3;
+    const int nSvSize    = (1 << nIndexBits);
+
+    const int bitOrderingLen = 3;
+    const int bitOrdering[]  = {1, 2, 0};
+
+    const int maskLen    = 0;
+
+    cuDoubleComplex h_sv[]        = {{ 0.0, 0.0}, { 0.0, 0.0}, { 0.0, 0.0}, { 0.0, 0.0},
+                                     { 0.0, 0.0}, { 0.0, 0.0}, { 0.0, 0.0}, { 0.0, 0.0}};
+    cuDoubleComplex h_sv_result[] = {{ 0.0, 0.0}, { 0.0, 0.1}, { 0.1, 0.1}, { 0.1, 0.2},
+                                     { 0.2, 0.2}, { 0.3, 0.3}, { 0.3, 0.4}, { 0.4, 0.5}};
+    cuDoubleComplex buffer[]      = {{ 0.0, 0.0}, { 0.1, 0.1}, { 0.2, 0.2}, { 0.3, 0.4},
+                                     { 0.0, 0.1}, { 0.1, 0.2}, { 0.3, 0.3}, { 0.4, 0.5}};
+
+    custatevecAccessorDescriptor_t accessor;
+
+    cuDoubleComplex *d_sv;
+    HANDLE_CUDA_ERROR( cudaMalloc((void**)&d_sv, nSvSize * sizeof(cuDoubleComplex)) );
+
+    HANDLE_CUDA_ERROR( cudaMemcpy(d_sv, h_sv, nSvSize * sizeof(cuDoubleComplex),
+                       cudaMemcpyHostToDevice) );
+
+    //----------------------------------------------------------------------------------------------
+
+    // custatevec handle initialization
+    custatevecHandle_t handle;
+    HANDLE_ERROR( custatevecCreate(&handle) );
+
+    void* extraWorkspace = nullptr;
+    size_t extraWorkspaceSizeInBytes = 0;
+
+    // create accessor and check the size of external workspace
+    HANDLE_ERROR( custatevecAccessor_create(
+                  handle, d_sv, CUDA_C_64F, nIndexBits, &accessor, bitOrdering, bitOrderingLen,
+                  nullptr, nullptr, maskLen, &extraWorkspaceSizeInBytes) );
+
+    // allocate external workspace if necessary
+    if (extraWorkspaceSizeInBytes > 0)
+        HANDLE_CUDA_ERROR( cudaMalloc(&extraWorkspace, extraWorkspaceSizeInBytes) );
+
+    // set external workspace
+    HANDLE_ERROR( custatevecAccessor_setExtraWorkspace(
+                  handle, &accessor, extraWorkspace, extraWorkspaceSizeInBytes) );
+
+    // set state vector components
+    HANDLE_ERROR( custatevecAccessor_set(
+                  handle, &accessor, buffer, 0, nSvSize) );
+
+    // destroy handle
+    HANDLE_ERROR( custatevecDestroy(handle) );
+
+    //----------------------------------------------------------------------------------------------
+
+    HANDLE_CUDA_ERROR( cudaMemcpy(h_sv, d_sv, nSvSize * sizeof(cuDoubleComplex),
+                       cudaMemcpyDeviceToHost) );
+
+    bool correct = true;
+    for (int i = 0; i < nSvSize; i++) {
+        if (!almost_equal(h_sv[i], h_sv_result[i])) {
+            correct = false;
+            break;
+        }
+    }
+
+    HANDLE_CUDA_ERROR( cudaFree(d_sv) );
+    if (extraWorkspaceSizeInBytes)
+        HANDLE_CUDA_ERROR( cudaFree(extraWorkspace) );
+
+    if (correct) {
+        printf("accessor_set example PASSED\n");
+        return EXIT_SUCCESS;
+    }
+    else {
+        printf("accessor_set example FAILED: wrong result\n");
+        return EXIT_FAILURE;
+    }
+
+}
diff --git a/samples/custatevec/batch_measure.cu b/samples/custatevec/batch_measure.cu
index 530f166..a55df73 100644
--- a/samples/custatevec/batch_measure.cu
+++ b/samples/custatevec/batch_measure.cu
@@ -97,11 +97,11 @@ int main(void) {
     HANDLE_CUDA_ERROR( cudaFree(d_sv) );
 
     if (correct) {
-        printf("example PASSED\n");
+        printf("batch_measure example PASSED\n");
         return EXIT_SUCCESS;
     }
     else {
-        printf("example FAILED: wrong result\n");
+        printf("batch_measure example FAILED: wrong result\n");
         return EXIT_FAILURE;
     }
 
diff --git a/samples/custatevec/diagonal_matrix.cu b/samples/custatevec/diagonal_matrix.cu
new file mode 100644
index 0000000..af9e798
--- /dev/null
+++ b/samples/custatevec/diagonal_matrix.cu
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+ *
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  - Neither the name(s) of the copyright holder(s) nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <cuda_runtime_api.h> // cudaMalloc, cudaMemcpy, etc.
+#include <cuComplex.h>        // cuDoubleComplex
+#include <custatevec.h>       // custatevecApplyMatrix
+#include <stdio.h>            // printf
+#include <stdlib.h>           // EXIT_FAILURE
+
+#include "helper.hpp"         // HANDLE_ERROR, HANDLE_CUDA_ERROR
+
+int main(void) {
+
+    const int nIndexBits = 3;
+    const int nSvSize    = (1 << nIndexBits);
+    const int nBasisBits = 1;
+    const int maskLen    = 0;
+    const int adjoint    = 0;
+
+    const int basisBits[] = {2};
+
+    cuDoubleComplex h_sv[]        = {{ 0.0, 0.0}, { 0.0, 0.1}, { 0.1, 0.1}, { 0.1, 0.2},
+                                     { 0.2, 0.2}, { 0.3, 0.3}, { 0.3, 0.4}, { 0.4, 0.5}};
+    cuDoubleComplex h_sv_result[] = {{ 0.0, 0.0}, { 0.0, 0.1}, { 0.1, 0.1}, { 0.1, 0.2},
+                                     { 0.2,-0.2}, { 0.3,-0.3}, { 0.4,-0.3}, { 0.5,-0.4}};
+    cuDoubleComplex diagonals[] = {{1.0, 0.0}, {0.0, -1.0}};
+
+    cuDoubleComplex *d_sv;
+    HANDLE_CUDA_ERROR( cudaMalloc((void**)&d_sv, nSvSize * sizeof(cuDoubleComplex)) );
+
+    HANDLE_CUDA_ERROR( cudaMemcpy(d_sv, h_sv, nSvSize * sizeof(cuDoubleComplex),
+                       cudaMemcpyHostToDevice) );
+
+    //----------------------------------------------------------------------------------------------
+
+    // custatevec handle initialization
+    custatevecHandle_t handle;
+    HANDLE_ERROR( custatevecCreate(&handle) );
+
+    void* extraWorkspace = nullptr;
+    size_t extraWorkspaceSizeInBytes = 0;
+
+    // check the size of external workspace
+    HANDLE_ERROR( custatevecApplyGeneralizedPermutationMatrix_bufferSize(
+                  handle, CUDA_C_64F, nIndexBits, nullptr, diagonals, CUDA_C_64F, basisBits,
+                  nBasisBits, maskLen, &extraWorkspaceSizeInBytes) );
+
+    // allocate external workspace if necessary
+    if (extraWorkspaceSizeInBytes > 0)
+        HANDLE_CUDA_ERROR( cudaMalloc(&extraWorkspace, extraWorkspaceSizeInBytes) );
+
+    // apply matrix
+    HANDLE_ERROR( custatevecApplyGeneralizedPermutationMatrix(
+                  handle, d_sv, CUDA_C_64F, nIndexBits, nullptr, diagonals, CUDA_C_64F, adjoint,
+                  basisBits, nBasisBits, nullptr, nullptr, maskLen, extraWorkspace,
+                  extraWorkspaceSizeInBytes) );
+
+    // destroy handle
+    HANDLE_ERROR( custatevecDestroy(handle) );
+
+    //----------------------------------------------------------------------------------------------
+
+    HANDLE_CUDA_ERROR( cudaMemcpy(h_sv, d_sv, nSvSize * sizeof(cuDoubleComplex),
+                       cudaMemcpyDeviceToHost) );
+
+    bool correct = true;
+    for (int i = 0; i < nSvSize; i++) {
+        if (!almost_equal(h_sv[i], h_sv_result[i])) {
+            correct = false;
+            break;
+        }
+    }
+
+    HANDLE_CUDA_ERROR( cudaFree(d_sv) );
+    if (extraWorkspaceSizeInBytes)
+        HANDLE_CUDA_ERROR( cudaFree(extraWorkspace) );
+
+    if (correct) {
+        printf("diagonal_matrix example PASSED\n");
+        return EXIT_SUCCESS;
+    }
+    else {
+        printf("diagonal_matrix example FAILED: wrong result\n");
+        return EXIT_FAILURE;
+    }
+
+}
diff --git a/samples/custatevec/expectation.cu b/samples/custatevec/expectation.cu
index c54cb93..bea4815 100644
--- a/samples/custatevec/expectation.cu
+++ b/samples/custatevec/expectation.cu
@@ -43,8 +43,8 @@ int main(void) {
 
     const int basisBits[] = {1};
 
-    cuDoubleComplex expect;
-    cuDoubleComplex expect_result = {4.1, 0.0};
+    cuDoubleComplex expectationValue;
+    cuDoubleComplex expectationValueResult = {4.1, 0.0};
 
     cuDoubleComplex h_sv[]        = {{ 0.0, 0.0}, { 0.0, 0.1}, { 0.1, 0.1}, { 0.1, 0.2}, 
                                      { 0.2, 0.2}, { 0.3, 0.3}, { 0.3, 0.4}, { 0.4, 0.5}};
@@ -77,8 +77,8 @@ int main(void) {
 
     // compute expectation
     HANDLE_ERROR( custatevecExpectation(
-                  handle, d_sv, CUDA_C_64F, nIndexBits, &expect, CUDA_C_64F, nullptr, matrix, 
-                  CUDA_C_64F, CUSTATEVEC_MATRIX_LAYOUT_ROW, basisBits, nBasisBits, 
+                  handle, d_sv, CUDA_C_64F, nIndexBits, &expectationValue, CUDA_C_64F, nullptr,
+                  matrix, CUDA_C_64F, CUSTATEVEC_MATRIX_LAYOUT_ROW, basisBits, nBasisBits,
                   CUSTATEVEC_COMPUTE_64F, extraWorkspace, extraWorkspaceSizeInBytes) );
 
     // destroy handle
@@ -90,7 +90,7 @@ int main(void) {
                        cudaMemcpyDeviceToHost) );
 
     bool correct = true;
-    if ( !almost_equal(expect, expect_result) ) {
+    if ( !almost_equal(expectationValue, expectationValueResult) ) {
         correct = false;
     }
 
@@ -99,11 +99,11 @@ int main(void) {
         HANDLE_CUDA_ERROR( cudaFree(extraWorkspace) );
 
     if (correct) {
-        printf("example PASSED\n");
+        printf("expectation example PASSED\n");
         return EXIT_SUCCESS;
     }
     else {
-        printf("example FAILED: wrong result\n");
+        printf("expectation example FAILED: wrong result\n");
         return EXIT_FAILURE;
     }
 
diff --git a/samples/custatevec/expectation_pauli.cu b/samples/custatevec/expectation_pauli.cu
new file mode 100644
index 0000000..e25f99c
--- /dev/null
+++ b/samples/custatevec/expectation_pauli.cu
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+ *
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  - Neither the name(s) of the copyright holder(s) nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <cuda_runtime_api.h> // cudaMalloc, cudaMemcpy, etc.
+#include <cuComplex.h>        // cuDoubleComplex
+#include <custatevec.h>       // custatevecApplyMatrix
+#include <stdio.h>            // printf
+#include <stdlib.h>           // EXIT_FAILURE
+#include <cmath>              // acos
+
+#include "helper.hpp"         // HANDLE_ERROR, HANDLE_CUDA_ERROR
+
+int main(void) {
+
+    const int nIndexBits = 3;
+    const int nSvSize    = (1 << nIndexBits);
+
+    const int nPauliOperatorArrays = 2;
+    const custatevecPauli_t pauliOperatorsI[] = {CUSTATEVEC_PAULI_I};
+    const custatevecPauli_t pauliOperatorsXY[] = {CUSTATEVEC_PAULI_X, CUSTATEVEC_PAULI_Y};
+    const custatevecPauli_t* pauliOperatorsArray[] = {pauliOperatorsI, pauliOperatorsXY};
+
+    const unsigned nBasisBitsArray[] = {1, 2};
+    const int basisBitsI[] = {1};
+    const int basisBitsXY[] = {1, 2};
+    const int* basisBitsArray[] = {basisBitsI, basisBitsXY};
+
+    double expectationValues[nPauliOperatorArrays];
+    double expectationValues_result[] = {1.0, -0.14};
+    cuDoubleComplex h_sv[]  = {{ 0.0, 0.0}, { 0.0, 0.1}, { 0.1, 0.1}, { 0.1, 0.2},
+                               { 0.2, 0.2}, { 0.3, 0.3}, { 0.3, 0.4}, { 0.4, 0.5}};
+
+    cuDoubleComplex *d_sv;
+    HANDLE_CUDA_ERROR( cudaMalloc((void**)&d_sv, nSvSize * sizeof(cuDoubleComplex)) );
+
+    HANDLE_CUDA_ERROR( cudaMemcpy(d_sv, h_sv, nSvSize * sizeof(cuDoubleComplex),
+                       cudaMemcpyHostToDevice) );
+
+    //----------------------------------------------------------------------------------------------
+
+    // custatevec handle initialization
+    custatevecHandle_t handle;
+    HANDLE_ERROR( custatevecCreate(&handle) );
+
+    // apply Pauli operator
+    HANDLE_ERROR( custatevecExpectationsOnPauliBasis(
+                  handle, d_sv, CUDA_C_64F, nIndexBits, expectationValues, pauliOperatorsArray,
+                  basisBitsArray, nBasisBitsArray, nPauliOperatorArrays) );
+
+    // destroy handle
+    HANDLE_ERROR( custatevecDestroy(handle) );
+
+    //----------------------------------------------------------------------------------------------
+
+    bool correct = true;
+    for (int i = 0; i < nPauliOperatorArrays; i++) {
+        if (!almost_equal(expectationValues[i], expectationValues_result[i]) ) {
+            correct = false;
+            break;
+        }
+    }
+
+    HANDLE_CUDA_ERROR( cudaFree(d_sv) );
+
+    if (correct) {
+        printf("example PASSED\n");
+        return EXIT_SUCCESS;
+    }
+    else {
+        printf("example FAILED: wrong result\n");
+        return EXIT_FAILURE;
+    }
+
+}
diff --git a/samples/custatevec/exponential_pauli.cu b/samples/custatevec/exponential_pauli.cu
index b7f81af..5b097ed 100644
--- a/samples/custatevec/exponential_pauli.cu
+++ b/samples/custatevec/exponential_pauli.cu
@@ -92,11 +92,11 @@ int main(void) {
     HANDLE_CUDA_ERROR( cudaFree(d_sv) );
 
     if (correct) {
-        printf("example PASSED\n");
+        printf("exponential_pauli example PASSED\n");
         return EXIT_SUCCESS;
     }
     else {
-        printf("example FAILED: wrong result\n");
+        printf("exponential_pauli example FAILED: wrong result\n");
         return EXIT_FAILURE;
     }
 
diff --git a/samples/custatevec/gate_application.cu b/samples/custatevec/gate_application.cu
index b3edb0b..b679802 100644
--- a/samples/custatevec/gate_application.cu
+++ b/samples/custatevec/gate_application.cu
@@ -104,11 +104,11 @@ int main(void) {
         HANDLE_CUDA_ERROR( cudaFree(extraWorkspace) );
 
     if (correct) {
-        printf("example PASSED\n");
+        printf("gate_application example PASSED\n");
         return EXIT_SUCCESS;
     }
     else {
-        printf("example FAILED: wrong result\n");
+        printf("gate_application example FAILED: wrong result\n");
         return EXIT_FAILURE;
     }
 
diff --git a/samples/custatevec/helper.hpp b/samples/custatevec/helper.hpp
index 778e586..6d3ed63 100644
--- a/samples/custatevec/helper.hpp
+++ b/samples/custatevec/helper.hpp
@@ -46,3 +46,9 @@ bool almost_equal(cuDoubleComplex x, cuDoubleComplex y) {
     const cuDoubleComplex diff = cuCsub(x, y);
     return (cuCabs(diff) < eps);
 }
+
+bool almost_equal(double x, double y) {
+    const double eps = 1.0e-5;
+    const double diff = x - y;
+    return (abs(diff) < eps);
+}
diff --git a/samples/custatevec/measure_zbasis.cu b/samples/custatevec/measure_zbasis.cu
index fee408f..792a590 100644
--- a/samples/custatevec/measure_zbasis.cu
+++ b/samples/custatevec/measure_zbasis.cu
@@ -94,11 +94,11 @@ int main(void) {
     HANDLE_CUDA_ERROR( cudaFree(d_sv) );
 
     if (correct) {
-        printf("example PASSED\n");
+        printf("measure_zbasis example PASSED\n");
         return EXIT_SUCCESS;
     }
     else {
-        printf("example FAILED: wrong result\n");
+        printf("measure_zbasis example FAILED: wrong result\n");
         return EXIT_FAILURE;
     }
 
diff --git a/samples/custatevec/permutation_matrix.cu b/samples/custatevec/permutation_matrix.cu
new file mode 100644
index 0000000..59c5b14
--- /dev/null
+++ b/samples/custatevec/permutation_matrix.cu
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+ *
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  - Neither the name(s) of the copyright holder(s) nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <cuda_runtime_api.h> // cudaMalloc, cudaMemcpy, etc.
+#include <cuComplex.h>        // cuDoubleComplex
+#include <custatevec.h>       // custatevecApplyMatrix
+#include <stdio.h>            // printf
+#include <stdlib.h>           // EXIT_FAILURE
+
+#include "helper.hpp"         // HANDLE_ERROR, HANDLE_CUDA_ERROR
+
+int main(void) {
+
+    const int nIndexBits = 3;
+    const int nSvSize    = (1 << nIndexBits);
+    const int nBasisBits = 2;
+    const int maskLen    = 1;
+    const int adjoint    = 0;
+
+    const int basisBits[]     = {0, 1};
+    const int maskOrdering[]  = {2};
+    const int maskBitString[] = {1};
+    custatevecIndex_t permutation[] = {0, 2, 1, 3};
+
+    cuDoubleComplex h_sv[]        = {{ 0.0, 0.0}, { 0.0, 0.1}, { 0.1, 0.1}, { 0.1, 0.2},
+                                     { 0.2, 0.2}, { 0.3, 0.3}, { 0.3, 0.4}, { 0.4, 0.5}};
+    cuDoubleComplex h_sv_result[] = {{ 0.0, 0.0}, { 0.0, 0.1}, { 0.1, 0.1}, { 0.1, 0.2},
+                                     { 0.2, 0.2}, {-0.4, 0.3}, {-0.3, 0.3}, { 0.4, 0.5}};
+    cuDoubleComplex diagonals[] = {{1.0, 0.0}, {0.0, 1.0}, {0.0, 1.0}, {1.0, 0.0}};
+
+    cuDoubleComplex *d_sv;
+    HANDLE_CUDA_ERROR( cudaMalloc((void**)&d_sv, nSvSize * sizeof(cuDoubleComplex)) );
+
+    HANDLE_CUDA_ERROR( cudaMemcpy(d_sv, h_sv, nSvSize * sizeof(cuDoubleComplex),
+                       cudaMemcpyHostToDevice) );
+
+    //----------------------------------------------------------------------------------------------
+
+    // custatevec handle initialization
+    custatevecHandle_t handle;
+    HANDLE_ERROR( custatevecCreate(&handle) );
+
+    void* extraWorkspace = nullptr;
+    size_t extraWorkspaceSizeInBytes = 0;
+
+    // check the size of external workspace
+    HANDLE_ERROR( custatevecApplyGeneralizedPermutationMatrix_bufferSize(
+                  handle, CUDA_C_64F, nIndexBits, permutation, diagonals, CUDA_C_64F, basisBits,
+                  nBasisBits, maskLen, &extraWorkspaceSizeInBytes) );
+
+    // allocate external workspace if necessary
+    if (extraWorkspaceSizeInBytes > 0)
+        HANDLE_CUDA_ERROR( cudaMalloc(&extraWorkspace, extraWorkspaceSizeInBytes) );
+
+    // apply matrix
+    HANDLE_ERROR( custatevecApplyGeneralizedPermutationMatrix(
+                  handle, d_sv, CUDA_C_64F, nIndexBits, permutation, diagonals, CUDA_C_64F,
+                  adjoint, basisBits, nBasisBits, maskBitString, maskOrdering, maskLen,
+                  extraWorkspace, extraWorkspaceSizeInBytes) );
+
+    // destroy handle
+    HANDLE_ERROR( custatevecDestroy(handle) );
+
+    //----------------------------------------------------------------------------------------------
+
+    HANDLE_CUDA_ERROR( cudaMemcpy(h_sv, d_sv, nSvSize * sizeof(cuDoubleComplex),
+                       cudaMemcpyDeviceToHost) );
+
+    bool correct = true;
+    for (int i = 0; i < nSvSize; i++) {
+        if (!almost_equal(h_sv[i], h_sv_result[i])) {
+            correct = false;
+            break;
+        }
+    }
+
+    HANDLE_CUDA_ERROR( cudaFree(d_sv) );
+    if (extraWorkspaceSizeInBytes)
+        HANDLE_CUDA_ERROR( cudaFree(extraWorkspace) );
+
+    if (correct) {
+        printf("permutation_matrix example PASSED\n");
+        return EXIT_SUCCESS;
+    }
+    else {
+        printf("permutation_matrix example FAILED: wrong result\n");
+        return EXIT_FAILURE;
+    }
+
+}
diff --git a/samples/custatevec/sampler.cu b/samples/custatevec/sampler.cu
index 478b1b6..f7206ee 100644
--- a/samples/custatevec/sampler.cu
+++ b/samples/custatevec/sampler.cu
@@ -110,11 +110,11 @@ int main(void) {
         HANDLE_CUDA_ERROR( cudaFree(extraWorkspace) );
 
     if (correct) {
-        printf("example PASSED\n");
+        printf("sampler example PASSED\n");
         return EXIT_SUCCESS;
     }
     else {
-        printf("example FAILED: wrong result\n");
+        printf("sampler example FAILED: wrong result\n");
         return EXIT_FAILURE;
     }
 }
diff --git a/samples/cutensornet/CMakeLists.txt b/samples/cutensornet/CMakeLists.txt
new file mode 100644
index 0000000..0ea8036
--- /dev/null
+++ b/samples/cutensornet/CMakeLists.txt
@@ -0,0 +1,186 @@
+# 
+# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# 
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#  - Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  - Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  - Neither the name(s) of the copyright holder(s) nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+# 
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# 
+
+# ---[ Check cmake version.
+cmake_minimum_required(VERSION 3.12.0 FATAL_ERROR)
+
+include(GNUInstallDirs)
+
+# ---[ Project specIFication.
+project(cutensornet_example LANGUAGES C CXX CUDA)
+
+# ##########################################
+# cuTENSOR_example build mode
+# ##########################################
+
+if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
+    message(STATUS "Setting build type to 'Debug' as none was specified.")
+    set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "Choose the type of build." FORCE)
+    set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "" "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
+else()
+    message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
+endif()
+
+# ##########################################
+# cuTENSOR_example dependencies
+# ##########################################
+
+find_package(CUDA 10.1 REQUIRED)
+include_directories("${CUDA_INCLUDE_DIRS}")
+
+if(NOT DEFINED ENV{CUTENSOR_ROOT} AND NOT DEFINED CUTENSOR_ROOT)
+  message(FATAL_ERROR "CUTENSOR_ROOT not set!")
+else()
+  if(DEFINED ENV{CUTENSOR_ROOT})
+    set(CUTENSOR_ROOT "$ENV{CUTENSOR_ROOT}")
+  endif()
+  message("-- Looking for cuTENSOR in ${CUTENSOR_ROOT}")
+  if(NOT EXISTS ${CUTENSOR_ROOT})
+    message(FATAL_ERROR "Cannot find CUTENSOR_ROOT")
+  endif()
+endif()
+
+if(NOT DEFINED ENV{CUTENSORNET_ROOT} AND NOT DEFINED CUTENSORNET_ROOT)
+    message(FATAL_ERROR "CUTENSORNET_ROOT not set!")
+else()
+  if(DEFINED ENV{CUTENSORNET_ROOT})
+      set(CUTENSORNET_ROOT "$ENV{CUTENSORNET_ROOT}")
+  endif()
+  message("-- Looking for cuTENSORNet in ${CUTENSORNET_ROOT}")
+  if(NOT EXISTS ${CUTENSORNET_ROOT})
+      message(FATAL_ERROR "Cannot find CUTENSORNET_ROOT")
+  endif()
+endif()
+
+if(NOT TARGET cutensor)
+  add_library(cutensor SHARED IMPORTED)
+  if(WIN32)
+    set(CUTENSOR_LIBRARY_NAME "cutensor.dll")
+    set(CUTENSOR_LIBRARY_DEF "cutensor.lib")
+  else()
+    set(CUTENSOR_LIBRARY_NAME "libcutensor.so")
+  endif()
+  set_target_properties(cutensor PROPERTIES
+      IMPORTED_LOCATION "${CUTENSOR_ROOT}/lib/${CUTENSOR_LIBRARY_NAME}"
+      IMPORTED_IMPLIB "${CUTENSOR_ROOT}/lib/${CUTENSOR_LIBRARY_DEF}"
+      INTERFACE_INCLUDE_DIRECTORIES "${CUTENSOR_ROOT}/include")
+endif()
+
+if(NOT TARGET cutensornet)
+  add_library(cutensornet SHARED IMPORTED)
+  if(WIN32)
+  set(CUTENSORNET_LIBRARY_NAME "cutensornet.dll")
+    set(CUTENSORNET_LIBRARY_DEF "cutensornet.lib")
+  else()
+    set(CUTENSORNET_LIBRARY_NAME "libcutensornet.so")
+  endif()
+  set_target_properties(cutensornet PROPERTIES
+    IMPORTED_LOCATION "${CUTENSORNET_ROOT}/lib/${CUTENSORNET_LIBRARY_NAME}"
+    IMPORTED_IMPLIB "${CUTENSORNET_ROOT}/lib/${CUTENSORNET_LIBRARY_DEF}"
+    INTERFACE_INCLUDE_DIRECTORIES "${CUTENSORNET_ROOT}/include")
+endif()
+
+# Installation directories
+if(NOT DEFINED CUTENSORNET_EXAMPLE_BINARY_INSTALL_DIR)
+    message(WARNING "CUTENSORNET_EXAMPLE_BINARY_INSTALL_DIR not set, setting to cutensornet_example/bin")
+    set(CUTENSORNET_EXAMPLE_BINARY_INSTALL_DIR "cutensornet_example/bin")
+endif()
+
+# ##########################################
+# cuTENSOR_example building flags
+# ##########################################
+
+# Global CXX/CUDA flags
+
+# Global CXX flags/options
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
+
+# Global CUDA CXX flags/options
+set(CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER})
+set(CMAKE_CUDA_STANDARD 11)
+set(CMAKE_CUDA_STANDARD_REQUIRED ON)
+set(CMAKE_CUDA_EXTENSIONS OFF)
+
+# ##########################################
+# cuTENSOR_example target
+# ##########################################
+
+function(add_cutensornet_example GROUP_TARGET EXAMPLE_NAME EXAMPLE_SOURCES)
+    list(GET EXAMPLE_SOURCES 0 EXAMPLE_MAIN_SOURCE)
+    get_filename_component(EXAMPLE_TARGET ${EXAMPLE_MAIN_SOURCE} NAME_WE)
+    add_executable(${EXAMPLE_TARGET} ${EXAMPLE_SOURCES})
+    target_include_directories(${EXAMPLE_TARGET}
+        PUBLIC
+            ${CUDA_INCLUDE_DIRS}
+            ${CUTENSOR_ROOT}/include
+            ${CUTENSORNET_ROOT}/include
+    )
+    target_link_libraries(${EXAMPLE_TARGET}
+        PUBLIC
+            cutensornet
+            cutensor
+            cudart
+            cublasLt
+    )
+    set_target_properties(${EXAMPLE_TARGET} PROPERTIES
+        POSITION_INDEPENDENT_CODE ON
+        CUDA_SEPARABLE_COMPILATION ON
+    )
+
+    # Install example
+    install(
+        TARGETS ${EXAMPLE_TARGET}
+        RUNTIME
+        DESTINATION ${CUTENSORNET_EXAMPLE_BINARY_INSTALL_DIR}
+        PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ
+    )
+
+    add_dependencies(${GROUP_TARGET} ${EXAMPLE_TARGET})
+endfunction()
+
+add_custom_target(cutensornet_examples)
+
+add_cutensornet_example(cutensornet_examples "cuTENSORNet.example.tensornet" tensornet_example.cu)
+
+# ##########################################
+# cuTENSOR_example directories
+# ##########################################
+
+# By default put binaries in build/bin (pre-install)
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
+
+# ##########################################
+# Install examples
+# ##########################################
+
+IF(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
+  SET(CMAKE_INSTALL_PREFIX ${CMAKE_BINARY_DIR} CACHE PATH "" FORCE)
+ENDIF()
diff --git a/samples/cutensornet/LICENSE b/samples/cutensornet/LICENSE
new file mode 100644
index 0000000..c33765e
--- /dev/null
+++ b/samples/cutensornet/LICENSE
@@ -0,0 +1,27 @@
+SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/samples/cutensornet/Makefile b/samples/cutensornet/Makefile
new file mode 100644
index 0000000..9390b06
--- /dev/null
+++ b/samples/cutensornet/Makefile
@@ -0,0 +1,7 @@
+CXX_FLAGS=-std=c++11 -I${CUTENSORNET_ROOT}/include -L${CUTENSORNET_ROOT}/lib64 -L${CUTENSORNET_ROOT}/lib -lcutensornet -lcudart -lstdc++
+
+all:
+	nvcc tensornet_example.cu -o  tensornet_example ${CXX_FLAGS}
+
+clean:
+	rm -f tensornet_example tensornet_example.o
diff --git a/samples/cutensornet/README.md b/samples/cutensornet/README.md
new file mode 100644
index 0000000..5aa6cee
--- /dev/null
+++ b/samples/cutensornet/README.md
@@ -0,0 +1,37 @@
+# cuTensorNet - Samples
+
+* [Documentation](https://docs.nvidia.com/cuda/cutensornet/index.html)
+
+# Install
+
+## Linux
+
+You can use make to compile the cuTensorNet samples. The option CUTENSORNET_ROOT need to be defined if cuTensorNet is not the CUDA installation folder.
+
+With make
+
+```
+export CUTENSORNET_ROOT=<path_to_custatevec_root>
+make -j8
+```
+
+# Support
+
+* **Supported SM Architectures:** SM 7.0, SM 7.5, SM 8.0, SM 8.6
+* **Supported OSes:** Linux
+* **Supported CPU Architectures**: x86_64, arm64
+* **Language**: `C++11`
+
+# Prerequisites
+
+* [CUDA 1X.X toolkit](https://developer.nvidia.com/cuda-downloads) (or above) and compatible driver (see [CUDA Driver Release Notes](https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cuda-major-component-versions)).
+
+# Description
+This sample helps users get familiar with cuTensorNet.
+It provides an example of calling cuTensorNet to find a contraction path and as well as performing the contraction.
+The sample consists of:
+* Defining a Tensor Network (Create Contraction Descriptor using "cutensornetCreateNetworkDescriptor").
+* Find a close-to-optimal order of contraction via "cutensornetContractionOptimize". Users can control some parameters of the cutensornetContractionOptimize (e.g., path finder) using the "cutensornetContractionOptimizerConfigSetAttribute" function. Users also can provide their own path and use the SetAttribute tool to set the Info structure to their own path.
+* Create a planning to performs the contraction using "cutensornetCreateContractionPlan". This step will prepare a planning for the execution of list of the pairwise contractions provided by the path.
+* Users can optionally call "cutensornetContractionAutotune" to perform autotuning and choose the best performing kernel for the corresponding path such that the winner kernels will be called for all subsequent calls to performs the contraction "cutensornetContraction". The autotuning could bring improvement in particular when "cutensornetContraction" is called multiple times.
+* Performs the computation of the contraction using "cutensornetContraction".
diff --git a/samples/cutensornet/tensornet_example.cu b/samples/cutensornet/tensornet_example.cu
new file mode 100644
index 0000000..4f57081
--- /dev/null
+++ b/samples/cutensornet/tensornet_example.cu
@@ -0,0 +1,417 @@
+/*  
+ * Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+ * 
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  - Neither the name(s) of the copyright holder(s) nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */  
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#include <unordered_map>
+#include <vector>
+#include <cassert>
+
+#include <cuda_runtime.h>
+#include <cutensornet.h>
+#include <cutensor.h>
+
+#define HANDLE_ERROR(x)                                               \
+{ const auto err = x;                                                 \
+if( err != CUTENSORNET_STATUS_SUCCESS )                                \
+{ printf("Error: %s in line %d\n", cutensornetGetErrorString(err), __LINE__); return err; } \
+};
+
+#define HANDLE_CUDA_ERROR(x)                                      \
+{  const auto err = x;                                             \
+   if( err != cudaSuccess )                                        \
+   { printf("Error: %s in line %d\n", cudaGetErrorString(err), __LINE__); return err; } \
+};
+
+struct GPUTimer
+{
+   GPUTimer()
+   {
+      cudaEventCreate(&start_);
+      cudaEventCreate(&stop_);
+      cudaEventRecord(start_, 0);
+   }
+
+   ~GPUTimer()
+   {
+      cudaEventDestroy(start_);
+      cudaEventDestroy(stop_);
+   }
+
+   void start()
+   {
+      cudaEventRecord(start_, 0);
+   }
+
+   float seconds()
+   {
+      cudaEventRecord(stop_, 0);
+      cudaEventSynchronize(stop_);
+      float time;
+      cudaEventElapsedTime(&time, start_, stop_);
+      return time * 1e-3;
+   }
+   private:
+   cudaEvent_t start_, stop_;
+};
+
+
+int main()
+{
+   const size_t cuTensornetVersion = cutensornetGetVersion();
+   printf("cuTensorNet-vers:%ld\n",cuTensornetVersion);
+
+   cudaDeviceProp prop;
+   int32_t deviceId = -1;
+   HANDLE_CUDA_ERROR( cudaGetDevice(&deviceId) );
+   HANDLE_CUDA_ERROR( cudaGetDeviceProperties(&prop, deviceId) );
+
+   printf("===== device info ======\n");
+   printf("GPU-name:%s\n", prop.name);
+   printf("GPU-clock:%d\n", prop.clockRate);
+   printf("GPU-memoryClock:%d\n", prop.memoryClockRate);
+   printf("GPU-nSM:%d\n", prop.multiProcessorCount);
+   printf("GPU-major:%d\n", prop.major);
+   printf("GPU-minor:%d\n", prop.minor);
+   printf("========================\n");
+
+   typedef float floatType;
+
+   cudaDataType_t typeData = CUDA_R_32F;
+   cutensornetComputeType_t typeCompute = CUTENSORNET_COMPUTE_32F;
+
+   printf("Include headers and define data types\n");
+
+   /**********************
+   * Computing: D_{m,x,n,y} = A_{m,h,k,n} B_{u,k,h} C_{x,u,y}
+   **********************/
+
+   constexpr int32_t numInputs = 3;
+
+   // Create vector of modes
+   std::vector<int32_t> modesA{'m','h','k','n'};
+   std::vector<int32_t> modesB{'u','k','h'};
+   std::vector<int32_t> modesC{'x','u','y'};
+   std::vector<int32_t> modesD{'m','x','n','y'};
+
+   // Extents
+   std::unordered_map<int32_t, int64_t> extent;
+   extent['m'] = 96;
+   extent['n'] = 96;
+   extent['u'] = 96;
+   extent['h'] = 64;
+   extent['k'] = 64;
+   extent['x'] = 64;
+   extent['y'] = 64;
+
+   // Create a vector of extents for each tensor
+   std::vector<int64_t> extentA;
+   for (auto mode : modesA)
+      extentA.push_back(extent[mode]);
+   std::vector<int64_t> extentB;
+   for (auto mode : modesB)
+      extentB.push_back(extent[mode]);
+   std::vector<int64_t> extentC;
+   for (auto mode : modesC)
+      extentC.push_back(extent[mode]);
+   std::vector<int64_t> extentD;
+   for (auto mode : modesD)
+      extentD.push_back(extent[mode]);
+
+   printf("Define network, modes, and extents\n");
+
+   /**********************
+   * Allocating data
+   **********************/
+
+   size_t elementsA = 1;
+   for (auto mode : modesA)
+      elementsA *= extent[mode];
+   size_t elementsB = 1;
+   for (auto mode : modesB)
+      elementsB *= extent[mode];
+   size_t elementsC = 1;
+   for (auto mode : modesC)
+      elementsC *= extent[mode];
+   size_t elementsD = 1;
+   for (auto mode : modesD)
+      elementsD *= extent[mode];
+
+   size_t sizeA = sizeof(floatType) * elementsA;
+   size_t sizeB = sizeof(floatType) * elementsB;
+   size_t sizeC = sizeof(floatType) * elementsC;
+   size_t sizeD = sizeof(floatType) * elementsD;
+   printf("Total memory: %.2f GiB\n", (sizeA + sizeB + sizeC + sizeD)/1024./1024./1024);
+
+   void* rawDataIn_d[numInputs];
+   void* D_d;
+   HANDLE_CUDA_ERROR(cudaMalloc((void**) &rawDataIn_d[0], sizeA));
+   HANDLE_CUDA_ERROR(cudaMalloc((void**) &rawDataIn_d[1], sizeB));
+   HANDLE_CUDA_ERROR(cudaMalloc((void**) &rawDataIn_d[2], sizeC));
+   HANDLE_CUDA_ERROR(cudaMalloc((void**) &D_d, sizeD));
+
+   floatType *A = (floatType*) malloc(sizeof(floatType) * elementsA);
+   floatType *B = (floatType*) malloc(sizeof(floatType) * elementsB);
+   floatType *C = (floatType*) malloc(sizeof(floatType) * elementsC);
+   floatType *D = (floatType*) malloc(sizeof(floatType) * elementsD);
+
+   if (A == NULL || B == NULL || C == NULL || D == NULL)
+   {
+      printf("Error: Host allocation of A or C.\n");
+      return -1;
+
+   }
+   /**********************
+   * Allocate workspace
+   **********************/
+
+   size_t freeMem, totalMem;
+   HANDLE_CUDA_ERROR( cudaMemGetInfo(&freeMem, &totalMem ));
+
+   uint64_t worksize = freeMem * 0.9;
+
+   void *work = nullptr;
+   HANDLE_CUDA_ERROR( cudaMalloc(&work, worksize) );
+
+   /*******************
+   * Initialize data
+   *******************/
+
+   for (uint64_t i = 0; i < elementsA; i++)
+      A[i] = (((float) rand())/RAND_MAX - 0.5)*100;
+   for (uint64_t i = 0; i < elementsB; i++)
+      B[i] = (((float) rand())/RAND_MAX - 0.5)*100;
+   for (uint64_t i = 0; i < elementsC; i++)
+      C[i] = (((float) rand())/RAND_MAX - 0.5)*100;
+
+   HANDLE_CUDA_ERROR(cudaMemcpy(rawDataIn_d[0], A, sizeA, cudaMemcpyHostToDevice));
+   HANDLE_CUDA_ERROR(cudaMemcpy(rawDataIn_d[1], B, sizeB, cudaMemcpyHostToDevice));
+   HANDLE_CUDA_ERROR(cudaMemcpy(rawDataIn_d[2], C, sizeC, cudaMemcpyHostToDevice));
+
+   printf("Allocate memory for data and workspace, and initialize data.\n");
+
+   /*************************
+   * cuTensorNet
+   *************************/
+
+   cudaStream_t stream;
+   cudaStreamCreate(&stream);
+
+   cutensornetHandle_t handle;
+   HANDLE_ERROR(cutensornetCreate(&handle));
+
+   const int32_t nmodeA = modesA.size();
+   const int32_t nmodeB = modesB.size();
+   const int32_t nmodeC = modesC.size();
+   const int32_t nmodeD = modesD.size();
+
+   /*******************************
+   * Create Contraction Descriptor
+   *******************************/
+
+   const int32_t* modesIn[] = {modesA.data(), modesB.data(), modesC.data()};
+   int32_t const numModesIn[] = {nmodeA, nmodeB, nmodeC};
+   const int64_t* extentsIn[] = {extentA.data(), extentB.data(), extentC.data()};
+   const int64_t* stridesIn[] = {NULL, NULL, NULL}; // strides are optional; if no stride is provided, then cuTensorNet assumes a generalized column-major data layout
+
+   // Notice that pointers are allocated via cudaMalloc are aligned to 256 byte
+   // boundaries by default; however here we're checking the pointer alignment explicitly
+   // to demonstrate how one would check the alginment for arbitrary pointers.
+
+   auto getMaximalPointerAlignment = [](const void* ptr) {
+      const uint64_t ptrAddr  = reinterpret_cast<uint64_t>(ptr);
+      uint32_t alignment = 1;
+      while(ptrAddr % alignment == 0 &&
+            alignment < 256) // at the latest we terminate once the alignment reached 256 bytes (we could be going, but any alignment larger or equal to 256 is equally fine)
+      {
+         alignment *= 2;
+      }
+      return alignment;
+   };
+   const uint32_t alignmentsIn[] = {getMaximalPointerAlignment(rawDataIn_d[0]),
+                                    getMaximalPointerAlignment(rawDataIn_d[1]),
+                                    getMaximalPointerAlignment(rawDataIn_d[2])};
+   const uint32_t alignmentOut = getMaximalPointerAlignment(D_d);
+
+   // setup tensor network
+   cutensornetNetworkDescriptor_t descNet;
+   HANDLE_ERROR (cutensornetCreateNetworkDescriptor(handle,
+                                                numInputs, numModesIn, extentsIn, stridesIn, modesIn, alignmentsIn,
+                                                nmodeD, extentD.data(), /*stridesOut = */NULL, modesD.data(), alignmentOut,
+                                                typeData, typeCompute,
+                                                &descNet));
+
+   printf("Initialize the cuTensorNet library and create a network descriptor.\n");
+
+   /*******************************
+   * Find "optimal" contraction order and slicing
+   *******************************/
+
+   cutensornetContractionOptimizerConfig_t optimizerConfig;
+   HANDLE_ERROR (cutensornetCreateContractionOptimizerConfig(handle, &optimizerConfig));
+
+    // Set the value of the partitioner imbalance factor, if desired
+    int imbalance_factor = 30;
+    HANDLE_ERROR(cutensornetContractionOptimizerConfigSetAttribute(
+                                                               handle,
+                                                               optimizerConfig,
+                                                               CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_GRAPH_IMBALANCE_FACTOR,
+                                                               &imbalance_factor,
+                                                               sizeof(imbalance_factor)));
+
+   cutensornetContractionOptimizerInfo_t optimizerInfo;
+   HANDLE_ERROR (cutensornetCreateContractionOptimizerInfo(handle, descNet, &optimizerInfo));
+
+   HANDLE_ERROR (cutensornetContractionOptimize(handle,
+                                             descNet,
+                                             optimizerConfig,
+                                             worksize,
+                                             optimizerInfo));
+
+   int64_t numSlices = 0;
+   HANDLE_ERROR( cutensornetContractionOptimizerInfoGetAttribute(
+               handle,
+               optimizerInfo,
+               CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_NUM_SLICES,
+               &numSlices,
+               sizeof(numSlices)));
+
+   assert(numSlices > 0);
+
+   printf("Find an optimized contraction path with cuTensorNet optimizer.\n");
+
+   /*******************************
+   * Initialize all pair-wise contraction plans (for cuTENSOR)
+   *******************************/
+   cutensornetContractionPlan_t plan;
+   HANDLE_ERROR( cutensornetCreateContractionPlan(handle,
+                                                descNet,
+                                                optimizerInfo,
+                                                worksize,
+                                                &plan) );
+
+
+   /*******************************
+   * Optional: Auto-tune cuTENSOR's cutensorContractionPlan to pick the fastest kernel
+   *******************************/
+   cutensornetContractionAutotunePreference_t autotunePref;
+   HANDLE_ERROR(cutensornetCreateContractionAutotunePreference(handle,
+                           &autotunePref));
+
+   const int numAutotuningIterations = 5; // may be 0
+   HANDLE_ERROR(cutensornetContractionAutotunePreferenceSetAttribute(
+                           handle,
+                           autotunePref,
+                           CUTENSORNET_CONTRACTION_AUTOTUNE_MAX_ITERATIONS,
+                           &numAutotuningIterations,
+                           sizeof(numAutotuningIterations)));
+
+   // modify the plan again to find the best pair-wise contractions
+   HANDLE_ERROR(cutensornetContractionAutotune(handle,
+                           plan,
+                           rawDataIn_d,
+                           D_d,
+                           work, worksize,
+                           autotunePref,
+                           stream));
+
+   HANDLE_ERROR(cutensornetDestroyContractionAutotunePreference(autotunePref));
+
+   printf("Create a contraction plan for cuTENSOR and optionally auto-tune it.\n");
+
+   /**********************
+   * Run
+   **********************/
+   GPUTimer timer;
+   double minTimeCUTENSOR = 1e100;
+   const int numRuns = 3; // to get stable perf results
+   for (int i=0; i < numRuns; ++i)
+   {
+      cudaMemcpy(D_d, D, sizeD, cudaMemcpyHostToDevice); // restore output
+      cudaDeviceSynchronize();
+
+      /*
+      * Contract over all slices.
+      *
+      * A user may choose to parallelize this loop across multiple devices.
+      */
+      for(int64_t sliceId=0; sliceId < numSlices; ++sliceId)
+      {
+         timer.start();
+
+         HANDLE_ERROR(cutensornetContraction(handle,
+                                 plan,
+                                 rawDataIn_d,
+                                 D_d,
+                                 work, worksize, sliceId, stream));
+
+         // Synchronize and measure timing
+         auto time = timer.seconds();
+         minTimeCUTENSOR = (minTimeCUTENSOR < time) ? minTimeCUTENSOR : time;
+      }
+   }
+
+   printf("Contract the network, each slice uses the same contraction plan.\n");
+
+   /*************************/
+
+   double flops = -1;
+
+   HANDLE_ERROR( cutensornetContractionOptimizerInfoGetAttribute(
+               handle,
+               optimizerInfo,
+               CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_FLOP_COUNT,
+               &flops,
+               sizeof(flops)));
+
+   printf("numSlices: %ld\n", numSlices);
+   printf("%.2f ms / slice\n", minTimeCUTENSOR * 1000.f);
+   printf("%.2f GFLOPS/s\n", flops/1e9/minTimeCUTENSOR );
+
+   HANDLE_ERROR(cutensornetDestroy(handle));
+   HANDLE_ERROR(cutensornetDestroyNetworkDescriptor(descNet));
+   HANDLE_ERROR(cutensornetDestroyContractionPlan(plan));
+   HANDLE_ERROR(cutensornetDestroyContractionOptimizerConfig(optimizerConfig));
+   HANDLE_ERROR(cutensornetDestroyContractionOptimizerInfo(optimizerInfo));
+
+   if (A) free(A);
+   if (B) free(B);
+   if (C) free(C);
+   if (D) free(D);
+   if (rawDataIn_d[0]) cudaFree(rawDataIn_d[0]);
+   if (rawDataIn_d[1]) cudaFree(rawDataIn_d[1]);
+   if (rawDataIn_d[2]) cudaFree(rawDataIn_d[2]);
+   if (D_d) cudaFree(D_d);
+   if (work) cudaFree(work);
+
+   printf("Free resource and exit.\n");
+
+   return 0;
+}