diff --git a/benchmarks/cuquantum_benchmarks/__init__.py b/benchmarks/cuquantum_benchmarks/__init__.py index 336ad30..1f9e1ee 100644 --- a/benchmarks/cuquantum_benchmarks/__init__.py +++ b/benchmarks/cuquantum_benchmarks/__init__.py @@ -2,4 +2,4 @@ # # SPDX-License-Identifier: BSD-3-Clause -__version__ = '0.3.0' +__version__ = '0.3.1' diff --git a/benchmarks/cuquantum_benchmarks/_utils.py b/benchmarks/cuquantum_benchmarks/_utils.py index 76ecbde..5797be6 100644 --- a/benchmarks/cuquantum_benchmarks/_utils.py +++ b/benchmarks/cuquantum_benchmarks/_utils.py @@ -229,7 +229,7 @@ def get_cpu_name(): if m: return m.group(0).split(':')[-1].strip() else: - assert False, f"getting cpu info failed" + return f"unknown" def get_gpu_driver_version(): diff --git a/benchmarks/cuquantum_benchmarks/backends/__init__.py b/benchmarks/cuquantum_benchmarks/backends/__init__.py index 0cb2cf2..e3de1ef 100644 --- a/benchmarks/cuquantum_benchmarks/backends/__init__.py +++ b/benchmarks/cuquantum_benchmarks/backends/__init__.py @@ -4,15 +4,10 @@ from .backend_cirq import Cirq from .backend_cutn import cuTensorNet -from .backend_pny import (Pny, PnyLightningGpu, PnyLightningCpu, - PnyLightningKokkos, PnyDumper) +from .backend_pny import Pny, PnyLightningGpu, PnyLightningCpu, PnyLightningKokkos from .backend_qsim import Qsim, QsimCuda, QsimCusv, QsimMgpu from .backend_qiskit import Aer, AerCuda, AerCusv, CusvAer from .backend_qulacs import QulacsGpu, QulacsCpu -try: - from .backend_naive import Naive -except ImportError: - Naive = None backends = { @@ -30,12 +25,9 @@ 'pennylane-lightning-gpu': PnyLightningGpu, 'pennylane-lightning-qubit': PnyLightningCpu, 'pennylane-lightning-kokkos': PnyLightningKokkos, - 'pennylane-dumper': PnyDumper, 'qulacs-cpu': QulacsCpu, 'qulacs-gpu': QulacsGpu, } -if Naive: - backends['naive'] = Naive def createBackend(backend_name, ngpus, ncpu_threads, precision, *args, **kwargs): diff --git a/benchmarks/cuquantum_benchmarks/backends/backend_cirq.py b/benchmarks/cuquantum_benchmarks/backends/backend_cirq.py index 7e1b087..429b64f 100644 --- a/benchmarks/cuquantum_benchmarks/backends/backend_cirq.py +++ b/benchmarks/cuquantum_benchmarks/backends/backend_cirq.py @@ -2,19 +2,24 @@ # # SPDX-License-Identifier: BSD-3-Clause +import functools import warnings try: import cirq except ImportError: cirq = None - + +try: + from .. import _internal_utils +except ImportError: + _internal_utils = None from .backend import Backend -class Cirq(Backend): +class _Cirq(Backend): - def __init__(self, ngpus, ncpu_threads, precision, *args, **kwargs): + def __init__(self, ngpus, ncpu_threads, precision, *args, identifier=None, **kwargs): if cirq is None: raise RuntimeError("cirq is not installed") if ngpus > 0: @@ -25,7 +30,15 @@ def __init__(self, ngpus, ncpu_threads, precision, *args, **kwargs): raise ValueError("the cirq backend only supports single precision") self.backend = cirq.Simulator() - + self.identifier = identifier + self.version = cirq.__version__ + + def preprocess_circuit(self, circuit, *args, **kwargs): + if _internal_utils is not None: + _internal_utils.preprocess_circuit(self.identifier, circuit, *args, **kwargs) + + return {} + def run(self, circuit, nshots=1024): run_data = {} if nshots > 0: @@ -34,3 +47,6 @@ def run(self, circuit, nshots=1024): results = self.backend.simulate(circuit) post_res = results.measurements['result'] return {'results': results, 'post_results': post_res, 'run_data': run_data} + + +Cirq = functools.partial(_Cirq, identifier='cirq') diff --git a/benchmarks/cuquantum_benchmarks/backends/backend_cutn.py b/benchmarks/cuquantum_benchmarks/backends/backend_cutn.py index ba58fb7..b7f5925 100644 --- a/benchmarks/cuquantum_benchmarks/backends/backend_cutn.py +++ b/benchmarks/cuquantum_benchmarks/backends/backend_cutn.py @@ -51,6 +51,7 @@ def __init__(self, ngpus, ncpu_threads, precision, **kwargs): opts = cutn.NetworkOptions(handle=self.handle) self.network_opts = opts self.n_samples = kwargs.pop('nhypersamples') + self.version = cutn.get_version() def __del__(self): cutn.destroy(self.handle) diff --git a/benchmarks/cuquantum_benchmarks/backends/backend_pny.py b/benchmarks/cuquantum_benchmarks/backends/backend_pny.py index 31f3920..a94b7c5 100644 --- a/benchmarks/cuquantum_benchmarks/backends/backend_pny.py +++ b/benchmarks/cuquantum_benchmarks/backends/backend_pny.py @@ -7,7 +7,6 @@ import os import time import warnings -import sys import numpy as np try: @@ -15,8 +14,11 @@ except ImportError: pennylane = None +try: + from .. import _internal_utils +except ImportError: + _internal_utils = None from .backend import Backend -from .._utils import call_by_root, EarlyReturnError, is_running_mpi # set up a logger @@ -35,16 +37,36 @@ def __init__(self, ngpus, ncpu_threads, precision, *args, identifier=None, **kwa self.ncpu_threads = ncpu_threads self.nqubits = kwargs.pop('nqubits') self.circuit = None + self.version = self.find_version(identifier) - def _make_qnode(self, circuit, nshots=1024, **kwargs): - if self.identifier == "pennylane-lightning-gpu": + def find_version(self, identifier): + if identifier == "pennylane-lightning-gpu": if self.ngpus == 1: try: import pennylane_lightning_gpu except ImportError as e: raise RuntimeError("PennyLane-Lightning-GPU plugin is not installed") from e else: - raise ValueError(f"cannot specify --ngpus > 1 for the backend {self.identifier}") + raise ValueError(f"cannot specify --ngpus > 1 for the backend {identifier}") + ver = pennylane_lightning_gpu.__version__ + elif identifier == "pennylane-lightning-kokkos": + try: + import pennylane_lightning_kokkos + except ImportError as e: + raise RuntimeError("PennyLane-Lightning-Kokkos plugin is not installed") from e + ver = pennylane_lightning_kokkos.__version__ + elif identifier == "pennylane-lightning-qubit": + try: + from pennylane_lightning import lightning_qubit + except ImportError as e: + raise RuntimeError("PennyLane-Lightning plugin is not installed") from e + ver = lightning_qubit.__version__ + else: # identifier == "pennylane" + ver = pennylane.__version__ + return ver + + def _make_qnode(self, circuit, nshots=1024, **kwargs): + if self.identifier == "pennylane-lightning-gpu": dev = pennylane.device("lightning.gpu", wires=self.nqubits, shots=nshots, c_dtype=self.dtype) elif self.identifier == "pennylane-lightning-kokkos": # there's no way for us to query what execution space (=backend) that kokkos supports at runtime, @@ -67,10 +89,6 @@ def _make_qnode(self, circuit, nshots=1024, **kwargs): sync=False, kokkos_args=args) elif self.identifier == "pennylane-lightning-qubit": - try: - import pennylane_lightning - except ImportError as e: - raise RuntimeError("PennyLane-Lightning plugin is not installed") from e if self.ngpus != 0: raise ValueError(f"cannot specify --ngpus for the backend {self.identifier}") if self.ncpu_threads > 1 and self.ncpu_threads != int(os.environ.get("OMP_NUM_THREADS", "-1")): @@ -81,23 +99,6 @@ def _make_qnode(self, circuit, nshots=1024, **kwargs): if self.ngpus != 0: raise ValueError(f"cannot specify --ngpus for the backend {self.identifier}") dev = pennylane.device("default.qubit", wires=self.nqubits, shots=nshots, c_dtype=self.dtype) - elif self.identifier == "pennylane-dumper": - import cloudpickle - import cuquantum_benchmarks - cloudpickle.register_pickle_by_value(cuquantum_benchmarks) - - # note: before loading the pickle, one should check if the Python version agrees - # (probably pennylane's version too) - py_major_minor = f'{sys.version_info.major}.{sys.version_info.minor}' - circuit_filename = kwargs.pop('circuit_filename') - circuit_filename += f"_pny_raw_py{py_major_minor}.pickle" - def dump(): - logger.info(f"dumping pennylane (raw) circuit as {circuit_filename} ...") - with open(circuit_filename, 'wb') as f: - cloudpickle.dump(circuit, f) # use highest protocol - logger.info("early exiting as the dumper task is completed") - call_by_root(dump) - raise EarlyReturnError else: raise ValueError(f"the backend {self.identifier} is not recognized") @@ -105,6 +106,9 @@ def dump(): return qnode def preprocess_circuit(self, circuit, *args, **kwargs): + if _internal_utils is not None: + _internal_utils.preprocess_circuit(self.identifier, circuit, *args, **kwargs) + nshots = kwargs.get('nshots', 1024) t1 = time.perf_counter() self.circuit = self._make_qnode(circuit, nshots, **kwargs) @@ -125,4 +129,3 @@ def run(self, circuit, nshots=1024): PnyLightningCpu = functools.partial(Pennylane, identifier='pennylane-lightning-qubit') PnyLightningKokkos = functools.partial(Pennylane, identifier='pennylane-lightning-kokkos') Pny = functools.partial(Pennylane, identifier='pennylane') -PnyDumper = functools.partial(Pennylane, identifier='pennylane-dumper') diff --git a/benchmarks/cuquantum_benchmarks/backends/backend_qiskit.py b/benchmarks/cuquantum_benchmarks/backends/backend_qiskit.py index 2b1bde5..245e63d 100644 --- a/benchmarks/cuquantum_benchmarks/backends/backend_qiskit.py +++ b/benchmarks/cuquantum_benchmarks/backends/backend_qiskit.py @@ -15,8 +15,13 @@ except ImportError: qiskit = None +try: + from .. import _internal_utils +except ImportError: + _internal_utils = None from .backend import Backend from .._utils import get_mpi_size, get_mpi_rank +from .._utils import call_by_root, EarlyReturnError # set up a logger @@ -32,9 +37,18 @@ def __init__(self, ngpus, ncpu_threads, precision, *args, identifier=None, **kwa self.precision = precision self.identifier = identifier self.nqubits = kwargs.pop('nqubits') - self.backend = self.create_aer_backend(identifier, ngpus, ncpu_threads, *args, **kwargs) + self.version = self.find_version(identifier) + self.backend = self.create_aer_backend(self.identifier, ngpus, ncpu_threads, *args, **kwargs) + def find_version(self, identifier): + if identifier == 'cusvaer': + return version('cusvaer') + return qiskit.__qiskit_version__['qiskit-aer'] + def preprocess_circuit(self, circuit, *args, **kwargs): + if _internal_utils is not None: + _internal_utils.preprocess_circuit(self.identifier, circuit, *args, **kwargs) + t0 = time.perf_counter() self.transpiled_qc = qiskit.transpile(circuit, self.backend) # (circuit, basis_gates=['u3', 'cx'], backend=self.backend) t1 = time.perf_counter() @@ -165,8 +179,7 @@ def create_aer_backend(self, identifier, ngpus, ncpu_threads, *args, **kwargs): blocking_enable=blocking_enable, blocking_qubits=blocking_qubits, fusion_max_qubit=nfused, precision=self.precision) else: - raise ValueError(f"the backend {identifier} is not recognized") - + backend = None return backend def get_aer_blocking_setup(self, ngpus=None): diff --git a/benchmarks/cuquantum_benchmarks/backends/backend_qsim.py b/benchmarks/cuquantum_benchmarks/backends/backend_qsim.py index 3959103..f5164bc 100644 --- a/benchmarks/cuquantum_benchmarks/backends/backend_qsim.py +++ b/benchmarks/cuquantum_benchmarks/backends/backend_qsim.py @@ -22,6 +22,7 @@ def __init__(self, ngpus, ncpu_threads, precision, *args, identifier=None, **kwa raise ValueError("all qsim backends only support single precision") self.identifier = identifier qsim_options = self.create_qsim_options(identifier, ngpus, ncpu_threads, **kwargs) + self.version = qsimcirq.__version__ self.backend = qsimcirq.QSimSimulator(qsim_options=qsim_options) def run(self, circuit, nshots=1024): diff --git a/benchmarks/cuquantum_benchmarks/backends/backend_qulacs.py b/benchmarks/cuquantum_benchmarks/backends/backend_qulacs.py index dad0659..8d7c309 100644 --- a/benchmarks/cuquantum_benchmarks/backends/backend_qulacs.py +++ b/benchmarks/cuquantum_benchmarks/backends/backend_qulacs.py @@ -27,6 +27,7 @@ def __init__(self, ngpus, ncpu_threads, precision, *args, identifier=None, **kwa self.ncpu_threads = ncpu_threads self.nqubits = kwargs.pop('nqubits') self.state = self.create_qulacs_state() + self.version = qulacs.__version__ def create_qulacs_state(self): if self.identifier == 'qulacs-gpu': diff --git a/benchmarks/cuquantum_benchmarks/config.py b/benchmarks/cuquantum_benchmarks/config.py index 04581c6..5572b23 100644 --- a/benchmarks/cuquantum_benchmarks/config.py +++ b/benchmarks/cuquantum_benchmarks/config.py @@ -195,16 +195,6 @@ }, }, - 'naive': { - 'config': { - 'nshots': 1024, - 'nfused': None, - 'ngpus': 1, - 'ncputhreads': 0, - 'precision': 'single', - }, - }, - 'pennylane': { 'config': { 'nshots': 1024, @@ -245,17 +235,6 @@ }, }, - # dummy - 'pennylane-dumper': { - 'config': { - 'nshots': 1024, - 'nfused': None, - 'ngpus': 0, - 'ncputhreads': 1, - 'precision': 'single', - }, - }, - 'qulacs-gpu': { 'config': { 'nshots': 1024, diff --git a/benchmarks/cuquantum_benchmarks/frontends/__init__.py b/benchmarks/cuquantum_benchmarks/frontends/__init__.py index 3f0ae06..965adca 100644 --- a/benchmarks/cuquantum_benchmarks/frontends/__init__.py +++ b/benchmarks/cuquantum_benchmarks/frontends/__init__.py @@ -6,10 +6,6 @@ from .frontend_qiskit import Qiskit from .frontend_pny import Pennylane from .frontend_qulacs import Qulacs -try: - from .frontend_naive import Naive -except ImportError: - Naive = None frontends = { @@ -18,8 +14,7 @@ 'pennylane': Pennylane, 'qulacs': Qulacs } -if Naive: - frontends['naive'] = Naive + def createFrontend(frontend_name, nqubits, config): return frontends[frontend_name](nqubits, config) diff --git a/benchmarks/cuquantum_benchmarks/frontends/frontend_pny.py b/benchmarks/cuquantum_benchmarks/frontends/frontend_pny.py index 1265262..fe5c32b 100644 --- a/benchmarks/cuquantum_benchmarks/frontends/frontend_pny.py +++ b/benchmarks/cuquantum_benchmarks/frontends/frontend_pny.py @@ -25,7 +25,7 @@ def __init__(self, nqubits, config): def generateCircuit(self, gateSeq): last_g = gateSeq[-1] assert last_g.id == "measure" # TODO: relax this? - + def circuit(): measured_qs = None @@ -71,5 +71,5 @@ def circuit(): raise NotImplementedError(f"The gate type {g.id} is not defined") return pennylane.sample(wires=measured_qs) - - return circuit + + return circuit \ No newline at end of file diff --git a/benchmarks/cuquantum_benchmarks/run.py b/benchmarks/cuquantum_benchmarks/run.py index 6f7daa3..f73f40d 100644 --- a/benchmarks/cuquantum_benchmarks/run.py +++ b/benchmarks/cuquantum_benchmarks/run.py @@ -14,6 +14,14 @@ from .run_interface import BenchApiRunner, BenchCircuitRunner from ._utils import (EarlyReturnError, MPHandler, RawTextAndDefaultArgFormatter, str_to_seq,) +try: + from . import _internal_utils +except ImportError: + _internal_utils = None + + +if _internal_utils is not None: + _internal_utils.init() frontend_names = [f for f in frontends.keys()] @@ -300,8 +308,8 @@ def run(args=None): del args.benchmark config = backend_config[args.backend] - if ((args.frontend == 'cirq' and args.backend not in ('cirq', 'cutn', *[k for k in backends.keys() if k.startswith('qsim')])) - or (args.frontend == 'qiskit' and args.backend not in ('cutn', *[k for k in backends.keys() if 'aer' in k])) + if ((args.frontend == 'cirq' and args.backend not in (*[k for k in backends.keys() if k.startswith('cirq')], 'cutn', *[k for k in backends.keys() if k.startswith('qsim')])) + or (args.frontend == 'qiskit' and args.backend not in ('cutn', *[k for k in backends.keys() if k.startswith('qiskit')], *[k for k in backends.keys() if 'aer' in k])) or (args.frontend == 'naive' and args.backend != 'naive') or (args.frontend == 'pennylane' and not args.backend.startswith('pennylane')) or (args.frontend == 'qulacs' and not args.backend.startswith('qulacs'))): diff --git a/benchmarks/cuquantum_benchmarks/run_interface.py b/benchmarks/cuquantum_benchmarks/run_interface.py index c6fd005..92c7e09 100644 --- a/benchmarks/cuquantum_benchmarks/run_interface.py +++ b/benchmarks/cuquantum_benchmarks/run_interface.py @@ -203,43 +203,6 @@ def _fix_filename_for_cutn(self, circuit_filename, nqubits): circuit_filename += f"_{''.join(pauli)}" return circuit_filename, target, pauli - def extract_backend_version(self): - if 'aer' in self.backend: - import qiskit - version = qiskit.__qiskit_version__['qiskit-aer'] - elif 'qsim' in self.backend: - import qsimcirq - version = qsimcirq.__version__ - elif self.backend == 'cutn': - import cuquantum - version = cuquantum.cutensornet.get_version() - elif self.backend == 'cirq': - import cirq - version = cirq.__version__ - elif self.backend == 'naive': - from .backends import backends - version = backends['naive'].version - elif self.backend == 'pennylane': - import pennylane - version = pennylane.__version__ - elif self.backend == 'pennylane-lightning-gpu': - import pennylane_lightning_gpu - version = pennylane_lightning_gpu.__version__ - elif self.backend == 'pennylane-lightning-qubit': - import pennylane_lightning - version = pennylane_lightning.__version__ - elif self.backend == 'pennylane-lightning-kokkos': - import pennylane_lightning_kokkos - version = pennylane_lightning_kokkos.__version__ - elif self.backend == 'pennylane-dumper': - version = '0' # dummy - elif self.backend in ('qulacs-gpu', 'qulacs-cpu'): - import qulacs - version = qulacs.__version__ - else: - assert False - return version - def extract_frontend_version(self): if self.frontend == 'qiskit': import qiskit @@ -315,7 +278,7 @@ def _run(self): # get versions; it's assumed up to this point, the existence of Python modules for # both frontend and backend is confirmed - backend_version = self.extract_backend_version() + backend_version = backend.version frontend_version = self.extract_frontend_version() glue_layer_version = self.extract_glue_layer_version() if glue_layer_version is not None: @@ -337,7 +300,7 @@ def _run(self): # only cutn needs these, TODO: backend config circuit_filename=os.path.join(self.cache_dir, circuit_filename), target=target, - pauli=pauli, + pauli=pauli ) for k in preprocess_data.keys(): diff --git a/python/builder/pep517.py b/python/builder/pep517.py index 66f7c09..3df2bcf 100644 --- a/python/builder/pep517.py +++ b/python/builder/pep517.py @@ -30,8 +30,8 @@ def get_requires_for_build_wheel(config_settings=None): # set up version constraints: note that CalVer like 22.03 is normalized to # 22.3 by setuptools, so we must follow the same practice in the constraints; # also, we don't need the patch number here - cuqnt_require = [f'custatevec-cu{utils.cuda_major_ver}~=1.4', # ">=1.4.0,<2" - f'cutensornet-cu{utils.cuda_major_ver}~=2.2', # ">=2.2.0,<3" + cuqnt_require = [f'custatevec-cu{utils.cuda_major_ver}~=1.5', # ">=1.5.0,<2" + f'cutensornet-cu{utils.cuda_major_ver}~=2.3', # ">=2.3.0,<3" ] return _build_meta.get_requires_for_build_wheel(config_settings) + cuqnt_require diff --git a/python/cuquantum/_version.py b/python/cuquantum/_version.py index 19b2289..d67bed9 100644 --- a/python/cuquantum/_version.py +++ b/python/cuquantum/_version.py @@ -5,4 +5,4 @@ # Note: cuQuantum Python follows the cuQuantum SDK version, which is now # switched to YY.MM and is different from individual libraries' (semantic) # versioning scheme. -__version__ = '23.06.0' +__version__ = '23.10.0' diff --git a/python/cuquantum/custatevec/custatevec.pxd b/python/cuquantum/custatevec/custatevec.pxd index c654a5a..e32b2aa 100644 --- a/python/cuquantum/custatevec/custatevec.pxd +++ b/python/cuquantum/custatevec/custatevec.pxd @@ -53,6 +53,7 @@ cdef extern from '' nogil: _Index transferSize ctypedef void* _DistIndexBitSwapSchedulerDescriptor 'custatevecDistIndexBitSwapSchedulerDescriptor_t' ctypedef void* _SVSwapWorkerDescriptor 'custatevecSVSwapWorkerDescriptor_t' + ctypedef void* _SubSVMigratorDescriptor 'custatevecSubSVMigratorDescriptor_t' # cuStateVec enums diff --git a/python/cuquantum/custatevec/custatevec.pyx b/python/cuquantum/custatevec/custatevec.pyx index 86fca61..93e92c4 100644 --- a/python/cuquantum/custatevec/custatevec.pyx +++ b/python/cuquantum/custatevec/custatevec.pyx @@ -176,7 +176,7 @@ cdef extern from * nogil: _Handle, _DistIndexBitSwapSchedulerDescriptor, int32_t, int32_t, _SVSwapParameters*) int custatevecSVSwapWorkerCreate( - _Handle, _SVSwapWorkerDescriptor, _CommunicatorDescriptor, void*, + _Handle, _SVSwapWorkerDescriptor*, _CommunicatorDescriptor, void*, int32_t, Event, DataType, Stream, size_t*, size_t*) int custatevecSVSwapWorkerDestroy( _Handle, _SVSwapWorkerDescriptor) @@ -190,6 +190,12 @@ cdef extern from * nogil: _Handle, _SVSwapWorkerDescriptor, _SVSwapParameters*, int) int custatevecSVSwapWorkerExecute( _Handle, _SVSwapWorkerDescriptor, _Index, _Index) + int custatevecSubSVMigratorCreate( + _Handle, _SubSVMigratorDescriptor*, void*, DataType, int, int) + int custatevecSubSVMigratorDestroy( + _Handle, _SubSVMigratorDescriptor) + int custatevecSubSVMigratorMigrate( + _Handle, _SubSVMigratorDescriptor, int, const void*, void*, _Index, _Index) # TODO: make this cpdef? @@ -3094,6 +3100,74 @@ cpdef sv_swap_worker_execute( check_status(status) +cpdef intptr_t sub_sv_migrator_create( + intptr_t handle, intptr_t device_slots, int sv_data_type, + int n_device_slots, int n_local_index_bits) except*: + """Create a cuStateVec sub state vector migrator. + + Args: + handle (intptr_t): The library handle. + device_slots (intptr_t): The pointer address to a device slots. + sv_data_type (cuquantum.cudaDataType): The data type of the device slots + n_device_slots (int): The number of device slots + n_local_index_bits (int): The number of index bits of sub state vectors. + + Returns: + An instance of the opaque migrator descriptor (as Python :class:`int`). + + .. seealso:: `custatevecSubSVMigratorCreate` + """ + cdef _SubSVMigratorDescriptor migrator + with nogil: + status = custatevecSubSVMigratorCreate( + <_Handle>handle, &migrator, device_slots, + sv_data_type, n_device_slots, n_local_index_bits) + check_status(status) + return migrator + + +cpdef sub_sv_migrator_destroy( + intptr_t handle, intptr_t migrator): + """Destroy the sub state vector migrator. + + Args: + handle (intptr_t): The library handle. + migrator (intptr_t): The sub state vector migrator descriptor. + + .. seealso:: `custatevecSubSVMigratorDestroy` + """ + with nogil: + status = custatevecSubSVMigratorDestroy( + <_Handle>handle, <_SubSVMigratorDescriptor>migrator) + check_status(status) + + +cpdef sub_sv_migrator_migrate( + intptr_t handle, intptr_t migrator, int device_slot_idx, + intptr_t src_sub_sv, intptr_t dst_sub_sv, _Index begin, _Index end): + """Performs state vector migration between device slots and given sub state vectors + + Args: + handle (intptr_t): The library handle. + migrator (intptr_t): The sub state vector migrator descriptor. + device_slot_idx (int): The slot index of a device slot + src_sub_sv (intptr_t): The pointer address (as Python :class:`int`) to the + src sub state vector pointer. + dst_sub_sv (intptr_t): The pointer address (as Python :class:`int`) to the + dst sub state vector pointer. + begin (int64_t): The index in a device slot to start sub state vector migration + end (int64_t): The index in a device slot to end sub state vector migration + + .. seealso:: `custatevecSubSVMigratorMigrate` + """ + with nogil: + status = custatevecSubSVMigratorMigrate( + <_Handle>handle, <_SubSVMigratorDescriptor>migrator, + device_slot_idx, src_sub_sv, dst_sub_sv, + begin, end) + check_status(status) + + # can't be cpdef because args & kwargs can't be handled in a C signature def logger_set_callback_data(callback, *args, **kwargs): """Set the logger callback along with arguments. diff --git a/python/cuquantum/cutensornet/_internal/decomposition_utils.py b/python/cuquantum/cutensornet/_internal/decomposition_utils.py index 56c911f..814fdae 100644 --- a/python/cuquantum/cutensornet/_internal/decomposition_utils.py +++ b/python/cuquantum/cutensornet/_internal/decomposition_utils.py @@ -44,7 +44,8 @@ 'rel_cutoff': cutn.TensorSVDConfigAttribute.REL_CUTOFF, 'partition': cutn.TensorSVDConfigAttribute.S_PARTITION, 'normalization': cutn.TensorSVDConfigAttribute.S_NORMALIZATION, - 'algorithm': cutn.TensorSVDConfigAttribute.ALGO} + 'algorithm': cutn.TensorSVDConfigAttribute.ALGO, + 'discarded_weight_cutoff': cutn.TensorSVDConfigAttribute.DISCARDED_WEIGHT_CUTOFF} SVD_INFO_MAP = {'full_extent': cutn.TensorSVDInfoAttribute.FULL_EXTENT, 'reduced_extent': cutn.TensorSVDInfoAttribute.REDUCED_EXTENT, @@ -294,20 +295,20 @@ def get_svd_info_dict(handle, svd_info): return info -def parse_decompose_operands_options(options, wrapped_operands, allowed_dtype_names=None): +def parse_decompose_operands_options(options, wrapped_operands, stream, allowed_dtype_names=None): """ Given initially wrapped tensors and network options, wrap the operands to device and create an internal NetworkOptions object. If cutensornet library handle is not provided in `options`, one will be created in the internal options. """ - device_id = utils.get_network_device_id(wrapped_operands) - logger = logging.getLogger() if options.logger is None else options.logger + package = utils.get_operands_package(wrapped_operands) operands_location = 'cuda' + device_id = utils.get_network_device_id(wrapped_operands) if device_id is None: + package = wrapped_operands[0].name + if package == 'numpy': + package = 'cupy' operands_location = 'cpu' device_id = options.device_id - logger.info(f"Begin transferring input data from host to device {device_id}") - wrapped_operands = tensor_wrapper.to(wrapped_operands, device_id) - logger.info("Input data transfer finished") # initialize handle once if not provided if options.handle is not None: @@ -323,7 +324,14 @@ def parse_decompose_operands_options(options, wrapped_operands, allowed_dtype_na raise ValueError(f"dtype {dtype_name} not supported") compute_type = options.compute_type if options.compute_type is not None else typemaps.NAME_TO_COMPUTE_TYPE[dtype_name] - package = utils.get_operands_package(wrapped_operands) + stream_holder = utils.get_or_create_stream(options.device_id, stream, package) + + logger = logging.getLogger() if options.logger is None else options.logger + if operands_location == 'cpu': + logger.info(f"Begin transferring input data from host to device {device_id}") + wrapped_operands = tensor_wrapper.to(wrapped_operands, device_id, stream_holder) + logger.info("Input data transfer finished") + allocator = options.allocator if options.allocator is not None else memory._MEMORY_MANAGER[package](device_id, logger) internal_options = options.__class__(device_id=device_id, @@ -334,17 +342,17 @@ def parse_decompose_operands_options(options, wrapped_operands, allowed_dtype_na memory_limit=options.memory_limit, allocator=allocator) - return wrapped_operands, internal_options, own_handle, operands_location + return wrapped_operands, internal_options, own_handle, operands_location, stream_holder -def allocate_and_set_workspace(handle, allocator, workspace_desc, pref, mem_space, workspace_kind, device_id, stream, stream_ctx, logger, task_name=''): +def allocate_and_set_workspace(handle, allocator, workspace_desc, pref, mem_space, workspace_kind, device_id, stream_holder, logger, task_name=''): """ Allocate and set the workspace in the workspace descriptor. """ workspace_size = cutn.workspace_get_memory_size(handle, workspace_desc, pref, mem_space, workspace_kind) # Allocate and set workspace if mem_space == cutn.Memspace.DEVICE: - with utils.device_ctx(device_id), stream_ctx: + with utils.device_ctx(device_id), stream_holder.ctx: try: logger.debug(f"Allocating device memory for {task_name}") workspace_ptr = allocator.memalloc(workspace_size) @@ -353,7 +361,7 @@ def allocate_and_set_workspace(handle, allocator, workspace_desc, pref, mem_spac "'BaseCUDAMemoryManager' protocol." raise TypeError(message) from e - logger.debug(f"Finished allocating device memory of size {formatters.MemoryStr(workspace_size)} for decomposition in the context of stream {stream}.") + logger.debug(f"Finished allocating device memory of size {formatters.MemoryStr(workspace_size)} for decomposition in the context of stream {stream_holder.obj}.") device_ptr = utils.get_ptr_from_memory_pointer(workspace_ptr) cutn.workspace_set_memory(handle, workspace_desc, mem_space, workspace_kind, device_ptr, workspace_size) logger.debug(f"The workspace memory (device pointer = {device_ptr}) has been set in the workspace descriptor.") @@ -376,7 +384,8 @@ def _destroy_tensor_descriptors(desc_tensors): cutn.destroy_tensor_descriptor(t) -def create_operands_and_descriptors(handle, wrapped_operands, size_dict, inputs, outputs, mid_extent, method, device_id, stream_ctx, logger): +def create_operands_and_descriptors( + handle, wrapped_operands, size_dict, inputs, outputs, mid_extent, method, device_id, stream_holder, logger): """ Create empty tensor operands and corresponding tensor descriptors for a decomposition problem. """ @@ -402,7 +411,7 @@ def create_operands_and_descriptors(handle, wrapped_operands, size_dict, inputs, output_operands = [] with utils.device_ctx(device_id): for extent, tensor_modes in zip(output_extents, outputs): - operand = utils.create_empty_tensor(output_class, extent, dtype_name, device_id, stream_ctx) + operand = utils.create_empty_tensor(output_class, extent, dtype_name, device_id, stream_holder) output_operands.append(operand) output_tensor_descriptors.append(operand.create_tensor_descriptor(handle, tensor_modes)) @@ -413,7 +422,7 @@ def create_operands_and_descriptors(handle, wrapped_operands, size_dict, inputs, s_dtype_name = 'float64' else: raise ValueError(f"{dtype_name} data type not supported") - s = utils.create_empty_tensor(output_class, (mid_extent, ), s_dtype_name, device_id, stream_ctx) + s = utils.create_empty_tensor(output_class, (mid_extent, ), s_dtype_name, device_id, stream_holder) s_ptr = s.data_ptr logger.debug("The output tensors and descriptors have been created.") except: @@ -423,13 +432,13 @@ def create_operands_and_descriptors(handle, wrapped_operands, size_dict, inputs, return input_tensor_descriptors, output_operands, output_tensor_descriptors, s, s_ptr -def get_return_operand_data(tensor, target_location): +def get_return_operand_data(tensor, target_location, stream_holder): """ Given wrapped tensors, fetch the return operands based on target location. """ if tensor is None: # potentially for s return tensor if target_location == 'cpu': - return tensor.to('cpu') + return tensor.to('cpu', stream_holder=stream_holder) else: # already on device return tensor.tensor diff --git a/python/cuquantum/cutensornet/_internal/einsum_parser.py b/python/cuquantum/cutensornet/_internal/einsum_parser.py index 44da696..0bd4204 100644 --- a/python/cuquantum/cutensornet/_internal/einsum_parser.py +++ b/python/cuquantum/cutensornet/_internal/einsum_parser.py @@ -371,6 +371,8 @@ def parse_einsum(*operands): # Map data to ordinals for cutensornet. inputs, output, mode_map_user_to_ord, mode_map_ord_to_user, label_end = map_modes(inputs, output, num_extra_labels, morpher) + has_user_output = (output is not None) + mapper = ModeLabelMapper(mode_map_ord_to_user) mapping_morpher = select_morpher(interleaved, mapper) @@ -383,4 +385,4 @@ def parse_einsum(*operands): # Create mode-extent map based on internal mode numbers. size_dict = create_size_dict(inputs, operands) - return operands, inputs, output, size_dict, mode_map_user_to_ord, mode_map_ord_to_user, interleaved or ellipses + return operands, inputs, output, has_user_output, size_dict, mode_map_user_to_ord, mode_map_ord_to_user, interleaved, ellipses diff --git a/python/cuquantum/cutensornet/_internal/grad_torch.py b/python/cuquantum/cutensornet/_internal/grad_torch.py new file mode 100644 index 0000000..d25d712 --- /dev/null +++ b/python/cuquantum/cutensornet/_internal/grad_torch.py @@ -0,0 +1,73 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES +# +# SPDX-License-Identifier: BSD-3-Clause + +import itertools + +# TODO: right now, we use try-except throughout the codebase to check the +# presence of PyTorch, so if it exists it'd get imported. We should switch +# to use importlib.util.find_spec('torch') so as to reduce the import time. +try: + import torch +except ImportError: + torch = None + + +if torch is not None: + + class _TorchContract(torch.autograd.Function): + + @staticmethod + def forward(context, network, optimize, stream, return_info, *operands): + + # Save objects needed in the backward pass. + context.network = network + context.stream = stream + + # Compute path. + opt_info = network.contract_path(optimize=optimize) + + # Skip autotuning since the network is contracted only once. + + # Contraction. + out = network.contract(stream=stream) + + if return_info: + return out, opt_info + else: + return out + + @staticmethod + def backward(context, *output_grad): + + try: + # Retrieve cached objects. + network = context.network + stream = context.stream + + # Regardless of return_info, we only care about the gradient of + # the first return value. + output_grad = output_grad[0] + + # Compute backprop. + input_grads = network.gradients(output_grad, stream=stream) + + # Rearrange return values based on the input format. + if network.is_interleaved: + out = [None, None, None, None, + *itertools.chain(*itertools.zip_longest(input_grads, [None]))] + if network.has_user_output: + out.append(None) + out = tuple(out) + else: + out = (None, None, None, None, + None, *input_grads) + finally: + # Free network resources explicitly. + network.free() + + return out + +else: + + _TorchContract = None diff --git a/python/cuquantum/cutensornet/_internal/optimizer_ifc.py b/python/cuquantum/cutensornet/_internal/optimizer_ifc.py index a92dc98..b0f583f 100644 --- a/python/cuquantum/cutensornet/_internal/optimizer_ifc.py +++ b/python/cuquantum/cutensornet/_internal/optimizer_ifc.py @@ -157,7 +157,8 @@ def path(self, path): if num_contraction != len(network.operands) - 1: raise ValueError(f"The length of the contraction path ({num_contraction}) must be one less than the number of operands ({len(network.operands)}).") - path = reduce(operator.concat, path) + if num_contraction > 0: + path = reduce(operator.concat, path) path_array = np.asarray(path, dtype=np.int32) # Construct the path type. @@ -235,7 +236,7 @@ def intermediate_modes(self): cutn.contraction_optimizer_info_get_attribute(network.handle, network.optimizer_info_ptr, InfoEnum.INTERMEDIATE_MODES, intermediate_modes.ctypes.data, size) count, out = 0, list() - mode_type = tuple if network.is_interleaved else ''.join + mode_type = tuple if (network.is_interleaved or network.has_ellipses) else ''.join for n in num_intermediate_modes: out.append(mode_type(map(lambda m: network.mode_map_ord_to_user[m], intermediate_modes[count:count+n]))) # Convert to user mode labels count += n diff --git a/python/cuquantum/cutensornet/_internal/package_ifc.py b/python/cuquantum/cutensornet/_internal/package_ifc.py index 6b94752..1309e5e 100644 --- a/python/cuquantum/cutensornet/_internal/package_ifc.py +++ b/python/cuquantum/cutensornet/_internal/package_ifc.py @@ -9,6 +9,9 @@ __all__ = ['Package'] from abc import ABC, abstractmethod +from dataclasses import dataclass +from contextlib import nullcontext +from typing import Any class Package(ABC): @@ -68,3 +71,21 @@ def create_stream(device_id): device_id: The id (ordinal) of the device. """ raise NotImplementedError + + +@dataclass +class StreamHolder: + """A data class for easing CUDA stream manipulation. + + Attributes: + ctx: A context manager for using the specified stream. + device_id (int): The device ID where the encapsulated stream locates. + obj: A foreign object that holds the stream alive. + package (str): + ptr (int): The address of the underlying ``cudaStream_t`` object. + """ + ctx: Any = nullcontext() + device_id: int = -2 + obj: Any = None + package: str = "" + ptr: int = 0 diff --git a/python/cuquantum/cutensornet/_internal/package_ifc_torch.py b/python/cuquantum/cutensornet/_internal/package_ifc_torch.py index 798a5f8..42fd66b 100644 --- a/python/cuquantum/cutensornet/_internal/package_ifc_torch.py +++ b/python/cuquantum/cutensornet/_internal/package_ifc_torch.py @@ -35,4 +35,3 @@ def create_external_stream(device_id, stream_ptr): def create_stream(device_id): stream = torch.cuda.Stream(device=device_id) return stream - diff --git a/python/cuquantum/cutensornet/_internal/tensor_ifc.py b/python/cuquantum/cutensornet/_internal/tensor_ifc.py index 921ae1d..bc6df7d 100644 --- a/python/cuquantum/cutensornet/_internal/tensor_ifc.py +++ b/python/cuquantum/cutensornet/_internal/tensor_ifc.py @@ -45,7 +45,7 @@ def empty(cls, shape, **context): raise NotImplementedError @abstractmethod - def numpy(self): + def numpy(self, stream_holder): raise NotImplementedError @property @@ -64,7 +64,11 @@ def empty(cls, shape, **context): raise NotImplementedError @abstractmethod - def to(self, device='cpu'): + def to(self, device='cpu', stream_holder=None): + raise NotImplementedError + + @abstractmethod + def copy_(self, src, stream_holder=None): raise NotImplementedError @staticmethod diff --git a/python/cuquantum/cutensornet/_internal/tensor_ifc_cupy.py b/python/cuquantum/cutensornet/_internal/tensor_ifc_cupy.py index efe69e0..6d99b3e 100644 --- a/python/cuquantum/cutensornet/_internal/tensor_ifc_cupy.py +++ b/python/cuquantum/cutensornet/_internal/tensor_ifc_cupy.py @@ -12,6 +12,7 @@ import numpy from . import utils +from .package_ifc import StreamHolder from .tensor_ifc import Tensor from .. import cutensornet as cutn @@ -52,8 +53,13 @@ def shape(self): def strides(self): return tuple(stride_in_bytes // self.tensor.itemsize for stride_in_bytes in self.tensor.strides) - def numpy(self): - return self.tensor.get() + def numpy(self, stream_holder=StreamHolder()): + stream = stream_holder.obj + out = self.tensor.get(stream=stream) + # cupy/cupy#7820 + if stream is not None: + stream.synchronize() + return out @classmethod def empty(cls, shape, **context): @@ -63,6 +69,7 @@ def empty(cls, shape, **context): name = context.get('dtype', 'float32') dtype = CupyTensor.name_to_dtype[name] device = context.get('device', None) + strides = context.get('strides', None) if isinstance(device, cupy.cuda.Device): device_id = device.id @@ -72,32 +79,42 @@ def empty(cls, shape, **context): raise ValueError(f"The device must be specified as an integer or cupy.cuda.Device instance, not '{device}'.") with utils.device_ctx(device_id): - tensor = cupy.empty(shape, dtype=dtype) + if strides: + # need an explicit allocation due to cupy/cupy#7818 + size = dtype.itemsize + for s in shape: + size = size * s + ptr = cupy.cuda.alloc(size) + # when strides is not None, it should be of unit counts not bytes + strides = tuple(s * dtype.itemsize for s in strides) + tensor = cupy.ndarray(shape, dtype=dtype, strides=strides, memptr=ptr) + else: + tensor = cupy.ndarray(shape, dtype=dtype) return tensor - def to(self, device='cpu'): + def to(self, device='cpu', stream_holder=StreamHolder()): """ Create a copy of the tensor on the specified device (integer or 'cpu'). Copy to Numpy ndarray if CPU, otherwise return Cupy type. """ if device == 'cpu': - return self.numpy() + return self.numpy(stream_holder=stream_holder) if not isinstance(device, int): raise ValueError(f"The device must be specified as an integer or 'cpu', not '{device}'.") - with utils.device_ctx(device): + with utils.device_ctx(device), stream_holder.ctx: tensor_device = cupy.asarray(self.tensor) return tensor_device - def copy_(self, src): + def copy_(self, src, stream_holder=StreamHolder()): """ Inplace copy of src (copy the data from src into self). """ - - cupy.copyto(self.tensor, src) + with stream_holder.ctx: + cupy.copyto(self.tensor, src) def istensor(self): """ @@ -110,4 +127,3 @@ def reshape_to_match_tensor_descriptor(self, handle, desc_tensor): if tuple(extents) != self.shape: strides = [i * self.tensor.itemsize for i in strides] self.tensor = cupy.ndarray(extents, dtype=self.tensor.dtype, memptr=self.tensor.data, strides=strides) - diff --git a/python/cuquantum/cutensornet/_internal/tensor_ifc_numpy.py b/python/cuquantum/cutensornet/_internal/tensor_ifc_numpy.py index ed6a0f2..c161a6b 100644 --- a/python/cuquantum/cutensornet/_internal/tensor_ifc_numpy.py +++ b/python/cuquantum/cutensornet/_internal/tensor_ifc_numpy.py @@ -12,6 +12,7 @@ import numpy from . import utils +from .package_ifc import StreamHolder from .tensor_ifc import Tensor @@ -51,7 +52,7 @@ def shape(self): def strides(self): return tuple(stride_in_bytes // self.tensor.itemsize for stride_in_bytes in self.tensor.strides) - def numpy(self): + def numpy(self, stream_holder=StreamHolder()): return self.tensor @classmethod @@ -61,9 +62,11 @@ def empty(cls, shape, **context): """ name = context.get('dtype', 'float32') dtype = NumpyTensor.name_to_dtype[name] - return cls(module.empty(shape, dtype=dtype)) + strides = context.get('strides', None) + # when strides is not None, it should be of unit counts not bytes + return cls(module.ndarray(shape, dtype=dtype, strides=(tuple(s * dtype.itemsize for s in strides) if strides else None))) - def to(self, device='cpu'): + def to(self, device='cpu', stream_holder=StreamHolder()): """ Create a copy of the tensor on the specified device (integer or 'cpu'). Copy to Cupy ndarray on the specified device if it @@ -75,11 +78,14 @@ def to(self, device='cpu'): if not isinstance(device, int): raise ValueError(f"The device must be specified as an integer or 'cpu', not '{device}'.") - with utils.device_ctx(device): + with utils.device_ctx(device), stream_holder.ctx: tensor_device = cupy.asarray(self.tensor) return tensor_device + def copy_(self, src, stream_holder=StreamHolder()): + raise NotImplementedError + def istensor(self): """ Check if the object is ndarray-like. @@ -88,4 +94,4 @@ def istensor(self): def reshape_to_match_tensor_descriptor(self, handle, desc_tensor): #NOTE: this method is only called for CupyTensor and TorchTensor - raise NotImplementedError \ No newline at end of file + raise NotImplementedError diff --git a/python/cuquantum/cutensornet/_internal/tensor_ifc_torch.py b/python/cuquantum/cutensornet/_internal/tensor_ifc_torch.py index a7e3a87..ece81e8 100644 --- a/python/cuquantum/cutensornet/_internal/tensor_ifc_torch.py +++ b/python/cuquantum/cutensornet/_internal/tensor_ifc_torch.py @@ -11,6 +11,7 @@ import torch from . import typemaps +from .package_ifc import StreamHolder from .tensor_ifc import Tensor from .. import cutensornet as cutn @@ -51,8 +52,9 @@ def shape(self): def strides(self): return self.tensor.stride() - def numpy(self): - return self.tensor.get() + def numpy(self, stream_holder=StreamHolder()): + # We currently do not use this. + raise NotImplementedError @classmethod def empty(cls, shape, **context): @@ -62,28 +64,36 @@ def empty(cls, shape, **context): name = context.get('dtype', 'float32') dtype = TorchTensor.name_to_dtype[name] device = context.get('device', None) - tensor = torch.empty(shape, dtype=dtype, device=device) + strides = context.get('strides', None) + if strides: + # note: torch strides is not scaled by bytes + tensor = torch.empty_strided(shape, strides, dtype=dtype, device=device) + else: + tensor = torch.empty(shape, dtype=dtype, device=device) return tensor - def to(self, device='cpu'): + def to(self, device='cpu', stream_holder=StreamHolder()): """ Create a copy of the tensor on the specified device (integer or 'cpu'). Copy to Numpy ndarray if CPU, otherwise return Cupy type. """ - if not(device == 'cpu' or isinstance(device, int)): + if not (device == 'cpu' or isinstance(device, int)): raise ValueError(f"The device must be specified as an integer or 'cpu', not '{device}'.") - tensor_device = self.tensor.to(device=device) + non_blocking = False if device == 'cpu' else True + + with stream_holder.ctx: + tensor_device = self.tensor.to(device=device, non_blocking=non_blocking) return tensor_device - def copy_(self, src): + def copy_(self, src, stream_holder=StreamHolder()): """ Inplace copy of src (copy the data from src into self). """ - - self.tensor.copy_(src) + with stream_holder.ctx: + self.tensor.copy_(src) def istensor(self): """ @@ -94,6 +104,5 @@ def istensor(self): def reshape_to_match_tensor_descriptor(self, handle, desc_tensor): _, _, extents, strides = cutn.get_tensor_details(handle, desc_tensor) if tuple(extents) != self.shape: - #note: torch strides is not scaled by bytes + # note: torch strides is not scaled by bytes self.tensor = torch.as_strided(self.tensor, tuple(extents), tuple(strides)) - diff --git a/python/cuquantum/cutensornet/_internal/tensor_wrapper.py b/python/cuquantum/cutensornet/_internal/tensor_wrapper.py index b1beeaf..10a7822 100644 --- a/python/cuquantum/cutensornet/_internal/tensor_wrapper.py +++ b/python/cuquantum/cutensornet/_internal/tensor_wrapper.py @@ -6,7 +6,7 @@ Entry point to using tensors from different libraries seamlessly. """ -__all__ = [ 'infer_tensor_package', 'wrap_operands', 'wrap_operands', 'to', 'copy_'] +__all__ = [ 'infer_tensor_package', 'wrap_operand', 'wrap_operands', 'to', 'copy_'] import functools @@ -103,23 +103,21 @@ def wrap_operands(native_operands): return wrapped_operands -def to(operands, device): +def to(operands, device, stream_holder): """ Copy the wrapped operands to the specified device ('cpu' or int) and return the wrapped operands on the device. """ - operands = tuple(o.to(device) for o in operands) + operands = tuple(o.to(device, stream_holder) for o in operands) return wrap_operands(operands) -def copy_(src, dest): +def copy_(src, dest, stream_holder): """ Copy the wrapped operands in dest to the corresponding wrapped operands in src. """ for s, d in zip(src, dest): if s.device_id is None: - s = wrap_operand(s.to(d.device_id)) - d.copy_(s.tensor) - - + s = wrap_operand(s.to(d.device_id, stream_holder=stream_holder)) + d.copy_(s.tensor, stream_holder=stream_holder) diff --git a/python/cuquantum/cutensornet/_internal/utils.py b/python/cuquantum/cutensornet/_internal/utils.py index 7e628c2..d9ea8b1 100644 --- a/python/cuquantum/cutensornet/_internal/utils.py +++ b/python/cuquantum/cutensornet/_internal/utils.py @@ -18,6 +18,7 @@ from . import mem_limit from . import package_wrapper from . import tensor_wrapper +from .package_ifc import StreamHolder def infer_object_package(obj): @@ -98,30 +99,34 @@ def get_or_create_stream(device_id, stream, op_package): op_package: The package the tensor network operands belong to. Returns: - tuple: CuPy stream object, package stream context, stream pointer. + StreamHolder: Hold a CuPy stream object, package stream context, stream pointer, ... """ op_package_ifc = package_wrapper.PACKAGE[op_package] if stream is None: stream = op_package_ifc.get_current_stream(device_id) - return _create_stream_ctx_ptr_cupy_stream(op_package_ifc, stream) + obj, ctx, ptr = _create_stream_ctx_ptr_cupy_stream(op_package_ifc, stream) + return StreamHolder( + **{'ctx': ctx, 'obj': obj, 'ptr': ptr, 'device_id': device_id, 'package': op_package}) if isinstance(stream, int): - stream_ptr = stream + ptr = stream if op_package == 'torch': message = "A stream object must be provided for PyTorch operands, not stream pointer." raise TypeError(message) - stream_ctx = op_package_ifc.to_stream_context(stream) - stream = cp.cuda.ExternalStream(stream_ptr) - - return stream, stream_ctx, stream_ptr + obj = cp.cuda.ExternalStream(ptr) + ctx = op_package_ifc.to_stream_context(obj) + return StreamHolder( + **{'ctx': ctx, 'obj': obj, 'ptr': ptr, 'device_id': device_id, 'package': op_package}) stream_package = infer_object_package(stream) if stream_package != op_package: message = "The stream object must belong to the same package as the tensor network operands." raise TypeError(message) - return _create_stream_ctx_ptr_cupy_stream(op_package_ifc, stream) + obj, ctx, ptr = _create_stream_ctx_ptr_cupy_stream(op_package_ifc, stream) + return StreamHolder( + **{'ctx': ctx, 'obj': obj, 'ptr': ptr, 'device_id': device_id, 'package': op_package}) def get_memory_limit(memory_limit, device): @@ -167,11 +172,11 @@ def get_operands_data(operands): """ Get the raw data pointer of the input operands for cuTensorNet. """ - op_data = tuple(o.data_ptr for o in operands) + op_data = tuple(o.data_ptr if o is not None else 0 for o in operands) return op_data -def create_empty_tensor(cls, extents, dtype, device_id, stream_ctx): +def create_empty_tensor(cls, extents, dtype, device_id, stream_holder, strides=None): """ Create a wrapped tensor of the same type as (the wrapped) cls on the specified device having the specified extents and dtype. @@ -179,13 +184,13 @@ def create_empty_tensor(cls, extents, dtype, device_id, stream_ctx): The tensor is created within a stream context to allow for asynchronous memory allocators like CuPy's MemoryAsyncPool. """ - with stream_ctx: - tensor = cls.empty(extents, dtype=dtype, device=device_id) + with stream_holder.ctx: + tensor = cls.empty(extents, dtype=dtype, device=device_id, strides=strides) tensor = tensor_wrapper.wrap_operand(tensor) return tensor -def create_output_tensor(cls, package, output, size_dict, device_id, stream, data_type): +def create_output_tensor(cls, output, size_dict, device_id, stream_holder, data_type): """ Create output tensor and associated data (modes, extents, strides). This operation is ordered through events and is safe to use with asynchronous memory pools. @@ -193,11 +198,9 @@ def create_output_tensor(cls, package, output, size_dict, device_id, stream, dat modes = tuple(m for m in output) extents = tuple(size_dict[m] for m in output) - stream, stream_ctx, _ = get_or_create_stream(device_id, stream, package) - with device_ctx(device_id): - output = create_empty_tensor(cls, extents, data_type, device_id, stream_ctx) - output_event = stream.record() + output = create_empty_tensor(cls, extents, data_type, device_id, stream_holder) + output_event = stream_holder.obj.record() strides = output.strides return output, output_event, modes, extents, strides @@ -375,7 +378,7 @@ def check_and_set_options(required: Mapping[str, Value], provided: Mapping[str, @contextlib.contextmanager -def cuda_call_ctx(stream, blocking=True, timing=True): +def cuda_call_ctx(stream_holder, blocking=True, timing=True): """ A simple context manager that provides (non-)blocking behavior depending on the `blocking` parameter for CUDA calls. The call is timed only for blocking behavior when timing is requested. @@ -384,6 +387,8 @@ def cuda_call_ctx(stream, blocking=True, timing=True): event is returned together with a `Value` object that stores the elapsed time if the call is blocking and timing is requested, or None otherwise. """ + stream = stream_holder.obj + if blocking: start = cp.cuda.Event(disable_timing = False if timing else True) stream.record(start) diff --git a/python/cuquantum/cutensornet/cutensornet.pxd b/python/cuquantum/cutensornet/cutensornet.pxd index 3a21b92..ff79bff 100644 --- a/python/cuquantum/cutensornet/cutensornet.pxd +++ b/python/cuquantum/cutensornet/cutensornet.pxd @@ -34,8 +34,11 @@ cdef extern from '' nogil: ctypedef void* _TensorSVDConfig 'cutensornetTensorSVDConfig_t' ctypedef void* _TensorSVDInfo 'cutensornetTensorSVDInfo_t' ctypedef void* _State 'cutensornetState_t' + ctypedef void* _StateAccessor 'cutensornetStateAccessor_t' + ctypedef void* _StateExpectation 'cutensornetStateExpectation_t' ctypedef void* _StateMarginal 'cutensornetStateMarginal_t' ctypedef void* _StateSampler 'cutensornetStateSampler_t' + ctypedef void* _NetworkOperator 'cutensornetNetworkOperator_t' # cuTensorNet structs ctypedef struct _NodePair 'cutensornetNodePair_t': @@ -175,6 +178,7 @@ cdef extern from '' nogil: CUTENSORNET_TENSOR_SVD_CONFIG_S_PARTITION CUTENSORNET_TENSOR_SVD_CONFIG_ALGO CUTENSORNET_TENSOR_SVD_CONFIG_ALGO_PARAMS + CUTENSORNET_TENSOR_SVD_CONFIG_DISCARDED_WEIGHT_CUTOFF ctypedef enum _TensorSVDAlgo 'cutensornetTensorSVDAlgo_t': CUTENSORNET_TENSOR_SVD_ALGO_GESVD @@ -208,6 +212,12 @@ cdef extern from '' nogil: ctypedef enum _StatePurity 'cutensornetStatePurity_t': CUTENSORNET_STATE_PURITY_PURE + ctypedef enum _ExpectationAttribute 'cutensornetExpectationAttributes_t': + CUTENSORNET_EXPECTATION_OPT_NUM_HYPER_SAMPLES + + ctypedef enum _AccessorAttribute 'cutensornetAccessorAttributes_t': + CUTENSORNET_ACCESSOR_OPT_NUM_HYPER_SAMPLES + ctypedef enum _MarginalAttribute 'cutensornetMarginalAttributes_t': CUTENSORNET_MARGINAL_OPT_NUM_HYPER_SAMPLES @@ -221,3 +231,16 @@ cdef extern from '' nogil: CUTENSORNET_NETWORK_INPUT_TENSORS_CONJUGATED CUTENSORNET_NETWORK_INPUT_TENSORS_NUM_REQUIRE_GRAD CUTENSORNET_NETWORK_INPUT_TENSORS_REQUIRE_GRAD + + ctypedef enum _BoundaryCondition 'cutensornetBoundaryCondition_t': + CUTENSORNET_BOUNDARY_CONDITION_OPEN + + ctypedef enum _StateAttribute 'cutensornetStateAttributes_t': + CUTENSORNET_STATE_MPS_CANONICAL_CENTER + CUTENSORNET_STATE_MPS_SVD_CONFIG_ABS_CUTOFF + CUTENSORNET_STATE_MPS_SVD_CONFIG_REL_CUTOFF + CUTENSORNET_STATE_MPS_SVD_CONFIG_S_NORMALIZATION + CUTENSORNET_STATE_MPS_SVD_CONFIG_ALGO + CUTENSORNET_STATE_MPS_SVD_CONFIG_ALGO_PARAMS + CUTENSORNET_STATE_MPS_SVD_CONFIG_DISCARDED_WEIGHT_CUTOFF + CUTENSORNET_STATE_NUM_HYPER_SAMPLES diff --git a/python/cuquantum/cutensornet/cutensornet.pyx b/python/cuquantum/cutensornet/cutensornet.pyx index a5d0c09..c83d197 100644 --- a/python/cuquantum/cutensornet/cutensornet.pyx +++ b/python/cuquantum/cutensornet/cutensornet.pyx @@ -15,6 +15,7 @@ from cuquantum.utils cimport cuqnt_alloc_wrapper from cuquantum.utils cimport cuqnt_free_wrapper from cuquantum.utils cimport get_buffer_pointer from cuquantum.utils cimport logger_callback_with_data +from cuquantum.utils cimport cuDoubleComplex from enum import IntEnum import warnings @@ -244,7 +245,37 @@ cdef extern from * nogil: const int64_t*, const int32_t, const int32_t, const int32_t, int64_t*) int cutensornetStateUpdateTensor( const _Handle, _State, int64_t, void*, int32_t) + int cutensornetStateConfigure( + const _Handle, _State, _StateAttribute, const void*, size_t) + int cutensornetStatePrepare( + const _Handle, _State, size_t, _WorkspaceDescriptor, Stream) + int cutensornetStateCompute( + const _Handle, _State, _WorkspaceDescriptor, int64_t*[], int64_t*[], void*[], Stream) + int cutensornetGetOutputStateDetails( + const _Handle, const _State, int32_t*, int32_t*, int64_t*[], int64_t*[]) + # expectation value + int cutensornetCreateExpectation( + const _Handle, _State, _NetworkOperator, _StateExpectation*) + int cutensornetExpectationConfigure( + const _Handle, _StateExpectation, _ExpectationAttribute, const void*, size_t) + int cutensornetExpectationPrepare( + const _Handle, _StateExpectation, size_t, _WorkspaceDescriptor, Stream) + int cutensornetExpectationCompute( + const _Handle, _StateExpectation, _WorkspaceDescriptor, void*, void*, Stream) + int cutensornetDestroyExpectation(_StateExpectation) + # accessor + int cutensornetCreateAccessor( + const _Handle, _State, int32_t, const int32_t*, + const int64_t*, _StateAccessor*) + int cutensornetAccessorConfigure( + const _Handle, _StateAccessor, _AccessorAttribute, const void*, size_t) + int cutensornetAccessorPrepare( + const _Handle, _StateAccessor, size_t, _WorkspaceDescriptor, Stream) + int cutensornetAccessorCompute( + const _Handle, _StateAccessor, const int64_t*, _WorkspaceDescriptor, void*, void*, Stream) + int cutensornetDestroyAccessor(_StateAccessor) + # marginals int cutensornetCreateMarginal( const _Handle, _State, int32_t, const int32_t*, @@ -269,6 +300,16 @@ cdef extern from * nogil: _WorkspaceDescriptor, int64_t*, Stream) int cutensornetDestroySampler(_StateSampler) + # mps-specific + int cutensornetStateFinalizeMPS( + const _Handle handle, _State, _BoundaryCondition, const int64_t* const[], const int64_t* const[]) + + # network operator + int cutensornetCreateNetworkOperator( + const _Handle, int32_t, const int64_t[], DataType, _NetworkOperator*) + int cutensornetNetworkOperatorAppendProduct( + const _Handle, _NetworkOperator, cuDoubleComplex, int32_t, const int32_t[], const int32_t* const[], const int64_t* const[], const void* const[], int64_t*) + int cutensornetDestroyNetworkOperator(_NetworkOperator) class cuTensorNetError(RuntimeError): def __init__(self, status): @@ -2213,6 +2254,7 @@ cdef dict tensor_svd_cfg_sizes = { CUTENSORNET_TENSOR_SVD_CONFIG_S_NORMALIZATION: _numpy.int32, # = sizeof(enum value) CUTENSORNET_TENSOR_SVD_CONFIG_S_PARTITION: _numpy.int32, # = sizeof(enum value) CUTENSORNET_TENSOR_SVD_CONFIG_ALGO: _numpy.int32, # = sizeof(enum value) + CUTENSORNET_TENSOR_SVD_CONFIG_DISCARDED_WEIGHT_CUTOFF: _numpy.float64, } cdef dict svd_algo_params_sizes = { @@ -3003,7 +3045,7 @@ cpdef intptr_t create_marginal( cdef dict marginal_attribute_sizes = { - CUTENSORNET_MARGINAL_OPT_NUM_HYPER_SAMPLES: _numpy.int64 + CUTENSORNET_MARGINAL_OPT_NUM_HYPER_SAMPLES: _numpy.int32 } @@ -3030,8 +3072,7 @@ cpdef marginal_configure(intptr_t handle, intptr_t marginal, int attr, intptr_t handle (intptr_t): The library handle. marginal (intptr_t): The tensor network marginal computation handle. attr (MarginalAttribute): The attribute to configure. - buf (intptr_t): The pointer address (as Python :class:`int`) for storing - the returned attribute value. + buf (intptr_t): The pointer address (as Python :class:`int`) of the attribute value. size (size_t): The size of ``buf`` (in bytes). .. note:: To compute ``size``, use the itemsize of the corresponding data @@ -3161,7 +3202,7 @@ cpdef intptr_t create_sampler( cdef dict sampler_attribute_sizes = { - CUTENSORNET_SAMPLER_OPT_NUM_HYPER_SAMPLES: _numpy.int64 + CUTENSORNET_SAMPLER_OPT_NUM_HYPER_SAMPLES: _numpy.int32 } @@ -3266,6 +3307,767 @@ cpdef destroy_sampler(intptr_t sampler): check_status(status) +cdef dict state_attribute_sizes = { + CUTENSORNET_STATE_MPS_CANONICAL_CENTER: _numpy.int32, + CUTENSORNET_STATE_MPS_SVD_CONFIG_ABS_CUTOFF: _numpy.float64, + CUTENSORNET_STATE_MPS_SVD_CONFIG_REL_CUTOFF: _numpy.float64, + CUTENSORNET_STATE_MPS_SVD_CONFIG_S_NORMALIZATION: _numpy.int32, # = sizeof(enum value) + CUTENSORNET_STATE_MPS_SVD_CONFIG_ALGO: _numpy.int32, # = sizeof(enum value) + CUTENSORNET_STATE_MPS_SVD_CONFIG_DISCARDED_WEIGHT_CUTOFF: _numpy.float64, + CUTENSORNET_STATE_NUM_HYPER_SAMPLES: _numpy.int32 +} + + +cpdef state_get_attribute_dtype(int attr): + """Get the Python data type of the corresponding state attribute. + + Args: + attr (StateAttribute): The attribute to query. The enum CUTENSORNET_STATE_MPS_SVD_CONFIG_ALGO is not supported, + the dtype of which can be queried by :func:`tensor_svd_algo_params_get_dtype`. + + Returns: + The data type of the queried attribute. The returned dtype is always + a valid NumPy dtype object. + + .. note:: This API has no C counterpart and is a convenient helper for + allocating memory for :func:`state_configure`. + """ + if attr == CUTENSORNET_STATE_MPS_SVD_CONFIG_ALGO_PARAMS: + raise ValueError("For CUTENSORNET_STATE_MPS_SVD_CONFIG_ALGO_PARAMS, use `tensor_svd_algo_params_get_dtype` to get the dtype") + dtype = state_attribute_sizes[attr] + if attr == CUTENSORNET_STATE_MPS_SVD_CONFIG_S_NORMALIZATION: + if _numpy.dtype(dtype).itemsize != sizeof(_TensorSVDNormalization): + warnings.warn("binary size may be incompatible") + elif attr == CUTENSORNET_STATE_MPS_SVD_CONFIG_ALGO: + if _numpy.dtype(dtype).itemsize != sizeof(_TensorSVDAlgo): + warnings.warn("binary size may be incompatible") + return dtype + + +cpdef state_configure(intptr_t handle, intptr_t state, int attr, intptr_t buf, size_t size): + """Configures computation of the tensor network state. + + Args: + handle (intptr_t): The library handle. + state (intptr_t): The tensor network state. + attr (StateAttribute): The attribute to configure. + buf (intptr_t): The pointer address (as Python :class:`int`) for storing + the returned attribute value. + size (size_t): The size of ``buf`` (in bytes). + + .. note:: To compute ``size``, use the itemsize of the corresponding data + type, which can be queried using :func:`state_get_attribute_dtype`. + + .. seealso:: `cutensornetStateConfigure` + """ + with nogil: + status = cutensornetStateConfigure( + <_Handle>handle, <_State>state, + <_StateAttribute>attr, + buf, size) + check_status(status) + + +cpdef state_prepare( + intptr_t handle, intptr_t state, + size_t max_workspace_size_device, intptr_t workspace, intptr_t stream): + """Prepares computation of the tensor network state representation. + + Args: + handle (intptr_t): The library handle. + state (intptr_t): The tensor network state handle. + max_workspace_size_device (size_t): The maximal device workspace size (in bytes) allowed + for the mariginal computation. + workspace (intptr_t): The workspace descriptor. + stream (intptr_t): The CUDA stream handle (``cudaStream_t`` as Python + :class:`int`). + + .. seealso:: `cutensornetStatePrepare` + """ + with nogil: + status = cutensornetStatePrepare( + <_Handle>handle, <_State>state, + max_workspace_size_device, <_WorkspaceDescriptor>workspace, stream) + check_status(status) + + +cpdef tuple state_compute( + intptr_t handle, intptr_t state, intptr_t workspace, + state_tensors_out, intptr_t stream): + """Computes the tensor network state representation. + + Args: + handle (intptr_t): The library handle. + state (intptr_t): The tensor network state. + workspace (intptr_t): The workspace descriptor. + state_tensors_out: A host array of pointer addresses (as Python :class:`int`) for + each output tensor (on device). It can be + + - an :class:`int` as the pointer address to the array + - a Python sequence of :class:`int` + + stream (intptr_t): The CUDA stream handle (``cudaStream_t`` as Python + :class:`int`). + + Returns: + tuple: + The metadata of the output tensors: ``(extents_out, strides_out)``. + + .. seealso:: `cutensornetStateCompute` + """ + cdef int32_t num_tensors = 0 + with nogil: + status = cutensornetGetOutputStateDetails( + <_Handle>handle, <_State>state, + &num_tensors, NULL, NULL, NULL) + check_status(status) + + num_modes = _numpy.empty(num_tensors, dtype=_numpy.int32) + cdef int32_t* numModesPtr = num_modes.ctypes.data + with nogil: + status = cutensornetGetOutputStateDetails( + <_Handle>handle, <_State>state, + &num_tensors, numModesPtr, NULL, NULL) + check_status(status) + + extents_out_py = [_numpy.empty(num_modes[i], dtype=_numpy.int64) for i in range(num_tensors)] + strides_out_py = [_numpy.empty(num_modes[i], dtype=_numpy.int64) for i in range(num_tensors)] + + cdef vector[intptr_t] extentsOut + cdef vector[intptr_t] stridesOut + for i in range(num_tensors): + extentsOut.push_back(extents_out_py[i].ctypes.data) + stridesOut.push_back(strides_out_py[i].ctypes.data) + + cdef int64_t** extentsOutPtr = (extentsOut.data()) + cdef int64_t** stridesOutPtr = (stridesOut.data()) + + cdef vector[intptr_t] stateTensorsOutData + cdef void** stateTensorsOutPtr + if cpython.PySequence_Check(state_tensors_out): + stateTensorsOutData = state_tensors_out + stateTensorsOutPtr = (stateTensorsOutData.data()) + else: # a pointer address + stateTensorsOutPtr = state_tensors_out + + with nogil: + status = cutensornetStateCompute( + <_Handle>handle, <_State>state, <_WorkspaceDescriptor>workspace, + extentsOutPtr, stridesOutPtr, stateTensorsOutPtr, stream) + check_status(status) + return (extents_out_py, strides_out_py) + + +cpdef tuple get_output_state_details(intptr_t handle, intptr_t state): + """Get the output state tensors' metadata. + + Args: + handle (intptr_t): The library handle. + state (intptr_t): The tensor network state. + + Returns: + tuple: + The metadata of the output tensor: ``(num_tensors, num_modes, extents, + strides)``. + + .. seealso:: `cutensornetGetOutputStateDetails` + """ + cdef int32_t num_tensors = 0 + with nogil: + status = cutensornetGetOutputStateDetails( + <_Handle>handle, <_State>state, + &num_tensors, NULL, NULL, NULL) + check_status(status) + + num_modes = _numpy.empty(num_tensors, dtype=_numpy.int32) + cdef int32_t* numModesPtr = num_modes.ctypes.data + with nogil: + status = cutensornetGetOutputStateDetails( + <_Handle>handle, <_State>state, + &num_tensors, numModesPtr, NULL, NULL) + check_status(status) + extents_out_py = [_numpy.empty(num_modes[i], dtype=_numpy.int64) for i in range(num_tensors)] + strides_out_py = [_numpy.empty(num_modes[i], dtype=_numpy.int64) for i in range(num_tensors)] + + cdef vector[intptr_t] extentsOut + cdef vector[intptr_t] stridesOut + for i in range(num_tensors): + extentsOut.push_back(extents_out_py[i].ctypes.data) + stridesOut.push_back(strides_out_py[i].ctypes.data) + + cdef int64_t** extentsOutPtr = (extentsOut.data()) + cdef int64_t** stridesOutPtr = (stridesOut.data()) + with nogil: + status = cutensornetGetOutputStateDetails( + <_Handle>handle, <_State>state, + &num_tensors, NULL, extentsOutPtr, stridesOutPtr) + check_status(status) + return (num_tensors, num_modes, extents_out_py, strides_out_py) + +cpdef state_finalize_mps( + intptr_t handle, intptr_t state, int boundary_condition, extents_out, strides_out): + """Set the target MPS representation. + + Args: + handle (intptr_t): The library handle. + state (intptr_t): The tensor network state. + boundary_condition (BoundaryCondition): The boundary condition of the initial MPS state. + extents_out: A host array of extents for all target MPS tensors. It can be + + - an :class:`int` as the pointer address to the nested sequence + - a Python sequence of :class:`int`, each of which is a pointer address + to the corresponding tensor's extents + - a nested Python sequence of :class:`int` + + strides_out: A host array of strides for all target MPS tensors. It can be + + - an :class:`int` as the pointer address to the nested sequence + - a Python sequence of :class:`int`, each of which is a pointer address + to the corresponding tensor's strides + - a nested Python sequence of :class:`int` + + + .. seealso:: `cutensornetStateFinalizeMPS` + """ + # extents_out can be: + # - a plain pointer address + # - a Python sequence (of pointer addresses) + # - a nested Python sequence (of int64_t) + # Note: it cannot be a mix of sequences and ints. + cdef vector[intptr_t] extentsOutCData + cdef int64_t** extentsOutPtr + if is_nested_sequence(extents_out): + # flatten the 2D sequence + extentsOutPyData = [] + for i in extents_out: + # too bad a Python list can't hold C++ vectors, so we use NumPy + # arrays as the container here to keep data alive + data = _numpy.asarray(i, dtype=_numpy.int64) + assert data.ndim == 1 + extentsOutPyData.append(data) + extentsOutCData.push_back(data.ctypes.data) + extentsOutPtr = (extentsOutCData.data()) + elif cpython.PySequence_Check(extents_out): + # handle 1D sequence + extentsOutCData = extents_out + extentsOutPtr = (extentsOutCData.data()) + else: + # a pointer address, take it as is + extentsOutPtr = extents_out + + # strides_out can be: + # - a plain pointer address + # - a Python sequence (of pointer addresses) + # - a nested Python sequence (of int64_t) + # Note: it cannot be a mix of sequences and ints. + cdef vector[intptr_t] stridesOutCData + cdef int64_t** stridesOutPtr + if is_nested_sequence(strides_out): + # flatten the 2D sequence + stridesOutPyData = [] + for i in strides_out: + # too bad a Python list can't hold C++ vectors, so we use NumPy + # arrays as the container here to keep data alive + data = _numpy.asarray(i, dtype=_numpy.int64) + assert data.ndim == 1 + stridesOutPyData.append(data) + stridesOutCData.push_back(data.ctypes.data) + stridesOutPtr = (stridesOutCData.data()) + elif cpython.PySequence_Check(strides_out): + # handle 1D sequence + stridesOutCData = strides_out + stridesOutPtr = (stridesOutCData.data()) + else: + # a pointer address, take it as is + stridesOutPtr = strides_out + + with nogil: + status = cutensornetStateFinalizeMPS( + <_Handle>handle, <_State>state, <_BoundaryCondition>boundary_condition, + extentsOutPtr, stridesOutPtr) + check_status(status) + + +cpdef intptr_t create_network_operator( + intptr_t handle, int32_t n_state_modes, state_mode_extents, int data_type) except*: + """Create a tensor network operator of a given shape. + + Args: + handle (intptr_t): The library handle. + n_state_modes (int32_t): The total number of state modes the operator will act on. + state_mode_extents: A host array of extents of each state mode. It can be + + - an :class:`int` as the pointer address to the array + - a Python sequence of :class:`int` + + data_type (cuquantum.cudaDataType): The data type of the operator. + Returns: + intptr_t: An opaque tensor network operator handle (as Python :class:`int`). + + .. seealso:: `cutensornetCreateNetworkOperator` + """ + # state_mode_extents can be a pointer address, or a Python sequence + cdef vector[int64_t] stateModeExtentsData + cdef int64_t* stateModeExtentsPtr + if cpython.PySequence_Check(state_mode_extents): + if len(state_mode_extents) != n_state_modes: + raise ValueError("size of state_mode_extents not matching num_state_modes") + stateModeExtentsData = state_mode_extents + stateModeExtentsPtr = stateModeExtentsData.data() + else: # a pointer address + stateModeExtentsPtr = state_mode_extents + + cdef _NetworkOperator operator + with nogil: + status = cutensornetCreateNetworkOperator( + <_Handle>handle, n_state_modes, stateModeExtentsPtr, data_type + , &operator) + check_status(status) + return operator + + +cpdef int64_t network_operator_append_product( + intptr_t handle, intptr_t network_operator, coefficient, + int32_t num_tensors, num_modes, state_modes, tensor_mode_strides, + tensor_data) except*: + """Appends a tensor product component to the tensor network operator. + + Args: + handle (intptr_t): The library handle. + network_operator (intptr_t): The tensor network operator the product will be appended to. + coefficient: Complex coefficient associated with the appended operator component. + num_tensors: Number of tensor factors in the tensor product. + num_modes: A host array of number of state modes each appended tensor factor acts on. It can be + + - an :class:`int` as the pointer address to the array + - a Python sequence of :class:`int` + + state_modes: A host array of modes each appended tensor factor acts on (length = nModes). It can be + + - an :class:`int` as the pointer address to the nested sequence + - a Python sequence of :class:`int`, each of which is a pointer address + to the corresponding tensor's modes + - a nested Python sequence of :class:`int` + + tensor_modes_strides: Tensor mode strides for each tensor factor (length = nModes * 2). It can be + + - an :class:`int` as the pointer address to the nested sequence + - a Python sequence of :class:`int`, each of which is a pointer address + to the corresponding tensor's strides + - a nested Python sequence of :class:`int` + + tensor_data: A host array of pointer addresses (as Python :class:`int`) for + each tensor data (on device). It can be + + - an :class:`int` as the pointer address to the array + - a Python sequence of :class:`int` + + Returns: + int64_t: A unique sequential integer identifier of the appended tensor network operator component. + + .. seealso:: `cutensornetNetworkOperatorAppendProduct` + """ + # num_modes can be a pointer address, or a Python sequence + cdef vector[int32_t] numModesData + cdef const int32_t* numModesPtr + if cpython.PySequence_Check(num_modes): + numModesData = num_modes + numModesPtr = numModesData.data() + else: # a pointer address + numModesPtr = num_modes + + # state_modes can be: + # - a plain pointer address + # - a Python sequence (of pointer addresses) + # - a nested Python sequence (of int32_t) + # Note: it cannot be a mix of sequences and ints. + cdef vector[intptr_t] stateModesCData + cdef const int32_t** stateModesPtr + if is_nested_sequence(state_modes): + # flatten the 2D sequence + stateModesPyData = [] + for i in state_modes: + # too bad a Python list can't hold C++ vectors, so we use NumPy + # arrays as the container here to keep data alive + data = _numpy.asarray(i, dtype=_numpy.int32) + assert data.ndim == 1 + stateModesPyData.append(data) + stateModesCData.push_back(data.ctypes.data) + stateModesPtr = (stateModesCData.data()) + elif cpython.PySequence_Check(state_modes): + # handle 1D sequence + stateModesCData = state_modes + stateModesPtr = (stateModesCData.data()) + else: + # a pointer address, take it as is + stateModesPtr = state_modes + + # tensor_mode_strides can be: + # - a plain pointer address + # - a Python sequence (of pointer addresses) + # - a nested Python sequence (of int64_t) + # Note: it cannot be a mix of sequences and ints. + cdef vector[intptr_t] tensorModeStridesCData + cdef const int64_t** tensorModeStridesPtr + if is_nested_sequence(tensor_mode_strides): + # flatten the 2D sequence + tensorModeStridesPyData = [] + for i in tensor_mode_strides: + # too bad a Python list can't hold C++ vectors, so we use NumPy + # arrays as the container here to keep data alive + data = _numpy.asarray(i, dtype=_numpy.int64) + assert data.ndim == 1 + tensorModeStridesPyData.append(data) + tensorModeStridesCData.push_back(data.ctypes.data) + tensorModeStridesPtr = (tensorModeStridesCData.data()) + elif cpython.PySequence_Check(tensor_mode_strides): + # handle 1D sequence + tensorModeStridesCData = tensor_mode_strides + tensorModeStridesPtr = (tensorModeStridesCData.data()) + else: + # a pointer address, take it as is + tensorModeStridesPtr = tensor_mode_strides + + # tensor_data can be a pointer address, or a Python sequence + cdef vector[intptr_t] tensorDataData + cdef const void** tensorDataPtr + if cpython.PySequence_Check(tensor_data): + tensorDataData = tensor_data + tensorDataPtr = (tensorDataData.data()) + else: # a pointer address + tensorDataPtr = tensor_data + + cdef cuDoubleComplex coeff + coeff.x = coefficient.real + coeff.y = coefficient.imag + + cdef int64_t componentId = 0 + with nogil: + status = cutensornetNetworkOperatorAppendProduct( + <_Handle>handle, <_NetworkOperator>network_operator, + coeff, num_tensors, numModesPtr, stateModesPtr, + tensorModeStridesPtr, tensorDataPtr + , &componentId) + check_status(status) + return componentId + + +cpdef intptr_t create_accessor( + intptr_t handle, intptr_t state, + int32_t n_projected_modes, projected_modes, amplitudes_tensor_strides) except*: + """Create a representation for the tensor network state accessor. + + Args: + handle (intptr_t): The library handle. + state (intptr_t): The tensor network state. + n_projected_modes (int32_t): The number of modes that are projected out for the state. + projected_modes: A host array of projected modes for the marginal. It can be + + - an :class:`int` as the pointer address to the array + - a Python sequence of :class:`int` + + amplitudes_tensor_strides: A host array of strides for the amplitudes tensor. It can be + + - an :class:`int` as the pointer address to the array + - a Python sequence of :class:`int` + + Returns: + intptr_t: An opaque tensor network state accessor handle (as Python :class:`int`). + + .. seealso:: `cutensornetCreateAccessor` + """ + + # projected_modes can be a pointer address, or a Python sequence + cdef vector[int32_t] projectedModesData + cdef int32_t* projectedModesPtr + if cpython.PySequence_Check(projected_modes): + if len(projected_modes) != n_projected_modes: + raise ValueError("size of projected_modes not matching n_projected_modes") + projectedModesData = projected_modes + projectedModesPtr = projectedModesData.data() + else: # a pointer address + projectedModesPtr = projected_modes + + # amplitudes_tensor_strides can be a pointer address, or a Python sequence + cdef vector[int64_t] amplitudesTensorStridesData + cdef int64_t* amplitudesTensorStridesPtr + if cpython.PySequence_Check(amplitudes_tensor_strides): + amplitudesTensorStridesData = amplitudes_tensor_strides + amplitudesTensorStridesPtr = amplitudesTensorStridesData.data() + else: # a pointer address + amplitudesTensorStridesPtr = amplitudes_tensor_strides + + cdef _StateAccessor accessor + with nogil: + status = cutensornetCreateAccessor( + <_Handle>handle, <_State>state, + n_projected_modes, projectedModesPtr, + amplitudesTensorStridesPtr, &accessor) + check_status(status) + return accessor + + +cdef dict accessor_attribute_sizes = { + CUTENSORNET_ACCESSOR_OPT_NUM_HYPER_SAMPLES: _numpy.int32 +} + + +cpdef accessor_get_attribute_dtype(int attr): + """Get the Python data type of the corresponding accessor attribute. + + Args: + attr (AccessorAttribute): The attribute to query. + + Returns: + The data type of the queried attribute. The returned dtype is always + a valid NumPy dtype object. + + .. note:: This API has no C counterpart and is a convenient helper for + allocating memory for :func:`accessor_configure`. + """ + return accessor_attribute_sizes[attr] + + +cpdef accessor_configure(intptr_t handle, intptr_t accessor, int attr, intptr_t buf, size_t size): + """Configures computation of the tensor network state accessor. + + Args: + handle (intptr_t): The library handle. + accessor (intptr_t): The tensor network state accessor computation handle. + attr (AccessorAttribute): The attribute to configure. + buf (intptr_t): The pointer address (as Python :class:`int`) for storing + the returned attribute value. + size (size_t): The size of ``buf`` (in bytes). + + .. note:: To compute ``size``, use the itemsize of the corresponding data + type, which can be queried using :func:`accessor_get_attribute_dtype`. + + .. seealso:: `cutensornetAccessorConfigure` + """ + with nogil: + status = cutensornetAccessorConfigure( + <_Handle>handle, <_StateAccessor>accessor, + <_AccessorAttribute>attr, + buf, size) + check_status(status) + + +cpdef accessor_prepare( + intptr_t handle, intptr_t accessor, + size_t max_workspace_size_device, intptr_t workspace, intptr_t stream): + """Prepares computation of the tensor network state accessor. + + Args: + handle (intptr_t): The library handle. + accessor (intptr_t): The tensor network state accessor handle. + max_workspace_size_device (size_t): The maximal device workspace size (in bytes) allowed + for the accessor computation. + workspace (intptr_t): The workspace descriptor. + stream (intptr_t): The CUDA stream handle (``cudaStream_t`` as Python + :class:`int`). + + .. seealso:: `cutensornetAccessorPrepare` + """ + with nogil: + status = cutensornetAccessorPrepare( + <_Handle>handle, <_StateAccessor>accessor, + max_workspace_size_device, <_WorkspaceDescriptor>workspace, stream) + check_status(status) + + +cpdef accessor_compute( + intptr_t handle, intptr_t accessor, projected_mode_values, + intptr_t workspace, intptr_t amplitudes_tensor, intptr_t state_norm, intptr_t stream): + """Computes the tensor network state amplitudes. + + Args: + handle (intptr_t): The library handle. + accessor (intptr_t): The tensor network state accessor handle. + projected_mode_values: A host array of values for the projected modes. It can be + + - an :class:`int` as the pointer address to the array + - a Python sequence of :class:`int` + + workspace (intptr_t): The workspace descriptor. + amplitudes_tensor (intptr_t): The pointer address (as Python :class:`int`) for storing + the computed amplitudes. + state_norm (intptr_t): The pointer address (as Python :class:`int`) for storing + the 2-norm of the underlying state. If set to 0 (`NULL` pointer), the norm calculation will be ignored. + stream (intptr_t): The CUDA stream handle (``cudaStream_t`` as Python + :class:`int`). + + .. seealso:: `cutensornetAccessorCompute` + """ + # projected_mode_values can be a pointer address, or a Python sequence + cdef vector[int64_t] projectedModeValuesData + cdef int64_t* projectedModeValuesPtr + if cpython.PySequence_Check(projected_mode_values): + projectedModeValuesData = projected_mode_values + projectedModeValuesPtr = projectedModeValuesData.data() + else: # a pointer address + projectedModeValuesPtr = projected_mode_values + + with nogil: + status = cutensornetAccessorCompute( + <_Handle>handle, <_StateAccessor>accessor, + projectedModeValuesPtr, <_WorkspaceDescriptor>workspace, + amplitudes_tensor, state_norm, stream) + check_status(status) + + +cpdef destroy_accessor(intptr_t accessor): + """Destroy a tensor network state accessor handle. + + Args: + marginal (intptr_t): The tensor network state accessor handle. + + .. seealso:: `cutensornetDestroyAccessor` + """ + with nogil: + status = cutensornetDestroyAccessor(<_StateAccessor>accessor) + check_status(status) + + +cpdef destroy_network_operator(intptr_t network_operator): + """Destroy a tensor network operator. + + Args: + network_operator (intptr_t): The tensor network operator. + + .. seealso:: `cutensornetDestroyNetworkOperator` + """ + with nogil: + status = cutensornetDestroyNetworkOperator(<_NetworkOperator>network_operator) + check_status(status) + + +cpdef intptr_t create_expectation( + intptr_t handle, intptr_t state, intptr_t operator) except*: + """Create a representation for the tensor network state expectation value. + + Args: + handle (intptr_t): The library handle. + state (intptr_t): The tensor network state. + + Returns: + intptr_t: An opaque tensor network state expectation handle (as Python :class:`int`). + + .. seealso:: `cutensornetCreateExpectation` + """ + cdef _StateExpectation expectation + with nogil: + status = cutensornetCreateExpectation( + <_Handle>handle, <_State>state, <_NetworkOperator>operator + , &expectation) + check_status(status) + return expectation + + +cdef dict expectation_attribute_sizes = { + CUTENSORNET_EXPECTATION_OPT_NUM_HYPER_SAMPLES: _numpy.int32 +} + + +cpdef expectation_get_attribute_dtype(int attr): + """Get the Python data type of the corresponding expectation attribute. + + Args: + attr (ExpectationAttribute): The attribute to query. + + Returns: + The data type of the queried attribute. The returned dtype is always + a valid NumPy dtype object. + + .. note:: This API has no C counterpart and is a convenient helper for + allocating memory for :func:`expectation_configure`. + """ + return expectation_attribute_sizes[attr] + + +cpdef expectation_configure(intptr_t handle, intptr_t expectation, int attr, intptr_t buf, size_t size): + """Configures computation of the tensor network state expectation value. + + Args: + handle (intptr_t): The library handle. + expectation (intptr_t): The tensor network expectation computation handle. + attr (ExpectationAttribute): The attribute to configure. + buf (intptr_t): The pointer address (as Python :class:`int`) of the attribute value. + size (size_t): The size of ``buf`` (in bytes). + + .. note:: To compute ``size``, use the itemsize of the corresponding data + type, which can be queried using :func:`expectation_get_attribute_dtype`. + + .. seealso:: `cutensornetExpectationConfigure` + """ + with nogil: + status = cutensornetExpectationConfigure( + <_Handle>handle, <_StateExpectation>expectation, + <_ExpectationAttribute>attr, + buf, size) + check_status(status) + + +cpdef expectation_prepare( + intptr_t handle, intptr_t expectation, + size_t max_workspace_size_device, intptr_t workspace, intptr_t stream): + """Prepares computation of the tensor network state expectation. + + Args: + handle (intptr_t): The library handle. + expectation (intptr_t): The tensor network expectation computation handle. + max_workspace_size_device (size_t): The maximal device workspace size (in bytes) allowed + for the expectation value computation. + workspace (intptr_t): The workspace descriptor. + stream (intptr_t): The CUDA stream handle (``cudaStream_t`` as Python + :class:`int`). + + .. seealso:: `cutensornetExpectationPrepare` + """ + with nogil: + status = cutensornetExpectationPrepare( + <_Handle>handle, <_StateExpectation>expectation, + max_workspace_size_device, <_WorkspaceDescriptor>workspace, stream) + check_status(status) + + +cpdef expectation_compute( + intptr_t handle, intptr_t expectation, + intptr_t workspace, intptr_t expectation_value, intptr_t state_norm, intptr_t stream): + """Computes the tensor network state expectation value. + + Args: + handle (intptr_t): The library handle. + expectation (intptr_t): The tensor network expectation computation handle. + workspace (intptr_t): The workspace descriptor. + expectation_value (intptr_t): The pointer address (as Python :class:`int`) for storing + the computed expectation_value (stored on host). + state_norm (intptr_t): The pointer address (as Python :class:`int`) for storing + the 2-norm of the underlying state. If set to 0 (`NULL` pointer), the norm calculation will be ignored. + stream (intptr_t): The CUDA stream handle (``cudaStream_t`` as Python + :class:`int`). + + .. seealso:: `cutensornetExpectationCompute` + """ + with nogil: + status = cutensornetExpectationCompute( + <_Handle>handle, <_StateExpectation>expectation, + <_WorkspaceDescriptor>workspace, expectation_value, state_norm, stream) + check_status(status) + + +cpdef destroy_expectation(intptr_t expectation): + """Destroy a tensor network expectation value representation. + + Args: + expectation (intptr_t): The tensor network expectation value representation. + + .. seealso:: `cutensornetDestroyExpectation` + """ + with nogil: + status = cutensornetDestroyExpectation(<_StateExpectation>expectation) + check_status(status) + + class NetworkAttribute(IntEnum): """See `cutensornetNetworkAttributes_t`.""" INPUT_TENSORS_NUM_CONSTANT = CUTENSORNET_NETWORK_INPUT_TENSORS_NUM_CONSTANT @@ -3360,6 +4162,7 @@ class TensorSVDConfigAttribute(IntEnum): S_PARTITION = CUTENSORNET_TENSOR_SVD_CONFIG_S_PARTITION ALGO = CUTENSORNET_TENSOR_SVD_CONFIG_ALGO ALGO_PARAMS = CUTENSORNET_TENSOR_SVD_CONFIG_ALGO_PARAMS + DISCARDED_WEIGHT_CUTOFF = CUTENSORNET_TENSOR_SVD_CONFIG_DISCARDED_WEIGHT_CUTOFF class TensorSVDNormalization(IntEnum): """See `cutensornetTensorSVDNormalization_t`.""" @@ -3407,6 +4210,29 @@ class SamplerAttribute(IntEnum): """See `cutensornetSamplerAttributes_t`.""" OPT_NUM_HYPER_SAMPLES = CUTENSORNET_SAMPLER_OPT_NUM_HYPER_SAMPLES +class AccessorAttribute(IntEnum): + """See `cutensornetAccessorAttributes_t`.""" + OPT_NUM_HYPER_SAMPLES = CUTENSORNET_ACCESSOR_OPT_NUM_HYPER_SAMPLES + +class ExpectationAttribute(IntEnum): + """See `cutensornetExpectationAttributes_t`.""" + OPT_NUM_HYPER_SAMPLES = CUTENSORNET_EXPECTATION_OPT_NUM_HYPER_SAMPLES + +class BoundaryCondition(IntEnum): + """See `cutensornetBoundaryCondition_t`.""" + OPEN = CUTENSORNET_BOUNDARY_CONDITION_OPEN + +class StateAttribute(IntEnum): + """See `cutensornetStateAttributes_t`.""" + MPS_CANONICAL_CENTER = CUTENSORNET_STATE_MPS_CANONICAL_CENTER + MPS_SVD_CONFIG_ABS_CUTOFF = CUTENSORNET_STATE_MPS_SVD_CONFIG_ABS_CUTOFF + MPS_SVD_CONFIG_REL_CUTOFF = CUTENSORNET_STATE_MPS_SVD_CONFIG_REL_CUTOFF + MPS_SVD_CONFIG_S_NORMALIZATION = CUTENSORNET_STATE_MPS_SVD_CONFIG_S_NORMALIZATION + MPS_SVD_CONFIG_ALGO = CUTENSORNET_STATE_MPS_SVD_CONFIG_ALGO + MPS_SVD_CONFIG_ALGO_PARAMS = CUTENSORNET_STATE_MPS_SVD_CONFIG_ALGO_PARAMS + MPS_SVD_CONFIG_DISCARDED_WEIGHT_CUTOFF = CUTENSORNET_STATE_MPS_SVD_CONFIG_DISCARDED_WEIGHT_CUTOFF + NUM_HYPER_SAMPLES = CUTENSORNET_STATE_NUM_HYPER_SAMPLES + del IntEnum diff --git a/python/cuquantum/cutensornet/experimental/tensor_network.py b/python/cuquantum/cutensornet/experimental/tensor_network.py index 9c8c450..00fae61 100644 --- a/python/cuquantum/cutensornet/experimental/tensor_network.py +++ b/python/cuquantum/cutensornet/experimental/tensor_network.py @@ -44,24 +44,24 @@ def _gate_split(wrapped_operands, inputs, outputs, size_dict, max_mid_extent, al # placeholder to help avoid resource leak handle = workspace_desc = svd_config = svd_info = None input_tensor_descriptors = output_tensor_descriptors = [] + workspaces = dict() + own_handle = False try: # Options converted to an internal option - wrapped_operands, options, own_handle, operands_location = decomposition_utils.parse_decompose_operands_options( - options, wrapped_operands, allowed_dtype_names=decomposition_utils.DECOMPOSITION_DTYPE_NAMES) + wrapped_operands, options, own_handle, operands_location, stream_holder = decomposition_utils.parse_decompose_operands_options( + options, wrapped_operands, stream, allowed_dtype_names=decomposition_utils.DECOMPOSITION_DTYPE_NAMES) mid_extent = max_mid_extent if algorithm.svd_method.max_extent is None else min(max_mid_extent, algorithm.svd_method.max_extent) handle = options.handle - - package = utils.infer_object_package(wrapped_operands[0].tensor) - stream, stream_ctx, stream_ptr = utils.get_or_create_stream(options.device_id, stream, package) + stream_ptr = stream_holder.ptr # this exists as we always use the ExternalStream from CuPy internally... options.logger.info("Calling specicialized kernel `cutensornetGateSplit` for contraction and decomposition.") # Create input/output tensor descriptors and empty output operands input_tensor_descriptors, output_operands, output_tensor_descriptors, s, s_ptr = decomposition_utils.create_operands_and_descriptors( handle, wrapped_operands, size_dict, inputs, outputs, - mid_extent, algorithm.svd_method, options.device_id, stream_ctx, options.logger) + mid_extent, algorithm.svd_method, options.device_id, stream_holder, options.logger) # Parse SVDConfig svd_config = cutn.create_tensor_svd_config(handle) @@ -75,17 +75,16 @@ def _gate_split(wrapped_operands, inputs, outputs, size_dict, max_mid_extent, al workspace_ptr = None options.logger.debug("Querying workspace size...") - + cutn.workspace_compute_gate_split_sizes(handle, *input_tensor_descriptors, *output_tensor_descriptors, gate_algorithm, svd_config, options.compute_type, workspace_desc) - + # Allocate and set workspace - workspaces = dict() for mem_space in (cutn.Memspace.DEVICE, cutn.Memspace.HOST): workspaces[mem_space] = decomposition_utils.allocate_and_set_workspace(handle, options.allocator, workspace_desc, cutn.WorksizePref.MIN, mem_space, cutn.WorkspaceKind.SCRATCH, options.device_id, - stream, stream_ctx, options.logger, task_name='contract decomposition') + stream_holder, options.logger, task_name='contract decomposition') options.logger.info("Starting contract-decompose (gate split)...") timing = bool(options.logger and options.logger.handlers) @@ -96,7 +95,7 @@ def _gate_split(wrapped_operands, inputs, outputs, size_dict, max_mid_extent, al options.logger.info("This call is non-blocking and will return immediately after the operation is launched on the device.") svd_info = cutn.create_tensor_svd_info(handle) - with utils.device_ctx(options.device_id), utils.cuda_call_ctx(stream, blocking, timing) as (last_compute_event, elapsed): + with utils.device_ctx(options.device_id), utils.cuda_call_ctx(stream_holder, blocking, timing) as (last_compute_event, elapsed): cutn.gate_split(handle, input_tensor_descriptors[0], wrapped_operands[0].data_ptr, input_tensor_descriptors[1], wrapped_operands[1].data_ptr, @@ -126,8 +125,8 @@ def _gate_split(wrapped_operands, inputs, outputs, size_dict, max_mid_extent, al s.tensor = s.tensor[:reduced_extent] finally: # when host workspace is allocated, synchronize stream before return - if workspaces[cutn.Memspace.HOST] is not None: - stream.synchronize() + if workspaces.get(cutn.Memspace.HOST) is not None: + stream_holder.obj.synchronize() # Free resources decomposition_utils._destroy_tensor_descriptors(input_tensor_descriptors) decomposition_utils._destroy_tensor_descriptors(output_tensor_descriptors) @@ -141,7 +140,7 @@ def _gate_split(wrapped_operands, inputs, outputs, size_dict, max_mid_extent, al if own_handle and handle is not None: cutn.destroy(handle) - u, v, s = [decomposition_utils.get_return_operand_data(t, operands_location) for t in output_operands + [s, ]] + u, v, s = [decomposition_utils.get_return_operand_data(t, operands_location, stream_holder) for t in output_operands + [s, ]] if return_info: info = ContractDecomposeInfo(qr_method=algorithm.qr_method, @@ -288,9 +287,15 @@ def contract_decompose(subscripts, *operands, algorithm=None, options=None, opti algorithm = utils.check_or_create_options(ContractDecomposeAlgorithm, algorithm, "Contract Decompose Algorithm") options = utils.check_or_create_options(NetworkOptions, options, "Network Options") + # Get cuTensorNet version (as seen at run-time) + cutn_ver = cutn.get_version() + cutn_major = cutn_ver // 10000 + cutn_minor = (cutn_ver % 10000) // 100 + cutn_patch = cutn_ver % 100 + + # Logger logger = logging.getLogger() if options.logger is None else options.logger - logger.info(f"CUDA runtime version = {cutn.get_cudart_version()}") - logger.info(f"cuTensorNet version = {cutn.MAJOR_VER}.{cutn.MINOR_VER}.{cutn.PATCH_VER}") + logger.info(f"cuTensorNet version = {cutn_major}.{cutn_minor}.{cutn_patch}") logger.info("Beginning operands parsing...") # Parse subscipts and operands @@ -299,11 +304,12 @@ def contract_decompose(subscripts, *operands, algorithm=None, options=None, opti if is_gate_split(inputs, outputs, algorithm): # dedicated kernel for GateSplit problem return _gate_split(wrapped_operands, inputs, outputs, size_dict, max_mid_extent, algorithm, options, stream, return_info) - + + own_handle = False try: # contraction followed by decomposition - wrapped_operands, options, own_handle, operands_location = decomposition_utils.parse_decompose_operands_options( - options, wrapped_operands, allowed_dtype_names=decomposition_utils.DECOMPOSITION_DTYPE_NAMES) + wrapped_operands, options, own_handle, operands_location, stream_holder = decomposition_utils.parse_decompose_operands_options( + options, wrapped_operands, stream, allowed_dtype_names=decomposition_utils.DECOMPOSITION_DTYPE_NAMES) intermediate_modes = einsum_parser.infer_output_mode_labels(outputs) @@ -348,17 +354,21 @@ def contract_decompose(subscripts, *operands, algorithm=None, options=None, opti logger.info("Beginning decomposition of the intermediate tensor...") if algorithm.qr_method and algorithm.svd_method is False: # contract and QR decompose - results = decompose(decompose_subscripts, intm_output, method=algorithm.qr_method, options=dataclasses.asdict(options), stream=stream, return_info=False) + results = decompose( + decompose_subscripts, intm_output, method=algorithm.qr_method, options=dataclasses.asdict(options), + stream=stream, return_info=False) results = maybe_truncate_qr_output_operands(results, outputs, max_mid_extent) if operands_location == 'cpu': - results = [tensor_wrapper.wrap_operand(o).to('cpu') for o in results] + results = [tensor_wrapper.wrap_operand(o).to('cpu', stream_holder=stream_holder) for o in results] elif algorithm.svd_method and algorithm.qr_method is False: # contract and SVD decompose use_max_mid_extent = algorithm.svd_method.max_extent is None if use_max_mid_extent: algorithm.svd_method.max_extent = max_mid_extent - results = decompose(decompose_subscripts, intm_output, method=algorithm.svd_method, options=dataclasses.asdict(options), stream=stream, return_info=return_info) + results = decompose( + decompose_subscripts, intm_output, method=algorithm.svd_method, options=dataclasses.asdict(options), + stream=stream, return_info=return_info) if use_max_mid_extent: # revert back algorithm.svd_method.max_extent = None @@ -366,7 +376,9 @@ def contract_decompose(subscripts, *operands, algorithm=None, options=None, opti if return_info: results, info_dict['svd_info'] = results[:-1], results[-1] if operands_location == 'cpu': - results = [o if o is None else tensor_wrapper.wrap_operand(o).to('cpu') for o in results] + results = [o if o is None else tensor_wrapper.wrap_operand(o).to( + 'cpu', stream_holder=stream_holder) + for o in results] else: raise NotImplementedError("contract_decompose currently doesn't support QR assisted SVD contract decomposition for more than 3 operands") logger.info("Decomposition of the intermediate tensor is completed.") diff --git a/python/cuquantum/cutensornet/tensor.py b/python/cuquantum/cutensornet/tensor.py index 52f13d3..b6f5fc0 100644 --- a/python/cuquantum/cutensornet/tensor.py +++ b/python/cuquantum/cutensornet/tensor.py @@ -230,13 +230,15 @@ def decompose( # placeholder to help avoid resource leak handle = workspace_desc = svd_config = svd_info = None input_descriptors = output_descriptors = [] - + workspaces = dict() + own_handle = False try: # wrap operands to be consistent with options. # options is a new instance of DecompositionOptions with all entries initialized - wrapped_operands, options, own_handle, operands_location = decomposition_utils.parse_decompose_operands_options(options, - wrapped_operands, allowed_dtype_names=decomposition_utils.DECOMPOSITION_DTYPE_NAMES) + wrapped_operands, options, own_handle, operands_location, stream_holder = decomposition_utils.parse_decompose_operands_options( + options, wrapped_operands, stream, allowed_dtype_names=decomposition_utils.DECOMPOSITION_DTYPE_NAMES) handle = options.handle + stream_ptr = stream_holder.ptr # this exists as we always use the ExternalStream from CuPy internally... if isinstance(method, QRMethod): mid_extent = max_mid_extent @@ -248,10 +250,8 @@ def decompose( raise ValueError("method must be either SVDMethod or QRMethod") # # Create input/output tensor descriptors and empty output operands - package = utils.infer_object_package(wrapped_operands[0].tensor) - stream, stream_ctx, stream_ptr = utils.get_or_create_stream(options.device_id, stream, package) - input_descriptors, output_operands, output_descriptors, s, s_ptr = decomposition_utils.create_operands_and_descriptors(options.handle, - wrapped_operands, size_dict, inputs, outputs, mid_extent, method, options.device_id, stream_ctx, options.logger) + input_descriptors, output_operands, output_descriptors, s, s_ptr = decomposition_utils.create_operands_and_descriptors( + options.handle, wrapped_operands, size_dict, inputs, outputs, mid_extent, method, options.device_id, stream_holder, options.logger) # Create workspace descriptor workspace_desc = cutn.create_workspace_descriptor(handle) @@ -270,11 +270,10 @@ def decompose( ValueError("method must be either a QRMethod/SVDMethod object or a dict that can be used to construct QRMethod/SVDMethod") # Allocate and set workspace - workspaces = dict() for mem_space in (cutn.Memspace.DEVICE, cutn.Memspace.HOST): workspaces[mem_space] = decomposition_utils.allocate_and_set_workspace(handle, options.allocator, workspace_desc, - cutn.WorksizePref.MIN, mem_space, cutn.WorkspaceKind.SCRATCH, options.device_id, - stream, stream_ctx, options.logger, task_name='tensor decomposition') + cutn.WorksizePref.MIN, mem_space, cutn.WorkspaceKind.SCRATCH, options.device_id, + stream_holder, options.logger, task_name='tensor decomposition') svd_info_obj = None @@ -287,7 +286,7 @@ def decompose( logger.info("This call is non-blocking and will return immediately after the operation is launched on the device.") timing = bool(logger and logger.handlers) if isinstance(method, QRMethod): - with utils.device_ctx(options.device_id), utils.cuda_call_ctx(stream, blocking, timing) as (last_compute_event, elapsed): + with utils.device_ctx(options.device_id), utils.cuda_call_ctx(stream_holder, blocking, timing) as (last_compute_event, elapsed): cutn.tensor_qr(handle, *input_descriptors, wrapped_operands[0].data_ptr, output_descriptors[0], output_operands[0].data_ptr, @@ -298,7 +297,7 @@ def decompose( logger.info(f"The QR decomposition took {elapsed.data:.3f} ms to complete.") elif isinstance(method, SVDMethod): svd_info = cutn.create_tensor_svd_info(handle) - with utils.device_ctx(options.device_id), utils.cuda_call_ctx(stream, blocking, timing) as (last_compute_event, elapsed): + with utils.device_ctx(options.device_id), utils.cuda_call_ctx(stream_holder, blocking, timing) as (last_compute_event, elapsed): cutn.tensor_svd(handle, *input_descriptors, wrapped_operands[0].data_ptr, output_descriptors[0], output_operands[0].data_ptr, @@ -318,8 +317,8 @@ def decompose( s.tensor = s.tensor[:reduced_extent] finally: # when host workspace is allocated, synchronize stream before return - if workspaces[cutn.Memspace.HOST] is not None: - stream.synchronize() + if workspaces.get(cutn.Memspace.HOST) is not None: + stream_holder.obj.synchronize() # Free resources if svd_config is not None: cutn.destroy_tensor_svd_config(svd_config) @@ -335,7 +334,9 @@ def decompose( cutn.destroy(handle) logger.info(f"All resources for the decomposition are freed.") - left_output, right_output, s = [decomposition_utils.get_return_operand_data(o, operands_location) for o in output_operands + [s, ]] + left_output, right_output, s = [decomposition_utils.get_return_operand_data( + o, operands_location, stream_holder) + for o in output_operands + [s, ]] if isinstance(method, QRMethod): return left_output, right_output @@ -407,6 +408,7 @@ class SVDMethod: max_extent: Keep no more than the largest ``max_extent`` singular values in the output operands (the rest will be truncated). abs_cutoff: Singular values below this value will be trimmed in the output operands. rel_cutoff: Singular values below the product of this value and the largest singular value will be trimmed in the output operands. + discarded_weight_cutoff: Singular values with discarded weight (square sum dividied by total square sum) below this value will be trimmed. partition: Singular values S will be explictly returned by default (``partition=None``). Alternatively, singular values may be factorized onto output tensor U (``partition="U"``), output tensor V (``partition="V"``) or equally onto output tensor U and output tensor V (``partition="UV"``). When any of these three partition schemes is selected, @@ -420,6 +422,10 @@ class SVDMethod: gesvdr_oversampling: The size of oversampling when ``algorithm`` is set to ``"gesvdr"``. Default 0 denotes the lower of 4 times ``max_extent`` and the difference between full rank and ``max_extent``. gesvdr_niters: The number of iteration of power method when ``algorithm`` is set to ``"gesvdr"`` and the default (0) is 10. + .. note:: + + If multiple truncation paramters are set, e.g, ``max_extent`` and ``discarded_weight_cutoff``, the truncated extent will be determined as the lowest among all. + .. note:: For detailed explanation on the different SVD algorithms and the corresponding parameters, @@ -434,6 +440,7 @@ class SVDMethod: max_extent: Optional[int] = None abs_cutoff: Optional[float] = 0.0 rel_cutoff: Optional[float] = 0.0 + discarded_weight_cutoff: Optional[float] = 0.0 partition: Optional[str] = None normalization: Optional[str] = None algorithm: Optional[str] = 'gesvd' @@ -459,6 +466,7 @@ def __str__(self): Maxmial number of singular values = {self.max_extent} Absolute value cutoff = {self.abs_cutoff} Relative value cutoff = {self.rel_cutoff} + Discarded weight cutoff = {self.discarded_weight_cutoff} Singular values partition = {self.partition} Singular values normalization = {self.normalization}""" @@ -473,6 +481,9 @@ def __post_init__(self): if (self.gesvdr_oversampling !=0 or self.gesvdr_niters !=0) and self.algorithm != 'gesvdr': raise ValueError(f"gesvdr_oversample and gesvdr_niters can only be set when algorithm is set to gesvdr, found algorithm {self.algorithm}") + + if self.algorithm == 'gesvdr' and self.discarded_weight_cutoff != 0 and self.max_extent is not None: + raise ValueError("Discarded weight truncation is not supported for gesvdr algorithm with fixed extent truncation") def _get_algo_params(self): initialized = False diff --git a/python/cuquantum/cutensornet/tensor_network.py b/python/cuquantum/cutensornet/tensor_network.py index c6f4528..19613dc 100644 --- a/python/cuquantum/cutensornet/tensor_network.py +++ b/python/cuquantum/cutensornet/tensor_network.py @@ -11,6 +11,7 @@ import collections import dataclasses import logging +import warnings import cupy as cp import numpy as np @@ -20,6 +21,7 @@ from . import memory from ._internal import einsum_parser from ._internal import formatters +from ._internal import grad_torch from ._internal import optimizer_ifc from ._internal import tensor_wrapper from ._internal import typemaps @@ -33,7 +35,7 @@ class InvalidNetworkState(Exception): class Network: """ - Network(subscripts, *operands, options=None) + Network(subscripts, *operands, qualifiers=None, options=None, stream=None) Create a tensor network object specified as an Einstein summation expression. @@ -78,6 +80,9 @@ class Network: options: Specify options for the tensor network as a :class:`~cuquantum.NetworkOptions` object. Alternatively, a `dict` containing the parameters for the ``NetworkOptions`` constructor can also be provided. If not specified, the value will be set to the default-constructed ``NetworkOptions`` object. + stream: Provide the CUDA stream to use for network construction, which is needed for stream-ordered operations such as allocating memory. Acceptable inputs include ``cudaStream_t`` (as + Python :class:`int`), :class:`cupy.cuda.Stream`, and :class:`torch.cuda.Stream`. If a stream is not provided, the + current stream will be used. See Also: :meth:`~Network.contract_path`, :meth:`autotune`, :meth:`~Network.contract`, :meth:`reset_operands` @@ -98,28 +103,28 @@ class Network: Create a :class:`Network` object: - >>> n = Network(expr, *operands) + >>> tn = Network(expr, *operands) Find the best contraction order: - >>> path, info = n.contract_path({'samples': 500}) + >>> path, info = tn.contract_path({'samples': 500}) Autotune the network: - >>> n.autotune(iterations=5) + >>> tn.autotune(iterations=5) Perform the contraction. The result is of the same type and on the same device as the operands: - >>> r1 = n.contract() + >>> r1 = tn.contract() Reset operands to new values: >>> operands = [i*operand for i, operand in enumerate(operands, start=1)] - >>> n.reset_operands(*operands) + >>> tn.reset_operands(*operands) Get the result of the new contraction: - >>> r2 = n.contract() + >>> r2 = tn.contract() >>> from math import factorial >>> np.allclose(r2, factorial(len(operands))*r1) True @@ -128,7 +133,7 @@ class Network: network is large) since the memory will be released only when the object goes out of scope. (*To avoid having to explicitly make this call, it is recommended to use the* :class:`Network` *object as a context manager*.) - >>> n.free() + >>> tn.free() If the operands are on the GPU, they can also be updated using in-place operations. In this case, the call to :meth:`reset_operands` can be skipped -- subsequent :meth:`~Network.contract` calls will use the same @@ -140,19 +145,19 @@ class Network: >>> shapes = [(8, 2, 5), (5, 7), (8, 8, 2, 5), (8, 6, 3), (8,), (6,), (5,), (6, 5, 5, 7), (6, 3), (3,)] >>> operands = [cp.random.rand(*shape) for shape in shapes] >>> - >>> with Network(expr, *operands) as n: - ... path, info = n.contract_path({'samples': 500}) - ... n.autotune(iterations=5) + >>> with Network(expr, *operands) as tn: + ... path, info = tn.contract_path({'samples': 500}) + ... tn.autotune(iterations=5) ... ... # Perform the contraction - ... r1 = n.contract() + ... r1 = tn.contract() ... ... # Update the operands in place ... for i, operand in enumerate(operands, start=1): ... operand *= i ... ... # Perform the contraction with the updated operand values - ... r2 = n.contract() + ... r2 = tn.contract() ... ... # The resources used by the network are automatically released when the context ends. >>> @@ -162,34 +167,83 @@ class Network: PyTorch CPU and GPU tensors can be passed as input operands in the same fashion. + To compute the gradients of the network w.r.t. the input operands (NumPy/CuPy/PyTorch), the :meth:`gradients` method can + be used. To enable the gradient computation, one should + + 1. create the network with the ``qualifiers`` argument + 2. call the :meth:`contract` method prior to the :meth:`gradients` method + 3. seed the :meth:`gradients` method with the output gradient (see the docs for the requirements) + + Below is a minimal example: + + >>> from cuquantum import cutensornet as cutn + >>> expr = "ijk,jkl,klm,lmn" + >>> shapes = ((3, 4, 5), (4, 5, 3), (5, 3, 2), (3, 2, 6)) + >>> operands = [cp.random.rand(*shape) for shape in shapes] + >>> qualifiers = np.zeros(len(shapes), dtype=cutn.tensor_qualifiers_dtype) + >>> qualifiers[:]["requires_gradient"] = 1 # request gradients for all input tensors + >>> + >>> with Network(expr, *operands, qualifiers=qualifiers) as tn: + ... path, info = tn.contract_path() + ... + ... # Perform the contraction + ... r = tn.contract() + ... + ... # Perform the backprop + ... input_grads = tn.gradients(cp.ones_like(r)) + ... + >>> + + For PyTorch CPU/GPU tensors with the ``requires_grad`` attribute set up, one does not need to pass the ``qualifiers`` + argument. Note that this :class:`Network` class and its methods are **not** PyTorch operators and do **not** add any + node to PyTorch's autograd graph. For a native, differentiable PyTorch operator, use the :func:`cuquantum.contract` + function. + See :func:`contract` for more examples on specifying the Einstein summation expression as well as specifying options for the tensor network and the optimizer. """ - def __init__(self, *operands, qualifiers=None, options=None): + def __init__(self, *operands, qualifiers=None, options=None, stream=None): """ - __init__(subscripts, *operands, options=None) + __init__(subscripts, *operands, qualifiers=None, options=None, stream=None) """ options = utils.check_or_create_options(configuration.NetworkOptions, options, "network options") self.options = options + # Get cuTensorNet version (as seen at run-time). + cutn_ver = cutn.get_version() + cutn_major = cutn_ver // 10000 + cutn_minor = (cutn_ver % 10000) // 100 + cutn_patch = cutn_ver % 100 + # Logger. self.logger = options.logger if options.logger is not None else logging.getLogger() - self.logger.info(f"CUDA runtime version = {cutn.get_cudart_version()}") - self.logger.info(f"cuTensorNet version = {cutn.MAJOR_VER}.{cutn.MINOR_VER}.{cutn.PATCH_VER}") + self.logger.info(f"cuTensorNet version = {cutn_major}.{cutn_minor}.{cutn_patch}") self.logger.info("Beginning network creation...") # Parse Einsum expression. - self.operands, self.inputs, self.output, self.size_dict, self.mode_map_user_to_ord, self.mode_map_ord_to_user, self.is_interleaved = einsum_parser.parse_einsum(*operands) + self.operands, self.inputs, self.output, self.has_user_output, \ + self.size_dict, self.mode_map_user_to_ord, self.mode_map_ord_to_user, \ + self.is_interleaved, self.has_ellipses = einsum_parser.parse_einsum(*operands) - # Copy operands to device if needed. + # Infer the library package & device ID the operands belong to. + self.package = utils.get_operands_package(self.operands) self.network_location = 'cuda' self.device_id = utils.get_network_device_id(self.operands) if self.device_id is None: + self.package = self.operands[0].name + if self.package == 'numpy': + self.package = 'cupy' self.network_location = 'cpu' self.device_id = options.device_id - self.operands = tensor_wrapper.to(self.operands, self.device_id) + + # Allocate device memory (in stream context) if needed. + stream_holder = utils.get_or_create_stream(self.device_id, stream, self.package) + + # Copy operands to device if needed. + if self.network_location == 'cpu': + self.operands = tensor_wrapper.to(self.operands, self.device_id, stream_holder) # Set blocking or non-blocking behavior. self.blocking = self.options.blocking is True or self.network_location == 'cpu' @@ -198,9 +252,6 @@ def __init__(self, *operands, qualifiers=None, options=None): else: self.call_prologue = "This call is non-blocking and will return immediately after the operation is launched on the device." - # Infer the library package the operands belong to. - self.package = utils.get_operands_package(self.operands) - # The output class is that of the first wrapped device operand. self.output_class = self.operands[0].__class__ @@ -226,18 +277,32 @@ def __init__(self, *operands, qualifiers=None, options=None): num_inputs = len(self.inputs) num_modes_out = len(self.output) - extents_in = tuple(o.shape for o in self.operands) - strides_in = tuple(o.strides for o in self.operands) + extents_in = self.extents_in = tuple(o.shape for o in self.operands) + strides_in = self.strides_in = tuple(o.strides for o in self.operands) self.operands_data = utils.get_operands_data(self.operands) modes_in = tuple(tuple(m for m in _input) for _input in self.inputs) num_modes_in = tuple(len(m) for m in modes_in) self.qualifiers_in = utils.check_tensor_qualifiers(qualifiers, cutn.tensor_qualifiers_dtype, num_inputs) + # For torch tensors, if qualifiers are explicitly passed, we ignore the tensor attrs. + # Otherwise, we look up the tensor attrs and populate qualifiers. + if self.package == 'torch' and isinstance(self.qualifiers_in, int): # = 0 + self.qualifiers_in = np.zeros(num_inputs, dtype=cutn.tensor_qualifiers_dtype) + self.logger.debug("Checking input tensors' requires_grad attribute") + for i, t in enumerate(self.operands): + self.qualifiers_in[i]['requires_gradient'] = self.operands[i].tensor.requires_grad + self.qualifiers_in[i]['is_conjugate'] = self.operands[i].tensor.is_conj() + + # Check if gradient computation is required + if isinstance(self.qualifiers_in, np.ndarray): + self.require_grad = any(self.qualifiers_in['requires_gradient']) + else: + self.require_grad = False + # Create the output in the context of the current stream to work around a performance issue with CuPy's memory pool. - stream = None self.logger.debug("Beginning output tensor creation...") self.contraction, self.contraction_output_event, modes_out, extents_out, strides_out = utils.create_output_tensor( - self.output_class, self.package, self.output, self.size_dict, self.device_id, stream, self.data_type) + self.output_class, self.output, self.size_dict, self.device_id, stream_holder, self.data_type) self.logger.debug("The output tensor has been created.") # Create/set handle. @@ -257,6 +322,7 @@ def __init__(self, *operands, qualifiers=None, options=None): # Keep output extents for creating new tensors, if needed. self.extents_out = extents_out + self.strides_out = strides_out # Path optimization attributes. self.optimizer_config_ptr, self.optimizer_info_ptr = None, None @@ -264,7 +330,10 @@ def __init__(self, *operands, qualifiers=None, options=None): # Workspace attributes. self.workspace_desc = cutn.create_workspace_descriptor(self.handle) - self.workspace_ptr, self.workspace_size = None, None + self.workspace_scratch_ptr, self.workspace_scratch_size = None, None + self.workspace_cache_ptr, self.workspace_cache_size = None, None + self.workspace_h_scratch_ptr, self.workspace_h_scratch_size = None, None + self.workspace_h_cache_ptr, self.workspace_h_cache_size = None, None # Contraction plan attributes. self.plan = None @@ -279,6 +348,7 @@ def __init__(self, *operands, qualifiers=None, options=None): self.last_compute_event = None self.valid_state = True + self.contracted = False self.logger.info("The network has been created.") @@ -308,6 +378,21 @@ def _check_planned(self, *args, **kwargs): if not self.planned: raise RuntimeError(f"Internal Error: {what} cannot be performed before planning has been done.") + def _check_contracted(self, *args, **kwargs): + """ + """ + what = kwargs['what'] + if not self.contracted: + raise RuntimeError(f"{what} cannot be performed before contraction has been done.") + + def _check_qualifiers(self, *args, **kwargs): + """ + """ + what = kwargs['what'] + # cannot perform equality check (a == 0) if a is a numpy ndarray + if isinstance(self.qualifiers_in, int): + raise RuntimeError(f"{what} cannot be performed without creating the Network object with tensor qualifiers") + def _free_plan_resources(self, exception=None): """ Free resources allocated in network contraction planning. @@ -323,7 +408,10 @@ def _free_workspace_memory(self, exception=None): """ Free workspace by releasing the MemoryPointer object. """ - self.workspace_ptr = None + self.workspace_scratch_ptr = None + self.workspace_cache_ptr = None + self.workspace_h_scratch_ptr = None + self.workspace_h_cache_ptr = None return True @@ -341,7 +429,10 @@ def _free_path_resources(self, exception=None): self.optimizer_info_ptr = None self._free_workspace_memory() - self.workspace_size = None + self.workspace_scratch_size = None + self.workspace_cache_size = None + self.workspace_h_scratch_size = None + self.workspace_h_cache_size = None self._free_plan_resources() @@ -350,26 +441,50 @@ def _free_path_resources(self, exception=None): @utils.precondition(_check_valid_network) @utils.precondition(_check_optimized, "Workspace memory allocation") @utils.atomic(_free_workspace_memory, method=True) - def _allocate_workspace_memory_perhaps(self, stream, stream_ctx): - if self.workspace_ptr is not None: + def _allocate_workspace_memory_perhaps(self, stream_holder, kind): + assert kind == "scratch" or kind == "cache", "Internal Error." + + if getattr(self, f"workspace_{kind}_ptr") is not None and getattr(self, f"workspace_h_{kind}_ptr") is not None: return - assert self.workspace_size is not None, "Internal Error." + assert getattr(self, f"workspace_{kind}_size") is not None, "Internal Error." + assert getattr(self, f"workspace_h_{kind}_size") is not None, "Internal Error." + + self.logger.debug(f"Allocating {kind} workspace for contracting the tensor network...") - self.logger.debug("Allocating memory for contracting the tensor network...") - with utils.device_ctx(self.device_id), stream_ctx: + # Allocate device workspace. + device_size = getattr(self, f"workspace_{kind}_size") + with utils.device_ctx(self.device_id), stream_holder.ctx: try: - self.workspace_ptr = self.allocator.memalloc(self.workspace_size) + setattr(self, f"workspace_{kind}_ptr", self.allocator.memalloc(device_size)) except TypeError as e: message = "The method 'memalloc' in the allocator object must conform to the interface in the "\ "'BaseCUDAMemoryManager' protocol." raise TypeError(message) from e - self.workspace_stream = stream - self.logger.debug(f"Finished allocating memory of size {formatters.MemoryStr(self.workspace_size)} for contraction in the context of stream {self.workspace_stream}.") - - device_ptr = utils.get_ptr_from_memory_pointer(self.workspace_ptr) - cutn.workspace_set_memory(self.handle, self.workspace_desc, cutn.Memspace.DEVICE, cutn.WorkspaceKind.SCRATCH, device_ptr, self.workspace_size) - self.logger.debug(f"The workspace memory (device pointer = {device_ptr}) has been set in the workspace descriptor.") + self.workspace_stream = stream_holder.obj + # Allocate host workspace. + # TODO: ideally we should use a memory manager, as we did for device memory, but... + host_size = getattr(self, f"workspace_h_{kind}_size") + setattr(self, f"workspace_h_{kind}_ptr", np.empty(host_size, dtype=np.int8)) + self.logger.debug("Finished allocating " + f"device memory of size {formatters.MemoryStr(device_size)} and " + f"host memory of size {formatters.MemoryStr(host_size)} " + f"for contraction in the context of stream {self.workspace_stream}.") + + # Set device workspace. + device_ptr = utils.get_ptr_from_memory_pointer(getattr(self, f"workspace_{kind}_ptr")) + cutn.workspace_set_memory(self.handle, self.workspace_desc, cutn.Memspace.DEVICE, + cutn.WorkspaceKind.SCRATCH if kind == "scratch" else cutn.WorkspaceKind.CACHE, + device_ptr, device_size) + # Set host workspace. + # TODO: ideally we should be manipulating a MemoryPointer object here, but ... + host_ptr = getattr(self, f"workspace_h_{kind}_ptr").ctypes.data + cutn.workspace_set_memory(self.handle, self.workspace_desc, cutn.Memspace.HOST, + cutn.WorkspaceKind.SCRATCH if kind == "scratch" else cutn.WorkspaceKind.CACHE, + # WAR: empty numpy arrays still have nonzero ptr addresses + host_ptr if host_size > 0 else 0, host_size) + self.logger.debug(f"The {kind} workspace memory (device pointer = {device_ptr}, " + f"host pointer = {host_ptr}) has been set in the workspace descriptor.") @utils.precondition(_check_valid_network) @utils.precondition(_check_optimized, "Workspace size calculation") @@ -379,26 +494,59 @@ def _calculate_workspace_size(self): """ # Release workspace already allocated, if any, because the new requirements are likely different. - self.workspace_ptr = None + self.workspace_scratch_ptr = None + self.workspace_cache_ptr = None + self.workspace_h_scratch_ptr = None + self.workspace_h_cache_ptr = None cutn.workspace_compute_contraction_sizes(self.handle, self.network, self.optimizer_info_ptr, self.workspace_desc) - min_size = cutn.workspace_get_memory_size(self.handle, self.workspace_desc, cutn.WorksizePref.MIN, cutn.Memspace.DEVICE, cutn.WorkspaceKind.SCRATCH) - max_size = cutn.workspace_get_memory_size(self.handle, self.workspace_desc, cutn.WorksizePref.MAX, cutn.Memspace.DEVICE, cutn.WorkspaceKind.SCRATCH) - - if self.memory_limit < min_size: + # Deal with device workspaces. + min_scratch_size = cutn.workspace_get_memory_size( + self.handle, self.workspace_desc, cutn.WorksizePref.MIN, cutn.Memspace.DEVICE, cutn.WorkspaceKind.SCRATCH) + max_scratch_size = cutn.workspace_get_memory_size( + self.handle, self.workspace_desc, cutn.WorksizePref.MAX, cutn.Memspace.DEVICE, cutn.WorkspaceKind.SCRATCH) + min_cache_size = cutn.workspace_get_memory_size( + self.handle, self.workspace_desc, cutn.WorksizePref.MIN, cutn.Memspace.DEVICE, cutn.WorkspaceKind.CACHE) + max_cache_size = cutn.workspace_get_memory_size( + self.handle, self.workspace_desc, cutn.WorksizePref.MAX, cutn.Memspace.DEVICE, cutn.WorkspaceKind.CACHE) + + if self.memory_limit < min_scratch_size + min_cache_size: message = f"""Insufficient memory. -The memory limit specified is {self.memory_limit}, while the minimum workspace size needed is {min_size}. +The memory limit specified is {self.memory_limit}, while the minimum workspace size needed is {min_scratch_size + min_cache_size}. """ raise RuntimeError(message) - self.workspace_size = max_size if max_size < self.memory_limit else self.memory_limit - self.logger.info(f"The workspace size requirements range from {formatters.MemoryStr(min_size)} to "\ - f"{formatters.MemoryStr(max_size)}.") - self.logger.info(f"The workspace size has been set to {formatters.MemoryStr(self.workspace_size)}.") + if self.memory_limit - max_scratch_size >= min_cache_size: + self.workspace_scratch_size = max_scratch_size + else: + self.workspace_scratch_size = min_scratch_size + if min_cache_size > 0 and self.require_grad: + self.workspace_cache_size = min(max_cache_size, self.memory_limit - self.workspace_scratch_size) + else: + self.workspace_cache_size = 0 + self.logger.info(f"The workspace size requirements range from {formatters.MemoryStr(min_scratch_size + min_cache_size)} to "\ + f"{formatters.MemoryStr(max_scratch_size + max_cache_size)}.") + self.logger.info(f"The scratch workspace size has been set to {formatters.MemoryStr(self.workspace_scratch_size)}.") + self.logger.info(f"The cache workspace size has been set to {formatters.MemoryStr(self.workspace_cache_size)}.") # Set workspace size to enable contraction planning. The device pointer will be set later during allocation. - cutn.workspace_set_memory(self.handle, self.workspace_desc, cutn.Memspace.DEVICE, cutn.WorkspaceKind.SCRATCH, 0, self.workspace_size) + cutn.workspace_set_memory( + self.handle, self.workspace_desc, cutn.Memspace.DEVICE, cutn.WorkspaceKind.SCRATCH, 0, self.workspace_scratch_size) + cutn.workspace_set_memory( + self.handle, self.workspace_desc, cutn.Memspace.DEVICE, cutn.WorkspaceKind.CACHE, 0, self.workspace_cache_size) + + # Deal with device workspaces. For now we don't care how much host memory is used. + self.workspace_h_scratch_size = cutn.workspace_get_memory_size( + self.handle, self.workspace_desc, cutn.WorksizePref.RECOMMENDED, cutn.Memspace.HOST, cutn.WorkspaceKind.SCRATCH) + self.workspace_h_cache_size = cutn.workspace_get_memory_size( + self.handle, self.workspace_desc, cutn.WorksizePref.RECOMMENDED, cutn.Memspace.HOST, cutn.WorkspaceKind.CACHE) + + # Set workspace size to enable contraction planning. The host pointer will be set later during allocation. + cutn.workspace_set_memory( + self.handle, self.workspace_desc, cutn.Memspace.HOST, cutn.WorkspaceKind.SCRATCH, 0, self.workspace_h_scratch_size) + cutn.workspace_set_memory( + self.handle, self.workspace_desc, cutn.Memspace.HOST, cutn.WorkspaceKind.CACHE, 0, self.workspace_h_cache_size) @utils.precondition(_check_valid_network) @utils.precondition(_check_optimized, "Planning") @@ -637,34 +785,35 @@ def autotune(self, *, iterations=3, stream=None): self._set_autotune_options(options) # Allocate device memory (in stream context) if needed. - stream, stream_ctx, stream_ptr = utils.get_or_create_stream(self.device_id, stream, self.package) - self._allocate_workspace_memory_perhaps(stream, stream_ctx) + stream_holder = utils.get_or_create_stream(self.device_id, stream, self.package) + self._allocate_workspace_memory_perhaps(stream_holder, "scratch") + self._allocate_workspace_memory_perhaps(stream_holder, "cache") # Check if we still hold an output tensor; if not, create a new one. if self.contraction is None: self.logger.debug("Beginning output (empty) tensor creation...") - self.contraction = utils.create_empty_tensor(self.output_class, self.extents_out, self.data_type, self.device_id, stream_ctx) + self.contraction = utils.create_empty_tensor(self.output_class, self.extents_out, self.data_type, self.device_id, stream_holder) self.logger.debug("The output (empty) tensor has been created.") elif self.contraction_output_event is not None: - stream.wait_event(self.contraction_output_event) + stream_holder.obj.wait_event(self.contraction_output_event) self.contraction_output_event = None self.logger.debug("Established ordering with output tensor creation event.") timing = bool(self.logger and self.logger.handlers) self.logger.info(f"Starting autotuning...") self.logger.info(f"{self.call_prologue}") - with utils.device_ctx(self.device_id), utils.cuda_call_ctx(stream, self.blocking, timing) as (self.last_compute_event, elapsed): - cutn.contraction_autotune(self.handle, self.plan, self.operands_data, self.contraction.data_ptr, - self.workspace_desc, self.autotune_pref_ptr, stream_ptr) + with utils.device_ctx(self.device_id), utils.cuda_call_ctx(stream_holder, self.blocking, timing) as (self.last_compute_event, elapsed): + cutn.contraction_autotune( + self.handle, self.plan, self.operands_data, self.contraction.data_ptr, + self.workspace_desc, self.autotune_pref_ptr, stream_holder.ptr) if elapsed.data is not None: self.logger.info(f"The autotuning took {elapsed.data:.3f} ms to complete.") self.autotuned = True - @utils.precondition(_check_valid_network) - def reset_operands(self, *operands): + def reset_operands(self, *operands, stream=None): """Reset the operands held by this :class:`Network` instance. This method is not needed when the operands @@ -678,12 +827,20 @@ def reset_operands(self, *operands): Args: operands: See :class:`Network`'s documentation. + stream: Provide the CUDA stream to use for resetting operands (this is used to copy the operands to the GPU if they are provided on the CPU). Acceptable inputs include ``cudaStream_t`` + (as Python :class:`int`), :class:`cupy.cuda.Stream`, and :class:`torch.cuda.Stream`. If a stream is not provided, + the current stream will be used. """ if len(operands) != len(self.operands): message = f"Mismatch in the number of operands ({len(operands)} provided, need {len(self.operands)})." raise ValueError(message) + # Future operations on the workspace stream should be ordered after the computation. + # Also, we should ensure self.operands is overwritten only after work using them is done. + if self.last_compute_event is not None: + self.workspace_stream.wait_event(self.last_compute_event) + self.logger.info("Resetting operands...") # First wrap operands. operands = tensor_wrapper.wrap_operands(operands) @@ -691,10 +848,12 @@ def reset_operands(self, *operands): utils.check_operands_match(self.operands, operands, 'dtype', "data type") utils.check_operands_match(self.operands, operands, 'shape', 'shape') + stream_holder = utils.get_or_create_stream(self.device_id, stream, self.package) + device_id = utils.get_network_device_id(operands) if device_id is None: - # Copy to existing device pointers because the new operands are on the CPU. - tensor_wrapper.copy_(operands, self.operands) + # In-place copy to existing device pointers because the new operands are on the CPU. + tensor_wrapper.copy_(operands, self.operands, stream_holder) else: utils.check_operands_match(self.operands, operands, 'strides', 'strides') package = utils.get_operands_package(operands) @@ -708,8 +867,17 @@ def reset_operands(self, *operands): # Finally, replace the original data pointers by the new ones. self.operands_data = utils.get_operands_data(operands) + self.operands = operands self.logger.info("The operands have been reset.") + self.contracted = False + if not self.require_grad: + return + + # The cache workspace is invalidated. + cutn.workspace_purge_cache(self.handle, self.workspace_desc, cutn.Memspace.DEVICE) + cutn.workspace_purge_cache(self.handle, self.workspace_desc, cutn.Memspace.HOST) + @utils.precondition(_check_valid_network) @utils.precondition(_check_optimized, "Contraction") @utils.precondition(_check_planned, "Contraction") @@ -728,16 +896,18 @@ def contract(self, *, slices=None, stream=None): """ # Allocate device memory (in stream context) if needed. - stream, stream_ctx, stream_ptr = utils.get_or_create_stream(self.device_id, stream, self.package) - self._allocate_workspace_memory_perhaps(stream, stream_ctx) + stream_holder = utils.get_or_create_stream(self.device_id, stream, self.package) + self._allocate_workspace_memory_perhaps(stream_holder, "scratch") + self._allocate_workspace_memory_perhaps(stream_holder, "cache") # Check if we still hold an output tensor; if not, create a new one. if self.contraction is None: self.logger.debug("Beginning output (empty) tensor creation...") - self.contraction = utils.create_empty_tensor(self.output_class, self.extents_out, self.data_type, self.device_id, stream_ctx) + self.contraction = utils.create_empty_tensor( + self.output_class, self.extents_out, self.data_type, self.device_id, stream_holder) self.logger.debug("The output (empty) tensor has been created.") elif self.contraction_output_event is not None: - stream.wait_event(self.contraction_output_event) + stream_holder.obj.wait_event(self.contraction_output_event) self.contraction_output_event = None self.logger.debug("Established ordering with output tensor creation event.") @@ -759,9 +929,10 @@ def contract(self, *, slices=None, stream=None): timing = bool(self.logger and self.logger.handlers) self.logger.info("Starting network contraction...") self.logger.info(f"{self.call_prologue}") - with utils.device_ctx(self.device_id), utils.cuda_call_ctx(stream, self.blocking, timing) as (self.last_compute_event, elapsed): - cutn.contract_slices(self.handle, self.plan, self.operands_data, self.contraction.data_ptr, False, - self.workspace_desc, slice_group, stream_ptr) + with utils.device_ctx(self.device_id), utils.cuda_call_ctx(stream_holder, self.blocking, timing) as (self.last_compute_event, elapsed): + cutn.contract_slices( + self.handle, self.plan, self.operands_data, self.contraction.data_ptr, False, + self.workspace_desc, slice_group, stream_holder.ptr) if elapsed.data is not None: self.logger.info(f"The contraction took {elapsed.data:.3f} ms to complete.") @@ -772,13 +943,102 @@ def contract(self, *, slices=None, stream=None): self.logger.debug(f"Slice group ({slice_group}) has been destroyed.") if self.network_location == 'cpu': - out = self.contraction.to('cpu') + out = self.contraction.to('cpu', stream_holder=stream_holder) else: out = self.contraction.tensor self.contraction = None # We cannot overwrite what we've already handed to users. + self.contracted = True return out + @utils.precondition(_check_valid_network) + @utils.precondition(_check_optimized, "Gradient") + @utils.precondition(_check_planned, "Gradient") + @utils.precondition(_check_contracted, "Gradient") + @utils.precondition(_check_qualifiers, "Gradient") + def gradients(self, output_gradient, *, stream=None): + """Compute the gradients of the network (w.r.t. the input operands whose gradients are required). + + Before calling this method, a full contraction must have been performed (by calling :meth:`contract`), otherwise an + error is raised. + + Args: + output_gradient: A tensor of the same package (NumPy/CuPy/PyTorch), shape, dtype, strides, and location (CPU/GPU) + as the contraction output (as returned by :meth:`contract`), which in turn shares the same properties with the + input operands. In a chain-rule setting, ``output_gradient`` is the gradient w.r.t. the output tensor. + stream: Provide the CUDA stream to use for the gradient computation. Acceptable inputs include ``cudaStream_t`` + (as Python :class:`int`), :class:`cupy.cuda.Stream`, and :class:`torch.cuda.Stream`. If a stream is not provided, + the current stream will be used. + + Returns: + A sequence of gradient tensors. The result is of the same length and type and on the same device as the input operands. + For the gradient components that are not requested, ``None`` is returned. + + .. note:: For PyTorch operands, calling this method is **not** tracked by the autograd graph. + + .. warning:: This API is experimental and subject to future changes. + """ + warnings.warn("Network.gradients() is an experimental API and subject to future changes", + stacklevel=2) + + # At this point, both scratch and cache workspaces are allocated/populated. + assert self.workspace_scratch_ptr is not None and self.workspace_cache_ptr is not None, "Internal error." + assert self.workspace_h_scratch_ptr is not None and self.workspace_h_cache_ptr is not None, "Internal error." + + stream_holder = utils.get_or_create_stream(self.device_id, stream, self.package) + # Future operations on the workspace stream should be ordered after the computation. + if self.last_compute_event is not None: + self.workspace_stream.wait_event(self.last_compute_event) + stream_holder.obj.wait_event(self.last_compute_event) + + # Wrap output_gradient + output_grad = tensor_wrapper.wrap_operand(output_gradient) + if output_grad.device_id != self.device_id: + output_grad = tensor_wrapper.to([output_grad], self.device_id, stream_holder)[0] + if output_grad.shape != self.extents_out: + raise ValueError(f"output_gradient shape incorrect (given {output_grad.shape}, expected {self.extents_out})") + if output_grad.dtype != self.data_type: + raise ValueError(f"output_gradient dtype incorrect (given {output_grad.dtype}, expected {self.data_type}") + if output_grad.strides != self.strides_out: + # output_gradient could be a view, but we need a full buffer for now + if any(s == 0 for s in output_grad.strides): + buf = utils.create_empty_tensor( + self.output_class, self.extents_out, self.data_type, self.device_id, stream_holder, strides=self.strides_out) + buf.copy_(output_grad.tensor, stream_holder=stream_holder) + output_grad = buf + else: + raise ValueError(f"output_gradient strides incorrect (given {output_grad.strides}, expected {self.strides_out}") + + # Allocate grad tensors, as needed + input_grads = [] + for i, extents, strides, requires_grad in zip( + range(len(self.inputs)), self.extents_in, self.strides_in, self.qualifiers_in['requires_gradient']): + if requires_grad: + input_grads.append( + utils.create_empty_tensor(self.output_class, extents, self.data_type, self.device_id, stream_holder, strides=strides) + ) + else: + input_grads.append(None) + input_grads_data = utils.get_operands_data(input_grads) + + timing = bool(self.logger and self.logger.handlers) + self.logger.info("Starting gradient computation...") + self.logger.info(f"{self.call_prologue}") + with utils.device_ctx(self.device_id), utils.cuda_call_ctx(stream_holder, self.blocking, timing) as (self.last_compute_event, elapsed): + cutn.compute_gradients_backward( + self.handle, self.plan, self.operands_data, output_grad.data_ptr, + input_grads_data, False, self.workspace_desc, stream_holder.ptr) + + if elapsed.data is not None: + self.logger.info(f"The backprop took {elapsed.data:.3f} ms to complete.") + + if self.network_location == 'cpu': + op = lambda t: t.to('cpu', stream_holder=stream_holder) if t is not None else None + else: + op = lambda t: t.tensor if t is not None else None + + return tuple(map(op, input_grads)) + def free(self): """Free network resources. @@ -950,7 +1210,11 @@ def contract(*operands, qualifiers=None, options=None, optimize=None, stream=Non ... b = cupy.arange(6.).reshape(2, 3) >>> r = contract('ij,jk', a, b) - Use PyTorch operands. The result ``r`` is a PyTorch tensor on the same device (``dev``) as the operands: + For PyTorch operands, this function **works like a native PyTorch operator** out of box that will be tracked by the + autograd graph so as to enable backpropagation. The result ``r`` is a PyTorch tensor on the same device (``dev``) as + the operands. To enable gradient computation, just set the target operands' ``requires_grad`` attribute to ``True``, + as usual. If ``stream`` is explicitly passed, the user must establish the stream ordering following the requirements + outlined in PyTorch's `CUDA Semantics `_. .. doctest:: :skipif: torch is None @@ -958,12 +1222,28 @@ def contract(*operands, qualifiers=None, options=None, optimize=None, stream=Non >>> import torch >>> dev = 0 >>> a = torch.arange(6., device=f'cuda:{dev}').reshape(3, 2) + >>> a.requires_grad_(True) >>> b = torch.arange(6., device=f'cuda:{dev}').reshape(2, 3) + >>> b.requires_grad_(True) >>> r = contract('ij,jk', a, b) + >>> r.backward(torch.ones_like(r)) # gradient w.r.t self is 1 + >>> a.grad + tensor([[ 3., 12.], + [ 3., 12.], + [ 3., 12.]], device='cuda:0') + >>> b.grad + tensor([[6., 6., 6.], + [9., 9., 9.]], device='cuda:0') """ # Create network. - with Network(*operands, qualifiers=qualifiers, options=options) as network: + network = Network(*operands, qualifiers=qualifiers, options=options, stream=stream) + + # For PyTorch tensors, we ensure contract() is differentiable. + if network.package == "torch": + return grad_torch._TorchContract.apply(network, optimize, stream, return_info, *operands) + + with network: # Compute path. opt_info = network.contract_path(optimize=optimize) @@ -1094,6 +1374,8 @@ def einsum(*operands, out=None, dtype=None, order='K', casting='safe', optimize= output: A tensor (ndarray-like object) of the same type and on the same device as the operands containing the result of the contraction. + + .. note:: For PyTorch operands, calling this method is **not** tracked by the autograd graph. """ _check_einsum_options(out, dtype, order, casting, optimize) diff --git a/python/cuquantum/utils.pxd b/python/cuquantum/utils.pxd index 9cfa200..a5a3d57 100644 --- a/python/cuquantum/utils.pxd +++ b/python/cuquantum/utils.pxd @@ -20,6 +20,11 @@ cdef extern from "vector_types.h" nogil: ctypedef struct int2 'int2': pass +cdef extern from "cuComplex.h" nogil: + ctypedef struct cuDoubleComplex: + double x + double y + # Cython limitation: need standalone typedef if we wanna use it for casting ctypedef int (*DeviceAllocType)(void*, void**, size_t, Stream) diff --git a/python/samples/custatevec/subsv_migration.py b/python/samples/custatevec/subsv_migration.py new file mode 100644 index 0000000..69801b6 --- /dev/null +++ b/python/samples/custatevec/subsv_migration.py @@ -0,0 +1,63 @@ +# Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES +# +# SPDX-License-Identifier: BSD-3-Clause + +import cupy as cp +import cupyx as cpx +import numpy as np + +from cuquantum import custatevec as cusv +from cuquantum import cudaDataType + +dtype = np.complex128 +sv_data_type = cudaDataType.CUDA_C_64F; + +n_local_index_bits = 3; + +sub_sv_size = 2 ** n_local_index_bits + +# allocate host sub state vectors +n_sub_svs = 2 +sub_svs = [None] * n_sub_svs +sub_svs[0] = cpx.empty_pinned(sub_sv_size, dtype=dtype) +sub_svs[0][:] = 0.25 + 0.j +sub_svs[1] = cpx.zeros_pinned(sub_sv_size, dtype=dtype) + +# allocate device slots + +n_device_slots = 1; +device_slots_size = sub_sv_size * n_device_slots +device_slots = cp.zeros([device_slots_size], dtype=dtype) + +# initialize custatevec handle +handle = cusv.create() + +# create migrator +migrator = cusv.sub_sv_migrator_create(handle, device_slots.data.ptr, sv_data_type, + n_device_slots, n_local_index_bits) + +device_slot_index = 0 +src_sub_sv = sub_svs[0] +dst_sub_sv = sub_svs[1] + +# migrate sub_svs[0] into device_slots +cusv.sub_sv_migrator_migrate(handle, migrator, device_slot_index, + src_sub_sv.ctypes.data, 0, 0, sub_sv_size) + +# migrate device_slots into sub_svs[1] +cusv.sub_sv_migrator_migrate(handle, migrator, device_slot_index, + 0, dst_sub_sv.ctypes.data, 0, sub_sv_size) + +# destroy migrator +cusv.sub_sv_migrator_destroy(handle, migrator) + +# destroy custatevec handle +cusv.destroy(handle) + +# check if sub_svs[1] has expected values +correct = np.all(sub_svs[1] == 0.25 + 0.j) + +if correct: + print('subsv_migration example PASSED') +else: + raise RuntimeError('subsv_migration example FAILED: wrong result') diff --git a/python/samples/cutensornet/circuit_converter/qiskit_advanced.ipynb b/python/samples/cutensornet/circuit_converter/qiskit_advanced.ipynb index 18856e7..bd6b60d 100644 --- a/python/samples/cutensornet/circuit_converter/qiskit_advanced.ipynb +++ b/python/samples/cutensornet/circuit_converter/qiskit_advanced.ipynb @@ -57,9 +57,9 @@ "outputs": [ { "data": { - "image/png": "\n", + "image/png": "", "text/plain": [ - "
" + "
" ] }, "execution_count": 2, @@ -86,9 +86,9 @@ "outputs": [ { "data": { - "image/png": "\n", + "image/png": "", "text/plain": [ - "
" + "
" ] }, "execution_count": 3, @@ -174,14 +174,12 @@ }, { "data": { - "image/png": "\n", + "image/png": "", "text/plain": [ - "
" + "
" ] }, - "metadata": { - "needs_background": "light" - }, + "metadata": {}, "output_type": "display_data" } ], @@ -227,9 +225,9 @@ "outputs": [ { "data": { - "image/png": "\n", + "image/png": "", "text/plain": [ - "
" + "
" ] }, "metadata": {}, @@ -246,7 +244,7 @@ "\n", "G = nx.random_regular_graph(reg, n, seed=seed)\n", "weights = {(i, j): 1 for i, j in G.edges}\n", - "nx.draw(G)" + "nx.draw_networkx(G)" ] }, { @@ -257,9 +255,9 @@ "outputs": [ { "data": { - "image/png": "\n", + "image/png": "", "text/plain": [ - "
" + "
" ] }, "execution_count": 7, @@ -374,7 +372,7 @@ " \n", " cutn.destroy(handle)\n", " \n", - " return e\n", + " return e.get()\n", " \n", " return expectation\n", "\n", @@ -469,7 +467,7 @@ "name": "stdout", "output_type": "stream", "text": [ - " fun: -5.974513781450166\n", + " fun: -5.974513781450138\n", " maxcv: 0.0\n", " message: 'Optimization terminated successfully.'\n", " nfev: 126\n", @@ -599,7 +597,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.12" + "version": "3.10.11" } }, "nbformat": 4, diff --git a/python/samples/cutensornet/coarse/example23_torch_grad.py b/python/samples/cutensornet/coarse/example23_torch_grad.py new file mode 100644 index 0000000..11361eb --- /dev/null +++ b/python/samples/cutensornet/coarse/example23_torch_grad.py @@ -0,0 +1,56 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES +# +# SPDX-License-Identifier: BSD-3-Clause + +""" +Example: Demo using cuquantum.contract() in a PyTorch compute graph out of box. + +This sample requires PyTorch. +""" + +import torch + +import cuquantum +from cuquantum import cutensornet as cutn + + +# random-initialize input tensors on GPU, and require gradient computation of all inputs +kwargs = {'device': 'cuda', + 'requires_grad': True, + 'dtype': torch.complex128} +a = torch.rand((3, 4), **kwargs) +b = torch.rand((4, 5, 6, 3), **kwargs) +c = torch.rand((3, 3), **kwargs) + +# create a hypothetical workload using PyTorch operators +def compute(func, expr, a, b, c): + # note: cannot perform in-place ops on leaf nodes + a = a * a + b = -10 + b + c = torch.cos(c) + d = func(expr, a, b, c) + return torch.sum(d, dim=0, keepdim=True) + +# use cuquantum.contract() in the workload to compute gradients +out_cuqnt = compute(cuquantum.contract, "ab,bcde,ef->acdf", a, b, c) + +# backprop to fill the gradients +output_grad = torch.ones_like(out_cuqnt) +out_cuqnt.backward(output_grad) + +# store the computed gradients for later verification +input_grads_cuqnt = (a.grad, b.grad, c.grad) + +# now let's reset the gradients and redo the computation using +# torch.einsum() for comparison +a.grad, b.grad, c.grad = None, None, None +out_torch = compute(torch.einsum, "ab,bcde,ef->acdf", a, b, c) +out_torch.backward(output_grad) +input_grads_torch = (a.grad, b.grad, c.grad) + +# check results +assert all( + torch.allclose(grad_cuqnt, grad_torch) + for grad_cuqnt, grad_torch in zip(input_grads_cuqnt, input_grads_torch) +) +print("all checked!") diff --git a/python/samples/cutensornet/experimental/example01-pairwise_canonicalization.py b/python/samples/cutensornet/experimental/example01-pairwise_canonicalization.py index 7ff69c0..70eb727 100644 --- a/python/samples/cutensornet/experimental/example01-pairwise_canonicalization.py +++ b/python/samples/cutensornet/experimental/example01-pairwise_canonicalization.py @@ -5,16 +5,15 @@ """ Example of pairwise tensor canonicalization with contract_decompose -NumPy ndarrays are used as inputs. +CuPy ndarrays are used as inputs. """ -import numpy as np +import cupy as cp from cuquantum import contract from cuquantum.cutensornet.experimental import contract_decompose - -a = np.ones((2,2,2)) -b = np.ones((2,2,2)) +a = cp.ones((2,2,2)) +b = cp.ones((2,2,2)) # use QR to canonicalize two tensors: # i k m i k m @@ -28,8 +27,10 @@ a_qr, b_qr = contract_decompose('ijk,klm->ijk,klm', a, b, algorithm=canonicalize_algorithm) # compare the difference after canonicalization -diff = contract('ijk,klm', a, b) - contract('ijk,klm', a_qr, b_qr) +out1 = contract('ijk,klm', a, b) +out2 = contract('ijk,klm', a_qr, b_qr) + +assert cp.allclose(out1, out2) print("After canonicalization") print(f" Shape of A, B: {a_qr.shape} {b_qr.shape}") -print(f" Maxdiff error: {abs(diff).max()}") \ No newline at end of file diff --git a/python/samples/cutensornet/fine/example5_cupy_grad.py b/python/samples/cutensornet/fine/example5_cupy_grad.py new file mode 100644 index 0000000..4f3f72d --- /dev/null +++ b/python/samples/cutensornet/fine/example5_cupy_grad.py @@ -0,0 +1,79 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES +# +# SPDX-License-Identifier: BSD-3-Clause + +""" +Example: Computing the gradients of a tensor network with CuPy ndarrays. + +The gradients are returned as CuPy ndarrays. + +This example is also straightforwardly applicable to NumPy ndarrays and PyTorch tensors. +""" + +import cupy as cp +import numpy as np + +import cuquantum +from cuquantum import cutensornet as cutn + + +# random-initialize input tensors +a = cp.random.random((3, 4, 5)) +b = cp.random.random((4, 5, 6)) +c = cp.random.random((6, 5, 2)) + +# create tensor qualifiers for all input tenors +qualifiers = np.zeros(3, dtype=cutn.tensor_qualifiers_dtype) + +# require gradient computation of all inputs +qualifiers[:]['requires_gradient'] = 1 + +# create a network +tn = cuquantum.Network("abc,bcd,dce->ae", a, b, c, qualifiers=qualifiers) + +# perform contraction as usual +# this would prepare the internal cache for gradient computation +tn.contract_path() +out = tn.contract() + +# prepare the seed gradient (w.r.t. the output tensor itself, so it's 1) +output_grad = cp.ones_like(out) + +# compute the gradients +input_grads = tn.gradients(output_grad) + +# the gradient tensors have the same type as the input tensors +assert all(isinstance(arr, cp.ndarray) for arr in input_grads) + +# free the network +tn.free() + +# check results against PyTorch (if installed) +try: + import torch +except ImportError: + torch = None +if torch is not None and torch.cuda.is_available(): + # create torch tenros via zero-copy + a_t = torch.as_tensor(a, device='cuda') + b_t = torch.as_tensor(b, device='cuda') + c_t = torch.as_tensor(c, device='cuda') + + # require gradient computation of all inputs + a_t.requires_grad_(True) + b_t.requires_grad_(True) + c_t.requires_grad_(True) + + # compute the contraction + out_t = torch.einsum("abc,bcd,dce->ae", a_t, b_t, c_t) + + # backprop to fill the gradients + output_grad_t = torch.ones_like(out_t) + out_t.backward(output_grad_t) + + # check results (zero-copy torch tensors as cupy arrays) + assert cp.allclose(out_t.detach(), out) # non-leaf nodes need to be detached first + assert cp.allclose(a_t.grad, input_grads[0]) + assert cp.allclose(b_t.grad, input_grads[1]) + assert cp.allclose(c_t.grad, input_grads[2]) + print("all checked!") diff --git a/python/samples/cutensornet/high_level/amplitudes_example.py b/python/samples/cutensornet/high_level/amplitudes_example.py new file mode 100755 index 0000000..aca6c57 --- /dev/null +++ b/python/samples/cutensornet/high_level/amplitudes_example.py @@ -0,0 +1,128 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES +# +# SPDX-License-Identifier: BSD-3-Clause + +import cupy as cp +import numpy as np + +import cuquantum +from cuquantum import cutensornet as cutn + + +print("cuTensorNet-vers:", cutn.get_version()) +dev = cp.cuda.Device() # get current device +props = cp.cuda.runtime.getDeviceProperties(dev.id) +print("===== device info ======") +print("GPU-name:", props["name"].decode()) +print("GPU-clock:", props["clockRate"]) +print("GPU-memoryClock:", props["memoryClockRate"]) +print("GPU-nSM:", props["multiProcessorCount"]) +print("GPU-major:", props["major"]) +print("GPU-minor:", props["minor"]) +print("========================") + +################################################# +# Accessor computation of a quantum circuit state +################################################# + +# Quantum state configuration +num_qubits = 6 +dim = 2 +qubits_dims = (dim, ) * num_qubits # qubit size +fixed_modes = (0, 1) # open qubits +num_fixed_modes = len(fixed_modes) +fixed_values = (1, 1) +print(f"Quantum circuit with {num_qubits} qubits") + +############# +# cuTensorNet +############# + +handle = cutn.create() +stream = cp.cuda.Stream() +data_type = cuquantum.cudaDataType.CUDA_C_64F + +# Define quantum gate tensors on device +gate_h = 2**-0.5 * cp.asarray([[1,1], [1,-1]], dtype='complex128', order='F') +gate_h_strides = 0 + +gate_cx = cp.asarray([[1, 0, 0, 0], + [0, 1, 0, 0], + [0, 0, 0, 1], + [0, 0, 1, 0]], dtype='complex128').reshape(2,2,2,2, order='F') +gate_cx_strides = 0 + +# Allocate device memory for the specified slice of the quantum circuit amplitudes tensor +amplitudes_shape = [qubits_dims[i] for i in range(num_qubits) if i not in fixed_modes] +amplitudes = cp.empty(amplitudes_shape, dtype='complex128') +amplitudes_strides = [stride_in_bytes // amplitudes.itemsize for stride_in_bytes in amplitudes.strides] + +free_mem = dev.mem_info[0] +# use half of the totol free size +scratch_size = free_mem // 2 +scratch_space = cp.cuda.alloc(scratch_size) +print(f"Allocated {scratch_size} bytes of scratch memory on GPU") + +# Create the initial quantum state +quantum_state = cutn.create_state(handle, cutn.StatePurity.PURE, num_qubits, qubits_dims, data_type) +print("Created the initial quantum state") + +# Construct the quantum circuit state with gate application +tensor_id = cutn.state_apply_tensor( + handle, quantum_state, 1, (0, ), + gate_h.data.ptr, gate_h_strides, 1, 0, 1) + +for i in range(1, num_qubits): + tensor_id = cutn.state_apply_tensor( + handle, quantum_state, 2, (i-1, i), # target on i-1 while control on i + gate_cx.data.ptr, gate_cx_strides, 1, 0, 1) +print("Quantum gates applied") + +# Specify the quantum circuit amplitudes accessor +accessor = cutn.create_accessor(handle, + quantum_state, num_fixed_modes, fixed_modes, amplitudes_strides) + +# Configure the computation of the specified slice of the quantum circuit amplitudes tensor +num_hyper_samples_dtype = cutn.accessor_get_attribute_dtype(cutn.AccessorAttribute.OPT_NUM_HYPER_SAMPLES) +num_hyper_samples = np.asarray(8, dtype=num_hyper_samples_dtype) +cutn.accessor_configure(handle, accessor, + cutn.AccessorAttribute.OPT_NUM_HYPER_SAMPLES, + num_hyper_samples.ctypes.data, num_hyper_samples.dtype.itemsize) + +# Prepare the computation of the specified slice of the quantum circuit amplitudes tensor +work_desc = cutn.create_workspace_descriptor(handle) +cutn.accessor_prepare(handle, accessor, scratch_size, work_desc, stream.ptr) +print("Prepare the computation of the specified slice of the quantum circuit amplitudes tensor") + +workspace_size_d = cutn.workspace_get_memory_size(handle, + work_desc, cutn.WorksizePref.RECOMMENDED, cutn.Memspace.DEVICE, cutn.WorkspaceKind.SCRATCH) + +if workspace_size_d <= scratch_size: + cutn.workspace_set_memory(handle, work_desc, cutn.Memspace.DEVICE, cutn.WorkspaceKind.SCRATCH, scratch_space.ptr, workspace_size_d) +else: + print("Error:Insufficient workspace size on Device") + cutn.destroy_workspace_descriptor(work_desc) + cutn.destroy_accessor(accessor) + cutn.destroy_state(quantum_state) + cutn.destroy(handle) + del scratch + print("Free resource and exit.") + exit() +print("Set the workspace buffer") + +# Compute the specified slice of the quantum circuit amplitudes tensor +state_norm = np.empty(1, dtype='complex128') +cutn.accessor_compute(handle, accessor, + fixed_values, work_desc, amplitudes.data.ptr, state_norm.ctypes.data, stream.ptr) +stream.synchronize() +print("Computed the specified quantum circuit state amplitudes") + +print(amplitudes) +print(f"norm of the state = {state_norm.item()}") + +cutn.destroy_workspace_descriptor(work_desc) +cutn.destroy_accessor(accessor) +cutn.destroy_state(quantum_state) +cutn.destroy(handle) +del scratch_space +print("Free resource and exit.") \ No newline at end of file diff --git a/python/samples/cutensornet/high_level/expectation_example.py b/python/samples/cutensornet/high_level/expectation_example.py new file mode 100755 index 0000000..f313d12 --- /dev/null +++ b/python/samples/cutensornet/high_level/expectation_example.py @@ -0,0 +1,159 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES +# +# SPDX-License-Identifier: BSD-3-Clause + +import cupy as cp +import numpy as np + +import cuquantum +from cuquantum import cutensornet as cutn + + +print("cuTensorNet-vers:", cutn.get_version()) +dev = cp.cuda.Device() # get current device +props = cp.cuda.runtime.getDeviceProperties(dev.id) +print("===== device info ======") +print("GPU-name:", props["name"].decode()) +print("GPU-clock:", props["clockRate"]) +print("GPU-memoryClock:", props["memoryClockRate"]) +print("GPU-nSM:", props["multiProcessorCount"]) +print("GPU-major:", props["major"]) +print("GPU-minor:", props["minor"]) +print("========================") + +#################################################### +# Expectation computation of a quantum circuit state +#################################################### + +# Quantum state configuration +num_qubits = 16 +dim = 2 +qubits_dims = (dim, ) * num_qubits # qubit size +print(f"Quantum circuit with {num_qubits} qubits") + +############# +# cuTensorNet +############# + +handle = cutn.create() +stream = cp.cuda.Stream() +data_type = cuquantum.cudaDataType.CUDA_C_64F + +# Define quantum gate tensors on device +gate_h = 2**-0.5 * cp.asarray([[1,1], [1,-1]], dtype='complex128', order='F') +gate_h_strides = 0 + +# Pauli X gate +gate_x = cp.asarray([[0, 1], [1, 0]]).T.astype('complex128', order='F') +# Pauli Y gate +gate_y = cp.asarray([[0, -1j], [1j, 0]]).T.astype('complex128', order='F') +# Pauli Z gate +gate_z = cp.asarray([[1, 0], [0, -1]]).T.astype('complex128', order='F') + +gate_cx = cp.asarray([[1, 0, 0, 0], + [0, 1, 0, 0], + [0, 0, 0, 1], + [0, 0, 1, 0]], dtype='complex128').reshape(2,2,2,2, order='F') +gate_cx_strides = 0 + +free_mem = dev.mem_info[0] +# use half of the totol free size +scratch_size = free_mem // 2 +scratch_space = cp.cuda.alloc(scratch_size) +print(f"Allocated {scratch_size} bytes of scratch memory on GPU") + +# Create the initial quantum state +quantum_state = cutn.create_state(handle, cutn.StatePurity.PURE, num_qubits, qubits_dims, data_type) +print("Created the initial quantum state") + +# Construct the quantum circuit state with gate application +tensor_id = cutn.state_apply_tensor( + handle, quantum_state, 1, (0, ), + gate_h.data.ptr, gate_h_strides, 1, 0, 1) + +for i in range(1, num_qubits): + tensor_id = cutn.state_apply_tensor( + handle, quantum_state, 2, (i-1, i), # target on i-1 while control on i + gate_cx.data.ptr, gate_cx_strides, 1, 0, 1) +print("Quantum gates applied") + +# Create an empty tensor network operator +hamiltonian = cutn.create_network_operator(handle, + num_qubits, qubits_dims, data_type) +# Append component (0.5 * Z1 * Z2) to the tensor network operator +num_modes = (1, 1) # Z1 acts on 1 mode, Z2 acts on 1 mode +modes_Z1 = (1, ) # state modes Z1 acts on +modes_Z2 = (2, ) # state modes Z2 acts on +state_modes = (modes_Z1, modes_Z2) # state modes (Z1 * Z2) acts on +gate_data = (gate_z.data.ptr, gate_z.data.ptr) # GPU pointers to gate data +operator_id = cutn.network_operator_append_product(handle, hamiltonian, 0.5, + 2, num_modes, state_modes, 0, gate_data) +# Append component (0.25 * Y3) to the tensor network operator +num_modes = (1, ) # Y3 acts on 1 mode +modes_Y3 = (3, ) # state modes Y3 acts on +state_modes = (modes_Y3, ) # state modes (Y3) acts on +gate_data = (gate_y.data.ptr, ) # GPU pointers to gate data +operator_id = cutn.network_operator_append_product(handle, hamiltonian, 0.25, + 1, num_modes, state_modes, 0, gate_data) + +# Append component (0.13 * Y0 X2 Z3) to the tensor network operator +num_modes = (1, 1, 1) # Y0 acts on 1 mode, X2 acts on 1 mode, Z3 acts on 1 mode +modes_Y0 = (0, ) # state modes Y0 acts on +modes_X2 = (2, ) # state modes X2 acts on +modes_Z3 = (3, ) # state modes Z3 acts on +state_modes = (modes_Y0, modes_X2, modes_Z3) # state modes (Y0 * X2 * Z3) acts on +gate_data = (gate_y.data.ptr, gate_x.data.ptr, gate_z.data.ptr) # GPU pointers to gate data +operator_id = cutn.network_operator_append_product(handle, hamiltonian, 0.13, + 3, num_modes, state_modes, 0, gate_data) +print("Constructed a tensor network operator: (0.5 * Z1 * Z2) + (0.25 * Y3) + (0.13 * Y0 * X2 * Z3)") + +# Specify the quantum circuit expectation value computation +expectation = cutn.create_expectation(handle, quantum_state, hamiltonian) + +# Configure the quantum circuit expectation value computation +num_hyper_samples_dtype = cutn.expectation_get_attribute_dtype(cutn.ExpectationAttribute.OPT_NUM_HYPER_SAMPLES) +num_hyper_samples = np.asarray(8, dtype=num_hyper_samples_dtype) +cutn.expectation_configure(handle, expectation, + cutn.ExpectationAttribute.OPT_NUM_HYPER_SAMPLES, + num_hyper_samples.ctypes.data, num_hyper_samples.dtype.itemsize) + +# Prepare the computation of the specified quantum circuit expectation value +work_desc = cutn.create_workspace_descriptor(handle) +cutn.expectation_prepare(handle, expectation, scratch_size, work_desc, stream.ptr) +print("Prepare the computation of the specified quantum circuit expectation value") + +workspace_size_d = cutn.workspace_get_memory_size(handle, + work_desc, cutn.WorksizePref.RECOMMENDED, cutn.Memspace.DEVICE, cutn.WorkspaceKind.SCRATCH) + +if workspace_size_d <= scratch_size: + cutn.workspace_set_memory(handle, work_desc, cutn.Memspace.DEVICE, cutn.WorkspaceKind.SCRATCH, scratch_space.ptr, workspace_size_d) +else: + print("Error:Insufficient workspace size on Device") + cutn.destroy_workspace_descriptor(work_desc) + cutn.destroy_expectation(expectation) + cutn.destroy_network_operator(hamiltonian) + cutn.destroy_state(quantum_state) + cutn.destroy(handle) + del scratch + print("Free resource and exit.") + exit() +print("Set the workspace buffer") + +# Compute the specified quantum circuit expectation value +expectation_value = np.empty(1, dtype='complex128') +state_norm = np.empty(1, dtype='complex128') +cutn.expectation_compute(handle, expectation, + work_desc, expectation_value.ctypes.data, state_norm.ctypes.data, stream.ptr) +stream.synchronize() +print("Computed the specified quantum circuit state amplitudes") + +print(f"expectation value = {expectation_value.item()}") +print(f"norm of the state = {state_norm.item()}") + +cutn.destroy_workspace_descriptor(work_desc) +cutn.destroy_expectation(expectation) +cutn.destroy_network_operator(hamiltonian) +cutn.destroy_state(quantum_state) +cutn.destroy(handle) +del scratch_space +print("Free resource and exit.") \ No newline at end of file diff --git a/python/samples/cutensornet/high_level/marginal_example.py b/python/samples/cutensornet/high_level/marginal_example.py index c23e208..e06aece 100755 --- a/python/samples/cutensornet/high_level/marginal_example.py +++ b/python/samples/cutensornet/high_level/marginal_example.py @@ -56,6 +56,12 @@ rdm = cp.empty(rdm_shape, dtype='complex128') rdm_strides = [stride_in_bytes // rdm.itemsize for stride_in_bytes in rdm.strides] +free_mem = dev.mem_info[0] +# use half of the totol free size +scratch_size = free_mem // 2 +scratch_space = cp.cuda.alloc(scratch_size) +print(f"Allocated {scratch_size} bytes of scratch memory on GPU") + # Create the initial quantum state quantum_state = cutn.create_state(handle, cutn.StatePurity.PURE, num_qubits, qubits_dims, data_type) print("Created the initial quantum state") @@ -74,12 +80,7 @@ # Specify the desired reduced density matrix (marginal) marginal = cutn.create_marginal(handle, quantum_state, num_marginal_modes, marginal_modes, 0, 0, rdm_strides) -free_mem = dev.mem_info[0] -# use half of the totol free size -scratch_size = free_mem // 2 -scratch_space = cp.cuda.alloc(scratch_size) -print(f"Allocated {scratch_size} bytes of scratch memory on GPU") - +# Configure the computation of the desired reduced density matrix (marginal) num_hyper_samples_dtype = cutn.marginal_get_attribute_dtype(cutn.MarginalAttribute.OPT_NUM_HYPER_SAMPLES) num_hyper_samples = np.asarray(8, dtype=num_hyper_samples_dtype) cutn.marginal_configure(handle, marginal, diff --git a/python/samples/cutensornet/high_level/mps_amplitudes_example.py b/python/samples/cutensornet/high_level/mps_amplitudes_example.py new file mode 100755 index 0000000..f6183be --- /dev/null +++ b/python/samples/cutensornet/high_level/mps_amplitudes_example.py @@ -0,0 +1,185 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES +# +# SPDX-License-Identifier: BSD-3-Clause + +import cupy as cp +import numpy as np + +import cuquantum +from cuquantum import cutensornet as cutn + + +print("cuTensorNet-vers:", cutn.get_version()) +dev = cp.cuda.Device() # get current device +props = cp.cuda.runtime.getDeviceProperties(dev.id) +print("===== device info ======") +print("GPU-name:", props["name"].decode()) +print("GPU-clock:", props["clockRate"]) +print("GPU-memoryClock:", props["memoryClockRate"]) +print("GPU-nSM:", props["multiProcessorCount"]) +print("GPU-major:", props["major"]) +print("GPU-minor:", props["minor"]) +print("========================") + +################################################# +# Accessor computation of a quantum circuit state +################################################# + +# Quantum state configuration +num_qubits = 6 +dim = 2 +qubits_dims = (dim, ) * num_qubits # qubit size +fixed_modes = (0, 1) # open qubits +num_fixed_modes = len(fixed_modes) +fixed_values = (1, 1) +print(f"Quantum circuit with {num_qubits} qubits") + +############# +# cuTensorNet +############# + +handle = cutn.create() +stream = cp.cuda.Stream() +data_type = cuquantum.cudaDataType.CUDA_C_64F + +# Define quantum gate tensors on device +gate_h = 2**-0.5 * cp.asarray([[1,1], [1,-1]], dtype='complex128', order='F') +gate_h_strides = 0 + +gate_cx = cp.asarray([[1, 0, 0, 0], + [0, 1, 0, 0], + [0, 0, 0, 1], + [0, 0, 1, 0]], dtype='complex128').reshape(2,2,2,2, order='F') +gate_cx_strides = 0 + +# Allocate device memory for the final MPS state +max_extent = 2 +mps_tensor_extents = [] +mps_tensor_strides = [] +mps_tensors = [] +mps_tensor_ptrs = [] +for i in range(num_qubits): + if i == 0: + extents = (2, max_extent) + elif i == num_qubits - 1: + extents = (max_extent, 2) + else: + extents = (max_extent, 2, max_extent) + mps_tensor_extents.append(extents) + tensor = cp.zeros(extents, dtype='complex128') + mps_tensors.append(tensor) + mps_tensor_ptrs.append(tensor.data.ptr) + mps_tensor_strides.append([stride_in_bytes // tensor.itemsize for stride_in_bytes in tensor.strides]) + +# Allocate device memory for the specified slice of the quantum circuit amplitudes tensor +amplitudes_shape = [qubits_dims[i] for i in range(num_qubits) if i not in fixed_modes] +amplitudes = cp.empty(amplitudes_shape, dtype='complex128') +amplitudes_strides = [stride_in_bytes // amplitudes.itemsize for stride_in_bytes in amplitudes.strides] + +free_mem = dev.mem_info[0] +# use half of the totol free size +scratch_size = free_mem // 2 +scratch_space = cp.cuda.alloc(scratch_size) +print(f"Allocated {scratch_size} bytes of scratch memory on GPU") + +# Create the initial quantum state +quantum_state = cutn.create_state(handle, cutn.StatePurity.PURE, num_qubits, qubits_dims, data_type) +print("Created the initial quantum state") + +# Construct the quantum circuit state with gate application +tensor_id = cutn.state_apply_tensor( + handle, quantum_state, 1, (0, ), + gate_h.data.ptr, gate_h_strides, 1, 0, 1) + +for i in range(1, num_qubits): + tensor_id = cutn.state_apply_tensor( + handle, quantum_state, 2, (i-1, i), # target on i-1 while control on i + gate_cx.data.ptr, gate_cx_strides, 1, 0, 1) +print("Quantum gates applied") + +# Specify the target MPS state +cutn.state_finalize_mps(handle, quantum_state, cutn.BoundaryCondition.OPEN, mps_tensor_extents, mps_tensor_strides) +print("Set the final MPS representation") + +# Configure the MPS computation +svd_algorithm_dtype = cutn.state_get_attribute_dtype(cutn.StateAttribute.MPS_SVD_CONFIG_ALGO) +svd_algorithm = np.array(cutn.TensorSVDAlgo.GESVDJ, dtype=svd_algorithm_dtype) +cutn.state_configure(handle, quantum_state, + cutn.StateAttribute.MPS_SVD_CONFIG_ALGO, svd_algorithm.ctypes.data, svd_algorithm.dtype.itemsize) + +# Prepare the specified quantum circuit for MPS computation +work_desc = cutn.create_workspace_descriptor(handle) +cutn.state_prepare(handle, quantum_state, scratch_size, work_desc, stream.ptr) +print("Prepared the specified quantum circuit for MPS computation") + +workspace_size_d = cutn.workspace_get_memory_size(handle, + work_desc, cutn.WorksizePref.RECOMMENDED, cutn.Memspace.DEVICE, cutn.WorkspaceKind.SCRATCH) +if workspace_size_d <= scratch_size: + cutn.workspace_set_memory(handle, work_desc, cutn.Memspace.DEVICE, cutn.WorkspaceKind.SCRATCH, scratch_space.ptr, workspace_size_d) +else: + print("Error:Insufficient workspace size on Device") + cutn.destroy_workspace_descriptor(work_desc) + cutn.destroy_marginal(marginal) + cutn.destroy_state(quantum_state) + cutn.destroy(handle) + del scratch + print("Free resource and exit.") + exit() +print("Set the workspace buffer for MPS computation") + +# Compute the final MPS state +extents_out, strides_out = cutn.state_compute(handle, quantum_state, work_desc, mps_tensor_ptrs, stream.ptr) + +# If a lower extent is found during runtime, the cupy.ndarray container must be adjusted to reflect the lower extent +for i, (extent_in, extent_out) in enumerate(zip(mps_tensor_extents, extents_out)): + if extent_in != tuple(extent_out): + stride_out = [s * mps_tensors[0].itemsize for s in strides_out[i]] + mps_tensors[i] = cp.ndarray(extent_out, dtype=mps_tensors[i].dtype, memptr=mps_tensors[i].data, strides=stride_out) +print("Computed the final MPS representation") + +# Specify the quantum circuit amplitudes accessor +accessor = cutn.create_accessor(handle, + quantum_state, num_fixed_modes, fixed_modes, amplitudes_strides) + +num_hyper_samples_dtype = cutn.accessor_get_attribute_dtype(cutn.AccessorAttribute.OPT_NUM_HYPER_SAMPLES) +num_hyper_samples = np.asarray(8, dtype=num_hyper_samples_dtype) +cutn.accessor_configure(handle, accessor, + cutn.AccessorAttribute.OPT_NUM_HYPER_SAMPLES, + num_hyper_samples.ctypes.data, num_hyper_samples.dtype.itemsize) + +# Prepare the computation of the specified slice of the quantum circuit amplitudes tensor +cutn.accessor_prepare(handle, accessor, scratch_size, work_desc, stream.ptr) +print("Prepare the computation of the specified slice of the quantum circuit amplitudes tensor") + +workspace_size_d = cutn.workspace_get_memory_size(handle, + work_desc, cutn.WorksizePref.RECOMMENDED, cutn.Memspace.DEVICE, cutn.WorkspaceKind.SCRATCH) + +if workspace_size_d <= scratch_size: + cutn.workspace_set_memory(handle, work_desc, cutn.Memspace.DEVICE, cutn.WorkspaceKind.SCRATCH, scratch_space.ptr, workspace_size_d) +else: + print("Error:Insufficient workspace size on Device") + cutn.destroy_workspace_descriptor(work_desc) + cutn.destroy_accessor(accessor) + cutn.destroy_state(quantum_state) + cutn.destroy(handle) + del scratch + print("Free resource and exit.") + exit() +print("Set the workspace buffer") + +# Compute the specified slice of the quantum circuit amplitudes tensor +state_norm = np.empty(1, dtype='complex128') +cutn.accessor_compute(handle, accessor, + fixed_values, work_desc, amplitudes.data.ptr, state_norm.ctypes.data, stream.ptr) +stream.synchronize() +print("Computed the specified quantum circuit state amplitudes") + +print(amplitudes) +print(f"norm of the state = {state_norm.item()}") + +cutn.destroy_workspace_descriptor(work_desc) +cutn.destroy_accessor(accessor) +cutn.destroy_state(quantum_state) +cutn.destroy(handle) +del scratch_space +print("Free resource and exit.") \ No newline at end of file diff --git a/python/samples/cutensornet/high_level/mps_expectation_example.py b/python/samples/cutensornet/high_level/mps_expectation_example.py new file mode 100755 index 0000000..d8e3803 --- /dev/null +++ b/python/samples/cutensornet/high_level/mps_expectation_example.py @@ -0,0 +1,231 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES +# +# SPDX-License-Identifier: BSD-3-Clause + +import cupy as cp +import numpy as np + +import cuquantum +from cuquantum import cutensornet as cutn + + +print("cuTensorNet-vers:", cutn.get_version()) +dev = cp.cuda.Device() # get current device +props = cp.cuda.runtime.getDeviceProperties(dev.id) +print("===== device info ======") +print("GPU-name:", props["name"].decode()) +print("GPU-clock:", props["clockRate"]) +print("GPU-memoryClock:", props["memoryClockRate"]) +print("GPU-nSM:", props["multiProcessorCount"]) +print("GPU-major:", props["major"]) +print("GPU-minor:", props["minor"]) +print("========================") + +#################################################### +# Expectation computation of a quantum circuit state +#################################################### + +# Quantum state configuration +num_qubits = 16 +dim = 2 +qubits_dims = (dim, ) * num_qubits # qubit size +print(f"Quantum circuit with {num_qubits} qubits") + +############# +# cuTensorNet +############# + +handle = cutn.create() +stream = cp.cuda.Stream() +data_type = cuquantum.cudaDataType.CUDA_C_64F + +# Define quantum gate tensors on device +gate_h = 2**-0.5 * cp.asarray([[1,1], [1,-1]], dtype='complex128', order='F') +gate_h_strides = 0 + +# Pauli X gate +gate_x = cp.asarray([[0, 1], [1, 0]]).T.astype('complex128', order='F') +# Pauli Y gate +gate_y = cp.asarray([[0, -1j], [1j, 0]]).T.astype('complex128', order='F') +# Pauli Z gate +gate_z = cp.asarray([[1, 0], [0, -1]]).T.astype('complex128', order='F') + +gate_cx = cp.asarray([[1, 0, 0, 0], + [0, 1, 0, 0], + [0, 0, 0, 1], + [0, 0, 1, 0]], dtype='complex128').reshape(2,2,2,2, order='F') +gate_cx_strides = 0 + +# Allocate device memory for the final MPS state +max_extent = 2 +mps_tensor_extents = [] +mps_tensor_strides = [] +mps_tensors = [] +mps_tensor_ptrs = [] +for i in range(num_qubits): + if i == 0: + extents = (2, max_extent) + elif i == num_qubits - 1: + extents = (max_extent, 2) + else: + extents = (max_extent, 2, max_extent) + mps_tensor_extents.append(extents) + tensor = cp.zeros(extents, dtype='complex128') + mps_tensors.append(tensor) + mps_tensor_ptrs.append(tensor.data.ptr) + mps_tensor_strides.append([stride_in_bytes // tensor.itemsize for stride_in_bytes in tensor.strides]) + +free_mem = dev.mem_info[0] +# use half of the totol free size +scratch_size = free_mem // 2 +scratch_space = cp.cuda.alloc(scratch_size) +print(f"Allocated {scratch_size} bytes of scratch memory on GPU") + +# Create the initial quantum state +quantum_state = cutn.create_state(handle, cutn.StatePurity.PURE, num_qubits, qubits_dims, data_type) +print("Created the initial quantum state") + +# Construct the quantum circuit state with gate application +tensor_id = cutn.state_apply_tensor( + handle, quantum_state, 1, (0, ), + gate_h.data.ptr, gate_h_strides, 1, 0, 1) + +for i in range(1, num_qubits): + tensor_id = cutn.state_apply_tensor( + handle, quantum_state, 2, (i-1, i), # target on i-1 while control on i + gate_cx.data.ptr, gate_cx_strides, 1, 0, 1) +print("Quantum gates applied") + +# Create the vacuum quantum state +quantum_state = cutn.create_state(handle, cutn.StatePurity.PURE, num_qubits, qubits_dims, data_type) +print("Created the initial quantum state") + +# Construct the quantum circuit state with gate application +tensor_id = cutn.state_apply_tensor( + handle, quantum_state, 1, (0, ), + gate_h.data.ptr, gate_h_strides, 1, 0, 1) + +for i in range(1, num_qubits): + tensor_id = cutn.state_apply_tensor( + handle, quantum_state, 2, (i-1, i), # target on i-1 while control on i + gate_cx.data.ptr, gate_cx_strides, 1, 0, 1) +print("Quantum gates applied") + +# Specify the target MPS state +cutn.state_finalize_mps(handle, quantum_state, cutn.BoundaryCondition.OPEN, mps_tensor_extents, mps_tensor_strides) +print("Set the final MPS representation") + +# Configure the MPS computation +svd_algorithm_dtype = cutn.state_get_attribute_dtype(cutn.StateAttribute.MPS_SVD_CONFIG_ALGO) +svd_algorithm = np.array(cutn.TensorSVDAlgo.GESVDJ, dtype=svd_algorithm_dtype) +cutn.state_configure(handle, quantum_state, + cutn.StateAttribute.MPS_SVD_CONFIG_ALGO, svd_algorithm.ctypes.data, svd_algorithm.dtype.itemsize) + +# Prepare the specified quantum circuit for MPS computation +work_desc = cutn.create_workspace_descriptor(handle) +cutn.state_prepare(handle, quantum_state, scratch_size, work_desc, stream.ptr) +print("Prepared the specified quantum circuit for MPS computation") + +workspace_size_d = cutn.workspace_get_memory_size(handle, + work_desc, cutn.WorksizePref.RECOMMENDED, cutn.Memspace.DEVICE, cutn.WorkspaceKind.SCRATCH) +if workspace_size_d <= scratch_size: + cutn.workspace_set_memory(handle, work_desc, cutn.Memspace.DEVICE, cutn.WorkspaceKind.SCRATCH, scratch_space.ptr, workspace_size_d) +else: + print("Error:Insufficient workspace size on Device") + cutn.destroy_workspace_descriptor(work_desc) + cutn.destroy_marginal(marginal) + cutn.destroy_state(quantum_state) + cutn.destroy(handle) + del scratch + print("Free resource and exit.") + exit() +print("Set the workspace buffer for MPS computation") + +# Compute the final MPS state +extents_out, strides_out = cutn.state_compute(handle, quantum_state, work_desc, mps_tensor_ptrs, stream.ptr) + +# If a lower extent is found during runtime, the cupy.ndarray container must be adjusted to reflect the lower extent +for i, (extent_in, extent_out) in enumerate(zip(mps_tensor_extents, extents_out)): + if extent_in != tuple(extent_out): + stride_out = [s * mps_tensors[0].itemsize for s in strides_out[i]] + mps_tensors[i] = cp.ndarray(extent_out, dtype=mps_tensors[i].dtype, memptr=mps_tensors[i].data, strides=stride_out) +print("Computed the final MPS representation") + +# Create an empty tensor network operator +hamiltonian = cutn.create_network_operator(handle, + num_qubits, qubits_dims, data_type) +# Append component (0.5 * Z1 * Z2) to the tensor network operator +num_modes = (1, 1) # Z1 acts on 1 mode, Z2 acts on 1 mode +modes_Z1 = (1, ) # state modes Z1 acts on +modes_Z2 = (2, ) # state modes Z2 acts on +state_modes = (modes_Z1, modes_Z2) # state modes (Z1 * Z2) acts on +gate_data = (gate_z.data.ptr, gate_z.data.ptr) # GPU pointers to gate data +operator_id = cutn.network_operator_append_product(handle, hamiltonian, 0.5, + 2, num_modes, state_modes, 0, gate_data) +# Append component (0.25 * Y3) to the tensor network operator +num_modes = (1, ) # Y3 acts on 1 mode +modes_Y3 = (3, ) # state modes Y3 acts on +state_modes = (modes_Y3, ) # state modes (Y3) acts on +gate_data = (gate_y.data.ptr, ) # GPU pointers to gate data +operator_id = cutn.network_operator_append_product(handle, hamiltonian, 0.25, + 1, num_modes, state_modes, 0, gate_data) + +# Append component (0.13 * Y0 X2 Z3) to the tensor network operator +num_modes = (1, 1, 1) # Y0 acts on 1 mode, X2 acts on 1 mode, Z3 acts on 1 mode +modes_Y0 = (0, ) # state modes Y0 acts on +modes_X2 = (2, ) # state modes X2 acts on +modes_Z3 = (3, ) # state modes Z3 acts on +state_modes = (modes_Y0, modes_X2, modes_Z3) # state modes (Y0 * X2 * Z3) acts on +gate_data = (gate_y.data.ptr, gate_x.data.ptr, gate_z.data.ptr) # GPU pointers to gate data +operator_id = cutn.network_operator_append_product(handle, hamiltonian, 0.13, + 3, num_modes, state_modes, 0, gate_data) +print("Constructed a tensor network operator: (0.5 * Z1 * Z2) + (0.25 * Y3) + (0.13 * Y0 * X2 * Z3)") + +# Specify the quantum circuit expectation value +expectation = cutn.create_expectation(handle, quantum_state, hamiltonian) + +num_hyper_samples_dtype = cutn.expectation_get_attribute_dtype(cutn.ExpectationAttribute.OPT_NUM_HYPER_SAMPLES) +num_hyper_samples = np.asarray(8, dtype=num_hyper_samples_dtype) +cutn.expectation_configure(handle, expectation, + cutn.ExpectationAttribute.OPT_NUM_HYPER_SAMPLES, + num_hyper_samples.ctypes.data, num_hyper_samples.dtype.itemsize) + +# Prepare the computation of the specified quantum circuit expectation value +cutn.expectation_prepare(handle, expectation, scratch_size, work_desc, stream.ptr) +print("Prepare the computation of the specified quantum circuit expectation value") + +workspace_size_d = cutn.workspace_get_memory_size(handle, + work_desc, cutn.WorksizePref.RECOMMENDED, cutn.Memspace.DEVICE, cutn.WorkspaceKind.SCRATCH) + +if workspace_size_d <= scratch_size: + cutn.workspace_set_memory(handle, work_desc, cutn.Memspace.DEVICE, cutn.WorkspaceKind.SCRATCH, scratch_space.ptr, workspace_size_d) +else: + print("Error:Insufficient workspace size on Device") + cutn.destroy_workspace_descriptor(work_desc) + cutn.destroy_expectation(expectation) + cutn.destroy_network_operator(hamiltonian) + cutn.destroy_state(quantum_state) + cutn.destroy(handle) + del scratch + print("Free resource and exit.") + exit() +print("Set the workspace buffer") + +# Compute the specified quantum circuit expectation value +expectation_value = np.empty(1, dtype='complex128') +state_norm = np.empty(1, dtype='complex128') +cutn.expectation_compute(handle, expectation, + work_desc, expectation_value.ctypes.data, state_norm.ctypes.data, stream.ptr) +stream.synchronize() +print("Computed the specified quantum circuit state expectation value") + +print(f"expectation value = {expectation_value.item()}") +print(f"norm of the state = {state_norm.item()}") + +cutn.destroy_workspace_descriptor(work_desc) +cutn.destroy_expectation(expectation) +cutn.destroy_network_operator(hamiltonian) +cutn.destroy_state(quantum_state) +cutn.destroy(handle) +del scratch_space +print("Free resource and exit.") \ No newline at end of file diff --git a/python/samples/cutensornet/high_level/mps_marginal_example.py b/python/samples/cutensornet/high_level/mps_marginal_example.py new file mode 100755 index 0000000..7b53375 --- /dev/null +++ b/python/samples/cutensornet/high_level/mps_marginal_example.py @@ -0,0 +1,182 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES +# +# SPDX-License-Identifier: BSD-3-Clause + +import cupy as cp +import numpy as np + +import cuquantum +from cuquantum import cutensornet as cutn + + +print("cuTensorNet-vers:", cutn.get_version()) +dev = cp.cuda.Device() # get current device +props = cp.cuda.runtime.getDeviceProperties(dev.id) +print("===== device info ======") +print("GPU-name:", props["name"].decode()) +print("GPU-clock:", props["clockRate"]) +print("GPU-memoryClock:", props["memoryClockRate"]) +print("GPU-nSM:", props["multiProcessorCount"]) +print("GPU-major:", props["major"]) +print("GPU-minor:", props["minor"]) +print("========================") + +################################################# +# Marginal computation of a quantum circuit state +################################################# + +# Quantum state configuration +num_qubits = 16 +dim = 2 +qubits_dims = (dim, ) * num_qubits # qubit size +marginal_modes = (0, 1) # open qubits +num_marginal_modes = len(marginal_modes) +print(f"Quantum circuit with {num_qubits} qubits") + +############# +# cuTensorNet +############# + +handle = cutn.create() +stream = cp.cuda.Stream() +data_type = cuquantum.cudaDataType.CUDA_C_64F + +# Define quantum gate tensors on device +gate_h = 2**-0.5 * cp.asarray([[1,1], [1,-1]], dtype='complex128', order='F') +gate_h_strides = 0 + +gate_cx = cp.asarray([[1, 0, 0, 0], + [0, 1, 0, 0], + [0, 0, 0, 1], + [0, 0, 1, 0]], dtype='complex128').reshape(2,2,2,2, order='F') +gate_cx_strides = 0 + +# Allocate device memory for the final MPS state +max_extent = 2 +mps_tensor_extents = [] +mps_tensor_strides = [] +mps_tensors = [] +mps_tensor_ptrs = [] +for i in range(num_qubits): + if i == 0: + extents = (2, max_extent) + elif i == num_qubits - 1: + extents = (max_extent, 2) + else: + extents = (max_extent, 2, max_extent) + mps_tensor_extents.append(extents) + tensor = cp.zeros(extents, dtype='complex128') + mps_tensors.append(tensor) + mps_tensor_ptrs.append(tensor.data.ptr) + mps_tensor_strides.append([stride_in_bytes // tensor.itemsize for stride_in_bytes in tensor.strides]) + +# Allocate device memory for the reduced density matrix (marginal) +rdm_shape = (dim, ) * 2 * len(marginal_modes) +rdm = cp.empty(rdm_shape, dtype='complex128') +rdm_strides = [stride_in_bytes // rdm.itemsize for stride_in_bytes in rdm.strides] + +free_mem = dev.mem_info[0] +# use half of the totol free size +scratch_size = free_mem // 2 +scratch_space = cp.cuda.alloc(scratch_size) +print(f"Allocated {scratch_size} bytes of scratch memory on GPU") + +# Create the vacuum quantum state +quantum_state = cutn.create_state(handle, cutn.StatePurity.PURE, num_qubits, qubits_dims, data_type) +print("Created the initial quantum state") + +# Construct the quantum circuit state with gate application +tensor_id = cutn.state_apply_tensor( + handle, quantum_state, 1, (0, ), + gate_h.data.ptr, gate_h_strides, 1, 0, 1) + +for i in range(1, num_qubits): + tensor_id = cutn.state_apply_tensor( + handle, quantum_state, 2, (i-1, i), # target on i-1 while control on i + gate_cx.data.ptr, gate_cx_strides, 1, 0, 1) +print("Quantum gates applied") + +# Specify the target MPS state +cutn.state_finalize_mps(handle, quantum_state, cutn.BoundaryCondition.OPEN, mps_tensor_extents, mps_tensor_strides) +print("Set the final MPS representation") + +# Configure the MPS computation +svd_algorithm_dtype = cutn.state_get_attribute_dtype(cutn.StateAttribute.MPS_SVD_CONFIG_ALGO) +svd_algorithm = np.array(cutn.TensorSVDAlgo.GESVDJ, dtype=svd_algorithm_dtype) +cutn.state_configure(handle, quantum_state, + cutn.StateAttribute.MPS_SVD_CONFIG_ALGO, svd_algorithm.ctypes.data, svd_algorithm.dtype.itemsize) + +# Prepare the specified quantum circuit for MPS computation +work_desc = cutn.create_workspace_descriptor(handle) +cutn.state_prepare(handle, quantum_state, scratch_size, work_desc, stream.ptr) +print("Prepared the specified quantum circuit for MPS computation") + +workspace_size_d = cutn.workspace_get_memory_size(handle, + work_desc, cutn.WorksizePref.RECOMMENDED, cutn.Memspace.DEVICE, cutn.WorkspaceKind.SCRATCH) +if workspace_size_d <= scratch_size: + cutn.workspace_set_memory(handle, work_desc, cutn.Memspace.DEVICE, cutn.WorkspaceKind.SCRATCH, scratch_space.ptr, workspace_size_d) +else: + print("Error:Insufficient workspace size on Device") + cutn.destroy_workspace_descriptor(work_desc) + cutn.destroy_marginal(marginal) + cutn.destroy_state(quantum_state) + cutn.destroy(handle) + del scratch + print("Free resource and exit.") + exit() +print("Set the workspace buffer for MPS computation") + +# Compute the final MPS state +extents_out, strides_out = cutn.state_compute(handle, quantum_state, work_desc, mps_tensor_ptrs, stream.ptr) + +# If a lower extent is found during runtime, the cupy.ndarray container must be adjusted to reflect the lower extent +for i, (extent_in, extent_out) in enumerate(zip(mps_tensor_extents, extents_out)): + if extent_in != tuple(extent_out): + stride_out = [s * mps_tensors[0].itemsize for s in strides_out[i]] + mps_tensors[i] = cp.ndarray(extent_out, dtype=mps_tensors[i].dtype, memptr=mps_tensors[i].data, strides=stride_out) +print("Computed the final MPS representation") + +# Specify the desired reduced density matrix (marginal) +marginal = cutn.create_marginal(handle, quantum_state, num_marginal_modes, marginal_modes, 0, 0, rdm_strides) + +# Configure the computation of the desired reduced density matrix (marginal) +num_hyper_samples_dtype = cutn.marginal_get_attribute_dtype(cutn.MarginalAttribute.OPT_NUM_HYPER_SAMPLES) +num_hyper_samples = np.asarray(8, dtype=num_hyper_samples_dtype) +cutn.marginal_configure(handle, marginal, + cutn.MarginalAttribute.OPT_NUM_HYPER_SAMPLES, + num_hyper_samples.ctypes.data, num_hyper_samples.dtype.itemsize) + +# Prepare the specified quantum circuit reduced densitry matrix (marginal) +cutn.marginal_prepare(handle, marginal, scratch_size, work_desc, stream.ptr) +print("Prepared the specified quantum circuit reduced density matrix (marginal)") + +workspace_size_d = cutn.workspace_get_memory_size(handle, + work_desc, cutn.WorksizePref.RECOMMENDED, cutn.Memspace.DEVICE, cutn.WorkspaceKind.SCRATCH) + +if workspace_size_d <= scratch_size: + cutn.workspace_set_memory(handle, work_desc, cutn.Memspace.DEVICE, cutn.WorkspaceKind.SCRATCH, scratch_space.ptr, workspace_size_d) +else: + print("Error:Insufficient workspace size on Device") + cutn.destroy_workspace_descriptor(work_desc) + cutn.destroy_marginal(marginal) + cutn.destroy_state(quantum_state) + cutn.destroy(handle) + del scratch + print("Free resource and exit.") + exit() +print("Set the workspace buffer for marginal computation") + +# Compute the specified quantum circuit reduced density matrix (marginal) +cutn.marginal_compute(handle, marginal, 0, work_desc, rdm.data.ptr, stream.ptr) +stream.synchronize() +print("Computed the specified quantum circuit reduced density matrix (marginal)") + +print(f"Reduced density matrix for {num_marginal_modes} qubits") +print(rdm.reshape(dim**num_marginal_modes, dim**num_marginal_modes)) + +cutn.destroy_workspace_descriptor(work_desc) +cutn.destroy_marginal(marginal) +cutn.destroy_state(quantum_state) +cutn.destroy(handle) +del scratch_space +print("Free resource and exit.") \ No newline at end of file diff --git a/python/samples/cutensornet/high_level/mps_sampling_example.py b/python/samples/cutensornet/high_level/mps_sampling_example.py new file mode 100755 index 0000000..67f877b --- /dev/null +++ b/python/samples/cutensornet/high_level/mps_sampling_example.py @@ -0,0 +1,178 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES +# +# SPDX-License-Identifier: BSD-3-Clause + +import cupy as cp +import numpy as np + +import cuquantum +from cuquantum import cutensornet as cutn + + +print("cuTensorNet-vers:", cutn.get_version()) +dev = cp.cuda.Device() # get current device +props = cp.cuda.runtime.getDeviceProperties(dev.id) +print("===== device info ======") +print("GPU-name:", props["name"].decode()) +print("GPU-clock:", props["clockRate"]) +print("GPU-memoryClock:", props["memoryClockRate"]) +print("GPU-nSM:", props["multiProcessorCount"]) +print("GPU-major:", props["major"]) +print("GPU-minor:", props["minor"]) +print("========================") + +##################################### +# Sampling of a quantum circuit state +##################################### + +# Quantum state configuration +num_samples = 100 +num_qubits = 16 +dim = 2 +qubits_dims = (dim, ) * num_qubits # qubit size +print(f"Quantum circuit with {num_qubits} qubits") + +############# +# cuTensorNet +############# + +handle = cutn.create() +stream = cp.cuda.Stream() +data_type = cuquantum.cudaDataType.CUDA_C_64F + +# Define quantum gate tensors in host memory +gate_h = 2**-0.5 * cp.asarray([[1,1], [1,-1]], dtype='complex128', order='F') +gate_h_strides = 0 + +gate_cx = cp.asarray([[1, 0, 0, 0], + [0, 1, 0, 0], + [0, 0, 0, 1], + [0, 0, 1, 0]], dtype='complex128').reshape(2,2,2,2, order='F') +gate_cx_strides = 0 + +# Allocate device memory for the final MPS state +max_extent = 2 +mps_tensor_extents = [] +mps_tensor_strides = [] +mps_tensors = [] +mps_tensor_ptrs = [] +for i in range(num_qubits): + if i == 0: + extents = (2, max_extent) + elif i == num_qubits - 1: + extents = (max_extent, 2) + else: + extents = (max_extent, 2, max_extent) + mps_tensor_extents.append(extents) + tensor = cp.zeros(extents, dtype='complex128') + mps_tensors.append(tensor) + mps_tensor_ptrs.append(tensor.data.ptr) + mps_tensor_strides.append([stride_in_bytes // tensor.itemsize for stride_in_bytes in tensor.strides]) + +# Allocate device memory for the samples +samples = np.empty((num_qubits, num_samples), dtype='int64', order='F') # samples are stored in F order with shape (num_qubits, num_qubits) + +free_mem = dev.mem_info[0] +# use half of the totol free size +scratch_size = free_mem // 2 +scratch_space = cp.cuda.alloc(scratch_size) +print(f"Allocated {scratch_size} bytes of scratch memory on GPU") + +# Create the vacuum quantum state +quantum_state = cutn.create_state(handle, cutn.StatePurity.PURE, num_qubits, qubits_dims, data_type) +print("Created the initial quantum state") + +# Construct the quantum circuit state with gate application +tensor_id = cutn.state_apply_tensor( + handle, quantum_state, 1, (0, ), + gate_h.data.ptr, gate_h_strides, 1, 0, 1) + +for i in range(1, num_qubits): + tensor_id = cutn.state_apply_tensor( + handle, quantum_state, 2, (i-1, i), # target on i-1 while control on i + gate_cx.data.ptr, gate_cx_strides, 1, 0, 1) +print("Quantum gates applied") + +# Specify the target MPS state +cutn.state_finalize_mps(handle, quantum_state, cutn.BoundaryCondition.OPEN, mps_tensor_extents, mps_tensor_strides) +print("Set the final MPS representation") + +# Configure the MPS computation +svd_algorithm_dtype = cutn.state_get_attribute_dtype(cutn.StateAttribute.MPS_SVD_CONFIG_ALGO) +svd_algorithm = np.array(cutn.TensorSVDAlgo.GESVDJ, dtype=svd_algorithm_dtype) +cutn.state_configure(handle, quantum_state, + cutn.StateAttribute.MPS_SVD_CONFIG_ALGO, svd_algorithm.ctypes.data, svd_algorithm.dtype.itemsize) + +# Prepare the specified quantum circuit for MPS computation +work_desc = cutn.create_workspace_descriptor(handle) +cutn.state_prepare(handle, quantum_state, scratch_size, work_desc, stream.ptr) +print("Prepared the specified quantum circuit for MPS computation") + +workspace_size_d = cutn.workspace_get_memory_size(handle, + work_desc, cutn.WorksizePref.RECOMMENDED, cutn.Memspace.DEVICE, cutn.WorkspaceKind.SCRATCH) +if workspace_size_d <= scratch_size: + cutn.workspace_set_memory(handle, work_desc, cutn.Memspace.DEVICE, cutn.WorkspaceKind.SCRATCH, scratch_space.ptr, workspace_size_d) +else: + print("Error:Insufficient workspace size on Device") + cutn.destroy_workspace_descriptor(work_desc) + cutn.destroy_marginal(marginal) + cutn.destroy_state(quantum_state) + cutn.destroy(handle) + del scratch + print("Free resource and exit.") + exit() +print("Set the workspace buffer for MPS computation") + +# Compute the final MPS state +extents_out, strides_out = cutn.state_compute(handle, quantum_state, work_desc, mps_tensor_ptrs, stream.ptr) + +# If a lower extent is found during runtime, the cupy.ndarray container must be adjusted to reflect the lower extent +for i, (extent_in, extent_out) in enumerate(zip(mps_tensor_extents, extents_out)): + if extent_in != tuple(extent_out): + stride_out = [s * mps_tensors[0].itemsize for s in strides_out[i]] + mps_tensors[i] = cp.ndarray(extent_out, dtype=mps_tensors[i].dtype, memptr=mps_tensors[i].data, strides=stride_out) +print("Computed the final MPS representation") + +# Create the quantum circuit sampler +sampler = cutn.create_sampler(handle, quantum_state, num_qubits, 0) + +# Configure the quantum circuit sampler +num_hyper_samples_dtype = cutn.sampler_get_attribute_dtype(cutn.SamplerAttribute.OPT_NUM_HYPER_SAMPLES) +num_hyper_samples = np.asarray(8, dtype=num_hyper_samples_dtype) +cutn.sampler_configure(handle, sampler, + cutn.SamplerAttribute.OPT_NUM_HYPER_SAMPLES, + num_hyper_samples.ctypes.data, num_hyper_samples.dtype.itemsize) + +# Prepare the quantum circuit sampler +cutn.sampler_prepare(handle, sampler, scratch_size, work_desc, stream.ptr) +print("Prepared the specified quantum circuit state sampler") + +workspace_size_d = cutn.workspace_get_memory_size(handle, + work_desc, cutn.WorksizePref.RECOMMENDED, cutn.Memspace.DEVICE, cutn.WorkspaceKind.SCRATCH) + +if workspace_size_d <= scratch_size: + cutn.workspace_set_memory(handle, work_desc, cutn.Memspace.DEVICE, cutn.WorkspaceKind.SCRATCH, scratch_space.ptr, workspace_size_d) +else: + print("Error:Insufficient workspace size on Device") + cutn.destroy_workspace_descriptor(work_desc) + cutn.destroy_sampler(sampler) + cutn.destroy_state(quantum_state) + cutn.destroy(handle) + del scratch + print("Free resource and exit.") + exit() +print("Set the workspace buffer for sampling") + +# Sample the quantum circuit state +cutn.sampler_sample(handle, sampler, num_samples, work_desc, samples.ctypes.data, stream.ptr) +stream.synchronize() +print("Performed quantum circuit state sampling") +print("Bit-string samples:") +print(samples.T) + +cutn.destroy_workspace_descriptor(work_desc) +cutn.destroy_sampler(sampler) +cutn.destroy_state(quantum_state) +cutn.destroy(handle) +del scratch_space +print("Free resource and exit.") diff --git a/python/samples/cutensornet/high_level/sampling_example.py b/python/samples/cutensornet/high_level/sampling_example.py index 948d182..e14ff8e 100755 --- a/python/samples/cutensornet/high_level/sampling_example.py +++ b/python/samples/cutensornet/high_level/sampling_example.py @@ -52,6 +52,13 @@ # Allocate device memory for the samples samples = np.empty((num_qubits, num_samples), dtype='int64', order='F') # samples are stored in F order with shape (num_qubits, num_qubits) + +free_mem = dev.mem_info[0] +# use half of the totol free size +scratch_size = free_mem // 2 +scratch_space = cp.cuda.alloc(scratch_size) +print(f"Allocated {scratch_size} bytes of scratch memory on GPU") + # Create the initial quantum state quantum_state = cutn.create_state(handle, cutn.StatePurity.PURE, num_qubits, qubits_dims, data_type) print("Created the initial quantum state") @@ -71,12 +78,7 @@ # Create the quantum circuit sampler sampler = cutn.create_sampler(handle, quantum_state, num_qubits, 0) -free_mem = dev.mem_info[0] -# use half of the totol free size -scratch_size = free_mem // 2 -scratch_space = cp.cuda.alloc(scratch_size) -print(f"Allocated {scratch_size} bytes of scratch memory on GPU") - +# Configure the quantum circuit sampler num_hyper_samples_dtype = cutn.sampler_get_attribute_dtype(cutn.SamplerAttribute.OPT_NUM_HYPER_SAMPLES) num_hyper_samples = np.asarray(8, dtype=num_hyper_samples_dtype) cutn.sampler_configure(handle, sampler, diff --git a/python/samples/cutensornet/tensor/example11-svd_algorithms.py b/python/samples/cutensornet/tensor/example11-svd_algorithms.py index 05ffdea..6df39bd 100644 --- a/python/samples/cutensornet/tensor/example11-svd_algorithms.py +++ b/python/samples/cutensornet/tensor/example11-svd_algorithms.py @@ -3,16 +3,15 @@ # SPDX-License-Identifier: BSD-3-Clause """ -truncated SVD Example using NumPy ndarray with various SVD algorithms. +truncated SVD Example using CuPy ndarray with various SVD algorithms. -The decomposition results are also NumPy ndarrays. +The decomposition results are also CuPy ndarrays. """ -import numpy as np +import cupy as cp from cuquantum import tensor - -a = np.ones((3,2,4,5)) +a = cp.ones((3,2,4,5)) base_options = {'max_extent': 4, 'abs_cutoff': 0.1, diff --git a/python/setup.py b/python/setup.py index 4e59fa7..4b71dc5 100644 --- a/python/setup.py +++ b/python/setup.py @@ -29,10 +29,10 @@ # - cuTENSOR version is constrained in the cutensornet-cuXX package, so we don't # need to list it install_requires = [ - 'numpy>=1.21', + 'numpy~=1.21', # ">=1.21,<2" # 'torch', # <-- PyTorch is optional; also, the PyPI version does not support GPU... - f'custatevec-cu{utils.cuda_major_ver}~=1.4', # ">=1.4.0,<2" - f'cutensornet-cu{utils.cuda_major_ver}~=2.2', # ">=2.2.0,<3" + f'custatevec-cu{utils.cuda_major_ver}~=1.5', # ">=1.5.0,<2" + f'cutensornet-cu{utils.cuda_major_ver}~=2.3', # ">=2.3.0,<3" ] if utils.cuda_major_ver == '11': # CuPy has 3+ wheels for CUDA 11.x, only the cuquantum-python meta package has diff --git a/python/tests/conftest.py b/python/tests/conftest.py index 4c03835..61bcef6 100644 --- a/python/tests/conftest.py +++ b/python/tests/conftest.py @@ -6,6 +6,8 @@ # various reasons, see pytest-dev/pytest#3730. In particular, this strategy # is borrowed from https://github.com/pytest-dev/pytest/issues/3730#issuecomment-567142496. +from collections.abc import Iterable + def pytest_configure(config): config.addinivalue_line( @@ -17,13 +19,20 @@ def pytest_collection_modifyitems(config, items): removed = [] kept = [] for item in items: + is_removed = False m = item.get_closest_marker('uncollect_if') if m: - func = m.kwargs['func'] - if func(**item.callspec.params): - removed.append(item) - continue - kept.append(item) + funcs = m.kwargs['func'] + if not isinstance(funcs, Iterable): + funcs = (funcs,) + # loops over all deselect requirements + for func in funcs: + if func(**item.callspec.params): + removed.append(item) + is_removed = True + break + if not is_removed: + kept.append(item) if removed: config.hook.pytest_deselected(items=removed) items[:] = kept diff --git a/python/tests/cuquantum_tests/custatevec_tests/test_custatevec.py b/python/tests/cuquantum_tests/custatevec_tests/test_custatevec.py index 364d3c3..946fb7e 100644 --- a/python/tests/cuquantum_tests/custatevec_tests/test_custatevec.py +++ b/python/tests/cuquantum_tests/custatevec_tests/test_custatevec.py @@ -6,6 +6,7 @@ import cupy as cp from cupy import testing +import cupyx as cpx import numpy as np try: from mpi4py import MPI # init! @@ -204,18 +205,24 @@ class TestLibHelper: def test_get_version(self): ver = cusv.get_version() - assert ver == (cusv.MAJOR_VER * 1000 + major = ver // 1000 + minor = (ver % 1000) // 100 + + # run-time version must be compatible with build-time version + assert major == cusv.MAJOR_VER + assert minor >= cusv.MINOR_VER + + # sanity check (build-time versions should agree) + assert cusv.VERSION == (cusv.MAJOR_VER * 1000 + cusv.MINOR_VER * 100 + cusv.PATCH_VER) - assert ver == cusv.VERSION def test_get_property(self): + # run-time version must be compatible with build-time version assert cusv.MAJOR_VER == cusv.get_property( cuquantum.libraryPropertyType.MAJOR_VERSION) - assert cusv.MINOR_VER == cusv.get_property( + assert cusv.MINOR_VER <= cusv.get_property( cuquantum.libraryPropertyType.MINOR_VERSION) - assert cusv.PATCH_VER == cusv.get_property( - cuquantum.libraryPropertyType.PATCH_LEVEL) class TestHandle: @@ -1790,6 +1797,94 @@ def test_scheduler(self, handle, scheduler_args, input_form, param_form): pass +class TestSubSVMigrator: + ''' This class runs random tests to check all API arguemnts + are correctly passed to C-API + ''' + @classmethod + def setup_class(cls): + np.random.seed(20231003) + + @pytest.mark.parametrize( + 'dtype', (np.complex64, np.complex128) + ) + @pytest.mark.parametrize( + 'exec_num', range(5) + ) + def test_sub_sv_migrator(self, handle, dtype, exec_num): + n_local_index_bits = np.random.randint(low=1, high=22) + n_device_slots = np.random.randint(low=2, high=16) + device_slot_idx = np.random.randint(n_device_slots) + check_in_out = np.random.randint(4) + randnum_dtype = np.float32 if dtype == np.complex64 else np.float64 + + data_type = dtype_to_data_type[dtype] + sub_sv_size = 2 ** n_local_index_bits + device_slot_size = sub_sv_size * n_device_slots + + randnums = np.random.rand(device_slot_size) + 1.j * np.random.rand(device_slot_size) + host_slots_ref = np.asarray(randnums, dtype=dtype) + device_slots = cp.array(host_slots_ref) + begin = np.random.randint(low=0, high=sub_sv_size-1) + end = np.random.randint(low=begin + 1, high=sub_sv_size) + + src_sub_sv_ptr = 0 + dst_sub_sv_ptr = 0 + if check_in_out == 0: + # swap + check_in = check_out = True + randnums = np.random.rand(sub_sv_size) + 1.j * np.random.rand(sub_sv_size) + src_sub_sv_ref = np.asarray(randnums, dtype=dtype) + src_sub_sv = cpx.empty_pinned(sub_sv_size, dtype=dtype) + src_sub_sv[:] = src_sub_sv_ref[:] + # src and dst are the same memory chunk + dst_sub_sv = src_sub_sv + dst_sub_sv_ref = src_sub_sv_ref + src_sub_sv_ptr = dst_sub_sv_ptr = src_sub_sv.ctypes.data + else: + # check-in / check-out + check_in = (check_in_out & 1) != 0 + check_out = (check_in_out & 2) != 0 + if check_out: + randnums = np.random.rand(sub_sv_size) + 1.j * np.random.rand(sub_sv_size) + src_sub_sv_ref = np.asarray(randnums, dtype=dtype) + src_sub_sv = cpx.empty_pinned(sub_sv_size, dtype=dtype) + src_sub_sv[:] = src_sub_sv_ref[:] + src_sub_sv_ptr = src_sub_sv.ctypes.data + if check_in: + randnums = np.random.rand(sub_sv_size) + 1.j * np.random.rand(sub_sv_size) + dst_sub_sv_ref = np.asarray(randnums, dtype=dtype) + dst_sub_sv = cpx.empty_pinned(sub_sv_size, dtype=dtype) + dst_sub_sv[:] = dst_sub_sv_ref[:] + dst_sub_sv_ptr = dst_sub_sv.ctypes.data + + # create SubStateVectorMigrator + migrator = cusv.sub_sv_migrator_create( + handle, device_slots.data.ptr, data_type, n_device_slots, n_local_index_bits) + # migrate + cusv.sub_sv_migrator_migrate( + handle, migrator, device_slot_idx, src_sub_sv_ptr, dst_sub_sv_ptr, begin, end) + # destroy + cp.cuda.Stream().synchronize() + cusv.sub_sv_migrator_destroy(handle, migrator) + + # reference + offset = sub_sv_size * device_slot_idx + if check_in: + # copy values for swap + tmp = host_slots_ref[offset+begin:offset+end].copy() + if check_out: + host_slots_ref[offset+begin:offset+end] = src_sub_sv_ref[begin:end] + if check_in: + dst_sub_sv_ref[begin:end] = tmp[:] + + assert cp.all(cp.asarray(host_slots_ref) == device_slots) + if check_out: + assert np.all(src_sub_sv == src_sub_sv_ref) + if check_in: + assert np.all(dst_sub_sv == dst_sub_sv_ref) + + class TestMemHandler(MemHandlerTestBase): mod = cusv diff --git a/python/tests/cuquantum_tests/cutensornet_tests/approxTN_utils.py b/python/tests/cuquantum_tests/cutensornet_tests/approxTN_utils.py index 605c4a1..7a7dcde 100644 --- a/python/tests/cuquantum_tests/cutensornet_tests/approxTN_utils.py +++ b/python/tests/cuquantum_tests/cutensornet_tests/approxTN_utils.py @@ -155,8 +155,17 @@ def is_exact_split(**split_options): max_extent = split_options.get("max_extent", 0) abs_cutoff = split_options.get("abs_cutoff", 0) rel_cutoff = split_options.get("rel_cutoff", 0) + discarded_weight_cutoff = split_options.get("discarded_weight_cutoff", 0) normalization = split_options.get("normalization", None) - return (max_extent == 0 or max_extent is None) and abs_cutoff == 0 and rel_cutoff == 0 and normalization is None + return (max_extent == 0 or max_extent is None) and \ + abs_cutoff == 0 and rel_cutoff == 0 and \ + discarded_weight_cutoff == 0 and normalization is None + +def is_dw_truncation_only(**split_options): + max_extent = split_options.get("max_extent", 0) + abs_cutoff = split_options.get("abs_cutoff", 0) + rel_cutoff = split_options.get("rel_cutoff", 0) + return (max_extent == 0 or max_extent is None) and abs_cutoff == 0 and rel_cutoff == 0 def split_contract_decompose(subscripts): @@ -170,8 +179,8 @@ def split_contract_decompose(subscripts): decompose_subscripts = f"{intm_modes}->{outputs}" return contract_subscripts, decompose_subscripts -#NOTE: torch does not have native support on F order -# We here get around this by converting to CuPy/NumPy ndarrays as a get arounds +# NOTE: torch does not have native support on F order +# We here get around this by converting to CuPy/NumPy ndarrays as a workaround # the overhead for torch tensors on GPU should be minimal as torch tensors support __cuda_array_interface__ def torch_support_wrapper(func): def new_func(T, *args, **kwargs): @@ -195,6 +204,7 @@ def get_einsum_kwargs(backend): #################################### ############ Execution ############# #################################### + @torch_support_wrapper def tensor_permute(T, input_modes, output_modes): axes = [input_modes.index(i) for i in output_modes] @@ -216,6 +226,7 @@ def matrix_svd( max_extent=0, abs_cutoff=0, rel_cutoff=0, + discarded_weight_cutoff=0, partition=None, normalization=None, return_info=True, @@ -234,6 +245,14 @@ def matrix_svd( if max_extent == 0 or max_extent is None: max_extent = len(s) reduced_extent = min(max_extent, int((s>cutoff).sum())) + if discarded_weight_cutoff != 0: + s_square_sum = backend.cumsum(s**2, 0) + if backend not in (cp, np): # torch + s_square_sum /= s_square_sum[-1].clone() + else: + s_square_sum /= s_square_sum[-1] + dw_reduced_extent = int((s_square_sum<(1-discarded_weight_cutoff)).sum()) + 1 + reduced_extent = min(reduced_extent, dw_reduced_extent) reduced_extent = max(reduced_extent, 1) info["reduced_extent"] = reduced_extent if reduced_extent != len(s): @@ -533,14 +552,14 @@ def verify_split_SVD( algorithm = 'gesvd' if algorithm == 'gesvdj': if dtype_name in ['float64', 'complex128']: - rtol = 1e-8 + rtol = 1e-6 if 'gesvdj_residual' not in info: logging.warning("gesvdj_residual not recorded in info; verification may fail due to unknown runtime status") else: - rtol = max(rtol, info['gesvdj_residual']) + rtol = max(rtol, info['gesvdj_residual'] * max_mid_extent) elif algorithm == 'gesvdp': if dtype_name in ['float64', 'complex128']: - rtol = 1e-8 + rtol = 1e-6 if 'gesvdp_err_sigma' not in info: logging.warning("gesvdp_err_sigma not recorded in info; verification may fail due to unknown runtime status") elif info['gesvdp_err_sigma'] > 1e-4: @@ -588,6 +607,18 @@ def verify_split_SVD( # For gesvdr, discarded weight is only computed when fix extent truncation is not enabled if info['algorithm'] != 'gesvdr' or max_extent == max_mid_extent: info_equal = info_equal and (abs(info["discarded_weight"]-info_ref["discarded_weight"]) < rtol) + if is_dw_truncation_only(**split_options) and max_extent == max_mid_extent: + # when only dw is in use, verify that discarded weight is less than the cutoff + dw_cutn = info["discarded_weight"] + dw_ref = info_ref["discarded_weight"] + dw_cutoff = split_options.get('discarded_weight_cutoff', 0) + if dw_cutn > dw_cutoff: + logging.error("cutensornet SVD runtime discarded weight {dw_cutn} larger than cutoff {dw_cutoff}") + return False + if dw_ref > dw_cutoff: + logging.error("reference SVD runtime discarded weight {dw_ref} larger than cutoff {dw_cutoff}") + return False + if not info_equal: info_details = "".join([f"{key}:({info.get(key)}, {info_ref.get(key)}); " for key in info.keys()]) logging.error(f"SVD Info not matching the reference: {info_details}") diff --git a/python/tests/cuquantum_tests/cutensornet_tests/circuit_utils.py b/python/tests/cuquantum_tests/cutensornet_tests/circuit_utils.py index 69f1fa1..b675d15 100644 --- a/python/tests/cuquantum_tests/cutensornet_tests/circuit_utils.py +++ b/python/tests/cuquantum_tests/cutensornet_tests/circuit_utils.py @@ -2,17 +2,15 @@ # # SPDX-License-Identifier: BSD-3-Clause -from collections import Counter -import itertools from types import MappingProxyType try: import cirq + from cuquantum.cutensornet._internal import circuit_parser_utils_cirq except ImportError: - cirq = None + cirq = circuit_parser_utils_cirq = None import cupy as cp import numpy as np -import pytest try: import torch if not torch.cuda.is_available(): @@ -21,8 +19,9 @@ torch = None try: import qiskit + from cuquantum.cutensornet._internal import circuit_parser_utils_qiskit except ImportError: - qiskit = None + qiskit = circuit_parser_utils_qiskit = None from cuquantum import contract, CircuitToEinsum from cuquantum import cutensornet as cutn @@ -31,10 +30,19 @@ from cuquantum.cutensornet._internal.circuit_converter_utils import EINSUM_SYMBOLS_BASE from cuquantum.cutensornet._internal.circuit_converter_utils import get_pauli_gates from cuquantum.cutensornet._internal.circuit_converter_utils import parse_gates_to_mode_labels_operands +from cuquantum.cutensornet._internal.decomposition_utils import SVD_ALGORITHM_MAP, NORMALIZATION_MAP from cuquantum.cutensornet._internal.utils import infer_object_package -from .test_utils import atol_mapper, get_stream_for_backend, rtol_mapper +from .approxTN_utils import SVD_TOLERANCE, verify_unitary +from .mps_utils import MPS, gen_random_mps, get_mps_tolerance +from .mps_utils import amplitude_from_sv +from .mps_utils import batched_amplitude_from_sv +from .mps_utils import expectation_from_sv +from .mps_utils import reduced_density_matrix_from_sv +from .mps_utils import sample_from_sv +from .test_utils import atol_mapper, rtol_mapper from .test_cutensornet import manage_resource +from .. import dtype_to_data_type # note: this implementation would cause pytorch tests being silently skipped @@ -49,14 +57,19 @@ qiskit_circuits = [] EMPTY_DICT = MappingProxyType(dict()) +GLOBAL_RNG = np.random.default_rng(2023) +DEFAULT_NUM_RANDOM_LAYERS = 2 +EXACT_MPS_QUBIT_COUNT_LIMIT = 63 # limit the number of qubits for exact MPS to avoid extent overflowing - -def gen_qubits_map(qubits): - n_qubits = len(qubits) - if n_qubits > len(EINSUM_SYMBOLS_BASE): - raise NotImplementedError(f'test suite only supports up to {len(EINSUM_SYMBOLS_BASE)} qubits') - qubits_map = dict(zip(qubits, EINSUM_SYMBOLS_BASE[:n_qubits])) - return qubits_map +STATE_ATTRIBUTE_MAP = { + 'canonical_center' : cutn.StateAttribute.MPS_CANONICAL_CENTER, + 'abs_cutoff' : cutn.StateAttribute.MPS_SVD_CONFIG_ABS_CUTOFF, + 'rel_cutoff' : cutn.StateAttribute.MPS_SVD_CONFIG_REL_CUTOFF, + 'normalization' : cutn.StateAttribute.MPS_SVD_CONFIG_S_NORMALIZATION, + 'discarded_weight_cutoff' : cutn.StateAttribute.MPS_SVD_CONFIG_DISCARDED_WEIGHT_CUTOFF, + 'algorithm' : cutn.StateAttribute.MPS_SVD_CONFIG_ALGO, + #'algorithm_params' : cutn.StateAttribute.MPS_SVD_CONFIG_ALGO_PARAMS, # NOTE: special treatment required + 'num_hyper_samples' : cutn.StateAttribute.NUM_HYPER_SAMPLES} def bitstring_generator(n_qubits, nsample=1): @@ -86,20 +99,58 @@ def random_pauli_string_generator(n_qubits, num_strings=4): yield ''.join(np.random.choice(['I','X', 'Y', 'Z'], n_qubits)) -def get_partial_indices(qubits, fixed): - partial_indices = [slice(None)] * len(qubits) - index_map = {'0': slice(0, 1), - '1': slice(1, 2)} - for ix, q in enumerate(qubits): - if q in fixed: - partial_indices[ix] = index_map[fixed[q]] - return partial_indices - - ################################################ # functions to generate cirq.Circuit for testing ################################################ +def get_cirq_random_2q_gate(): + class Random2QGate(cirq.Gate): + def __init__(self): + super(Random2QGate, self) + setattr(self, '_internal_array_', cirq.testing.random_unitary(4)) + + def _num_qubits_(self): + return 2 + + def _unitary_(self): + return getattr(self, '_internal_array_') + + def __pow__(self, power): + if power == 1: + return self + elif power == -1: + new_gate = Random2QGate() + unitary = getattr(self, '_internal_array_').T.conj() + setattr(new_gate, '_internal_array_', unitary) + return new_gate + else: + return NotImplementedError + + def _circuit_diagram_info_(self, args): + return "Q1", "Q2" + + return Random2QGate() + + +def gen_random_layered_cirq_circuit(qubits, num_random_layers=2): + n_qubits = len(qubits) + operations = [] + for n in range(num_random_layers): + for i in range(n%2, n_qubits-1, 2): + operations.append(get_cirq_random_2q_gate().on(qubits[i], qubits[i+1])) + return cirq.Circuit(operations) + + +def cirq_insert_random_layers(circuit, num_random_layers=DEFAULT_NUM_RANDOM_LAYERS): + if num_random_layers == 0: + return circuit + qubits = sorted(circuit.all_qubits()) + circuit = circuit_parser_utils_cirq.remove_measurements(circuit) + pre_circuit = gen_random_layered_cirq_circuit(qubits, num_random_layers=num_random_layers) + post_circuit = gen_random_layered_cirq_circuit(qubits, num_random_layers=num_random_layers) + return pre_circuit.concat_ragged(circuit, post_circuit) + + def get_cirq_qft_circuit(n_qubits): qubits = cirq.LineQubit.range(n_qubits) qreg = list(qubits)[::-1] @@ -139,11 +190,46 @@ def get_cirq_random_circuit(n_qubits, n_moments, op_density=0.9, seed=3): except: pass +cirq_circuits_mps = [cirq_insert_random_layers(circuit) for circuit in cirq_circuits] ######################################################### # functions to generate qiskit.QuantumCircuit for testing ######################################################### +def get_qiskit_unitary_gate(rng=GLOBAL_RNG, control=None): + # random unitary two qubit gate + from qiskit.extensions import UnitaryGate + m = rng.standard_normal(size=(4, 4)) + 1j*rng.standard_normal(size=(4, 4)) + q, r = np.linalg.qr(m) + d = np.diag(r) + q *= d/abs(d) + gate = UnitaryGate(q) + if control is None: + return gate + else: + return gate.control(control) + + +def gen_random_layered_qiskit_circuit(qubits, num_random_layers=DEFAULT_NUM_RANDOM_LAYERS): + n_qubits = len(qubits) + circuit = qiskit.QuantumCircuit(qubits) + for n in range(num_random_layers): + for i in range(n%2, n_qubits-1, 2): + circuit.append(get_qiskit_unitary_gate(), qubits[i:i+2]) + return circuit + + +def qiskit_insert_random_layers(circuit, num_random_layers=DEFAULT_NUM_RANDOM_LAYERS): + if num_random_layers == 0: + return circuit + qubits = circuit.qubits + circuit.remove_final_measurements() + pre_circuit = gen_random_layered_qiskit_circuit(qubits, num_random_layers=num_random_layers) + post_circuit = gen_random_layered_qiskit_circuit(qubits, num_random_layers=num_random_layers) + circuit.data = pre_circuit.data + circuit.data + post_circuit.data + return circuit + + def get_qiskit_qft_circuit(n_qubits): return qiskit.circuit.library.QFT(n_qubits, do_swaps=False).decompose() @@ -189,20 +275,6 @@ def get_qiskit_nested_circuit(): return circ -def get_cc_unitary_gate(seed=None): - # random unitary two qubit gate - from qiskit.extensions import UnitaryGate - if seed is None: - seed = 1234 - rng = np.random.default_rng(seed) - m = rng.standard_normal(size=(4, 4)) + 1j*rng.standard_normal(size=(4, 4)) - q, r = np.linalg.qr(m) - d = np.diag(r) - q *= d/abs(d) - gate = UnitaryGate(q).control(2) - return gate - - def get_qiskit_multi_control_circuit(): qubits = qiskit.QuantumRegister(5) circuit = qiskit.QuantumCircuit(qubits) @@ -211,9 +283,10 @@ def get_qiskit_multi_control_circuit(): qs = list(qubits) # 3 layers of multi-controlled qubits np.random.seed(0) + rng = np.random.default_rng(1234) for i in range(2): - np.random.shuffle(qs) - ccu_gate = get_cc_unitary_gate(i) + rng.shuffle(qs) + ccu_gate = get_qiskit_unitary_gate(rng, control=2) circuit.append(ccu_gate, qs[:4]) for q in qubits: if i % 2 == 1: @@ -250,6 +323,13 @@ def get_qiskit_multi_control_circuit(): except: pass +qiskit_circuits_mps = [qiskit_insert_random_layers(circuit) for circuit in qiskit_circuits] + +def is_converter_mps_compatible(converter): + for _, qubits in converter.gates: + if len(qubits) > 2: + return False + return True def compute_histogram_overlap(hist1, hist2, nshots): # assuming hist1 & hist2 have the same sample size (=nshots) @@ -261,142 +341,344 @@ def compute_histogram_overlap(hist1, hist2, nshots): overlap /= nshots return overlap +class _BaseComputeEngine: + + @property + def qubits(self): + raise NotImplementedError + + @property + def n_qubits(self): + return len(self.qubits) + + @property + def tolerance(self): + raise NotImplementedError + + def setup_resources(self, *args, **kwargs): + raise NotImplementedError + + def get_sv(self): + raise NotImplementedError + + def get_norm(self): + if self.norm is None: + sv = self.get_sv() + if sv is not None: + self.norm = self.backend.linalg.norm(self.get_sv()).item() ** 2 + return self.norm + + def get_amplitude(self, bitstring): + raise NotImplementedError + + def get_batched_amplitudes(self, fixed): + raise NotImplementedError + + def get_reduced_density_matrix(self, where, fixed=EMPTY_DICT): + r""" + For where = (a, b), reduced density matrix is formulated as: + :math: `rho_{a,b,a^{\prime},b^{\prime}} = \sum_{c,d,e,...} SV^{\star}_{a^{\prime}, b^{\prime}, c, d, e, ...} SV_{a, b, c, d, e, ...}` + """ + raise NotImplementedError + + def get_expectation(self, pauli_string): + raise NotImplementedError + + def get_sampling(self, qubits_to_sample=None, seed=None, nshots=5000): + raise NotImplementedError -################################################################### -# -# Simulator APIs inside cirq and qiskit may be subject to change. -# Version tests are needed. In cases where simulator API changes, -# the implementatitons to be modified are: -# `CirqTest._get_state_vector_from_simulator` and -# `QiskitTest._get_state_vector_from_simulator` -# -################################################################### -class BaseTester: - def __init__(self, circuit, dtype, backend, nsample, nsite_max, nfix_max, nshots=5000, seed=1024): +class BaseFrameworkComputeEngine(_BaseComputeEngine): + ################################################################### + # + # Reference implementation from framework providers. + # + # Simulator APIs inside cirq and qiskit may be subject to change. + # Version tests are needed. In cases where simulator API changes, + # the methods to be modified are: + # 1. `CirqComputeEngine._get_state_vector` + # 2. `CirqComputeEngine.get_sampling` + # 3. `QiskitComputeEngine._get_state_vector` + # 4. `QiskitComputeEngine.get_sampling` + # + ################################################################### + + def __init__(self, circuit, dtype, backend): self.circuit = circuit - self.converter = CircuitToEinsum(circuit, dtype=dtype, backend=backend) self.backend = backend - self.qubits = list(self.converter.qubits) - self.n_qubits = self.converter.n_qubits self.dtype = dtype self.sv = None - self.nsample = nsample - self.nsite_max = max(1, min(nsite_max, self.n_qubits-1)) - self.nfix_max = max(min(nfix_max, self.n_qubits-nsite_max-1), 0) - self.nshots = nshots - self.seed = seed - self.state_purity = cutn.StatePurity.PURE - self.state_prepared = False - - def get_state_vector_from_simulator(self): + self._tolerance = None + self.norm = None + + @property + def tolerance(self): + if self._tolerance is None: + self._tolerance = {'atol': atol_mapper[self.dtype], + 'rtol': rtol_mapper[self.dtype]} + return self._tolerance + + def setup_resources(self, *args, **kwargs): + # No additional resources needed + pass + + def _get_state_vector(self): + # implementation for different frameworks + raise NotImplementedError + + def get_sv(self): if self.sv is None: - self.sv = self._get_state_vector_from_simulator() + self.sv = self._get_state_vector() return self.sv - def get_amplitude_from_simulator(self, bitstring): - sv = self.get_state_vector_from_simulator() - index = [int(ibit) for ibit in bitstring] - return sv[tuple(index)] + def get_amplitude(self, bitstring): + return amplitude_from_sv(self.get_sv(), bitstring) - def get_batched_amplitudes_from_simulator(self, fixed): - sv = self.get_state_vector_from_simulator() - partial_indices = get_partial_indices(self.qubits, fixed) - batched_amplitudes = sv[tuple(partial_indices)] - return batched_amplitudes.reshape((2,)*(self.n_qubits-len(fixed))) + def get_batched_amplitudes(self, fixed): + fixed = dict([(self.qubits.index(q), bit) for q, bit in fixed.items()]) + return batched_amplitude_from_sv(self.get_sv(), fixed) - def get_reduced_density_matrix_from_simulator(self, where, fixed=EMPTY_DICT): - r""" - For where = (a, b), reduced density matrix is formulated as: - :math: `rho_{a,b,a^{\prime},b^{\prime}} = \sum_{c,d,e,...} SV^{\star}_{a^{\prime}, b^{\prime}, c, d, e, ...} SV_{a, b, c, d, e, ...}` - """ - sv = self.get_state_vector_from_simulator() - partial_indices = get_partial_indices(self.qubits, fixed) - sv = sv[tuple(partial_indices)] - - qubits_map = gen_qubits_map(self.qubits) - output_inds = ''.join([qubits_map[q] for q in where]) - output_inds += output_inds.upper() - left_inds = ''.join([qubits_map[q] for q in self.qubits]) - right_inds = '' - for q in self.qubits: - if q in where: - right_inds += qubits_map[q].upper() - else: - right_inds += qubits_map[q] - expression = left_inds + ',' + right_inds + '->' + output_inds + def get_reduced_density_matrix(self, where, fixed=EMPTY_DICT): + sv = self.get_sv() + where = [self.qubits.index(q) for q in where] + fixed = dict([(self.qubits.index(q), bit) for q, bit in fixed.items()]) + return reduced_density_matrix_from_sv(sv, where, fixed=fixed) + + def get_expectation(self, pauli_string): + return expectation_from_sv(self.get_sv(), pauli_string) + + def get_sampling(self, qubits_to_sample=None, seed=None, nshots=5000): + # implementation for different framework providers + raise NotImplementedError + + +class CirqComputeEngine(BaseFrameworkComputeEngine): + + @property + def qubits(self): + return sorted(self.circuit.all_qubits()) + + def _get_state_vector(self): + qubits = self.qubits + simulator = cirq.Simulator(dtype=self.dtype) + circuit = circuit_parser_utils_cirq.remove_measurements(self.circuit) + result = simulator.simulate(circuit, qubit_order=qubits) + statevector = result.state_vector().reshape((2,)*self.n_qubits) if self.backend is torch: - rdm = contract(expression, sv, sv.conj().resolve_conj()) + statevector = torch.as_tensor(statevector, dtype=getattr(torch, self.dtype), device='cuda') else: - rdm = contract(expression, sv, sv.conj()) - return rdm + statevector = self.backend.asarray(statevector, dtype=self.dtype) + return statevector - def get_expectation_from_sv(self, pauli_string): - - input_mode_labels = [[*range(self.n_qubits)]] - qubits_frontier = dict(zip(self.qubits, itertools.count())) - next_frontier = max(qubits_frontier.values()) + 1 - - pauli_map = dict(zip(self.qubits, pauli_string)) - dtype = getattr(self.backend, self.dtype) - pauli_gates = get_pauli_gates(pauli_map, dtype=dtype, backend=self.backend) - gate_mode_labels, gate_operands = parse_gates_to_mode_labels_operands(pauli_gates, - qubits_frontier, - next_frontier) - - mode_labels = input_mode_labels + gate_mode_labels + [[qubits_frontier[ix] for ix in self.qubits]] - output_mode_labels = [] - expression = convert_mode_labels_to_expression(mode_labels, output_mode_labels) - - sv = self.get_state_vector_from_simulator() + def get_sampling(self, qubits_to_sample=None, seed=None, nshots=5000): + if qubits_to_sample is None: + qubits_to_sample = self.qubits + circuit = circuit_parser_utils_cirq.remove_measurements(self.circuit) + circuit.append(cirq.measure_each(qubits_to_sample)) + circuit.append(cirq.measure(*qubits_to_sample, key='meas')) + result = cirq.sample( + circuit, repetitions=nshots, seed=seed, dtype=getattr(np, self.dtype)) + result = result.histogram(key='meas') + sampling = {} + nsamples = 0 + for bitstring, nsample in result.items(): + sampling[int(bitstring)] = nsample + nsamples += nsample + assert nsamples == nshots + return sampling + + +class QiskitComputeEngine(BaseFrameworkComputeEngine): + + @property + def qubits(self): + return list(self.circuit.qubits) + + def _get_precision(self): + precision = {'complex64': 'single', + 'complex128': 'double'}[self.dtype] + return precision + + def _get_state_vector(self): + # requires qiskit >= 0.24.0 + precision = self._get_precision() + circuit = circuit_parser_utils_qiskit.remove_measurements(self.circuit) + try: + # for qiskit >= 0.25.0 + simulator = qiskit.Aer.get_backend('aer_simulator_statevector', precision=precision) + circuit = qiskit.transpile(circuit, simulator) + circuit.save_statevector() + result = simulator.run(circuit).result() + except: + # for qiskit 0.24.* + simulator = qiskit.Aer.get_backend('statevector_simulator', precision=precision) + result = qiskit.execute(circuit, simulator).result() + sv = np.asarray(result.get_statevector()).reshape((2,)*circuit.num_qubits) + # statevector returned by qiskit's simulator is labelled by the inverse of :attr:`qiskit.QuantumCircuit.qubits` + # this is different from `cirq` and different from the implementation in :class:`CircuitToEinsum` + sv = sv.transpose(list(range(circuit.num_qubits))[::-1]) if self.backend is torch: - operands = [sv] + gate_operands + [sv.conj().resolve_conj()] + sv = torch.as_tensor(sv, dtype=getattr(torch, self.dtype), device='cuda') else: - operands = [sv] + gate_operands + [sv.conj()] - expec = contract(expression, *operands) - return expec + sv = self.backend.asarray(sv, dtype=self.dtype) + return sv + + def get_sampling(self, qubits_to_sample=None, seed=None, nshots=5000): + if qubits_to_sample is None: + qubits_to_sample = self.qubits + circuit = self.circuit.remove_final_measurements(inplace=False) + new_creg = circuit._create_creg(len(qubits_to_sample), "meas") + circuit.add_register(new_creg) + circuit.measure(qubits_to_sample, new_creg) + precision = self._get_precision() + backend = qiskit.Aer.get_backend('qasm_simulator', precision=precision) + result = backend.run(qiskit.transpile(circuit, backend), shots=nshots, seed=seed).result() + counts = result.get_counts(circuit) + sampling = {} + nsamples = 0 + for bitstring, nsample in counts.items(): + # little endian from qiskit + value = int(bitstring[::-1], 2) + sampling[value] = nsample + nsamples += nsample + assert nsamples == nshots + return sampling - def _get_state_vector_from_simulator(self): - raise NotImplementedError - def _get_sampling_from_simulator(self, qubits_to_sample=None, seed=None): - raise NotImplementedError +class CircuitToEinsumComputeEngine(_BaseComputeEngine): + + def __init__(self, converter): + self.converter = converter + self.backend = self.converter.backend + if self.backend is torch: + self.dtype = str(converter.dtype).split('.')[-1] + else: + self.dtype = converter.dtype.__name__ + self._tolerance = None + self.handle = None # Non-owning + self.sv = None + self.norm = None - def get_sampling_from_sv(self, qubits_to_sample=None, seed=None): - sv = self.get_state_vector_from_simulator() - p = abs(sv) ** 2 - # convert p to double type in case probs does not add up to 1 - if self.backend is np: - p = p.astype('float64') - elif self.backend is cp: - p = cp.asnumpy(p).astype('float64') - elif self.backend is torch: - if p.device.type == 'cpu': - p = p.numpy().astype('float64') - else: - p = p.cpu().numpy().astype('float64') - if qubits_to_sample is not None: - sorted_qubits_to_sample = [q for q in self.qubits if q in qubits_to_sample] - axis = [i for (i, q) in enumerate(self.qubits) if q not in qubits_to_sample] - if axis: - p = p.sum(tuple(axis)) - # potential transpose to match the order of qubits_to_sample - transpose_order = [sorted_qubits_to_sample.index(q) for q in qubits_to_sample] - p = p.transpose(*transpose_order) - # normalize - p /= p.sum() - if seed is not None: - np.random.seed(seed) - samples = np.random.choice(np.arange(p.size), p=p.flat, size=self.nshots) - hist_sv = np.unique(samples, return_counts=True) - return dict(zip(*hist_sv)) - - def maybe_prepare_state(self): - if not self.state_prepared: - if not hasattr(self, 'state'): - raise RuntimeError("state not initialized") - if self.backend is not cp: - raise RuntimeError("This func is only expected to be executed for cupy backend") + @property + def qubits(self): + return list(self.converter.qubits) + + @property + def tolerance(self): + if self._tolerance is None: + self._tolerance = {'atol': atol_mapper[self.dtype], + 'rtol': rtol_mapper[self.dtype]} + return self._tolerance + + def setup_resources(self, *args, **kwargs): + self.handle = kwargs.get('handle', None) + + def _compute_from_converter(self, task, *args, **kwargs): + assert self.handle is not None, "handle not provided" + expression, operands = getattr(self.converter, task)(*args, **kwargs) + return contract(expression, *operands, options={'handle': self.handle}) + + def get_sv(self): + if self.sv is None: + self.sv = self._compute_from_converter('state_vector') + return self.sv + + def get_reduced_density_matrix(self, where, fixed=EMPTY_DICT, lightcone=True): + return self._compute_from_converter('reduced_density_matrix', where, fixed=fixed, lightcone=lightcone) + + def get_sampling(self, qubits_to_sample=None, seed=None, nshots=5000): + sv = self.get_sv() + if qubits_to_sample is None: + modes_to_sample = list(range(self.n_qubits)) + else: + modes_to_sample = [self.qubits.index(q) for q in qubits_to_sample] + return sample_from_sv(sv, nshots, modes_to_sample=modes_to_sample, seed=seed) + + def get_amplitude(self, bitstring): + return self._compute_from_converter('amplitude', bitstring) + + def get_batched_amplitudes(self, fixed): + return self._compute_from_converter('batched_amplitudes', fixed) + + def get_expectation(self, pauli_string, lightcone=True): + return self._compute_from_converter('expectation', pauli_string, lightcone=lightcone) + + +class StateComputeEngine(_BaseComputeEngine): + ##################################################################### + # + # Implementation from cutensornetState_t APIs. + # This reference are only meant to be tested when backend is `cupy`. + # + # The methods below must have the same API signature with + # their counterer parts in `BastFrameworkComputeEngine` + # (up to the first few arguments being handle and workspace): + # 1. `StateComputeEngine.get_sv` + # 2. `StateComputeEngine.get_amplitude` + # 3. `StateComputeEngine.get_batched_amplitudes` + # 4. `StateComputeEngine.get_reduced_density_matrix` + # 5. `StateComputeEngine.get_expectation` + # 6. `StateComputeEngine.get_sampling` + # + ##################################################################### + + def __init__(self, converter, **options): + if converter.backend is not cp: + raise RuntimeError("This class is only expected to be executed for cupy backend") + self.converter = converter + self.state = None + self.state_computed = False + self.circuit_state_parsed = False + if converter.backend is torch: + self.dtype = str(converter.dtype).split('.')[-1] + else: + self.dtype = converter.dtype.__name__ + self.options = options + self._tolerance = None + gate_i = cp.asarray([[1,0], [0,1]], dtype=self.dtype, order=np.random.choice(['C', 'F'])) + gate_x = cp.asarray([[0,1], [1,0]], dtype=self.dtype, order=np.random.choice(['C', 'F'])) + gate_y = cp.asarray([[0,-1j], [1j,0]], dtype=self.dtype, order=np.random.choice(['C', 'F'])) + gate_z = cp.asarray([[1,0], [0,-1]], dtype=self.dtype, order=np.random.choice(['C', 'F'])) + self.pauli_map = {'I': gate_i.T, + 'X': gate_x.T, + 'Y': gate_y.T, + 'Z': gate_z.T} + self.norm = None + self.sv = None + self.handle = None # non-owning + self.workspace = None # non-owning + + @property + def qubits(self): + return list(self.converter.qubits) + + @property + def tolerance(self): + if self._tolerance is None: + self._tolerance = {'atol': atol_mapper[self.dtype], + 'rtol': rtol_mapper[self.dtype]} + return self._tolerance + + def __del__(self): + if self.state is not None: + cutn.destroy_state(self.state) + + def setup_resources(self, *args, **kwargs): + self.handle = kwargs.get('handle', None) + self.workspace = kwargs.get('workspace', None) + + def _maybe_create_state(self): + assert self.handle is not None and self.workspace is not None, f"handle or workspace not setted up" + if self.state is None: + dtype = dtype_to_data_type[getattr(np, self.dtype)] + # create the state object + self.state = cutn.create_state(self.handle, + cutn.StatePurity.PURE, self.n_qubits, (2,)*self.n_qubits, dtype) + + def _maybe_parse_state(self): + self._maybe_create_state() + + if not self.circuit_state_parsed: gates = self.converter.gates immutable = 0 adjoint = 0 @@ -415,15 +697,31 @@ def maybe_prepare_state(self): tensor_id = cutn.state_apply_tensor(self.handle, self.state, n_state_modes, state_modes, tmp.data.ptr, tensor_mode_strides, immutable, adjoint, unitary) - cutn.state_update_tensor(self.handle, self.state, tensor_id, operand.data.ptr, unitary) + cutn.state_update_tensor(self.handle, self.state, + tensor_id, operand.data.ptr, unitary) else: cutn.state_apply_tensor(self.handle, self.state, n_state_modes, state_modes, operand.data.ptr, tensor_mode_strides, immutable, adjoint, unitary) - self.state_prepared = True - - def _run_cutensornet_sampling_marginal(self, task, create_args, execute_args, stream): - self.maybe_prepare_state() + self.circuit_state_parsed = True + + def _maybe_parse_options(self): + if self.options: + raise NotImplementedError + + def _maybe_compute_state(self): + # Implement this for different type of simulators + # For tensor network simulator, final state is not computed + # For other types of simulator, final state must be explictly computed and stored + if not self.state_computed: + self._maybe_parse_state() + self._maybe_parse_options() + self.state_computed = True + + def _compute_target(self, task, create_args, execute_args, stream): + if task != 'state': + # avoid going into infinite loops + self._maybe_compute_state() if task == 'marginal': create_func = cutn.create_marginal configure_func = cutn.marginal_configure @@ -440,36 +738,117 @@ def _run_cutensornet_sampling_marginal(self, task, create_args, execute_args, st prepare_func = cutn.sampler_prepare execute_func = cutn.sampler_sample destroy_func = cutn.destroy_sampler + elif task == 'accessor': + create_func = cutn.create_accessor + configure_func = cutn.accessor_configure + hyper_sample_attr = cutn.AccessorAttribute.OPT_NUM_HYPER_SAMPLES + num_hyper_samples_dtype = cutn.accessor_get_attribute_dtype(hyper_sample_attr) + prepare_func = cutn.accessor_prepare + execute_func = cutn.accessor_compute + destroy_func = cutn.destroy_accessor + elif task == 'expectation': + create_func = cutn.create_expectation + configure_func = cutn.expectation_configure + hyper_sample_attr = cutn.ExpectationAttribute.OPT_NUM_HYPER_SAMPLES + num_hyper_samples_dtype = cutn.accessor_get_attribute_dtype(hyper_sample_attr) + prepare_func = cutn.expectation_prepare + execute_func = cutn.expectation_compute + destroy_func = cutn.destroy_expectation + elif task == 'state': + # full state_vector computation does not need to destroy state + create_func = None + configure_func = cutn.state_configure + hyper_sample_attr = cutn.StateAttribute.NUM_HYPER_SAMPLES + num_hyper_samples_dtype = cutn.state_get_attribute_dtype(hyper_sample_attr) + prepare_func = cutn.state_prepare + execute_func = cutn.state_compute + destroy_func = None else: - raise ValueError("only supports marginal and sampler") + raise ValueError("only supports marginal, sampler, accessor, expectation and state") dev = cp.cuda.Device() free_mem = dev.mem_info[0] scratch_size = free_mem // 2 # maximal usage of 50% device memory - - task_obj = create_func(self.handle, self.state, *create_args) + if create_func is None: # state vector computation + task_obj = self.state + else: + task_obj = create_func(self.handle, self.state, *create_args) num_hyper_samples = np.asarray(8, dtype=num_hyper_samples_dtype) configure_func(self.handle, task_obj, hyper_sample_attr, num_hyper_samples.ctypes.data, num_hyper_samples.dtype.itemsize) prepare_func(self.handle, task_obj, scratch_size, self.workspace, stream.ptr) # similar args for marginal and sampler - workspace_size_d = cutn.workspace_get_memory_size(self.handle, - self.workspace, cutn.WorksizePref.RECOMMENDED, cutn.Memspace.DEVICE, cutn.WorkspaceKind.SCRATCH) - if workspace_size_d >= scratch_size: + for memspace in (cutn.Memspace.DEVICE, cutn.Memspace.HOST): + workspace_size = cutn.workspace_get_memory_size(self.handle, + self.workspace, cutn.WorksizePref.RECOMMENDED, + memspace, cutn.WorkspaceKind.SCRATCH) + workspace_ptr = None + if memspace == cutn.Memspace.DEVICE: + if workspace_size > scratch_size: + destroy_func(task_obj) + return None + else: + workspace_ptr = cp.cuda.alloc(workspace_size).ptr + else: + workspace_ptr = np.empty(workspace_size, dtype=np.int8).ctypes.data + if workspace_size != 0: + cutn.workspace_set_memory(self.handle, + self.workspace, memspace, + cutn.WorkspaceKind.SCRATCH, workspace_ptr, workspace_size) + + output = execute_func(self.handle, task_obj, *execute_args, stream.ptr) + stream.synchronize() + if destroy_func is not None: destroy_func(task_obj) + if isinstance(output, tuple): + return output + else: + return True + + def _run_state_accessor(self, bitstring=None, fixed=None): + if bitstring is not None: + # compute a single bitstring amplitude + assert fixed is None + shape = 1 + num_fixed_modes = self.n_qubits + fixed_modes = list(range(self.n_qubits)) + fixed_values = [int(i) for i in bitstring] + elif fixed is not None: + # compute batched amplitudes + shape = (2,) * (self.n_qubits - len(fixed)) + num_fixed_modes = len(fixed) + fixed_modes = [] + fixed_values = [] + for q, bit in fixed.items(): + fixed_modes.append(self.qubits.index(q)) + fixed_values.append(int(bit)) + else: + # compute full state vector + shape = (2, ) * self.n_qubits + num_fixed_modes = fixed_modes = fixed_values = 0 + + amplitudes = cp.empty(shape, dtype=self.dtype, order=np.random.choice(('C', 'F'))) + amplitudes_strides = [stride_in_bytes // amplitudes.itemsize for stride_in_bytes in amplitudes.strides] + norm = np.empty(1, dtype=self.dtype) + + create_args = (num_fixed_modes, fixed_modes, amplitudes_strides) + execute_args = (fixed_values, self.workspace, amplitudes.data.ptr, norm.ctypes.data) + stream = cp.cuda.get_current_stream() + if self._compute_target('accessor', create_args, execute_args, stream): + if self.norm is None: + self.norm = norm.item() + else: + assert np.allclose(self.norm, norm.item(), **self.tolerance) + return amplitudes + else: return None - scratch_space = cp.cuda.alloc(workspace_size_d) - cutn.workspace_set_memory(self.handle, - self.workspace, cutn.Memspace.DEVICE, - cutn.WorkspaceKind.SCRATCH, scratch_space.ptr, workspace_size_d) - - execute_func(self.handle, task_obj, *execute_args, stream.ptr) - stream.synchronize() - destroy_func(task_obj) - return True + def get_sv(self): + if self.sv is None: + self.sv = self._run_state_accessor() + return self.sv - def get_reduced_density_matrix_from_cutn(self, where, fixed=EMPTY_DICT): + def get_reduced_density_matrix(self, where, fixed=EMPTY_DICT): n_marginal_modes = len(where) marginal_modes = [self.qubits.index(q) for q in where] if fixed: @@ -482,28 +861,30 @@ def get_reduced_density_matrix_from_cutn(self, where, fixed=EMPTY_DICT): else: n_projected_modes = projected_modes = projected_mode_values = 0 - rdm = cp.empty((2,2)*n_marginal_modes, dtype=self.dtype, order=np.random.choice(['C', 'F'])) + rdm = cp.empty((2,2)*n_marginal_modes, + dtype=self.dtype, order=np.random.choice(['C', 'F'])) rdm_strides = [s // rdm.itemsize for s in rdm.strides] stream = cp.cuda.get_current_stream() create_args = (n_marginal_modes, marginal_modes, n_projected_modes, projected_modes, rdm_strides) execute_args = (projected_mode_values, self.workspace, rdm.data.ptr) - if self._run_cutensornet_sampling_marginal('marginal', create_args, execute_args, stream): + if self._compute_target('marginal', create_args, execute_args, stream): return rdm else: return None - def get_sampling_from_cutensornet(self, qubits_to_sample=None, seed=None): + def get_sampling(self, qubits_to_sample=None, seed=None, nshots=5000): if qubits_to_sample is None: qubits_to_sample = self.qubits n_modes_to_sample = len(qubits_to_sample) modes_to_sample = [self.qubits.index(q) for q in qubits_to_sample] - samples = np.empty((self.nshots, n_modes_to_sample), dtype='int64', order='C') # equivalent to (n_modes, nshots) in F order + samples = np.empty((nshots, n_modes_to_sample), + dtype='int64', order='C') # equivalent to (n_modes, nshots) in F order stream = cp.cuda.get_current_stream() create_args = (n_modes_to_sample, modes_to_sample) - execute_args = (self.nshots, self.workspace, samples.ctypes.data) - if self._run_cutensornet_sampling_marginal('sampler', create_args, execute_args, stream): + execute_args = (nshots, self.workspace, samples.ctypes.data) + if self._compute_target('sampler', create_args, execute_args, stream): sampling = {} for bitstring, n_sampling in zip(*np.unique(samples, axis=0, return_counts=True)): bitstring = np.array2string(bitstring, separator='')[1:-1] @@ -511,209 +892,454 @@ def get_sampling_from_cutensornet(self, qubits_to_sample=None, seed=None): return sampling else: return None + + def get_amplitude(self, bitstring): + return self._run_state_accessor(bitstring=bitstring) - def test_qubits(self): - assert len(self.qubits) == self.n_qubits + def get_batched_amplitudes(self, fixed): + return self._run_state_accessor(fixed=fixed) - def test_gates(self): - for (gate_operand, qubits) in self.converter.gates: - assert gate_operand.ndim == len(qubits) * 2 - assert infer_object_package(gate_operand) == self.backend.__name__ + # cutensornet State APIs can not compute a single expectation. + # Here we compute the sum of all Pauli strings + def get_expectation_sum(self, pauli_strings): + if not isinstance(pauli_strings, dict): + raise ValueError("pauli_strings is expected to be a map from paul strings to coefficients") + dtype = dtype_to_data_type[getattr(np, self.dtype)] + hamiltonian = cutn.create_network_operator(self.handle, + self.n_qubits, (2,)*self.n_qubits, dtype) + for pauli_string, coefficient in pauli_strings.items(): + num_tensors = 0 + num_modes = [] + state_modes = [] + tensor_mode_strides = [] + tensor_data = [] + for q, pauli_char in enumerate(pauli_string): + if pauli_char == 'I': continue + operand = self.pauli_map[pauli_char] + num_tensors += 1 + num_modes.append(1) + state_modes.append([q]) + tensor_mode_strides.append([stride_in_bytes//operand.itemsize for stride_in_bytes in operand.strides]) + tensor_data.append(operand.data.ptr) + if num_tensors == 0: + # pauli string being IIIIII + num_tensors = self.n_qubits + num_modes = [1,] * num_tensors + state_modes = list(range(num_tensors)) + operand = self.pauli_map['I'] + tensor_data = [operand.data.ptr] * num_tensors + tensor_mode_strides = [stride_in_bytes//operand.itemsize for stride_in_bytes in operand.strides] * num_tensors + cutn.network_operator_append_product(self.handle, + hamiltonian, coefficient, num_tensors, + num_modes, state_modes, tensor_mode_strides, tensor_data) + + expectation_value = np.empty(1, dtype=self.dtype) + norm = np.empty(1, dtype=self.dtype) + create_args = (hamiltonian, ) + execute_args = (self.workspace, expectation_value.ctypes.data, norm.ctypes.data) + stream = cp.cuda.get_current_stream() + if self._compute_target('expectation', create_args, execute_args, stream): + output = expectation_value.item() + if self.norm is None: + self.norm = norm.item() + else: + assert np.allclose(self.norm, norm.item(), **self.tolerance) + else: + output = None + cutn.destroy_network_operator(hamiltonian) + return output + + +class SVStateComputeEngine(StateComputeEngine): + + def _maybe_compute_state(self): + # Implement this for different type of simulators + # For tensor network simulator, final state is not computed + # For other types of simulator, final state must be explictly computed and stored + if not self.state_computed: + self._maybe_parse_state() + self._maybe_parse_options() + order = np.random.choice(('C', 'F')) + sv = cp.empty((2,) * self.n_qubits, dtype=self.dtype, order=order) + stream = cp.cuda.get_current_stream() + create_args = () + execute_args = (self.workspace, [sv.data.ptr]) + output = self._compute_target('state', create_args, execute_args, stream) + if output: + extents = output[0][0] + strides = [s * sv.dtype.itemsize for s in output[1][0]] + if order == 'F': + self.sv = sv + else: + self.sv = cp.ndarray(extents, + dtype=sv.dtype, memptr=sv.data, strides=strides) + self.state_computed = True + else: + self.sv = None + + def get_sv(self): + self._maybe_compute_state() + return self.sv + + +class MPSStateComputeEngine(StateComputeEngine): + + @property + def tolerance(self): + if self._tolerance is None: + # tolerance for double precision is increase + self._tolerance = get_mps_tolerance(self.dtype) + return self._tolerance + + def _maybe_parse_options(self): + self._maybe_create_state() + # parse max extent + max_extent = self.options.get('max_extent', None) + if max_extent is None: + if self.n_qubits > EXACT_MPS_QUBIT_COUNT_LIMIT: + raise ValueError(f"Exact MPS will encounter overflow with n_qubits={self.n_qubits}") + else: + max_extent = 2**EXACT_MPS_QUBIT_COUNT_LIMIT + self.mps_tensors = [] + prev_extent = 1 + output_mps_extents = [] + output_mps_strides = [] + for i in range(self.n_qubits): + next_extent = min(max_extent, 2**(i+1), 2**(self.n_qubits-i-1)) + if i==0: + extents = (2, next_extent) + elif i !=self.n_qubits - 1: + extents = (prev_extent, 2, next_extent) + else: + extents = (prev_extent, 2) + prev_extent = next_extent + tensor = cp.empty(extents, dtype=self.dtype, order=np.random.choice(['C', 'F'])) + self.mps_tensors.append(tensor) + output_mps_extents.append(extents) + output_mps_strides.append([stride_in_bytes // tensor.itemsize for stride_in_bytes in tensor.strides]) + cutn.state_finalize_mps(self.handle, self.state, + cutn.BoundaryCondition.OPEN, output_mps_extents, output_mps_strides) + + algorithm = 'gesvd' + for key, value in self.options.items(): + if key in STATE_ATTRIBUTE_MAP: + attr = STATE_ATTRIBUTE_MAP[key] + dtype = cutn.state_get_attribute_dtype(attr) + if key == 'algorithm': + algorithm = value + value = SVD_ALGORITHM_MAP[value] + elif key == 'normalization': + value = NORMALIZATION_MAP[value] + elif key == 'canonical_center' and value is None: + continue + value = np.asarray(value, dtype=dtype) + cutn.state_configure(self.handle, self.state, attr, value.ctypes.data, value.dtype.itemsize) + + if algorithm in ('gesvdj', 'gesvdr'): + dtype = cutn.tensor_svd_algo_params_get_dtype(SVD_ALGORITHM_MAP[algorithm]) + algo_params = np.zeros(1, dtype=dtype) + + for name in dtype.names: + value = self.options.get(f'{algorithm}_{name}', 0) + if value != 0: + algo_params[name] = value + cutn.state_configure(self.handle, self.state, + cutn.StateAttribute.MPS_SVD_CONFIG_ALGO_PARAMS, + algo_params.ctypes.data, algo_params.dtype.itemsize) + + def _maybe_compute_state(self): + # Implement this for different type of simulators + # For tensor network simulator, final state is not computed + # For other types of simulator, final state must be explictly computed and stored + if not self.state_computed: + self._maybe_parse_state() + self._maybe_parse_options() + stream = cp.cuda.get_current_stream() + create_args = () + execute_args = (self.workspace, [o.data.ptr for o in self.mps_tensors]) + output = self._compute_target('state', create_args, execute_args, stream) + if output is None: + return False + else: + extents, strides = output + for i in range(self.n_qubits): + extent_in = self.mps_tensors[i].shape + extent_out = extents[i] + if extent_in != tuple(extent_out): + tensor_strides = [s * self.mps_tensors[i].dtype.itemsize for s in strides[i]] + self.mps_tensors[i] = cp.ndarray(extent_out, + dtype=self.mps_tensors[i].dtype, memptr=self.mps_tensors[i].data, strides=tensor_strides) + self.state_computed = True + + def check_canonicalization(self): + self._maybe_compute_state() + center = self.options.get('canonical_center', None) + if center is None: + return + for i in range(self.n_qubits): + if i == 0: + modes = 'pj' + elif i == self.n_qubits - 1: + modes = 'ip' + else: + modes = 'ipj' + if i == center: + continue + if i < center: + shared_mode = 'j' + elif i > center: + shared_mode = 'i' + else: + continue + verify_unitary(self.mps_tensors[i], modes, shared_mode, + SVD_TOLERANCE[self.dtype], tensor_name=f"Site {i} canonicalization") + + +class BaseTester: + + @property + def reference_engine(self): + raise NotImplementedError + + @property + def target_engines(self): + raise NotImplementedError + + @property + def all_engines(self): + return [self.reference_engine] + self.target_engines + + def test_misc(self): + raise NotImplementedError + + def test_norm(self): + norm1 = self.reference_engine.get_norm() + for engine in self.target_engines: + norm2 = engine.get_norm() + message = f"{engine.__class__.__name__} maxDiff={abs(norm1-norm2)}" + assert np.allclose(norm1, norm2, **engine.tolerance), message def test_state_vector(self): - expression, operands = self.converter.state_vector() - sv1 = contract(expression, *operands) - sv2 = self.get_state_vector_from_simulator() - assert self.backend.allclose( - sv1, sv2, atol=atol_mapper[self.dtype], rtol=rtol_mapper[self.dtype]) + sv1 = self.reference_engine.get_sv() + for engine in self.target_engines: + sv2 = engine.get_sv() + message = f"{engine.__class__.__name__} maxDiff={abs(sv1-sv2).max()}" + assert self.backend.allclose(sv1, sv2, **engine.tolerance), message def test_amplitude(self): for bitstring in bitstring_generator(self.n_qubits, self.nsample): - expression, operands = self.converter.amplitude(bitstring) - amp1 = contract(expression, *operands) - amp2 = self.get_amplitude_from_simulator(bitstring) - assert self.backend.allclose( - amp1, amp2, atol=atol_mapper[self.dtype], rtol=rtol_mapper[self.dtype]) + amp1 = self.reference_engine.get_amplitude(bitstring) + for engine in self.target_engines: + amp2 = engine.get_amplitude(bitstring) + message = f"{engine.__class__.__name__} maxDiff={abs(amp1-amp2).max()}" + assert self.backend.allclose(amp1, amp2, **engine.tolerance), message def test_batched_amplitudes(self): for fixed in where_fixed_generator(self.qubits, self.nfix_max): - expression, operands = self.converter.batched_amplitudes(fixed) - batched_amps1 = contract(expression, *operands) - batched_amps2 = self.get_batched_amplitudes_from_simulator(fixed) - assert self.backend.allclose( - batched_amps1, batched_amps2, atol=atol_mapper[self.dtype], rtol=rtol_mapper[self.dtype]) + batched_amps1 = self.reference_engine.get_batched_amplitudes(fixed) + for engine in self.target_engines: + batched_amps2 = engine.get_batched_amplitudes(fixed) + message = f"{engine.__class__.__name__} maxDiff={abs(batched_amps1-batched_amps2).max()}" + assert self.backend.allclose(batched_amps1, batched_amps2, **engine.tolerance), message def test_reduced_density_matrix(self): for where, fixed in where_fixed_generator(self.qubits, self.nfix_max, nsite_max=self.nsite_max): - expression1, operands1 = self.converter.reduced_density_matrix(where, fixed=fixed, lightcone=True) - expression2, operands2 = self.converter.reduced_density_matrix(where, fixed=fixed, lightcone=False) + operands1 = self.converter.reduced_density_matrix(where, fixed=fixed, lightcone=True)[1] + operands2 = self.converter.reduced_density_matrix(where, fixed=fixed, lightcone=False)[1] assert len(operands1) <= len(operands2) + 2 # potential phase handling for qiskit Circuit - rdm1 = contract(expression1, *operands1) - rdm2 = contract(expression2, *operands2) - rdm3 = self.get_reduced_density_matrix_from_simulator(where, fixed=fixed) - - assert self.backend.allclose( - rdm1, rdm2, atol=atol_mapper[self.dtype], rtol=rtol_mapper[self.dtype]) - assert self.backend.allclose( - rdm1, rdm3, atol=atol_mapper[self.dtype], rtol=rtol_mapper[self.dtype]) - if self.backend is cp: - rdm4 = self.get_reduced_density_matrix_from_cutn(where, fixed=fixed) - if rdm4 is not None: - assert self.backend.allclose( - rdm1, rdm4, atol=atol_mapper[self.dtype], rtol=rtol_mapper[self.dtype]) + rdm1 = self.reference_engine.get_reduced_density_matrix(where, fixed=fixed) + if isinstance(self.reference_engine, CircuitToEinsumComputeEngine): + # by default CircuitToEinsumComputeEngine.get_reduced_density_matrix uses lightcone=True + rdm2 = self.reference_engine.get_reduced_density_matrix(where, fixed=fixed, lightcone=False) + assert self.backend.allclose(rdm1, rdm2, **self.reference_engine.tolerance) + + # comparision with different references + for engine in self.target_engines: + rdm2 = engine.get_reduced_density_matrix(where, fixed=fixed) + message = f"{engine.__class__.__name__} maxDiff={abs(rdm1-rdm2).max()}" + assert self.backend.allclose(rdm1, rdm2, **engine.tolerance), message def test_expectation(self): - for pauli_string in random_pauli_string_generator(self.n_qubits, 2): - expression1, operands1 = self.converter.expectation(pauli_string, lightcone=True) - expression2, operands2 = self.converter.expectation(pauli_string, lightcone=False) + full_expectation = 0. + pauli_strings = dict() + for pauli_string in random_pauli_string_generator(self.n_qubits, 6): + coefficient = np.random.random(1).item() + 1j * np.random.random(1).item() + if pauli_string not in pauli_strings: + pauli_strings[pauli_string] = coefficient + else: + # in case duplicate pauli string is reproduced by the random generator + pauli_strings[pauli_string] += coefficient + operands1 = self.converter.expectation(pauli_string, lightcone=True)[1] + operands2 = self.converter.expectation(pauli_string, lightcone=False)[1] assert len(operands1) <= len(operands2) + 2 # potential phase handling for qiskit Circuit - expec1 = contract(expression1, *operands1) - expec2 = contract(expression2, *operands2) - expec3 = self.get_expectation_from_sv(pauli_string) - - assert self.backend.allclose( - expec1, expec2, atol=atol_mapper[self.dtype], rtol=rtol_mapper[self.dtype]) - assert self.backend.allclose( - expec1, expec3, atol=atol_mapper[self.dtype], rtol=rtol_mapper[self.dtype]) + + expec1 = self.reference_engine.get_expectation(pauli_string) + if isinstance(self.reference_engine, CircuitToEinsumComputeEngine): + expec2 = self.reference_engine.get_expectation(pauli_string, lightcone=False) + assert self.backend.allclose(expec1, expec2, **self.reference_engine.tolerance) + + full_expectation += coefficient * expec1 + + for engine in self.target_engines: + if not isinstance(engine, StateComputeEngine): + expec2 = engine.get_expectation(pauli_string) + message = f"{engine.__class__.__name__} maxDiff={abs(expec1-expec2).max()}" + assert self.backend.allclose(expec1, expec2, **engine.tolerance), message + + for engine in self.target_engines: + if isinstance(engine, StateComputeEngine): + expec2 = engine.get_expectation_sum(pauli_strings) + message = f"{engine.__class__.__name__} maxDiff={abs(full_expectation-expec2).max()}" + assert self.backend.allclose(full_expectation, expec2, **engine.tolerance), message def test_sampling(self): full_qubits = list(self.qubits) np.random.shuffle(full_qubits) selected_qubits = full_qubits[:len(full_qubits)//2] - for qubits_to_sample in (None, selected_qubits): - seed = self.seed - nshots = self.nshots - max_try = 3 - overlap_best = 0. - - for counter in range(1, max_try+1): - # build a histogram for the reference impl - hist_ref = self._get_sampling_from_simulator(qubits_to_sample=qubits_to_sample, seed=seed) - - # do the same for cutensornet sampling - hist_cutn = self.get_sampling_from_cutensornet(qubits_to_sample=qubits_to_sample, seed=seed) - - # compute overlap of the histograms (cutn vs ref) - overlap = compute_histogram_overlap(hist_cutn, hist_ref, self.nshots) - if overlap > overlap_best: - overlap_best = overlap - else: - print("WARNING: overlap not improving as nshots increases!") - - # do the same for sampling from the (exactly computed) SV - hist_sv = self.get_sampling_from_sv(qubits_to_sample=qubits_to_sample, seed=seed) - - # compute overlap of the histograms (sv vs ref) - overlap_check = compute_histogram_overlap(hist_sv, hist_ref, self.nshots) - print(f"with nshots = {self.nshots}, {overlap_best = }, {overlap_check = }") - - # to reduce test time we set 95% here, but 99% will also work - if np.round(overlap, decimals=2) < 0.95: - self.nshots *= 10 - print(f"retry with nshots = {self.nshots} ...") + for engine in self.target_engines: + for qubits_to_sample in (None, selected_qubits): + seed = self.seed + nshots = self.nshots + max_try = 3 + overlap_best = 0. + + for counter in range(1, max_try+1): + # build a histogram for the reference impl + hist_ref = self.reference_engine.get_sampling(qubits_to_sample=qubits_to_sample, seed=seed, nshots=self.nshots) + + # do the same for cutensornet sampling + hist_cutn = engine.get_sampling(qubits_to_sample=qubits_to_sample, seed=seed, nshots=self.nshots) + + # compute overlap of the histograms (cutn vs ref) + overlap = compute_histogram_overlap(hist_cutn, hist_ref, self.nshots) + if overlap > overlap_best: + overlap_best = overlap + else: + print(f"WARNING: overlap not improving {counter=} {overlap_best=} {overlap=} as nshots increases!") + + # to reduce test time we set 95% here, but 99% will also work + if np.round(overlap, decimals=2) < 0.95: + self.nshots *= 10 + print(f"retry with nshots = {self.nshots} ...") + else: + self.nshots = nshots # restore + break else: self.nshots = nshots # restore - break - else: - self.nshots = nshots # restore - assert False, f"{overlap_best=} after {counter} retries..." - + assert False, f"{overlap_best=} after {counter} retries..." + @manage_resource("handle") - @manage_resource("state") @manage_resource("workspace") def run_tests(self): + resources = {'handle': self.handle, 'workspace': self.workspace} + # share cutensornet resources for all compute engines + for engine in self.all_engines: + engine.setup_resources(**resources) + self.test_state_vector() self.test_amplitude() self.test_batched_amplitudes() self.test_reduced_density_matrix() self.test_expectation() - self.test_gates() - self.test_qubits() + self.test_norm() + self.test_misc() if self.backend is cp: # sampling only needed to be tested for cupy backend self.test_sampling() -class CirqTester(BaseTester): - def _get_state_vector_from_simulator(self): - qubits = self.qubits - simulator = cirq.Simulator(dtype=self.dtype) - circuit = circuit_parser_utils_cirq.remove_measurements(self.circuit) - result = simulator.simulate(circuit, qubit_order=qubits) - statevector = result.state_vector().reshape((2,)*self.n_qubits) - if self.backend is torch: - statevector = torch.as_tensor(statevector, dtype=getattr(torch, self.dtype), device='cuda') +class CircuitToEinsumTester(BaseTester): + def __init__(self, circuit, dtype, backend, nsample, nsite_max, nfix_max, nshots=5000, seed=1024): + self.circuit = circuit + self.converter = CircuitToEinsum(circuit, dtype=dtype, backend=backend) + self.backend = backend + self.qubits = list(self.converter.qubits) + self.n_qubits = self.converter.n_qubits + self.dtype = dtype + self.sv = None + self.nsample = nsample + self.nsite_max = max(1, min(nsite_max, self.n_qubits-1)) + self.nfix_max = max(min(nfix_max, self.n_qubits-nsite_max-1), 0) + self.nshots = nshots + self.seed = seed + + self._reference_engine = CircuitToEinsumComputeEngine(self.converter) + + # Framework provider as reference + if qiskit and isinstance(circuit, qiskit.QuantumCircuit): + self._target_engines = [QiskitComputeEngine(circuit, dtype, backend)] + elif cirq and isinstance(circuit, cirq.Circuit): + self._target_engines = [CirqComputeEngine(circuit, dtype, backend)] else: - statevector = self.backend.asarray(statevector, dtype=self.dtype) - return statevector + raise ValueError(f"circuit type {type(circuit)} not supported") + + if backend == cp: + # Tensor network state simulator + self._target_engines.append(StateComputeEngine(self.converter)) + # SV state simulator + self._target_engines.append(SVStateComputeEngine(self.converter)) + # MPS simulators are only functioning if no multicontrol gates exist in the circuit. + if is_converter_mps_compatible(self.converter): + # MPS state simulator + self._target_engines.append(MPSStateComputeEngine(self.converter)) + # reference MPS implementation + self._target_engines.append(MPS.from_converter(self.converter)) - def _get_sampling_from_simulator(self, qubits_to_sample=None, seed=None): - if qubits_to_sample is None: - qubits_to_sample = list(self.qubits) - circuit = circuit_parser_utils_cirq.remove_measurements(self.circuit) - circuit.append(cirq.measure_each(qubits_to_sample)) - circuit.append(cirq.measure(*qubits_to_sample, key='meas')) - result = cirq.sample( - circuit, repetitions=self.nshots, seed=seed, dtype=getattr(np, self.dtype)) - result = result.histogram(key='meas') - sampling = {} - nsamples = 0 - for bitstring, nsample in result.items(): - sampling[int(bitstring)] = nsample - nsamples += nsample - assert nsamples == self.nshots - return sampling + @property + def reference_engine(self): + return self._reference_engine + + @property + def target_engines(self): + return self._target_engines + def test_misc(self): + self.test_qubits() + self.test_gates() + norm = self.reference_engine.get_norm() + assert np.allclose(norm, 1, **self.reference_engine.tolerance) + + def test_qubits(self): + assert len(self.qubits) == self.n_qubits + + def test_gates(self): + for (gate_operand, qubits) in self.converter.gates: + assert gate_operand.ndim == len(qubits) * 2 + assert infer_object_package(gate_operand) == self.backend.__name__ -class QiskitTester(BaseTester): - def _get_precision(self): - precision = {'complex64': 'single', - 'complex128': 'double'}[self.dtype] - return precision + +class ApproximateMPSTester(BaseTester): + def __init__(self, converter, nsample, nsite_max, nfix_max, nshots=5000, seed=1024, **mps_options): + self.converter = converter + self.backend = converter.backend + if self.backend is not cp: + raise ValueError("This tester is only meant for cupy testing") + self.qubits = list(self.converter.qubits) + self.n_qubits = self.converter.n_qubits + self.dtype = self.converter.dtype.__name__ + self.sv = None + self.norm = None + self.nsample = nsample + self.nsite_max = max(1, min(nsite_max, self.n_qubits-1)) + self.nfix_max = max(min(nfix_max, self.n_qubits-nsite_max-1), 0) + self.nshots = nshots + self.seed = seed + self.mps_options = mps_options + if not is_converter_mps_compatible(self.converter): + raise ValueError("circuit contains gates acting on more than 2 qubits") + self._reference_engine = MPS.from_converter(self.converter, **self.mps_options) + self._target_engines = [MPSStateComputeEngine(self.converter, **self.mps_options)] - def _get_state_vector_from_simulator(self): - # requires qiskit >= 0.24.0 - precision = self._get_precision() - circuit = circuit_parser_utils_qiskit.remove_measurements(self.circuit) - try: - # for qiskit >= 0.25.0 - simulator = qiskit.Aer.get_backend('aer_simulator_statevector', precision=precision) - circuit = qiskit.transpile(circuit, simulator) - circuit.save_statevector() - result = simulator.run(circuit).result() - except: - # for qiskit 0.24.* - simulator = qiskit.Aer.get_backend('statevector_simulator', precision=precision) - result = qiskit.execute(circuit, simulator).result() - sv = np.asarray(result.get_statevector()).reshape((2,)*circuit.num_qubits) - # statevector returned by qiskit's simulator is labelled by the inverse of :attr:`qiskit.QuantumCircuit.qubits` - # this is different from `cirq` and different from the implementation in :class:`CircuitToEinsum` - sv = sv.transpose(list(range(circuit.num_qubits))[::-1]) - if self.backend is torch: - sv = torch.as_tensor(sv, dtype=getattr(torch, self.dtype), device='cuda') - else: - sv = self.backend.asarray(sv, dtype=self.dtype) - return sv + @property + def reference_engine(self): + return self._reference_engine - def _get_sampling_from_simulator(self, qubits_to_sample=None, seed=None): - if qubits_to_sample is None: - qubits_to_sample = list(self.qubits) - circuit = self.circuit.remove_final_measurements(inplace=False) - new_creg = circuit._create_creg(len(qubits_to_sample), "meas") - circuit.add_register(new_creg) - circuit.measure(qubits_to_sample, new_creg) - precision = self._get_precision() - backend = qiskit.Aer.get_backend('qasm_simulator', precision=precision) - result = backend.run(qiskit.transpile(circuit, backend), shots=self.nshots, seed=seed).result() - counts = result.get_counts(circuit) - sampling = {} - nsamples = 0 - for bitstring, nsample in counts.items(): - # little endian from qiskit - value = int(bitstring[::-1], 2) - sampling[value] = nsample - nsamples += nsample - assert nsamples == self.nshots - return sampling + @property + def target_engines(self): + return self._target_engines + + def test_misc(self): + for engine in self.all_engines: + engine.check_canonicalization() \ No newline at end of file diff --git a/python/tests/cuquantum_tests/cutensornet_tests/data.py b/python/tests/cuquantum_tests/cutensornet_tests/data.py index 2ecee3c..631c599 100644 --- a/python/tests/cuquantum_tests/cutensornet_tests/data.py +++ b/python/tests/cuquantum_tests/cutensornet_tests/data.py @@ -31,6 +31,8 @@ # the second variant is suitable for testing exotic TNs that require further customization # TODO: expand the tests einsum_expressions = ( + "ii->", + "jii->ij", "ij,jb,ah", "ea,fb,abcd,gc,hd->efgh", "ea,fb,abcd,gc,hd", diff --git a/python/tests/cuquantum_tests/cutensornet_tests/mps_utils.py b/python/tests/cuquantum_tests/cutensornet_tests/mps_utils.py new file mode 100644 index 0000000..04d8e80 --- /dev/null +++ b/python/tests/cuquantum_tests/cutensornet_tests/mps_utils.py @@ -0,0 +1,415 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES +# +# SPDX-License-Identifier: BSD-3-Clause + +# Note: This file must be self-contained and not import private helpers! + +from dataclasses import asdict, dataclass +import importlib +import logging +from types import MappingProxyType +from typing import Optional + +try: + import cupy as cp +except ImportError: + cp = None +import numpy as np +import opt_einsum as oe +try: + import torch + if not torch.cuda.is_available(): + raise ImportError +except ImportError: + torch = None + +from cuquantum import CircuitToEinsum +from cuquantum.cutensornet._internal.circuit_converter_utils import get_pauli_gates +from cuquantum.cutensornet._internal.utils import infer_object_package +from cuquantum.cutensornet._internal.tensor_wrapper import _get_backend_asarray_func +from .approxTN_utils import tensor_decompose, gate_decompose, SVD_TOLERANCE, verify_unitary +from .test_utils import gen_rand_svd_method, atol_mapper, rtol_mapper + +#################################################### +################# Helper functions ################# +#################################################### + + +EMPTY_DICT = MappingProxyType(dict()) + +def get_partial_indices(n, fixed=EMPTY_DICT): + partial_indices = [slice(None)] * n + index_map = {'0': slice(0, 1), + '1': slice(1, 2)} + for q, val in fixed.items(): + partial_indices[q] = index_map[val] + return tuple(partial_indices) + + +def reduced_density_matrix_from_sv(sv, where, fixed=EMPTY_DICT): + n = sv.ndim + sv = sv[get_partial_indices(n, fixed)] + bra_modes = list(range(n)) + ket_modes = [i+n if i in where else i for i in range(n)] + output_modes = list(where) + [i+n for i in where] + if infer_object_package(sv) is torch: + inputs = [sv, bra_modes, sv.conj().resolve_conj(), ket_modes] + else: + inputs = [sv, bra_modes, sv.conj(), ket_modes] + inputs.append(output_modes) + return oe.contract(*inputs) + + +def batched_amplitude_from_sv(sv, fixed): + n = sv.ndim + sv = sv[get_partial_indices(n, fixed)] + return sv.reshape([2,]* (n-len(fixed))) + + +def amplitude_from_sv(sv, bitstring): + index = [int(ibit) for ibit in bitstring] + return sv[tuple(index)] + + +def expectation_from_sv(sv, pauli_string): + n = sv.ndim + pauli_map = dict(zip(range(n), pauli_string)) + backend = importlib.import_module(infer_object_package(sv)) + pauli_gates = get_pauli_gates(pauli_map, dtype=sv.dtype, backend=backend) + # tentative bra/ket indices + if backend is torch: + inputs = [sv, list(range(n)), sv.conj().resolve_conj(), list(range(n))] + else: + inputs = [sv, list(range(n)), sv.conj(), list(range(n))] + for o, qs in pauli_gates: + q = qs[0] + inputs[3][q] += n # update ket indices + inputs.extend([o, [q+n, q]]) + return oe.contract(*inputs) + + +def sample_from_sv(sv, nshots, modes_to_sample=None, seed=None): + backend = infer_object_package(sv) + p = abs(sv) ** 2 + # convert p to double type in case probs does not add up to 1 + if backend == 'numpy': + p = p.astype('float64') + elif backend == 'cupy': + p = cp.asnumpy(p).astype('float64') + elif backend == 'torch': + if p.device.type == 'cpu': + p = p.numpy().astype('float64') + else: + p = p.cpu().numpy().astype('float64') + if modes_to_sample is not None: + sorted_modes_to_sample = sorted(modes_to_sample) + axis = [q for q in range(sv.ndim) if q not in modes_to_sample] + if axis: + p = p.sum(tuple(axis)) + # NOTE: bug here + transpose_order = [sorted_modes_to_sample.index(q) for q in modes_to_sample] + p = p.transpose(*transpose_order) + # normalize + p /= p.sum() + if seed is not None: + np.random.seed(seed) + samples = np.random.choice(np.arange(p.size), p=p.flat, size=nshots) + hist_sv = np.unique(samples, return_counts=True) + return dict(zip(*hist_sv)) + + +def gen_random_mps(n_qubits, backend, rng, dtype, D=None): + assert backend in (cp, np), "backend not supported" + assert dtype in ('complex64', 'complex128'), f"dtype {dtype} not supported" + real_dtype = 'float32' if dtype == 'complex64' else 'float64' + mps_tensors = [] + for i in range(n_qubits): + next_D = D if D is not None else rng.integers(1, 8) + if i == 0: + shape = (2, next_D) + elif i == n_qubits - 1: + shape = (prev_D, 2) + else: + shape = (prev_D, 2, next_D) + t = rng.random(shape, dtype=real_dtype) + 1j * rng.random(shape, dtype=real_dtype) + t = backend.asarray(t, order=rng.choice(['C', 'F'])) + t /= backend.linalg.norm(t) + mps_tensors.append(t) + prev_D = next_D + return mps_tensors + + +def get_mps_tolerance(dtype): + tolerance = {'rtol': rtol_mapper[dtype], + 'atol': atol_mapper[dtype]} + if dtype in ('float64', 'complex128'): + # for double precision, relax the tolerance + tolerance['rtol'] += SVD_TOLERANCE[dtype] ** .5 + tolerance['atol'] += SVD_TOLERANCE[dtype] ** .5 + else: + tolerance['rtol'] += SVD_TOLERANCE[dtype] + tolerance['atol'] += SVD_TOLERANCE[dtype] + return tolerance + + +@dataclass +class MPSConfig: + """Class for MPS simulation.""" + # final state + canonical_center: Optional[int] = None + # svd options + max_extent: Optional[int] = None + abs_cutoff: Optional[float] = 0 + rel_cutoff: Optional[float] = 0 + discarded_weight_cutoff: Optional[float] = 0 + normalization: Optional[str] = None + algorithm: Optional[str] = 'gesvd' + gesvdj_tol: Optional[float] = 0 + gesvdj_max_sweeps: Optional[int] = 0 + gesvdr_oversampling: Optional[int] = 0 + gesvdr_niters: Optional[int] = 0 + + def __post_init__(self): + # to be parsed to reference MPS implementation, algorithm and params not supported + self.svd_options = {'max_extent': self.max_extent, + 'abs_cutoff': self.abs_cutoff, + 'rel_cutoff': self.rel_cutoff, + 'discarded_weight_cutoff': self.discarded_weight_cutoff, + 'normalization': self.normalization, + 'partition': 'UV'} # must be enforced to UV partition + + @staticmethod + def rand(n_qubits, rng, dtype, fixed=None, dict_format=True): + config = dict() + config['canonical_center'] = rng.integers(0, high=n_qubits) + svd_method = asdict(gen_rand_svd_method(rng, dtype, fixed=fixed)) + # MPS simulation does not take allow partition other than 'UV' + svd_method.pop('partition') + config.update(svd_method) + # if found exact MPS simulation setting, shrink it down to truncated extent + if config['max_extent'] >= 2**(n_qubits//2): + config['max_extent'] = rng.integers(1, high=2**(n_qubits//2)) + if dict_format: + return config + else: + return MPSConfig(**config) + + +class MPS: + + def __init__( + self, + mps_tensors, + qubits=None, + **mps_config + ): + self.n = len(mps_tensors) + # avoid in-place modification + self.mps_tensors = mps_tensors.copy() + # potentially insert dummy labels for boundary tensors for consistent notation in this class + if self.mps_tensors[0].ndim == 2: + self.mps_tensors[0] = self.mps_tensors[0].reshape(1, *self.mps_tensors[0].shape) + if self.mps_tensors[-1].ndim == 2: + new_shape = self.mps_tensors[-1].shape + (1, ) + self.mps_tensors[-1] = self.mps_tensors[-1].reshape(*new_shape) + self.qubits = qubits + self.dtype = mps_tensors[0].dtype.name + self.sv = None + self.norm = None + self.backend = importlib.import_module(infer_object_package(mps_tensors[0])) + self.swap_gate = None + self.mps_config = MPSConfig(**mps_config) + self._tolerance = get_mps_tolerance(self.dtype) + + @property + def tolerance(self): + return self._tolerance + + def setup_resources(self, *args, **kwargs): + pass + + def get_swap_gate(self): + if self.swap_gate is None: + asarray = _get_backend_asarray_func(self.backend) + self.swap_gate = asarray([[1,0,0,0], + [0,0,1,0], + [0,1,0,0], + [0,0,0,1]], dtype=self.dtype).reshape(2,2,2,2) + return self.swap_gate + + def __getitem__(self, key): + assert key >= 0 and key < self.n + return self.mps_tensors[key] + + def __setitem__(self, key, val): + assert key>=0 and key < self.n + self.mps_tensors[key] = val + # resetting SV and norm + self.sv = self.norm = None + + def get_norm(self): + if self.norm is None: + self.norm = self.backend.linalg.norm(self.get_sv()) ** 2 + return self.norm + + def get_sv(self): + if self.sv is None: + inputs = [] + output_modes = [] + for i, o in enumerate(self.mps_tensors): + modes = [2*i, 2*i+1, 2*i+2] + inputs.extend([o, modes]) + output_modes.append(2*i+1) + inputs.append(output_modes) + self.sv = oe.contract(*inputs) + return self.sv + + def get_amplitude(self, bitstring): + return amplitude_from_sv(self.get_sv(), bitstring) + + def get_batched_amplitudes(self, fixed=EMPTY_DICT): + if self.qubits is not None: + _fixed = dict([(self.qubits.index(q), bit) for q, bit in fixed.items()]) + else: + _fix = fixed + return batched_amplitude_from_sv(self.get_sv(), fixed=_fixed) + + def get_reduced_density_matrix(self, where, fixed=EMPTY_DICT): + if self.qubits is not None and not isinstance(where[0], int): + _where = [self.qubits.index(q) for q in where] + _fixed = dict([(self.qubits.index(q), bit) for q, bit in fixed.items()]) + else: + _where = where + _fixed = fixed + return reduced_density_matrix_from_sv(self.get_sv(), _where, fixed=_fixed) + + def get_expectation(self, pauli_string): + return expectation_from_sv(self.get_sv(), pauli_string) + + def get_sampling(self, qubits_to_sample=None, seed=None, nshots=5000): + if qubits_to_sample is None: + _qubits_to_sample = None + else: + _qubits_to_sample = [self.qubits.index(q) for q in qubits_to_sample] + return sample_from_sv(self.get_sv(), nshots, modes_to_sample=_qubits_to_sample, seed=seed) + + def _apply_gate_1q(self, i, operand): + self[i] = self.backend.einsum('ipj,Pp->iPj', self[i], operand) + + def _apply_gate_2q(self, i, j, operand): + if i > j: + return self._apply_gate_2q(j, i, operand.transpose(1,0,3,2)) + elif i == j: + raise ValueError(f"gate acting on the same site {i} twice") + elif i == j - 1: + self[i], _, self[j] = gate_decompose('ipj,jqk,PQpq->iPj,jQk', self[i], self[j], operand, **self.mps_config.svd_options) + else: + # insert swap gates recursively + swap_gate = self.get_swap_gate() + if (j - i) % 2 == 0: + self._apply_gate_2q(i, i+1, swap_gate) + self._apply_gate_2q(i+1, j, operand) + self._apply_gate_2q(i, i+1, swap_gate) + else: + self._apply_gate_2q(j-1, j, swap_gate) + self._apply_gate_2q(i, j-1, operand) + self._apply_gate_2q(j-1, j, swap_gate) + + def apply_gate(self, sites, operand): + if len(sites) == 1: + return self._apply_gate_1q(*sites, operand) + elif len(sites) == 2: + return self._apply_gate_2q(*sites, operand) + else: + raise NotImplementedError("Only single- and two- qubit gate supported") + + @staticmethod + def from_converter(converter, initial_state=None, **mps_config): + if initial_state is None: + asarray = _get_backend_asarray_func(converter.backend) + t = asarray([1,0], dtype=converter.dtype).reshape(1,2,1) + initial_state = [t, ] * len(converter.qubits) + mps = MPS(initial_state, qubits=list(converter.qubits), **mps_config) + for operand, qs in converter.gates: + sites = [converter.qubits.index(q) for q in qs] + mps.apply_gate(sites, operand) + mps.canonicalize() + return mps + + def print(self): + print([o.shape[2] for o in self.mps_tensors[:-1]]) + + def canonicalize(self): + center = self.mps_config.canonical_center + if center is None: + return + max_extent = self.mps_config.max_extent + svd_method = self.mps_config.svd_options.copy() + svd_method['partition'] = 'V' + for i in range(center): + shared_extent = self[i+1].shape[0] + if max_extent is not None and shared_extent > max_extent: + self[i], r = tensor_decompose('ipj->ipx,xj', self[i], method='svd', **svd_method) + else: + self[i], r = tensor_decompose('ipj->ipx,xj', self[i], method='qr') + self[i+1] = self.backend.einsum('xj,jqk->xqk', r, self[i+1]) + for i in range(self.n-1, center, -1): + shared_extent = self[i].shape[0] + if max_extent is not None and shared_extent > max_extent: + self[i], r = tensor_decompose('ipj->xpj,ix', self[i], method='svd', **svd_method) + else: + self[i], r = tensor_decompose('ipj->xpj,ix', self[i], method='qr') + self[i-1] = self.backend.einsum('mqi,ix->mqx', self[i-1], r) + + def check_canonicalization(self): + center = self.mps_config.canonical_center + if center is None: + return + modes = 'ipj' + for i in range(self.n): + if i < center: + shared_mode = 'j' + elif i > center: + shared_mode = 'i' + else: + continue + verify_unitary(self[i], modes, shared_mode, + SVD_TOLERANCE[self.dtype], tensor_name=f"Site {i} canonicalization") + + +if __name__ == '__main__': + from cuquantum_benchmarks.frontends.frontend_qiskit import Qiskit as cuqnt_qiskit + from cuquantum_benchmarks.benchmarks import qpe, quantum_volume, qaoa, random + from cuquantum import contract + generators = [qpe.QPE, quantum_volume.QuantumVolume, qaoa.QAOA] + config = {'measure': True, 'unfold': True, 'p': 4} + n_qubits = 8 + nshots = 10000 + + # exact MPS for reference + mps_config = {'abs_cutoff':1e-8, 'rel_cutoff':1e-5, 'canonical_center': 2} + for generator in generators: + seq = generator.generateGatesSequence(n_qubits, config) + circuit = cuqnt_qiskit(n_qubits, config).generateCircuit(seq) + converter = CircuitToEinsum(circuit) + expr, operands = converter.state_vector() + sv = contract(expr, *operands) + mps0 = MPS.from_converter(converter, **mps_config) + mps1 = MPS.from_converter(converter, max_extent=8, **mps_config) + + mps0.check_canonicalization() + mps1.check_canonicalization() + sv0 = mps0.get_sv() + sv1 = mps1.get_sv() / mps1.get_norm() + samples = sample_from_sv(sv, nshots, seed=1) + samples0 = mps0.get_sampling(seed=1, nshots=nshots) + samples1 = mps1.get_sampling(seed=1, nshots=nshots) + + print("Exact MPS bonds:") + mps0.print() + print("MPS bonds with max extent 8") + mps1.print() + print(f"Exact MPS SV error: {abs(sv0-sv).max()}; Approximate MPS SV error: {abs(sv1-sv).max()}") + sv_ovlp = abs(cp.dot(sv.ravel(), sv1.ravel().conj())) + print(f"approx MPS sv overlap: {sv_ovlp}") + \ No newline at end of file diff --git a/python/tests/cuquantum_tests/cutensornet_tests/test_circuit_converter.py b/python/tests/cuquantum_tests/cutensornet_tests/test_circuit_converter.py index 039674c..fe8b068 100644 --- a/python/tests/cuquantum_tests/cutensornet_tests/test_circuit_converter.py +++ b/python/tests/cuquantum_tests/cutensornet_tests/test_circuit_converter.py @@ -4,25 +4,65 @@ import pytest -from .circuit_utils import backends -from .circuit_utils import cirq_circuits, CirqTester -from .circuit_utils import qiskit_circuits, QiskitTester +import cupy as cp +from cuquantum import CircuitToEinsum + +from .circuit_utils import ApproximateMPSTester, backends, CircuitToEinsumTester +from .circuit_utils import cirq_circuits, cirq_circuits_mps +from .circuit_utils import qiskit_circuits, qiskit_circuits_mps +from .circuit_utils import GLOBAL_RNG, is_converter_mps_compatible +from .mps_utils import MPSConfig class TestCircuitToEinsum: # If PyTorch/Qiskit/Cirq is not installed, the corresponding tests are silently # skipped. - @pytest.mark.parametrize("circuit", cirq_circuits) + @pytest.mark.parametrize("circuit", cirq_circuits + qiskit_circuits) @pytest.mark.parametrize("dtype", ('complex64', 'complex128',)) @pytest.mark.parametrize("backend", backends) - def test_cirq(self, circuit, dtype, backend, nsample=3, nsite_max=3, nfix_max=3): - cirq_tests = CirqTester(circuit, dtype, backend, nsample, nsite_max, nfix_max) - cirq_tests.run_tests() + def test_circuit_converter(self, circuit, dtype, backend, nsample=3, nsite_max=3, nfix_max=3): + # Results from CircuitToEinsum are compared with Cirq/Qiskit + # If the backend is set to cupy, additional references below are also tested: + # 1. Tensor network simulation based on cutensornet state APIs if backend is cupy + # 2. State vector simulation based on cutensornet state APIs + # 3. Exact MPS simulation based on cutensornet state APIs if no mulit-qubit gates exist in the circuit + # 4. Exact MPS simulation based on a reference cupy implementation in `mps_utils.MPS` if no multi-qubit gates exist in the circuit + circuit_tests = CircuitToEinsumTester(circuit, dtype, backend, nsample, nsite_max, nfix_max) + circuit_tests.run_tests() + +class TestMPSStateAPIs: + + @pytest.mark.parametrize("circuit", cirq_circuits_mps + qiskit_circuits_mps) + def test_exact_mps(self, circuit, nsamples=3, nsite_max=3, nfix_max=3): + # Computation results from approaches below are compared: + # 1. Exact MPS simulation based on cutensornet state APIs if no mulit-qubit gates exist in the circuit + # 2. Exact MPS simulation based on a reference cupy-gesvd implementation + # Here we only perform exact MPS simulation for double precision as + # different SVD algorithms may lead to drastically different results for single precision circuits + converter = CircuitToEinsum(circuit, dtype="complex128", backend=cp) + n_qubits = converter.n_qubits + if not is_converter_mps_compatible(converter): + pytest.skip("MPS test skipped due to multi-qubit gate") + + mps_options = {'algorithm': GLOBAL_RNG.choice(('gesvd', 'gesvdr', 'gesvdp', 'gesvdj'))} + exact_mps_tests = ApproximateMPSTester(converter, nsamples, nsite_max, nfix_max, **mps_options) + exact_mps_tests.run_tests() - @pytest.mark.parametrize("circuit", qiskit_circuits) + @pytest.mark.parametrize("circuit", cirq_circuits_mps + qiskit_circuits_mps) @pytest.mark.parametrize("dtype", ('complex64', 'complex128',)) - @pytest.mark.parametrize("backend", backends) - def test_qiskit(self, circuit, dtype, backend, nsample=3, nsite_max=3, nfix_max=3): - qiskit_tests = QiskitTester(circuit, dtype, backend, nsample, nsite_max, nfix_max) - qiskit_tests.run_tests() + def test_approximate_mps(self, circuit, dtype, nsamples=3, nsite_max=3, nfix_max=3): + # Computation results from approaches below are compared: + # 1. Approximate MPS simulation based on cutensornet state APIs if no mulit-qubit gates exist in the circuit + # 2. Approximate MPS simulation based on a reference cupy implementation in `mps_utils.MPS` if no multi-qubit gates exist in the circuit + converter = CircuitToEinsum(circuit, dtype=dtype, backend=cp) + n_qubits = converter.n_qubits + if not is_converter_mps_compatible(converter): + pytest.skip("MPS test skipped due to multi-qubit gate") + + # test two different types of randomly generated MPS options + for _ in range(2): + # restrict to gesvd algorithm to avoid accuracy fallout + mps_options = MPSConfig.rand(n_qubits, GLOBAL_RNG, dtype, fixed={'algorithm': 'gesvd'}, dict_format=True) + approximate_mps_tests = ApproximateMPSTester(converter, nsamples, nsite_max, nfix_max, **mps_options) + approximate_mps_tests.run_tests() diff --git a/python/tests/cuquantum_tests/cutensornet_tests/test_contract.py b/python/tests/cuquantum_tests/cutensornet_tests/test_contract.py index 51f2efb..e0c75a7 100644 --- a/python/tests/cuquantum_tests/cutensornet_tests/test_contract.py +++ b/python/tests/cuquantum_tests/cutensornet_tests/test_contract.py @@ -18,18 +18,15 @@ from .test_utils import atol_mapper, EinsumFactory, rtol_mapper from .test_utils import compute_and_normalize_numpy_path from .test_utils import deselect_contract_tests +from .test_utils import deselect_gradient_tests from .test_utils import get_stream_for_backend from .test_utils import set_path_to_optimizer_options # TODO: parametrize compute type? -@pytest.mark.uncollect_if(func=deselect_contract_tests) @pytest.mark.parametrize( "use_numpy_path", (False, True) ) -@pytest.mark.parametrize( - "stream", (None, True) -) @pytest.mark.parametrize( "order", ("C", "F") ) @@ -42,11 +39,11 @@ @pytest.mark.parametrize( "einsum_expr_pack", einsum_expressions ) -class TestContract: +class _TestContractBase: def _test_runner( self, func, einsum_expr_pack, xp, dtype, order, - stream, use_numpy_path, **kwargs): + use_numpy_path, gradient, **kwargs): einsum_expr = copy.deepcopy(einsum_expr_pack) if isinstance(einsum_expr, list): einsum_expr, network_opts, optimizer_opts, _ = einsum_expr @@ -57,9 +54,20 @@ def _test_runner( factory = EinsumFactory(einsum_expr) operands = factory.generate_operands( factory.input_shapes, xp, dtype, order) + qualifiers, picks = factory.generate_qualifiers(xp, gradient) + factory.setup_torch_grads(xp, picks, operands) backend = sys.modules[infer_object_package(operands[0])] + stream = kwargs.get('stream') if stream: - stream = get_stream_for_backend(backend) + stream_obj = get_stream_for_backend(backend) + if stream == "as_int": + if backend is numpy or backend is cupy: + stream = stream_obj.ptr + else: + pytest.skip("we do not support torch operands + " + "a raw stream pointer") + else: + stream = stream_obj path = None if use_numpy_path: @@ -77,13 +85,49 @@ def _test_runner( if path is not None: optimizer_opts = set_path_to_optimizer_options( optimizer_opts, path) - out = func( - *data, options=network_opts, optimize=optimizer_opts, - stream=stream, return_info=return_info) + try: + out = func( + *data, options=network_opts, optimize=optimizer_opts, + stream=stream, return_info=return_info) + if stream: + stream_obj.synchronize() + except cutn.cuTensorNetError as e: + # differentiating some edge TNs is not yet supported + if "NOT_SUPPORTED" in str(e): + pytest.skip("this TN is currently not supported") + else: + raise + if return_info: out, (path, info) = out assert isinstance(path, list) assert isinstance(info, cuquantum.OptimizerInfo) + + if gradient: + # compute gradients + output_grad = backend.ones_like(out) + try: + out.backward(output_grad) + except cutn.cuTensorNetError as e: + # differentiating some edge TNs is not yet supported; + if "NOT_SUPPORTED" in str(e): + # we don't wanna skip because we can still verify + # contraction ouput + gradient = None + else: + raise + + if gradient: + input_grads = tuple(op.grad for op in operands) + + # check gradient result types + assert all((sys.modules[infer_object_package(grad)] is backend) + if grad is not None else True + for grad in input_grads) + assert all((grad.dtype == operands[0].dtype) + if grad is not None else True + for grad in input_grads) + else: # cuquantum.einsum() optimize = kwargs.pop('optimize') if optimize == 'path': @@ -97,33 +141,70 @@ def _test_runner( else: raise - if stream: - stream.synchronize() backend_out = sys.modules[infer_object_package(out)] assert backend_out is backend assert out.dtype == operands[0].dtype + # check contraction + factory.setup_torch_grads(xp, picks, operands) out_ref = opt_einsum.contract( *data, backend="torch" if "torch" in xp else xp) assert backend.allclose( out, out_ref, atol=atol_mapper[dtype], rtol=rtol_mapper[dtype]) - @pytest.mark.parametrize( - "return_info", (False, True) - ) + # check gradients + if gradient and func is cuquantum.contract: + out_ref.backward(output_grad) + + # check gradients + try: + is_close = backend.tensor(tuple( + backend.allclose( + cutn_grad, op.grad, + atol=atol_mapper[dtype], rtol=rtol_mapper[dtype]) + if cutn_grad is not None else cutn_grad is op.grad + for cutn_grad, op in zip(input_grads, operands) + )) + assert all(is_close) + except AssertionError as e: + # for easier debugging + print(tuple(op.shape for op in operands)) + print(input_grads) + print(tuple(op.grad for op in operands)) + raise + + +@pytest.mark.uncollect_if(func=(deselect_contract_tests, + deselect_gradient_tests)) +@pytest.mark.parametrize( + "gradient", (False, "random", "all") +) +@pytest.mark.parametrize( + "stream", (None, True, "as_int") +) +@pytest.mark.parametrize( + "return_info", (False, True) +) +class TestContract(_TestContractBase): + def test_contract( self, einsum_expr_pack, xp, dtype, order, - stream, use_numpy_path, return_info): + use_numpy_path, gradient, stream, return_info): self._test_runner( cuquantum.contract, einsum_expr_pack, xp, dtype, order, - stream, use_numpy_path, return_info=return_info) + use_numpy_path, gradient, stream=stream, return_info=return_info) + + +# einsum does not support gradient (at some point we should deprecate it...) +@pytest.mark.uncollect_if(func=deselect_contract_tests) +@pytest.mark.parametrize( + "optimize", (False, True, "path") +) +class TestEinsum(_TestContractBase): - @pytest.mark.parametrize( - "optimize", (False, True, "path") - ) def test_einsum( self, einsum_expr_pack, xp, dtype, order, - stream, use_numpy_path, optimize): + use_numpy_path, optimize): self._test_runner( cuquantum.einsum, einsum_expr_pack, xp, dtype, order, - stream, use_numpy_path, optimize=optimize) + use_numpy_path, None, optimize=optimize) diff --git a/python/tests/cuquantum_tests/cutensornet_tests/test_contract_path.py b/python/tests/cuquantum_tests/cutensornet_tests/test_contract_path.py index 59ec90f..1783ac6 100644 --- a/python/tests/cuquantum_tests/cutensornet_tests/test_contract_path.py +++ b/python/tests/cuquantum_tests/cutensornet_tests/test_contract_path.py @@ -62,7 +62,7 @@ def _test_runner( # sanity checks; the correctness checks are done in the contract() tests assert len(path) == len(operands)-1 - operand_ids = list(range(len(operands))) + operand_ids = list(range(len(operands))) if path else [-1] # handle single operand case. for i, j in path: op_i, op_j = operand_ids[i], operand_ids[j] operand_ids.remove(op_i) diff --git a/python/tests/cuquantum_tests/cutensornet_tests/test_cutensornet.py b/python/tests/cuquantum_tests/cutensornet_tests/test_cutensornet.py index b2e4291..03e6f76 100644 --- a/python/tests/cuquantum_tests/cutensornet_tests/test_cutensornet.py +++ b/python/tests/cuquantum_tests/cutensornet_tests/test_cutensornet.py @@ -155,10 +155,17 @@ class TestLibHelper: def test_get_version(self): ver = cutn.get_version() - assert ver == (cutn.MAJOR_VER * 10000 + major = ver // 10000 + minor = (ver % 10000) // 100 + + # run-time version must be compatible with build-time version + assert major == cutn.MAJOR_VER + assert minor >= cutn.MINOR_VER + + # sanity check (build-time versions should agree) + assert cutn.VERSION == (cutn.MAJOR_VER * 10000 + cutn.MINOR_VER * 100 + cutn.PATCH_VER) - assert ver == cutn.VERSION def test_get_cudart_version(self): # CUDA runtime is statically linked, so we can't compare @@ -734,8 +741,6 @@ def test_contraction_gradient_workflow( # compare gradients for grad_cutn, in_torch in zip(tn.gradients, inputs): grad_torch = in_torch.grad - if torch.is_complex(grad_torch): - grad_torch = grad_torch.conj().resolve_conj() # zero-copy if on GPU assert cp.allclose(grad_cutn, cp.asarray(grad_torch)) @@ -959,6 +964,7 @@ def test_tensor_qr(self): 'options': ( {}, # standard exact svd {'max_extent': 4, 'normalization':'L1', 'partition':'U', 'algorithm': 'gesvdr', 'gesvdr_niters': 40}, # fix extent truncation + {'abs_cutoff': 0.1, 'discarded_weight_cutoff': 0.05, 'normalization': 'L2'}, # discarded weight truncation {'abs_cutoff': 0.1, 'rel_cutoff': 0.1, 'algorithm': 'gesvdj', 'gesvdj_tol':1e-14, 'gesvdj_max_sweeps': 80}, # value based truncation {'abs_cutoff': 0.1, 'normalization':'L2', 'partition':'V', 'algorithm': 'gesvdj'}, # absolute value based truncation {'rel_cutoff': 0.1, 'normalization':'LInf', 'partition':'UV', 'algorithm': 'gesvdp'}, # relative value based truncation @@ -981,6 +987,13 @@ def test_tensor_svd(self): svd_config, svd_info = self.svd_config, self.svd_info dtype = cp.dtype(self.dtype) + # relax gesvdj_tol for single precision operand + algorithm = self.options.get('algorithm', None) + if algorithm == 'gesvdj' and self.dtype in [np.float32, np.complex64]: + gesvdj_tol = self.options.get('gesvdj_tol', None) + if gesvdj_tol is not None: + self.options['gesvdj_tol'] = 1e-7 + # parse svdConfig svd_method = check_or_create_options(tensor.SVDMethod, self.options, "SVDMethod") parse_svd_config(handle, svd_config, svd_method, logger=None) @@ -1075,6 +1088,7 @@ def test_tensor_svd(self): 'options': ( {}, # standard exact svd {'max_extent': 4, 'normalization':'L1', 'partition':'U', 'algorithm': 'gesvdr', 'gesvdr_niters': 40}, # fix extent truncation + {'abs_cutoff': 0.1, 'discarded_weight_cutoff': 0.05, 'normalization': 'L2'}, # discarded weight truncation {'abs_cutoff': 0.1, 'rel_cutoff': 0.1, 'algorithm': 'gesvdj', 'gesvdj_tol':1e-14, 'gesvdj_max_sweeps': 80}, # value based truncation {'abs_cutoff': 0.1, 'normalization':'L2', 'partition':'V', 'algorithm': 'gesvdj'}, # absolute value based truncation {'rel_cutoff': 0.1, 'normalization':'LInf', 'partition':'UV', 'algorithm': 'gesvdp'}, # relative value based truncation @@ -1101,6 +1115,13 @@ def test_gate_split(self): gate_algorithm = self.GATE_ALGO_MAP[algo] svd_config, svd_info = self.svd_config, self.svd_info + # relax gesvdj_tol for single precision operand + algorithm = self.options.get('algorithm', None) + if algorithm == 'gesvdj' and self.dtype in [np.float32, np.complex64]: + gesvdj_tol = self.options.get('gesvdj_tol', None) + if gesvdj_tol is not None: + self.options['gesvdj_tol'] = 1e-7 + # parse svdConfig svd_method = check_or_create_options(tensor.SVDMethod, self.options, "SVDMethod") parse_svd_config(handle, svd_config, svd_method, logger=None) diff --git a/python/tests/cuquantum_tests/cutensornet_tests/test_experimental.py b/python/tests/cuquantum_tests/cutensornet_tests/test_experimental.py index d0da2ec..d8f207d 100644 --- a/python/tests/cuquantum_tests/cutensornet_tests/test_experimental.py +++ b/python/tests/cuquantum_tests/cutensornet_tests/test_experimental.py @@ -19,7 +19,7 @@ from .approxTN_utils import split_contract_decompose, tensor_decompose, verify_split_QR, verify_split_SVD from .data import backend_names, contract_decompose_expr from .test_options import _OptionsBase -from .test_utils import DecomposeFactory, deselect_contract_decompose_algorithm_tests, deselect_decompose_tests, gen_rand_svd_method +from .test_utils import DecomposeFactory, deselect_contract_decompose_algorithm_tests, deselect_decompose_tests, get_svd_methods_for_test from .test_utils import get_stream_for_backend @@ -131,16 +131,14 @@ def test_contract_qr_decompose(self, decompose_expr, xp, dtype, order, stream): def test_contract_svd_decompose(self, decompose_expr, xp, dtype, order, stream): - rng = numpy.random.default_rng(2021) - methods = [tensor.SVDMethod()] + [gen_rand_svd_method(rng) for _ in range(10)] + methods = get_svd_methods_for_test(3, dtype) for svd_method in methods: algorithm = ContractDecomposeAlgorithm(qr_method=False, svd_method=svd_method) self._run_contract_decompose(decompose_expr, xp, dtype, order, stream, algorithm) def test_contract_qr_assisted_svd_decompose(self, decompose_expr, xp, dtype, order, stream): - rng = numpy.random.default_rng(2021) - methods = [tensor.SVDMethod()] + [gen_rand_svd_method(rng) for _ in range(10)] + methods = get_svd_methods_for_test(3, dtype) for svd_method in methods: algorithm = ContractDecomposeAlgorithm(qr_method={}, svd_method=svd_method) self._run_contract_decompose(decompose_expr, xp, dtype, order, stream, algorithm) diff --git a/python/tests/cuquantum_tests/cutensornet_tests/test_network.py b/python/tests/cuquantum_tests/cutensornet_tests/test_network.py index 4b40cbb..8619078 100644 --- a/python/tests/cuquantum_tests/cutensornet_tests/test_network.py +++ b/python/tests/cuquantum_tests/cutensornet_tests/test_network.py @@ -7,11 +7,12 @@ import re import sys -import cupy -import numpy -import opt_einsum +import cupy as cp +import numpy as np +import opt_einsum as oe import pytest +from cuquantum import cutensornet as cutn from cuquantum import Network from cuquantum.cutensornet._internal.utils import infer_object_package @@ -26,6 +27,9 @@ # TODO: parametrize compute type? @pytest.mark.uncollect_if(func=deselect_contract_tests) +@pytest.mark.parametrize( + "gradient", (False, "random", "all") +) @pytest.mark.parametrize( "use_numpy_path", (False, True) ) @@ -51,7 +55,7 @@ class TestNetwork: def test_network( self, einsum_expr_pack, xp, dtype, order, autotune, - stream, use_numpy_path): + stream, use_numpy_path, gradient): einsum_expr = copy.deepcopy(einsum_expr_pack) if isinstance(einsum_expr, list): einsum_expr, network_opts, optimizer_opts, _ = einsum_expr @@ -59,20 +63,23 @@ def test_network( network_opts = optimizer_opts = None assert isinstance(einsum_expr, (str, tuple)) + # prepare operands and other needed test config factory = EinsumFactory(einsum_expr) operands = factory.generate_operands( factory.input_shapes, xp, dtype, order) + qualifiers, picks = factory.generate_qualifiers(xp, gradient) + factory.setup_torch_grads(xp, picks, operands) backend = sys.modules[infer_object_package(operands[0])] data = factory.convert_by_format(operands) if stream: stream = get_stream_for_backend(backend) - tn = Network(*data, options=network_opts) + tn = Network(*data, options=network_opts, qualifiers=qualifiers) # We already test tn as a context manager in the samples, so let's test # explicitly calling tn.free() here. try: if not use_numpy_path: - path, info = tn.contract_path(optimize=optimizer_opts) + path, info = self._setup_path(tn, optimizer_opts) uninit_f_str = re.compile("{.*}") assert uninit_f_str.search(str(info)) is None check_intermediate_modes( @@ -89,7 +96,7 @@ def test_network( else: optimizer_opts = set_path_to_optimizer_options( optimizer_opts, path_ref) - path, _ = tn.contract_path(optimizer_opts) + path, _ = self._setup_path(tn, optimizer_opts) # round-trip test # note that within each pair it could have different order assert all(map(lambda x, y: sorted(x) == sorted(y), path, path_ref)) @@ -97,30 +104,107 @@ def test_network( if autotune: tn.autotune(iterations=autotune, stream=stream) # check the result - self._verify_contract( + out, out_ref = self._verify_contract( tn, operands, backend, data, xp, dtype, stream) + self._verify_gradient( + tn, operands, backend, data, xp, dtype, + gradient, out, out_ref, picks, stream) - # generate new data and bind them to the TN + # generate new data (by picking a nonzero seed) and bind them + # to the TN operands = factory.generate_operands( - factory.input_shapes, xp, dtype, order) + factory.input_shapes, xp, dtype, order, seed=100) + factory.setup_torch_grads(xp, picks, operands) data = factory.convert_by_format(operands) - tn.reset_operands(*operands) + tn.reset_operands(*operands, stream=stream) + # check the result - self._verify_contract( + out, out_ref = self._verify_contract( tn, operands, backend, data, xp, dtype, stream) + self._verify_gradient( + tn, operands, backend, data, xp, dtype, + gradient, out, out_ref, picks, stream) finally: tn.free() + def _setup_path(self, tn, optimizer_opts): + try: + path, info = tn.contract_path(optimize=optimizer_opts) + except cutn.cuTensorNetError as e: + # differentiating some edge TNs is not yet supported + if "NOT_SUPPORTED" in str(e): + pytest.skip("this TN is currently not supported") + else: + raise + return path, info + + def _setup_gradients(self, tn, output_grad, stream): + try: + input_grads = tn.gradients(output_grad, stream=stream) + except cutn.cuTensorNetError as e: + # differentiating some edge TNs is not yet supported + if "NOT_SUPPORTED" in str(e): + pytest.skip("this TN is currently not supported") + else: + raise + return input_grads + def _verify_contract( self, tn, operands, backend, data, xp, dtype, stream): out = tn.contract(stream=stream) if stream: stream.synchronize() - backend_out = sys.modules[infer_object_package(out)] - assert backend_out is backend + + # check contraction result types + assert sys.modules[infer_object_package(out)] is backend assert out.dtype == operands[0].dtype - out_ref = opt_einsum.contract( - *data, backend="torch" if "torch" in xp else xp) + # check contraction + out_ref = oe.contract(*data, backend=("torch" if "torch" in xp else xp)) assert backend.allclose( out, out_ref, atol=atol_mapper[dtype], rtol=rtol_mapper[dtype]) + + return out, out_ref + + def _verify_gradient( + self, tn, operands, backend, data, xp, dtype, + gradient, out, out_ref, picks, stream): + if gradient is False: + return + + # compute gradients + output_grad = backend.ones_like(out) + input_grads = self._setup_gradients(tn, output_grad, stream) + if stream: + stream.synchronize() + + # check gradient result types + assert all((sys.modules[infer_object_package(grad)] is backend) + if grad is not None else True + for grad in input_grads) + assert all((grad.dtype == operands[0].dtype) + if grad is not None else True + for grad in input_grads) + + # given simplicity & CI time constraints we only do grad + # verification with torch tensors + if "torch" in xp: + output_grad = backend.ones_like(out_ref) + out_ref.backward(output_grad) + + # check gradients + try: + is_close = backend.tensor(tuple( + backend.allclose( + cutn_grad, op.grad, + atol=atol_mapper[dtype], rtol=rtol_mapper[dtype]) + if cutn_grad is not None else cutn_grad is op.grad + for cutn_grad, op in zip(input_grads, operands) + )) + assert all(is_close) + except AssertionError as e: + # for easier debugging + print(tuple(op.shape for op in operands)) + print(input_grads) + print(tuple(op.grad for op in operands)) + raise diff --git a/python/tests/cuquantum_tests/cutensornet_tests/test_tensor.py b/python/tests/cuquantum_tests/cutensornet_tests/test_tensor.py index 502ebc6..7851e31 100644 --- a/python/tests/cuquantum_tests/cutensornet_tests/test_tensor.py +++ b/python/tests/cuquantum_tests/cutensornet_tests/test_tensor.py @@ -18,7 +18,7 @@ from .data import backend_names, tensor_decomp_expressions from .test_options import _OptionsBase, TestNetworkOptions from .test_utils import DecomposeFactory -from .test_utils import deselect_decompose_tests, gen_rand_svd_method +from .test_utils import deselect_decompose_tests, get_svd_methods_for_test from .test_utils import get_stream_for_backend @@ -107,8 +107,7 @@ def test_qr(self, decompose_expr, xp, dtype, order, stream, blocking): ) def test_svd( self, decompose_expr, xp, dtype, order, stream, return_info, blocking): - rng = numpy.random.default_rng(2021) - methods = [tensor.SVDMethod()] + [gen_rand_svd_method(rng) for _ in range(10)] + methods = get_svd_methods_for_test(3, dtype) for method in methods: self._run_decompose( decompose_expr, xp, dtype, order, stream, method, @@ -133,6 +132,9 @@ def test_abs_cutoff(self): def test_rel_cutoff(self): self.create_options({'rel_cutoff': 0.1}) + def test_discarded_weight_cutoff(self): + self.create_options({'discarded_weight_cutoff': 0.1}) + @pytest.mark.parametrize( 'partition', [None, 'U', 'V', 'UV'] ) diff --git a/python/tests/cuquantum_tests/cutensornet_tests/test_utils.py b/python/tests/cuquantum_tests/cutensornet_tests/test_utils.py index faaabcd..42b4f81 100644 --- a/python/tests/cuquantum_tests/cutensornet_tests/test_utils.py +++ b/python/tests/cuquantum_tests/cutensornet_tests/test_utils.py @@ -5,9 +5,9 @@ import re import sys -import cupy +import cupy as cp from cupy.testing import shaped_random -import numpy +import numpy as np try: import torch if not torch.cuda.is_available(): @@ -15,19 +15,21 @@ except ImportError: torch = None +from cuquantum import cutensornet as cutn from cuquantum import OptimizerOptions from cuquantum import tensor from cuquantum.cutensornet._internal.circuit_converter_utils import EINSUM_SYMBOLS_BASE +from cuquantum.cutensornet._internal.decomposition_utils import DECOMPOSITION_DTYPE_NAMES from cuquantum.cutensornet._internal.einsum_parser import infer_output_mode_labels from .data import dtype_names -machine_epsilon_values = [numpy.finfo(dtype).eps for dtype in dtype_names] +machine_epsilon_values = [np.finfo(dtype).eps for dtype in dtype_names] rtol_mapper = dict(zip( dtype_names, - [numpy.sqrt(m_eps) for m_eps in machine_epsilon_values] + [np.sqrt(m_eps) for m_eps in machine_epsilon_values] )) atol_mapper = dict(zip( @@ -50,7 +52,7 @@ def set_path_to_optimizer_options(optimizer_opts, path): def compute_and_normalize_numpy_path(data, num_operands): try: # this can fail if the TN is too large (ex: containing unicode) - path, _ = numpy.einsum_path(*data, optimize=True) + path, _ = np.einsum_path(*data, optimize=True) except: raise NotImplementedError path = path[1:] @@ -220,18 +222,18 @@ def input_modes(self): def output_modes(self): return self.modes[self.num_inputs:] - def generate_operands(self, shapes, xp, dtype, order): + def generate_operands(self, shapes, xp, dtype, order, seed=0): # we always generate data from shaped_random as CuPy fixes # the RNG seed for us if xp == "torch-cpu": - _xp = numpy + _xp = np elif xp == "torch-gpu": - _xp = cupy + _xp = cp else: _xp = sys.modules[xp] operands = [ - shaped_random(shape, xp=_xp, dtype=dtype, order=order) + shaped_random(shape, xp=_xp, dtype=dtype, order=order, seed=seed) for shape in shapes ] @@ -272,7 +274,7 @@ def convert_by_format(self, operands, *, dummy=False): if dummy: # create dummy NumPy arrays to bypass the __array_function__ # dispatcher, see numpy/numpy#21379 for discussion - operands = [numpy.broadcast_to(0, arr.shape) for arr in operands] + operands = [np.broadcast_to(0, arr.shape) for arr in operands] if self.expr_format == "subscript": data = [self.expr, *operands] @@ -283,6 +285,44 @@ def convert_by_format(self, operands, *, dummy=False): return data + def generate_qualifiers(self, xp, gradient): + if not gradient: + qualifiers = None + picks = None + elif gradient == "random": + # picks could be all false, and torch would not be happy during + # backprop + while True: + picks = np.random.choice(2, size=self.num_inputs) + if any(picks): + break + if "torch" in xp: + # for torch, test auto-detect, will set up torch operands later + qualifiers = None + else: + qualifiers = np.zeros( + self.num_inputs, dtype=cutn.tensor_qualifiers_dtype) + qualifiers[:]["requires_gradient"] = picks + elif gradient == "all": + # for torch, test overwrite + qualifiers = np.zeros( + self.num_inputs, dtype=cutn.tensor_qualifiers_dtype) + qualifiers[:]["requires_gradient"] = True + picks = tuple(True for i in range(self.num_inputs)) + + return qualifiers, picks + + def setup_torch_grads(self, xp, picks, operands): + if not "torch" in xp or picks is None: + return + + for op, pick in zip(operands, picks): + if pick: + op.requires_grad_(True) + else: + op.requires_grad_(False) + op.grad = None # reset + class DecomposeFactory(ExpressionFactory): @@ -314,26 +354,47 @@ def modes(self): return self._modes -def gen_rand_svd_method(rng): +def gen_rand_svd_method(rng, dtype, fixed=None): + assert dtype in DECOMPOSITION_DTYPE_NAMES, f"dtype {dtype} not supported" method = {"max_extent": rng.choice(range(1, 7)), "abs_cutoff": rng.random() / 2.0, # [0, 0.5) - "rel_cutoff": 0.1 + rng.random() / 2.5 , # [0.1, 0.5) + "rel_cutoff": 0.1 + rng.random() / 2.5, # [0.1, 0.5) "normalization": rng.choice([None, "L1", "L2", "LInf"]), "partition": rng.choice([None, "U", "V", "UV"]), "algorithm": rng.choice(['gesvd', 'gesvdj', 'gesvdp', 'gesvdr'])} - if method["algorithm"] == 'gesvdj': - method["gesvdj_tol"] = rng.choice([0, 1e-14]) + algorithm = method["algorithm"] + if fixed is not None and "algorithm" in fixed: + algorithm = fixed["algorithm"] + if algorithm != 'gesvdr': + # gesvdr + max_extent can't be used with discarded weight truncation + method["discarded_weight_cutoff"] = rng.random() / 10.0 # [0, 0.1) + if algorithm == 'gesvdj': + if dtype in ("float32", "complex64"): + # for single precision, lowered down gesvdj_tol for convergence + method["gesvdj_tol"] = rng.choice([0, 1e-7]) + else: + method["gesvdj_tol"] = rng.choice([0, 1e-14]) method["gesvdj_max_sweeps"] = rng.choice([0, 100]) - elif method["algorithm"] == 'gesvdr': + elif algorithm == 'gesvdr': method["gesvdr_niters"] = rng.choice([0, 40]) # we can't set oversampling as it depends on matrix size here + # updating method again in case svd_params are already + if fixed is not None: + method.update(fixed) return tensor.SVDMethod(**method) +def get_svd_methods_for_test(num, dtype): + # single dw cutoff to verify dw < dw_cutoff + methods = [tensor.SVDMethod(), tensor.SVDMethod(discarded_weight_cutoff=0.05)] + rng = np.random.default_rng(2021) + for _ in range(num): + methods.append(gen_rand_svd_method(rng, dtype)) + return methods # We want to avoid fragmenting the stream-ordered mempools _predefined_streams = { - numpy: cupy.cuda.Stream(), # implementation detail - cupy: cupy.cuda.Stream(), + np: cp.cuda.Stream(), # implementation detail + cp: cp.cuda.Stream(), } if torch is not None: _predefined_streams[torch] = torch.cuda.Stream() @@ -362,6 +423,18 @@ def deselect_contract_tests( return True return False +def deselect_gradient_tests( + einsum_expr_pack, xp, dtype, order, stream, + use_numpy_path, gradient, *args, **kwargs): + if xp.startswith('torch') and torch is None: + return True + assert gradient in (False, "random", "all", "skip") + if gradient == "skip": + return True + if gradient and "torch" not in xp: + return True + return False + def deselect_decompose_tests( decompose_expr, xp, dtype, *args, **kwargs): if xp.startswith('torch') and torch is None: diff --git a/python/tests/requirements.txt b/python/tests/requirements.txt index db5a3d6..51f63c5 100644 --- a/python/tests/requirements.txt +++ b/python/tests/requirements.txt @@ -2,7 +2,9 @@ pytest >=6.2 pytest-xdist opt_einsum cffi >=1.0.0 -nbmake ==1.3.0 +nbformat +nbconvert cirq >=0.6.0 qiskit >=0.24.0 +qiskit-aer pylatexenc diff --git a/python/tests/samples_tests/cutensornet_tests/test_cutensornet_samples.py b/python/tests/samples_tests/cutensornet_tests/test_cutensornet_samples.py index df4b0cb..84da894 100644 --- a/python/tests/samples_tests/cutensornet_tests/test_cutensornet_samples.py +++ b/python/tests/samples_tests/cutensornet_tests/test_cutensornet_samples.py @@ -7,10 +7,6 @@ import re import sys -try: - import nbmake -except ImportError: - nbmake = None # we could use packaging.version.Version too, but NumPy is our required # dependency, packaging is not. from numpy.lib import NumpyVersion as Version @@ -70,10 +66,6 @@ def test_sample(self, sample): notebook_files = glob.glob(samples_path+'/**/*.ipynb', recursive=True) -@pytest.mark.skipif( - nbmake is None, - reason="testing Jupyter notebooks requires nbmake" -) @pytest.mark.parametrize( 'notebook', notebook_files ) @@ -84,6 +76,4 @@ def test_notebook(self, notebook): if circuit_type in notebook_skip_messages: pytest.skip(notebook_skip_messages[circuit_type]) else: - status = pytest.main(['--nbmake', notebook]) - if status != 0: - raise cuQuantumSampleTestError(f'{notebook} failed') + run_sample(samples_path, notebook) diff --git a/python/tests/samples_tests/test_utils.py b/python/tests/samples_tests/test_utils.py index b8fa1ed..786a69d 100644 --- a/python/tests/samples_tests/test_utils.py +++ b/python/tests/samples_tests/test_utils.py @@ -2,9 +2,20 @@ # # SPDX-License-Identifier: BSD-3-Clause +import gc import os import sys +import cupy as cp +try: + import matplotlib +except ImportError: + matplotlib = None +else: + # disable plot windows from popping out when testing locally + matplotlib.use('Agg') +import nbformat +from nbconvert import PythonExporter import pytest @@ -12,19 +23,35 @@ class cuQuantumSampleTestError(Exception): pass +def parse_python_script(filepath): + if filepath.endswith('.py'): + with open(filepath, "r", encoding='utf-8') as f: + script = f.read() + elif filepath.endswith('.ipynb'): + # run all notebooks in the same process to avoid OOM & ABI issues + with open(filepath, "r", encoding="utf-8") as f: + nb = nbformat.reads(f.read(), nbformat.NO_CONVERT) + script = PythonExporter().from_notebook_node(nb)[0] + else: + raise ValueError(f"{filepath} not supported") + return script + + def run_sample(samples_path, filename): fullpath = os.path.join(samples_path, filename) - with open(fullpath, "r", encoding='utf-8') as f: - script = f.read() + script = parse_python_script(fullpath) try: old_argv = sys.argv sys.argv = [fullpath] exec(script, {}) except ImportError as e: - if 'torch' not in str(e): - raise + # for samples/notebooks requiring any of optional dependencies + for m in ('torch', 'qiskit', 'cirq'): + if f"No module named '{m}'" in str(e): + pytest.skip(f'{m} uninstalled, skipping related tests') + break else: - pytest.skip('PyTorch uninstalled, skipping related tests') + raise except Exception as e: msg = "\n" msg += f'Got error ({filename}):\n' @@ -32,3 +59,6 @@ def run_sample(samples_path, filename): raise cuQuantumSampleTestError(msg) from e finally: sys.argv = old_argv + # further reduce the memory watermark + gc.collect() + cp.get_default_memory_pool().free_all_blocks() diff --git a/samples/custatevec/CMakeLists.txt b/samples/custatevec/CMakeLists.txt index 520a447..2e07f89 100644 --- a/samples/custatevec/CMakeLists.txt +++ b/samples/custatevec/CMakeLists.txt @@ -145,3 +145,4 @@ add_custatevec_example(custatevec_examples "cuStateVec.example.swap_index_bits" add_custatevec_example(custatevec_examples "cuStateVec.example.mgpu_swap_index_bits" mgpu_swap_index_bits.cu) add_custatevec_example(custatevec_examples "cuStateVec.example.mgpu_sampler" mgpu_sampler.cu) add_custatevec_example(custatevec_examples "cuStateVec.example.mgpu_batch_measure" mgpu_batch_measure.cu) +add_custatevec_example(custatevec_examples "cuStateVec.example.subsv_migration" subsv_migration.cu) diff --git a/samples/custatevec/Makefile b/samples/custatevec/Makefile index ffc2649..067c279 100644 --- a/samples/custatevec/Makefile +++ b/samples/custatevec/Makefile @@ -41,6 +41,7 @@ all: check-env nvcc mgpu_swap_index_bits.cu -o mgpu_swap_index_bits ${CXX_FLAGS} nvcc mgpu_batch_measure.cu -o mgpu_batch_measure ${CXX_FLAGS} nvcc mgpu_sampler.cu -o mgpu_sampler ${CXX_FLAGS} + nvcc subsv_migration.cu -o subsv_migration ${CXX_FLAGS} check-env: @@ -77,4 +78,5 @@ clean: swap_index_bits \ mgpu_swap_index_bits \ mgpu_batch_measure \ - mgpu_sampler + mgpu_sampler \ + subsv_migration diff --git a/samples/custatevec/subsv_migration.cu b/samples/custatevec/subsv_migration.cu new file mode 100644 index 0000000..f7ce5ad --- /dev/null +++ b/samples/custatevec/subsv_migration.cu @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. + * + * SPDX-License-Identifier: BSD-3-Clause + */ + +#include // cudaMalloc, cudaMemcpy, etc. +#include // cuDoubleComplex +#include // custatevecSubSVMigratorMigrate +#include // printf +#include // EXIT_FAILURE + +#include "helper.hpp" // HANDLE_ERROR, HANDLE_CUDA_ERROR + +int main(void) { + + const cudaDataType_t svDataType = CUDA_C_64F; + + const int nLocalIndexBits = 3; + const int64_t subSvSize = int64_t(1) << nLocalIndexBits; + + // allocate host memory + const int nSubSvs = 2; + cuDoubleComplex* subSvs[nSubSvs]; + const size_t subSvSizeInBytes = subSvSize * sizeof(cuDoubleComplex); + for (int iSv = 0; iSv < nSubSvs; iSv++) { + HANDLE_CUDA_ERROR( cudaMallocHost(&subSvs[iSv], subSvSizeInBytes) ); + } + + // fill subSvs[0] + for (int i = 0; i < subSvSize; i++) { + subSvs[0][i] = {0.25, 0.00}; + } + + // allocate device memory + const int nDeviceSlots = 1; + cuDoubleComplex* deviceSlots; + const size_t deviceSlotSizeInBytes = nDeviceSlots * subSvSize * sizeof(cuDoubleComplex); + HANDLE_CUDA_ERROR( cudaMalloc(&deviceSlots, deviceSlotSizeInBytes) ); + + //---------------------------------------------------------------------------------------------- + + // initialize custatevec handle + custatevecHandle_t handle; + HANDLE_ERROR( custatevecCreate(&handle) ); + + // create migrator + custatevecSubSVMigratorDescriptor_t migrator; + HANDLE_ERROR( custatevecSubSVMigratorCreate(handle, &migrator, deviceSlots, svDataType, + nDeviceSlots, nLocalIndexBits) ); + + int deviceSlotIndex = 0; + cuDoubleComplex* srcSubSv = subSvs[0]; + cuDoubleComplex* dstSubSv = subSvs[1]; + + // migrate subSvs[0] into d_subSvSlots + HANDLE_ERROR( custatevecSubSVMigratorMigrate(handle, migrator, deviceSlotIndex, srcSubSv, + nullptr, 0, subSvSize) ); + + // migrate d_subSvSlots into subSvs[1] + HANDLE_ERROR( custatevecSubSVMigratorMigrate(handle, migrator, deviceSlotIndex, nullptr, + dstSubSv, 0, subSvSize) ); + + // destroy migrator + HANDLE_ERROR( custatevecSubSVMigratorDestroy(handle, migrator)); + + // destroy custatevec handle + HANDLE_ERROR( custatevecDestroy(handle) ); + + //---------------------------------------------------------------------------------------------- + + // check if subSvs[1] has expected values + bool correct = true; + for (int i = 0; i < subSvSize; i++) { + cuDoubleComplex expectedValue = {0.25, 0.00}; + if (!almost_equal(subSvs[1][i], expectedValue)) { + correct = false; + break; + } + } + + // free host memory + for (int iSv = 0; iSv < nSubSvs; iSv++) { + HANDLE_CUDA_ERROR( cudaFreeHost(subSvs[iSv]) ); + } + + // free device memory + HANDLE_CUDA_ERROR( cudaFree(deviceSlots) ); + + if (correct) { + printf("subsv_migration example PASSED\n"); + return EXIT_SUCCESS; + } + else { + printf("subsv_migration example FAILED: wrong result\n"); + return EXIT_FAILURE; + } +} \ No newline at end of file diff --git a/samples/cutensornet/CMakeLists.txt b/samples/cutensornet/CMakeLists.txt index 1bee4d0..65653e8 100644 --- a/samples/cutensornet/CMakeLists.txt +++ b/samples/cutensornet/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. +# Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. # # SPDX-License-Identifier: BSD-3-Clause @@ -6,6 +6,7 @@ cmake_minimum_required(VERSION 3.13.0 FATAL_ERROR) project(cutensornet_example LANGUAGES C CXX CUDA) include(GNUInstallDirs) +find_package(CUDA 11.0 REQUIRED) # ########################################## # cutensornet_example build mode @@ -92,13 +93,19 @@ function(add_cutensornet_example GROUP_TARGET EXAMPLE_NAME EXAMPLE_SOURCES) cutensornet $<$:MPI::MPI_CXX> ) - set_target_properties( - ${EXAMPLE_TARGET} - PROPERTIES - POSITION_INDEPENDENT_CODE ON - CUDA_ARCHITECTURES - "70;75;80" - ) + if((${CUDA_VERSION_MAJOR} GREATER_EQUAL 12) OR (${CUDA_VERSION_MAJOR} EQUAL 11 AND ${CUDA_VERSION_MINOR} GREATER_EQUAL 8)) + set_target_properties( + ${EXAMPLE_TARGET} + PROPERTIES + POSITION_INDEPENDENT_CODE ON + CUDA_ARCHITECTURES "70;75;80;86;90") + else() + set_target_properties( + ${EXAMPLE_TARGET} + PROPERTIES + POSITION_INDEPENDENT_CODE ON + CUDA_ARCHITECTURES "70;75;80;86") + endif() # Install example install( TARGETS ${EXAMPLE_TARGET} @@ -118,8 +125,14 @@ add_custom_target(cutensornet_examples) add_cutensornet_example(cutensornet_examples "cuTENSORNet.example.tensornet" tensornet_example.cu) add_cutensornet_example(cutensornet_examples "cuTENSORNet.example.tensornet.reuse" tensornet_example_reuse.cu) add_cutensornet_example(cutensornet_examples "cuTENSORNet.example.tensornet.gradients" tensornet_example_gradients.cu) +add_cutensornet_example(cutensornet_examples "cuTENSORNet.example.tensornet.amplitudes" high_level/amplitudes_example.cu) +add_cutensornet_example(cutensornet_examples "cuTENSORNet.example.tensornet.expectation" high_level/expectation_example.cu) add_cutensornet_example(cutensornet_examples "cuTENSORNet.example.tensornet.marginal" high_level/marginal_example.cu) add_cutensornet_example(cutensornet_examples "cuTENSORNet.example.tensornet.sampler" high_level/sampling_example.cu) +add_cutensornet_example(cutensornet_examples "cuTENSORNet.example.tensornet.mps_amplitudes" high_level/mps_amplitudes_example.cu) +add_cutensornet_example(cutensornet_examples "cuTENSORNet.example.tensornet.mps_expectation" high_level/mps_expectation_example.cu) +add_cutensornet_example(cutensornet_examples "cuTENSORNet.example.tensornet.mps_marginal" high_level/mps_marginal_example.cu) +add_cutensornet_example(cutensornet_examples "cuTENSORNet.example.tensornet.mps_sampler" high_level/mps_sampling_example.cu) add_cutensornet_example(cutensornet_examples "cuTENSORNet.example.tensornet.svd" approxTN/tensor_svd_example.cu) add_cutensornet_example(cutensornet_examples "cuTENSORNet.example.tensornet.qr" approxTN/tensor_qr_example.cu) add_cutensornet_example(cutensornet_examples "cuTENSORNet.example.tensornet.gate" approxTN/gate_split_example.cu) @@ -129,6 +142,7 @@ find_package(MPI) if (MPI_FOUND) add_cutensornet_example(cutensornet_examples "cuTENSORNet.example.tensornet.mpi" tensornet_example_mpi.cu) add_cutensornet_example(cutensornet_examples "cuTENSORNet.example.tensornet.mpi.auto" tensornet_example_mpi_auto.cu) + add_cutensornet_example(cutensornet_examples "cuTENSORNet.example.tensornet.mpi.sampling" high_level/sampling_mpi_example.cu) else () message(WARNING "An MPI installation was not detected. Please install CUDA-aware MPI if you would like to build the distributed example(s).") endif () diff --git a/samples/cutensornet/Makefile b/samples/cutensornet/Makefile index 3d7f7e7..2a4689f 100644 --- a/samples/cutensornet/Makefile +++ b/samples/cutensornet/Makefile @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. +# Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. # # SPDX-License-Identifier: BSD-3-Clause @@ -28,11 +28,18 @@ all: check-env ${CUDA_PATH}/bin/nvcc approxTN/tensor_qr_example.cu -o tensor_qr_example ${CXX_FLAGS} ${CUDA_PATH}/bin/nvcc approxTN/gate_split_example.cu -o gate_split_example ${CXX_FLAGS} ${CUDA_PATH}/bin/nvcc approxTN/mps_example.cu -o mps_example ${CXX_FLAGS} + ${CUDA_PATH}/bin/nvcc high_level/amplitudes_example.cu -o amplitudes_example ${CXX_FLAGS} + ${CUDA_PATH}/bin/nvcc high_level/expectation_example.cu -o expectation_example ${CXX_FLAGS} ${CUDA_PATH}/bin/nvcc high_level/marginal_example.cu -o marginal_example ${CXX_FLAGS} ${CUDA_PATH}/bin/nvcc high_level/sampling_example.cu -o sampling_example ${CXX_FLAGS} + ${CUDA_PATH}/bin/nvcc high_level/mps_amplitudes_example.cu -o mps_amplitudes_example ${CXX_FLAGS} + ${CUDA_PATH}/bin/nvcc high_level/mps_expectation_example.cu -o mps_expectation_example ${CXX_FLAGS} + ${CUDA_PATH}/bin/nvcc high_level/mps_marginal_example.cu -o mps_marginal_example ${CXX_FLAGS} + ${CUDA_PATH}/bin/nvcc high_level/mps_sampling_example.cu -o mps_sampling_example ${CXX_FLAGS} ifdef MPI_ROOT ${CUDA_PATH}/bin/nvcc tensornet_example_mpi.cu -Xlinker -rpath,${MPI_ROOT}/lib -L${MPI_ROOT}/lib -o tensornet_example_mpi ${CXX_FLAGS} -lmpi ${CUDA_PATH}/bin/nvcc tensornet_example_mpi_auto.cu -Xlinker -rpath,${MPI_ROOT}/lib -L${MPI_ROOT}/lib -o tensornet_example_mpi_auto ${CXX_FLAGS} -lmpi + ${CUDA_PATH}/bin/nvcc high_level/sampling_mpi_example.cu -Xlinker -rpath,${MPI_ROOT}/lib -L${MPI_ROOT}/lib -o sampling_mpi_example ${CXX_FLAGS} -lmpi endif check-env: @@ -62,7 +69,14 @@ clean: rm -f tensor_qr_example tensor_qr_example.o rm -f gate_split_example gate_split_example.o rm -f mps_example mps_example.o + rm -f amplitudes_example amplitudes_example.o + rm -f expectation_example expectation_example.o rm -f marginal_example marginal_example.o rm -f sampling_example sampling_example.o + rm -f mps_amplitudes_example mps_amplitudes_example + rm -f mps_expectation_example mps_expectation_example + rm -f mps_marginal_example mps_marginal_example + rm -f mps_sampling_example mps_sampling_example rm -f tensornet_example_mpi tensornet_example_mpi.o rm -f tensornet_example_mpi_auto tensornet_example_mpi_auto.o + rm -f sampling_mpi_example sampling_mpi_example.o diff --git a/samples/cutensornet/high_level/amplitudes_example.cu b/samples/cutensornet/high_level/amplitudes_example.cu new file mode 100644 index 0000000..3af8147 --- /dev/null +++ b/samples/cutensornet/high_level/amplitudes_example.cu @@ -0,0 +1,204 @@ +/* Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. + * + * SPDX-License-Identifier: BSD-3-Clause + */ + +// Sphinx: Amplitudes #1 + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + + +#define HANDLE_CUDA_ERROR(x) \ +{ const auto err = x; \ + if( err != cudaSuccess ) \ + { printf("CUDA error %s in line %d\n", cudaGetErrorString(err), __LINE__); fflush(stdout); std::abort(); } \ +}; + +#define HANDLE_CUTN_ERROR(x) \ +{ const auto err = x; \ + if( err != CUTENSORNET_STATUS_SUCCESS ) \ + { printf("cuTensorNet error %s in line %d\n", cutensornetGetErrorString(err), __LINE__); fflush(stdout); std::abort(); } \ +}; + + +int main(int argc, char **argv) +{ + static_assert(sizeof(size_t) == sizeof(int64_t), "Please build this sample on a 64-bit architecture!"); + + constexpr std::size_t fp64size = sizeof(double); + + // Sphinx: Amplitudes #2 + + // Quantum state configuration + constexpr int32_t numQubits = 6; // number of qubits + const std::vector qubitDims(numQubits,2); // qubit dimensions + const std::vector fixedModes({0,1}); // fixed modes in the output amplitude tensor (must be in acsending order) + const std::vector fixedValues({1,1}); // values of the fixed modes in the output amplitude tensor + const int32_t numFixedModes = fixedModes.size(); // number of fixed modes in the output amplitude tensor + std::cout << "Quantum circuit: " << numQubits << " qubits\n"; + + // Sphinx: Amplitudes #3 + + // Initialize the cuTensorNet library + HANDLE_CUDA_ERROR(cudaSetDevice(0)); + cutensornetHandle_t cutnHandle; + HANDLE_CUTN_ERROR(cutensornetCreate(&cutnHandle)); + std::cout << "Initialized cuTensorNet library on GPU 0\n"; + + // Sphinx: Amplitudes #4 + + // Define necessary quantum gate tensors in Host memory + const double invsq2 = 1.0 / std::sqrt(2.0); + // Hadamard gate + const std::vector> h_gateH {{invsq2, 0.0}, {invsq2, 0.0}, + {invsq2, 0.0}, {-invsq2, 0.0}}; + // CX gate + const std::vector> h_gateCX {{1.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, + {0.0, 0.0}, {1.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, + {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {1.0, 0.0}, + {0.0, 0.0}, {0.0, 0.0}, {1.0, 0.0}, {0.0, 0.0}}; + + // Copy quantum gates to Device memory + void *d_gateH{nullptr}, *d_gateCX{nullptr}; + HANDLE_CUDA_ERROR(cudaMalloc(&d_gateH, 4 * (2 * fp64size))); + HANDLE_CUDA_ERROR(cudaMalloc(&d_gateCX, 16 * (2 * fp64size))); + std::cout << "Allocated quantum gate memory on GPU\n"; + HANDLE_CUDA_ERROR(cudaMemcpy(d_gateH, h_gateH.data(), 4 * (2 * fp64size), cudaMemcpyHostToDevice)); + HANDLE_CUDA_ERROR(cudaMemcpy(d_gateCX, h_gateCX.data(), 16 * (2 * fp64size), cudaMemcpyHostToDevice)); + std::cout << "Copied quantum gates to GPU memory\n"; + + // Sphinx: Amplitudes #5 + + // Allocate Device memory for the specified slice of the quantum circuit amplitudes tensor + void *d_amp{nullptr}; + std::size_t ampSize = 1; + for(const auto & qubitDim: qubitDims) ampSize *= qubitDim; // all state modes (full size) + for(const auto & fixedMode: fixedModes) ampSize /= qubitDims[fixedMode]; // fixed state modes reduce the slice size + HANDLE_CUDA_ERROR(cudaMalloc(&d_amp, ampSize * (2 * fp64size))); + std::cout << "Allocated memory for the specified slice of the quantum circuit amplitude tensor of size " + << ampSize << " elements\n"; + + // Sphinx: Amplitudes #6 + + // Query the free memory on Device + std::size_t freeSize{0}, totalSize{0}; + HANDLE_CUDA_ERROR(cudaMemGetInfo(&freeSize, &totalSize)); + const std::size_t scratchSize = (freeSize - (freeSize % 4096)) / 2; // use half of available memory with alignment + void *d_scratch{nullptr}; + HANDLE_CUDA_ERROR(cudaMalloc(&d_scratch, scratchSize)); + std::cout << "Allocated " << scratchSize << " bytes of scratch memory on GPU\n"; + + // Sphinx: Amplitudes #7 + + // Create the initial quantum state + cutensornetState_t quantumState; + HANDLE_CUTN_ERROR(cutensornetCreateState(cutnHandle, CUTENSORNET_STATE_PURITY_PURE, numQubits, qubitDims.data(), + CUDA_C_64F, &quantumState)); + std::cout << "Created the initial quantum state\n"; + + // Sphinx: Amplitudes #8 + + // Construct the final quantum circuit state (apply quantum gates) for the GHZ circuit + int64_t id; + HANDLE_CUTN_ERROR(cutensornetStateApplyTensor(cutnHandle, quantumState, 1, std::vector{{0}}.data(), + d_gateH, nullptr, 1, 0, 1, &id)); + for(int32_t i = 1; i < numQubits; ++i) { + HANDLE_CUTN_ERROR(cutensornetStateApplyTensor(cutnHandle, quantumState, 2, std::vector{{i-1,i}}.data(), + d_gateCX, nullptr, 1, 0, 1, &id)); + } + std::cout << "Applied quantum gates\n"; + + // Sphinx: Amplitudes #9 + + // Specify the quantum circuit amplitudes accessor + cutensornetStateAccessor_t accessor; + HANDLE_CUTN_ERROR(cutensornetCreateAccessor(cutnHandle, quantumState, numFixedModes, fixedModes.data(), + nullptr, &accessor)); // using default strides + std::cout << "Created the specified quantum circuit amplitudes accessor\n"; + + // Sphinx: Amplitudes #10 + + // Configure the computation of the slice of the specified quantum circuit amplitudes tensor + const int32_t numHyperSamples = 8; // desired number of hyper samples used in the tensor network contraction path finder + HANDLE_CUTN_ERROR(cutensornetAccessorConfigure(cutnHandle, accessor, + CUTENSORNET_ACCESSOR_OPT_NUM_HYPER_SAMPLES, &numHyperSamples, sizeof(numHyperSamples))); + + // Sphinx: Amplitudes #11 + + // Prepare the computation of the specified slice of the quantum circuit amplitudes tensor + cutensornetWorkspaceDescriptor_t workDesc; + HANDLE_CUTN_ERROR(cutensornetCreateWorkspaceDescriptor(cutnHandle, &workDesc)); + std::cout << "Created the workspace descriptor\n"; + HANDLE_CUTN_ERROR(cutensornetAccessorPrepare(cutnHandle, accessor, scratchSize, workDesc, 0x0)); + std::cout << "Prepared the computation of the specified slice of the quantum circuit amplitudes tensor\n"; + + // Sphinx: Amplitudes #12 + + // Attach the workspace buffer + int64_t worksize {0}; + HANDLE_CUTN_ERROR(cutensornetWorkspaceGetMemorySize(cutnHandle, + workDesc, + CUTENSORNET_WORKSIZE_PREF_RECOMMENDED, + CUTENSORNET_MEMSPACE_DEVICE, + CUTENSORNET_WORKSPACE_SCRATCH, + &worksize)); + std::cout << "Required scratch GPU workspace size (bytes) = " << worksize << std::endl; + if(worksize <= scratchSize) { + HANDLE_CUTN_ERROR(cutensornetWorkspaceSetMemory(cutnHandle, workDesc, CUTENSORNET_MEMSPACE_DEVICE, + CUTENSORNET_WORKSPACE_SCRATCH, d_scratch, worksize)); + }else{ + std::cout << "ERROR: Insufficient workspace size on Device!\n"; + std::abort(); + } + std::cout << "Set the workspace buffer\n"; + + // Sphinx: Amplitudes #13 + + // Compute the specified slice of the quantum circuit amplitudes tensor + std::complex stateNorm{0.0,0.0}; + HANDLE_CUTN_ERROR(cutensornetAccessorCompute(cutnHandle, accessor, fixedValues.data(), + workDesc, d_amp, static_cast(&stateNorm), 0x0)); + std::cout << "Computed the specified slice of the quantum circuit amplitudes tensor\n"; + std::vector> h_amp(ampSize); + HANDLE_CUDA_ERROR(cudaMemcpy(h_amp.data(), d_amp, ampSize * (2 * fp64size), cudaMemcpyDeviceToHost)); + std::cout << "Amplitudes slice for " << (numQubits - numFixedModes) << " qubits:\n"; + for(std::size_t i = 0; i < ampSize; ++i) { + std::cout << " " << h_amp[i] << std::endl; + } + std::cout << "State 2-norm = (" << stateNorm.real() << ", " << stateNorm.imag() << ")\n"; + + // Sphinx: Amplitudes #14 + + // Destroy the workspace descriptor + HANDLE_CUTN_ERROR(cutensornetDestroyWorkspaceDescriptor(workDesc)); + std::cout << "Destroyed the workspace descriptor\n"; + + // Destroy the quantum circuit amplitudes accessor + HANDLE_CUTN_ERROR(cutensornetDestroyAccessor(accessor)); + std::cout << "Destroyed the quantum circuit amplitudes accessor\n"; + + // Destroy the quantum circuit state + HANDLE_CUTN_ERROR(cutensornetDestroyState(quantumState)); + std::cout << "Destroyed the quantum circuit state\n"; + + HANDLE_CUDA_ERROR(cudaFree(d_scratch)); + HANDLE_CUDA_ERROR(cudaFree(d_amp)); + HANDLE_CUDA_ERROR(cudaFree(d_gateCX)); + HANDLE_CUDA_ERROR(cudaFree(d_gateH)); + std::cout << "Freed memory on GPU\n"; + + // Finalize the cuTensorNet library + HANDLE_CUTN_ERROR(cutensornetDestroy(cutnHandle)); + std::cout << "Finalized the cuTensorNet library\n"; + + return 0; +} diff --git a/samples/cutensornet/high_level/expectation_example.cu b/samples/cutensornet/high_level/expectation_example.cu new file mode 100644 index 0000000..e3c98af --- /dev/null +++ b/samples/cutensornet/high_level/expectation_example.cu @@ -0,0 +1,242 @@ +/* Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. + * + * SPDX-License-Identifier: BSD-3-Clause + */ + +// Sphinx: Expectation #1 + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + + +#define HANDLE_CUDA_ERROR(x) \ +{ const auto err = x; \ + if( err != cudaSuccess ) \ + { printf("CUDA error %s in line %d\n", cudaGetErrorString(err), __LINE__); fflush(stdout); std::abort(); } \ +}; + +#define HANDLE_CUTN_ERROR(x) \ +{ const auto err = x; \ + if( err != CUTENSORNET_STATUS_SUCCESS ) \ + { printf("cuTensorNet error %s in line %d\n", cutensornetGetErrorString(err), __LINE__); fflush(stdout); std::abort(); } \ +}; + + +int main(int argc, char **argv) +{ + static_assert(sizeof(size_t) == sizeof(int64_t), "Please build this sample on a 64-bit architecture!"); + + constexpr std::size_t fp64size = sizeof(double); + + // Sphinx: Expectation #2 + + // Quantum state configuration + constexpr int32_t numQubits = 16; // number of qubits + const std::vector qubitDims(numQubits,2); // qubit dimensions + std::cout << "Quantum circuit: " << numQubits << " qubits\n"; + + // Sphinx: Expectation #3 + + // Initialize the cuTensorNet library + HANDLE_CUDA_ERROR(cudaSetDevice(0)); + cutensornetHandle_t cutnHandle; + HANDLE_CUTN_ERROR(cutensornetCreate(&cutnHandle)); + std::cout << "Initialized cuTensorNet library on GPU 0\n"; + + // Sphinx: Expectation #4 + + // Define necessary quantum gate tensors in Host memory + const double invsq2 = 1.0 / std::sqrt(2.0); + // Hadamard gate + const std::vector> h_gateH {{invsq2, 0.0}, {invsq2, 0.0}, + {invsq2, 0.0}, {-invsq2, 0.0}}; + // Pauli X gate + const std::vector> h_gateX {{0.0, 0.0}, {1.0, 0.0}, + {1.0, 0.0}, {0.0, 0.0}}; + // Pauli Y gate + const std::vector> h_gateY {{0.0, 0.0}, {0.0, -1.0}, + {0.0, 1.0}, {0.0, 0.0}}; + // Pauli Z gate + const std::vector> h_gateZ {{1.0, 0.0}, {0.0, 0.0}, + {0.0, 0.0}, {-1.0, 0.0}}; + // CX gate + const std::vector> h_gateCX {{1.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, + {0.0, 0.0}, {1.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, + {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {1.0, 0.0}, + {0.0, 0.0}, {0.0, 0.0}, {1.0, 0.0}, {0.0, 0.0}}; + + // Copy quantum gates to Device memory + void *d_gateH{nullptr}, *d_gateX{nullptr}, *d_gateY{nullptr}, *d_gateZ{nullptr}, *d_gateCX{nullptr}; + HANDLE_CUDA_ERROR(cudaMalloc(&d_gateH, 4 * (2 * fp64size))); + HANDLE_CUDA_ERROR(cudaMalloc(&d_gateX, 4 * (2 * fp64size))); + HANDLE_CUDA_ERROR(cudaMalloc(&d_gateY, 4 * (2 * fp64size))); + HANDLE_CUDA_ERROR(cudaMalloc(&d_gateZ, 4 * (2 * fp64size))); + HANDLE_CUDA_ERROR(cudaMalloc(&d_gateCX, 16 * (2 * fp64size))); + std::cout << "Allocated quantum gate memory on GPU\n"; + HANDLE_CUDA_ERROR(cudaMemcpy(d_gateH, h_gateH.data(), 4 * (2 * fp64size), cudaMemcpyHostToDevice)); + HANDLE_CUDA_ERROR(cudaMemcpy(d_gateX, h_gateX.data(), 4 * (2 * fp64size), cudaMemcpyHostToDevice)); + HANDLE_CUDA_ERROR(cudaMemcpy(d_gateY, h_gateY.data(), 4 * (2 * fp64size), cudaMemcpyHostToDevice)); + HANDLE_CUDA_ERROR(cudaMemcpy(d_gateZ, h_gateZ.data(), 4 * (2 * fp64size), cudaMemcpyHostToDevice)); + HANDLE_CUDA_ERROR(cudaMemcpy(d_gateCX, h_gateCX.data(), 16 * (2 * fp64size), cudaMemcpyHostToDevice)); + std::cout << "Copied quantum gates to GPU memory\n"; + + // Sphinx: Expectation #5 + + // Query the free memory on Device + std::size_t freeSize{0}, totalSize{0}; + HANDLE_CUDA_ERROR(cudaMemGetInfo(&freeSize, &totalSize)); + const std::size_t scratchSize = (freeSize - (freeSize % 4096)) / 2; // use half of available memory with alignment + void *d_scratch{nullptr}; + HANDLE_CUDA_ERROR(cudaMalloc(&d_scratch, scratchSize)); + std::cout << "Allocated " << scratchSize << " bytes of scratch memory on GPU\n"; + + // Sphinx: Expectation #6 + + // Create the initial quantum state + cutensornetState_t quantumState; + HANDLE_CUTN_ERROR(cutensornetCreateState(cutnHandle, CUTENSORNET_STATE_PURITY_PURE, numQubits, qubitDims.data(), + CUDA_C_64F, &quantumState)); + std::cout << "Created the initial quantum state\n"; + + // Sphinx: Expectation #7 + + // Construct the final quantum circuit state (apply quantum gates) for the GHZ circuit + int64_t id; + HANDLE_CUTN_ERROR(cutensornetStateApplyTensor(cutnHandle, quantumState, 1, std::vector{{0}}.data(), + d_gateH, nullptr, 1, 0, 1, &id)); + for(int32_t i = 1; i < numQubits; ++i) { + HANDLE_CUTN_ERROR(cutensornetStateApplyTensor(cutnHandle, quantumState, 2, std::vector{{i-1,i}}.data(), + d_gateCX, nullptr, 1, 0, 1, &id)); + } + std::cout << "Applied quantum gates\n"; + + // Sphinx: Expectation #8 + + // Create an empty tensor network operator + cutensornetNetworkOperator_t hamiltonian; + HANDLE_CUTN_ERROR(cutensornetCreateNetworkOperator(cutnHandle, numQubits, qubitDims.data(), CUDA_C_64F, &hamiltonian)); + // Append component (0.5 * Z1 * Z2) to the tensor network operator + { + const int32_t numModes[] = {1, 1}; // Z1 acts on 1 mode, Z2 acts on 1 mode + const int32_t modesZ1[] = {1}; // state modes Z1 acts on + const int32_t modesZ2[] = {2}; // state modes Z2 acts on + const int32_t * stateModes[] = {modesZ1, modesZ2}; // state modes (Z1 * Z2) acts on + const void * gateData[] = {d_gateZ, d_gateZ}; // GPU pointers to gate data + HANDLE_CUTN_ERROR(cutensornetNetworkOperatorAppendProduct(cutnHandle, hamiltonian, cuDoubleComplex{0.5,0.0}, + 2, numModes, stateModes, NULL, gateData, &id)); + } + // Append component (0.25 * Y3) to the tensor network operator + { + const int32_t numModes[] = {1}; // Y3 acts on 1 mode + const int32_t modesY3[] = {3}; // state modes Y3 acts on + const int32_t * stateModes[] = {modesY3}; // state modes (Y3) acts on + const void * gateData[] = {d_gateY}; // GPU pointers to gate data + HANDLE_CUTN_ERROR(cutensornetNetworkOperatorAppendProduct(cutnHandle, hamiltonian, cuDoubleComplex{0.25,0.0}, + 1, numModes, stateModes, NULL, gateData, &id)); + } + // Append component (0.13 * Y0 X2 Z3) to the tensor network operator + { + const int32_t numModes[] = {1, 1, 1}; // Y0 acts on 1 mode, X2 acts on 1 mode, Z3 acts on 1 mode + const int32_t modesY0[] = {0}; // state modes Y0 acts on + const int32_t modesX2[] = {2}; // state modes X2 acts on + const int32_t modesZ3[] = {3}; // state modes Z3 acts on + const int32_t * stateModes[] = {modesY0, modesX2, modesZ3}; // state modes (Y0 * X2 * Z3) acts on + const void * gateData[] = {d_gateY, d_gateX, d_gateZ}; // GPU pointers to gate data + HANDLE_CUTN_ERROR(cutensornetNetworkOperatorAppendProduct(cutnHandle, hamiltonian, cuDoubleComplex{0.13,0.0}, + 3, numModes, stateModes, NULL, gateData, &id)); + } + std::cout << "Constructed a tensor network operator: (0.5 * Z1 * Z2) + (0.25 * Y3) + (0.13 * Y0 * X2 * Z3)" << std::endl; + + // Sphinx: Expectation #9 + + // Specify the quantum circuit expectation value + cutensornetStateExpectation_t expectation; + HANDLE_CUTN_ERROR(cutensornetCreateExpectation(cutnHandle, quantumState, hamiltonian, &expectation)); + std::cout << "Created the specified quantum circuit expectation value\n"; + + // Sphinx: Expectation #10 + + // Configure the computation of the specified quantum circuit expectation value + const int32_t numHyperSamples = 8; // desired number of hyper samples used in the tensor network contraction path finder + HANDLE_CUTN_ERROR(cutensornetExpectationConfigure(cutnHandle, expectation, + CUTENSORNET_EXPECTATION_OPT_NUM_HYPER_SAMPLES, &numHyperSamples, sizeof(numHyperSamples))); + + // Sphinx: Expectation #11 + + // Prepare the specified quantum circuit expectation value for computation + cutensornetWorkspaceDescriptor_t workDesc; + HANDLE_CUTN_ERROR(cutensornetCreateWorkspaceDescriptor(cutnHandle, &workDesc)); + std::cout << "Created the workspace descriptor\n"; + HANDLE_CUTN_ERROR(cutensornetExpectationPrepare(cutnHandle, expectation, scratchSize, workDesc, 0x0)); + std::cout << "Prepared the specified quantum circuit expectation value\n"; + + // Sphinx: Expectation #12 + + // Attach the workspace buffer + int64_t worksize {0}; + HANDLE_CUTN_ERROR(cutensornetWorkspaceGetMemorySize(cutnHandle, + workDesc, + CUTENSORNET_WORKSIZE_PREF_RECOMMENDED, + CUTENSORNET_MEMSPACE_DEVICE, + CUTENSORNET_WORKSPACE_SCRATCH, + &worksize)); + std::cout << "Required scratch GPU workspace size (bytes) = " << worksize << std::endl; + if(worksize <= scratchSize) { + HANDLE_CUTN_ERROR(cutensornetWorkspaceSetMemory(cutnHandle, workDesc, CUTENSORNET_MEMSPACE_DEVICE, + CUTENSORNET_WORKSPACE_SCRATCH, d_scratch, worksize)); + }else{ + std::cout << "ERROR: Insufficient workspace size on Device!\n"; + std::abort(); + } + std::cout << "Set the workspace buffer\n"; + + // Sphinx: Expectation #13 + + // Compute the specified quantum circuit expectation value + std::complex expectVal{0.0,0.0}, stateNorm{0.0,0.0}; + HANDLE_CUTN_ERROR(cutensornetExpectationCompute(cutnHandle, expectation, workDesc, + static_cast(&expectVal), static_cast(&stateNorm), 0x0)); + std::cout << "Computed the specified quantum circuit expectation value\n"; + std::cout << "Expectation value = (" << expectVal.real() << ", " << expectVal.imag() << ")\n"; + std::cout << "State 2-norm = (" << stateNorm.real() << ", " << stateNorm.imag() << ")\n"; + + // Sphinx: Expectation #14 + + // Destroy the workspace descriptor + HANDLE_CUTN_ERROR(cutensornetDestroyWorkspaceDescriptor(workDesc)); + std::cout << "Destroyed the workspace descriptor\n"; + + // Destroy the quantum circuit expectation value + HANDLE_CUTN_ERROR(cutensornetDestroyExpectation(expectation)); + std::cout << "Destroyed the quantum circuit state expectation value\n"; + + // Destroy the tensor network operator + HANDLE_CUTN_ERROR(cutensornetDestroyNetworkOperator(hamiltonian)); + std::cout << "Destroyed the tensor network operator\n"; + + // Destroy the quantum circuit state + HANDLE_CUTN_ERROR(cutensornetDestroyState(quantumState)); + std::cout << "Destroyed the quantum circuit state\n"; + + HANDLE_CUDA_ERROR(cudaFree(d_scratch)); + HANDLE_CUDA_ERROR(cudaFree(d_gateCX)); + HANDLE_CUDA_ERROR(cudaFree(d_gateZ)); + HANDLE_CUDA_ERROR(cudaFree(d_gateY)); + HANDLE_CUDA_ERROR(cudaFree(d_gateX)); + HANDLE_CUDA_ERROR(cudaFree(d_gateH)); + std::cout << "Freed memory on GPU\n"; + + // Finalize the cuTensorNet library + HANDLE_CUTN_ERROR(cutensornetDestroy(cutnHandle)); + std::cout << "Finalized the cuTensorNet library\n"; + + return 0; +} diff --git a/samples/cutensornet/high_level/marginal_example.cu b/samples/cutensornet/high_level/marginal_example.cu index 8de8720..a5ebc1f 100644 --- a/samples/cutensornet/high_level/marginal_example.cu +++ b/samples/cutensornet/high_level/marginal_example.cu @@ -32,15 +32,17 @@ int main(int argc, char **argv) { + static_assert(sizeof(size_t) == sizeof(int64_t), "Please build this sample on a 64-bit architecture!"); + constexpr std::size_t fp64size = sizeof(double); // Sphinx: Marginal #2 // Quantum state configuration - constexpr int32_t numQubits = 16; + constexpr int32_t numQubits = 16; // number of qubits const std::vector qubitDims(numQubits,2); // qubit dimensions - constexpr int32_t numMarginalModes = 2; // rank of the marginal (reduced density matrix) - const std::vector marginalModes({0,1}); // open qubits (must be in acsending order) + const std::vector marginalModes({0,1}); // open qubits defining the marginal (must be in acsending order) + const int32_t numMarginalModes = marginalModes.size(); // rank of the marginal (reduced density matrix) std::cout << "Quantum circuit: " << numQubits << " qubits\n"; // Sphinx: Marginal #3 @@ -75,12 +77,14 @@ int main(int argc, char **argv) // Sphinx: Marginal #5 - // Allocate the specified quantum circuit reduced density matrix (marginal) in Device memory + // Allocate Device memory for the specified quantum circuit reduced density matrix (marginal) void *d_rdm{nullptr}; std::size_t rdmDim = 1; for(const auto & mode: marginalModes) rdmDim *= qubitDims[mode]; const std::size_t rdmSize = rdmDim * rdmDim; HANDLE_CUDA_ERROR(cudaMalloc(&d_rdm, rdmSize * (2 * fp64size))); + std::cout << "Allocated memory for the specified quantum circuit reduced density matrix (marginal) of size " + << rdmSize << " elements\n"; // Sphinx: Marginal #6 @@ -114,7 +118,7 @@ int main(int argc, char **argv) // Sphinx: Marginal #9 - // Specify the desired reduced density matrix (marginal) + // Specify the quantum circuit reduced density matrix (marginal) cutensornetStateMarginal_t marginal; HANDLE_CUTN_ERROR(cutensornetCreateMarginal(cutnHandle, quantumState, numMarginalModes, marginalModes.data(), 0, nullptr, std::vector{{1,2,4,8}}.data(), &marginal)); // using explicit strides @@ -129,12 +133,12 @@ int main(int argc, char **argv) // Sphinx: Marginal #11 - // Prepare the specified quantum circuit reduced densitry matrix (marginal) + // Prepare the computation of the specified quantum circuit reduced densitry matrix (marginal) cutensornetWorkspaceDescriptor_t workDesc; HANDLE_CUTN_ERROR(cutensornetCreateWorkspaceDescriptor(cutnHandle, &workDesc)); std::cout << "Created the workspace descriptor\n"; HANDLE_CUTN_ERROR(cutensornetMarginalPrepare(cutnHandle, marginal, scratchSize, workDesc, 0x0)); - std::cout << "Prepared the specified quantum circuit reduced density matrix (marginal)\n"; + std::cout << "Prepared the computation of the specified quantum circuit reduced density matrix (marginal)\n"; // Sphinx: Marginal #12 @@ -159,7 +163,7 @@ int main(int argc, char **argv) // Sphinx: Marginal #13 // Compute the specified quantum circuit reduced densitry matrix (marginal) - HANDLE_CUTN_ERROR(cutensornetMarginalCompute(cutnHandle, marginal, nullptr, workDesc, d_rdm, 0)); + HANDLE_CUTN_ERROR(cutensornetMarginalCompute(cutnHandle, marginal, nullptr, workDesc, d_rdm, 0x0)); std::cout << "Computed the specified quantum circuit reduced density matrix (marginal)\n"; std::vector> h_rdm(rdmSize); HANDLE_CUDA_ERROR(cudaMemcpy(h_rdm.data(), d_rdm, rdmSize * (2 * fp64size), cudaMemcpyDeviceToHost)); diff --git a/samples/cutensornet/high_level/mps_amplitudes_example.cu b/samples/cutensornet/high_level/mps_amplitudes_example.cu new file mode 100644 index 0000000..9125bef --- /dev/null +++ b/samples/cutensornet/high_level/mps_amplitudes_example.cu @@ -0,0 +1,272 @@ +/* Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. + * + * SPDX-License-Identifier: BSD-3-Clause + */ + +// Sphinx: MPS Amplitudes #1 + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + + +#define HANDLE_CUDA_ERROR(x) \ +{ const auto err = x; \ + if( err != cudaSuccess ) \ + { printf("CUDA error %s in line %d\n", cudaGetErrorString(err), __LINE__); fflush(stdout); std::abort(); } \ +}; + +#define HANDLE_CUTN_ERROR(x) \ +{ const auto err = x; \ + if( err != CUTENSORNET_STATUS_SUCCESS ) \ + { printf("cuTensorNet error %s in line %d\n", cutensornetGetErrorString(err), __LINE__); fflush(stdout); std::abort(); } \ +}; + + +int main(int argc, char **argv) +{ + static_assert(sizeof(size_t) == sizeof(int64_t), "Please build this sample on a 64-bit architecture!"); + + constexpr std::size_t fp64size = sizeof(double); + + // Sphinx: MPS Amplitudes #2 + + // Quantum state configuration + constexpr int32_t numQubits = 6; // number of qubits + const std::vector qubitDims(numQubits,2); // qubit dimensions + const std::vector fixedModes({0,1}); // fixed modes in the output amplitude tensor (must be in acsending order) + const std::vector fixedValues({1,1}); // values of the fixed modes in the output amplitude tensor + const int32_t numFixedModes = fixedModes.size(); // number of fixed modes in the output amplitude tensor + std::cout << "Quantum circuit: " << numQubits << " qubits\n"; + + // Sphinx: MPS Amplitudes #3 + + // Initialize the cuTensorNet library + HANDLE_CUDA_ERROR(cudaSetDevice(0)); + cutensornetHandle_t cutnHandle; + HANDLE_CUTN_ERROR(cutensornetCreate(&cutnHandle)); + std::cout << "Initialized cuTensorNet library on GPU 0\n"; + + // Sphinx: MPS Amplitudes #4 + + // Define necessary quantum gate tensors in Host memory + const double invsq2 = 1.0 / std::sqrt(2.0); + // Hadamard gate + const std::vector> h_gateH {{invsq2, 0.0}, {invsq2, 0.0}, + {invsq2, 0.0}, {-invsq2, 0.0}}; + // CX gate + const std::vector> h_gateCX {{1.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, + {0.0, 0.0}, {1.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, + {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {1.0, 0.0}, + {0.0, 0.0}, {0.0, 0.0}, {1.0, 0.0}, {0.0, 0.0}}; + + // Copy quantum gates to Device memory + void *d_gateH{nullptr}, *d_gateCX{nullptr}; + HANDLE_CUDA_ERROR(cudaMalloc(&d_gateH, 4 * (2 * fp64size))); + HANDLE_CUDA_ERROR(cudaMalloc(&d_gateCX, 16 * (2 * fp64size))); + std::cout << "Allocated quantum gate memory on GPU\n"; + HANDLE_CUDA_ERROR(cudaMemcpy(d_gateH, h_gateH.data(), 4 * (2 * fp64size), cudaMemcpyHostToDevice)); + HANDLE_CUDA_ERROR(cudaMemcpy(d_gateCX, h_gateCX.data(), 16 * (2 * fp64size), cudaMemcpyHostToDevice)); + std::cout << "Copied quantum gates to GPU memory\n"; + + // Sphinx: MPS Amplitudes #5 + + // Determine the MPS representation and allocate buffers for the MPS tensors + const int64_t maxExtent = 2; // GHZ state can be exactly represented with max bond dimension of 2 + std::vector> extents; + std::vector extentsPtr(numQubits); + std::vector d_mpsTensors(numQubits, nullptr); + for (int32_t i = 0; i < numQubits; i++) { + if (i == 0) { // left boundary MPS tensor + extents.push_back({2, maxExtent}); + HANDLE_CUDA_ERROR(cudaMalloc(&d_mpsTensors[i], 2 * maxExtent * 2 * fp64size)); + } + else if (i == numQubits-1) { // right boundary MPS tensor + extents.push_back({maxExtent, 2}); + HANDLE_CUDA_ERROR(cudaMalloc(&d_mpsTensors[i], 2 * maxExtent * 2 * fp64size)); + } + else { // middle MPS tensors + extents.push_back({maxExtent, 2, maxExtent}); + HANDLE_CUDA_ERROR(cudaMalloc(&d_mpsTensors[i], 2 * maxExtent * maxExtent * 2 * fp64size)); + } + extentsPtr[i] = extents[i].data(); + } + + // Sphinx: MPS Amplitudes #6 + + // Allocate Device memory for the specified slice of the quantum circuit amplitudes tensor + void *d_amp{nullptr}; + std::size_t ampSize = 1; + for(const auto & qubitDim: qubitDims) ampSize *= qubitDim; // all state modes (full size) + for(const auto & fixedMode: fixedModes) ampSize /= qubitDims[fixedMode]; // fixed state modes reduce the slice size + HANDLE_CUDA_ERROR(cudaMalloc(&d_amp, ampSize * (2 * fp64size))); + std::cout << "Allocated memory for the specified slice of the quantum circuit amplitude tensor of size " + << ampSize << " elements\n"; + + // Sphinx: MPS Amplitudes #7 + + // Query the free memory on Device + std::size_t freeSize{0}, totalSize{0}; + HANDLE_CUDA_ERROR(cudaMemGetInfo(&freeSize, &totalSize)); + const std::size_t scratchSize = (freeSize - (freeSize % 4096)) / 2; // use half of available memory with alignment + void *d_scratch{nullptr}; + HANDLE_CUDA_ERROR(cudaMalloc(&d_scratch, scratchSize)); + std::cout << "Allocated " << scratchSize << " bytes of scratch memory on GPU\n"; + + // Sphinx: MPS Amplitudes #8 + + // Create the initial quantum state + cutensornetState_t quantumState; + HANDLE_CUTN_ERROR(cutensornetCreateState(cutnHandle, CUTENSORNET_STATE_PURITY_PURE, numQubits, qubitDims.data(), + CUDA_C_64F, &quantumState)); + std::cout << "Created the initial quantum state\n"; + + // Sphinx: MPS Amplitudes #9 + + // Construct the final quantum circuit state (apply quantum gates) for the GHZ circuit + int64_t id; + HANDLE_CUTN_ERROR(cutensornetStateApplyTensor(cutnHandle, quantumState, 1, std::vector{{0}}.data(), + d_gateH, nullptr, 1, 0, 1, &id)); + for(int32_t i = 1; i < numQubits; ++i) { + HANDLE_CUTN_ERROR(cutensornetStateApplyTensor(cutnHandle, quantumState, 2, std::vector{{i-1,i}}.data(), + d_gateCX, nullptr, 1, 0, 1, &id)); + } + std::cout << "Applied quantum gates\n"; + + // Sphinx: MPS Amplitudes #10 + + // Specify the final target MPS representation (use default fortran strides) + HANDLE_CUTN_ERROR(cutensornetStateFinalizeMPS(cutnHandle, quantumState, + CUTENSORNET_BOUNDARY_CONDITION_OPEN, extentsPtr.data(), /*strides=*/nullptr)); + std::cout << "Requested the final MPS factorization of the quantum circuit state\n"; + + // Sphinx: MPS Amplitudes #11 + + // Optional, set up the SVD method for MPS truncation. + cutensornetTensorSVDAlgo_t algo = CUTENSORNET_TENSOR_SVD_ALGO_GESVDJ; + HANDLE_CUTN_ERROR(cutensornetStateConfigure(cutnHandle, quantumState, + CUTENSORNET_STATE_MPS_SVD_CONFIG_ALGO, &algo, sizeof(algo))); + std::cout << "Configured the MPS factorization computation\n"; + + // Sphinx: MPS Amplitudes #12 + + // Prepare the MPS computation and attach workspace + cutensornetWorkspaceDescriptor_t workDesc; + HANDLE_CUTN_ERROR(cutensornetCreateWorkspaceDescriptor(cutnHandle, &workDesc)); + std::cout << "Created the workspace descriptor\n"; + HANDLE_CUTN_ERROR(cutensornetStatePrepare(cutnHandle, quantumState, scratchSize, workDesc, 0x0)); + int64_t worksize {0}; + HANDLE_CUTN_ERROR(cutensornetWorkspaceGetMemorySize(cutnHandle, + workDesc, + CUTENSORNET_WORKSIZE_PREF_RECOMMENDED, + CUTENSORNET_MEMSPACE_DEVICE, + CUTENSORNET_WORKSPACE_SCRATCH, + &worksize)); + std::cout << "Scratch GPU workspace size (bytes) for MPS computation = " << worksize << std::endl; + if(worksize <= scratchSize) { + HANDLE_CUTN_ERROR(cutensornetWorkspaceSetMemory(cutnHandle, workDesc, CUTENSORNET_MEMSPACE_DEVICE, + CUTENSORNET_WORKSPACE_SCRATCH, d_scratch, worksize)); + }else{ + std::cout << "ERROR: Insufficient workspace size on Device!\n"; + std::abort(); + } + std::cout << "Set the workspace buffer for the MPS factorization computation\n"; + + // Sphinx: MPS Amplitudes #13 + + // Execute MPS computation + HANDLE_CUTN_ERROR(cutensornetStateCompute(cutnHandle, quantumState, + workDesc, extentsPtr.data(), /*strides=*/nullptr, d_mpsTensors.data(), 0)); + std::cout << "Computed the MPS factorization\n"; + + // Sphinx: MPS Amplitudes #14 + + // Specify the quantum circuit amplitudes accessor + cutensornetStateAccessor_t accessor; + HANDLE_CUTN_ERROR(cutensornetCreateAccessor(cutnHandle, quantumState, numFixedModes, fixedModes.data(), + nullptr, &accessor)); // using default strides + std::cout << "Created the specified quantum circuit amplitudes accessor\n"; + + // Sphinx: MPS Amplitudes #15 + + // Configure the computation of the slice of the specified quantum circuit amplitudes tensor + const int32_t numHyperSamples = 8; // desired number of hyper samples used in the tensor network contraction path finder + HANDLE_CUTN_ERROR(cutensornetAccessorConfigure(cutnHandle, accessor, + CUTENSORNET_ACCESSOR_OPT_NUM_HYPER_SAMPLES, &numHyperSamples, sizeof(numHyperSamples))); + + // Sphinx: MPS Amplitudes #16 + + // Prepare the computation of the specified slice of the quantum circuit amplitudes tensor + HANDLE_CUTN_ERROR(cutensornetAccessorPrepare(cutnHandle, accessor, scratchSize, workDesc, 0x0)); + std::cout << "Prepared the computation of the specified slice of the quantum circuit amplitudes tensor\n"; + + // Sphinx: MPS Amplitudes #17 + + // Attach the workspace buffer + HANDLE_CUTN_ERROR(cutensornetWorkspaceGetMemorySize(cutnHandle, + workDesc, + CUTENSORNET_WORKSIZE_PREF_RECOMMENDED, + CUTENSORNET_MEMSPACE_DEVICE, + CUTENSORNET_WORKSPACE_SCRATCH, + &worksize)); + std::cout << "Required scratch GPU workspace size (bytes) = " << worksize << std::endl; + if(worksize <= scratchSize) { + HANDLE_CUTN_ERROR(cutensornetWorkspaceSetMemory(cutnHandle, workDesc, CUTENSORNET_MEMSPACE_DEVICE, + CUTENSORNET_WORKSPACE_SCRATCH, d_scratch, worksize)); + }else{ + std::cout << "ERROR: Insufficient workspace size on Device!\n"; + std::abort(); + } + std::cout << "Set the workspace buffer\n"; + + // Sphinx: MPS Amplitudes #18 + + // Compute the specified slice of the quantum circuit amplitudes tensor + std::complex stateNorm{0.0,0.0}; + HANDLE_CUTN_ERROR(cutensornetAccessorCompute(cutnHandle, accessor, fixedValues.data(), + workDesc, d_amp, static_cast(&stateNorm), 0x0)); + std::cout << "Computed the specified slice of the quantum circuit amplitudes tensor\n"; + std::vector> h_amp(ampSize); + HANDLE_CUDA_ERROR(cudaMemcpy(h_amp.data(), d_amp, ampSize * (2 * fp64size), cudaMemcpyDeviceToHost)); + std::cout << "Amplitudes slice for " << (numQubits - numFixedModes) << " qubits:\n"; + for(std::size_t i = 0; i < ampSize; ++i) { + std::cout << " " << h_amp[i] << std::endl; + } + std::cout << "State 2-norm = (" << stateNorm.real() << ", " << stateNorm.imag() << ")\n"; + + // Sphinx: MPS Amplitudes #19 + + // Destroy the workspace descriptor + HANDLE_CUTN_ERROR(cutensornetDestroyWorkspaceDescriptor(workDesc)); + std::cout << "Destroyed the workspace descriptor\n"; + + // Destroy the quantum circuit amplitudes accessor + HANDLE_CUTN_ERROR(cutensornetDestroyAccessor(accessor)); + std::cout << "Destroyed the quantum circuit amplitudes accessor\n"; + + // Destroy the quantum circuit state + HANDLE_CUTN_ERROR(cutensornetDestroyState(quantumState)); + std::cout << "Destroyed the quantum circuit state\n"; + + for (int32_t i = 0; i < numQubits; i++) { + HANDLE_CUDA_ERROR(cudaFree(d_mpsTensors[i])); + } + HANDLE_CUDA_ERROR(cudaFree(d_scratch)); + HANDLE_CUDA_ERROR(cudaFree(d_amp)); + HANDLE_CUDA_ERROR(cudaFree(d_gateCX)); + HANDLE_CUDA_ERROR(cudaFree(d_gateH)); + std::cout << "Freed memory on GPU\n"; + + // Finalize the cuTensorNet library + HANDLE_CUTN_ERROR(cutensornetDestroy(cutnHandle)); + std::cout << "Finalized the cuTensorNet library\n"; + + return 0; +} diff --git a/samples/cutensornet/high_level/mps_expectation_example.cu b/samples/cutensornet/high_level/mps_expectation_example.cu new file mode 100644 index 0000000..2b5e84b --- /dev/null +++ b/samples/cutensornet/high_level/mps_expectation_example.cu @@ -0,0 +1,308 @@ +/* Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. + * + * SPDX-License-Identifier: BSD-3-Clause + */ + +// Sphinx: MPS Expectation #1 + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + + +#define HANDLE_CUDA_ERROR(x) \ +{ const auto err = x; \ + if( err != cudaSuccess ) \ + { printf("CUDA error %s in line %d\n", cudaGetErrorString(err), __LINE__); fflush(stdout); std::abort(); } \ +}; + +#define HANDLE_CUTN_ERROR(x) \ +{ const auto err = x; \ + if( err != CUTENSORNET_STATUS_SUCCESS ) \ + { printf("cuTensorNet error %s in line %d\n", cutensornetGetErrorString(err), __LINE__); fflush(stdout); std::abort(); } \ +}; + + +int main(int argc, char **argv) +{ + static_assert(sizeof(size_t) == sizeof(int64_t), "Please build this sample on a 64-bit architecture!"); + + constexpr std::size_t fp64size = sizeof(double); + + // Sphinx: MPS Expectation #2 + + // Quantum state configuration + constexpr int32_t numQubits = 16; // number of qubits + const std::vector qubitDims(numQubits,2); // qubit dimensions + std::cout << "Quantum circuit: " << numQubits << " qubits\n"; + + // Sphinx: MPS Expectation #3 + + // Initialize the cuTensorNet library + HANDLE_CUDA_ERROR(cudaSetDevice(0)); + cutensornetHandle_t cutnHandle; + HANDLE_CUTN_ERROR(cutensornetCreate(&cutnHandle)); + std::cout << "Initialized cuTensorNet library on GPU 0\n"; + + // Sphinx: MPS Expectation #4 + + // Define necessary quantum gate tensors in Host memory + const double invsq2 = 1.0 / std::sqrt(2.0); + // Hadamard gate + const std::vector> h_gateH {{invsq2, 0.0}, {invsq2, 0.0}, + {invsq2, 0.0}, {-invsq2, 0.0}}; + // Pauli X gate + const std::vector> h_gateX {{0.0, 0.0}, {1.0, 0.0}, + {1.0, 0.0}, {0.0, 0.0}}; + // Pauli Y gate + const std::vector> h_gateY {{0.0, 0.0}, {0.0, -1.0}, + {0.0, 1.0}, {0.0, 0.0}}; + // Pauli Z gate + const std::vector> h_gateZ {{1.0, 0.0}, {0.0, 0.0}, + {0.0, 0.0}, {-1.0, 0.0}}; + // CX gate + const std::vector> h_gateCX {{1.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, + {0.0, 0.0}, {1.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, + {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {1.0, 0.0}, + {0.0, 0.0}, {0.0, 0.0}, {1.0, 0.0}, {0.0, 0.0}}; + + // Copy quantum gates to Device memory + void *d_gateH{nullptr}, *d_gateX{nullptr}, *d_gateY{nullptr}, *d_gateZ{nullptr}, *d_gateCX{nullptr}; + HANDLE_CUDA_ERROR(cudaMalloc(&d_gateH, 4 * (2 * fp64size))); + HANDLE_CUDA_ERROR(cudaMalloc(&d_gateX, 4 * (2 * fp64size))); + HANDLE_CUDA_ERROR(cudaMalloc(&d_gateY, 4 * (2 * fp64size))); + HANDLE_CUDA_ERROR(cudaMalloc(&d_gateZ, 4 * (2 * fp64size))); + HANDLE_CUDA_ERROR(cudaMalloc(&d_gateCX, 16 * (2 * fp64size))); + std::cout << "Allocated quantum gate memory on GPU\n"; + HANDLE_CUDA_ERROR(cudaMemcpy(d_gateH, h_gateH.data(), 4 * (2 * fp64size), cudaMemcpyHostToDevice)); + HANDLE_CUDA_ERROR(cudaMemcpy(d_gateX, h_gateX.data(), 4 * (2 * fp64size), cudaMemcpyHostToDevice)); + HANDLE_CUDA_ERROR(cudaMemcpy(d_gateY, h_gateY.data(), 4 * (2 * fp64size), cudaMemcpyHostToDevice)); + HANDLE_CUDA_ERROR(cudaMemcpy(d_gateZ, h_gateZ.data(), 4 * (2 * fp64size), cudaMemcpyHostToDevice)); + HANDLE_CUDA_ERROR(cudaMemcpy(d_gateCX, h_gateCX.data(), 16 * (2 * fp64size), cudaMemcpyHostToDevice)); + std::cout << "Copied quantum gates to GPU memory\n"; + + // Sphinx: MPS Expectation #5 + + // Determine the MPS representation and allocate buffers for the MPS tensors + const int64_t maxExtent = 2; // GHZ state can be exactly represented with max bond dimension of 2 + std::vector> extents; + std::vector extentsPtr(numQubits); + std::vector d_mpsTensors(numQubits, nullptr); + for (int32_t i = 0; i < numQubits; i++) { + if (i == 0) { // left boundary MPS tensor + extents.push_back({2, maxExtent}); + HANDLE_CUDA_ERROR(cudaMalloc(&d_mpsTensors[i], 2 * maxExtent * 2 * fp64size)); + } + else if (i == numQubits-1) { // right boundary MPS tensor + extents.push_back({maxExtent, 2}); + HANDLE_CUDA_ERROR(cudaMalloc(&d_mpsTensors[i], 2 * maxExtent * 2 * fp64size)); + } + else { // middle MPS tensors + extents.push_back({maxExtent, 2, maxExtent}); + HANDLE_CUDA_ERROR(cudaMalloc(&d_mpsTensors[i], 2 * maxExtent * maxExtent * 2 * fp64size)); + } + extentsPtr[i] = extents[i].data(); + } + + // Sphinx: MPS Expectation #6 + + // Query the free memory on Device + std::size_t freeSize{0}, totalSize{0}; + HANDLE_CUDA_ERROR(cudaMemGetInfo(&freeSize, &totalSize)); + const std::size_t scratchSize = (freeSize - (freeSize % 4096)) / 2; // use half of available memory with alignment + void *d_scratch{nullptr}; + HANDLE_CUDA_ERROR(cudaMalloc(&d_scratch, scratchSize)); + std::cout << "Allocated " << scratchSize << " bytes of scratch memory on GPU\n"; + + // Sphinx: MPS Expectation #7 + + // Create the initial quantum state + cutensornetState_t quantumState; + HANDLE_CUTN_ERROR(cutensornetCreateState(cutnHandle, CUTENSORNET_STATE_PURITY_PURE, numQubits, qubitDims.data(), + CUDA_C_64F, &quantumState)); + std::cout << "Created the initial quantum state\n"; + + // Sphinx: MPS Expectation #8 + + // Construct the final quantum circuit state (apply quantum gates) for the GHZ circuit + int64_t id; + HANDLE_CUTN_ERROR(cutensornetStateApplyTensor(cutnHandle, quantumState, 1, std::vector{{0}}.data(), + d_gateH, nullptr, 1, 0, 1, &id)); + for(int32_t i = 1; i < numQubits; ++i) { + HANDLE_CUTN_ERROR(cutensornetStateApplyTensor(cutnHandle, quantumState, 2, std::vector{{i-1,i}}.data(), + d_gateCX, nullptr, 1, 0, 1, &id)); + } + std::cout << "Applied quantum gates\n"; + + // Sphinx: MPS Expectation #9 + + // Specify the final target MPS representation (use default fortran strides) + HANDLE_CUTN_ERROR(cutensornetStateFinalizeMPS(cutnHandle, quantumState, + CUTENSORNET_BOUNDARY_CONDITION_OPEN, extentsPtr.data(), /*strides=*/nullptr )); + + // Sphinx: MPS Expectation #10 + + // Optional, set up the SVD method for truncation. + cutensornetTensorSVDAlgo_t algo = CUTENSORNET_TENSOR_SVD_ALGO_GESVDJ; + HANDLE_CUTN_ERROR(cutensornetStateConfigure(cutnHandle, quantumState, + CUTENSORNET_STATE_MPS_SVD_CONFIG_ALGO, &algo, sizeof(algo))); + std::cout << "Configured the MPS computation\n"; + + // Sphinx: MPS Expectation #11 + + // Prepare the MPS computation and attach workspace + cutensornetWorkspaceDescriptor_t workDesc; + HANDLE_CUTN_ERROR(cutensornetCreateWorkspaceDescriptor(cutnHandle, &workDesc)); + std::cout << "Created the workspace descriptor\n"; + HANDLE_CUTN_ERROR(cutensornetStatePrepare(cutnHandle, quantumState, scratchSize, workDesc, 0x0)); + int64_t worksize {0}; + HANDLE_CUTN_ERROR(cutensornetWorkspaceGetMemorySize(cutnHandle, + workDesc, + CUTENSORNET_WORKSIZE_PREF_RECOMMENDED, + CUTENSORNET_MEMSPACE_DEVICE, + CUTENSORNET_WORKSPACE_SCRATCH, + &worksize)); + std::cout << "Scratch GPU workspace size (bytes) for MPS computation = " << worksize << std::endl; + if(worksize <= scratchSize) { + HANDLE_CUTN_ERROR(cutensornetWorkspaceSetMemory(cutnHandle, workDesc, CUTENSORNET_MEMSPACE_DEVICE, + CUTENSORNET_WORKSPACE_SCRATCH, d_scratch, worksize)); + }else{ + std::cout << "ERROR: Insufficient workspace size on Device!\n"; + std::abort(); + } + std::cout << "Set the workspace buffer for MPS computation\n"; + + // Sphinx: MPS Expectation #12 + + // Execute MPS computation + HANDLE_CUTN_ERROR(cutensornetStateCompute(cutnHandle, quantumState, + workDesc, extentsPtr.data(), /*strides=*/nullptr, d_mpsTensors.data(), 0)); + + // Sphinx: MPS Expectation #13 + + // Create an empty tensor network operator + cutensornetNetworkOperator_t hamiltonian; + HANDLE_CUTN_ERROR(cutensornetCreateNetworkOperator(cutnHandle, numQubits, qubitDims.data(), CUDA_C_64F, &hamiltonian)); + // Append component (0.5 * Z1 * Z2) to the tensor network operator + { + const int32_t numModes[] = {1, 1}; // Z1 acts on 1 mode, Z2 acts on 1 mode + const int32_t modesZ1[] = {1}; // state modes Z1 acts on + const int32_t modesZ2[] = {2}; // state modes Z2 acts on + const int32_t * stateModes[] = {modesZ1, modesZ2}; // state modes (Z1 * Z2) acts on + const void * gateData[] = {d_gateZ, d_gateZ}; // GPU pointers to gate data + HANDLE_CUTN_ERROR(cutensornetNetworkOperatorAppendProduct(cutnHandle, hamiltonian, cuDoubleComplex{0.5,0.0}, + 2, numModes, stateModes, NULL, gateData, &id)); + } + // Append component (0.25 * Y3) to the tensor network operator + { + const int32_t numModes[] = {1}; // Y3 acts on 1 mode + const int32_t modesY3[] = {3}; // state modes Y3 acts on + const int32_t * stateModes[] = {modesY3}; // state modes (Y3) acts on + const void * gateData[] = {d_gateY}; // GPU pointers to gate data + HANDLE_CUTN_ERROR(cutensornetNetworkOperatorAppendProduct(cutnHandle, hamiltonian, cuDoubleComplex{0.25,0.0}, + 1, numModes, stateModes, NULL, gateData, &id)); + } + // Append component (0.13 * Y0 X2 Z3) to the tensor network operator + { + const int32_t numModes[] = {1, 1, 1}; // Y0 acts on 1 mode, X2 acts on 1 mode, Z3 acts on 1 mode + const int32_t modesY0[] = {0}; // state modes Y0 acts on + const int32_t modesX2[] = {2}; // state modes X2 acts on + const int32_t modesZ3[] = {3}; // state modes Z3 acts on + const int32_t * stateModes[] = {modesY0, modesX2, modesZ3}; // state modes (Y0 * X2 * Z3) acts on + const void * gateData[] = {d_gateY, d_gateX, d_gateZ}; // GPU pointers to gate data + HANDLE_CUTN_ERROR(cutensornetNetworkOperatorAppendProduct(cutnHandle, hamiltonian, cuDoubleComplex{0.13,0.0}, + 3, numModes, stateModes, NULL, gateData, &id)); + } + std::cout << "Constructed a tensor network operator: (0.5 * Z1 * Z2) + (0.25 * Y3) + (0.13 * Y0 * X2 * Z3)" << std::endl; + + // Sphinx: MPS Expectation #14 + + // Specify the quantum circuit expectation value + cutensornetStateExpectation_t expectation; + HANDLE_CUTN_ERROR(cutensornetCreateExpectation(cutnHandle, quantumState, hamiltonian, &expectation)); + std::cout << "Created the specified quantum circuit expectation value\n"; + + // Sphinx: MPS Expectation #15 + + // Configure the computation of the specified quantum circuit expectation value + const int32_t numHyperSamples = 8; // desired number of hyper samples used in the tensor network contraction path finder + HANDLE_CUTN_ERROR(cutensornetExpectationConfigure(cutnHandle, expectation, + CUTENSORNET_EXPECTATION_OPT_NUM_HYPER_SAMPLES, &numHyperSamples, sizeof(numHyperSamples))); + + // Sphinx: MPS Expectation #16 + + // Prepare the specified quantum circuit expectation value for computation + HANDLE_CUTN_ERROR(cutensornetExpectationPrepare(cutnHandle, expectation, scratchSize, workDesc, 0x0)); + std::cout << "Prepared the specified quantum circuit expectation value\n"; + + // Sphinx: MPS Expectation #17 + + // Attach the workspace buffer + HANDLE_CUTN_ERROR(cutensornetWorkspaceGetMemorySize(cutnHandle, + workDesc, + CUTENSORNET_WORKSIZE_PREF_RECOMMENDED, + CUTENSORNET_MEMSPACE_DEVICE, + CUTENSORNET_WORKSPACE_SCRATCH, + &worksize)); + std::cout << "Required scratch GPU workspace size (bytes) = " << worksize << std::endl; + if(worksize <= scratchSize) { + HANDLE_CUTN_ERROR(cutensornetWorkspaceSetMemory(cutnHandle, workDesc, CUTENSORNET_MEMSPACE_DEVICE, + CUTENSORNET_WORKSPACE_SCRATCH, d_scratch, worksize)); + }else{ + std::cout << "ERROR: Insufficient workspace size on Device!\n"; + std::abort(); + } + std::cout << "Set the workspace buffer\n"; + + // Sphinx: MPS Expectation #18 + + // Compute the specified quantum circuit expectation value + std::complex expectVal{0.0,0.0}, stateNorm{0.0,0.0}; + HANDLE_CUTN_ERROR(cutensornetExpectationCompute(cutnHandle, expectation, workDesc, + static_cast(&expectVal), static_cast(&stateNorm), 0x0)); + std::cout << "Computed the specified quantum circuit expectation value\n"; + std::cout << "Expectation value = (" << expectVal.real() << ", " << expectVal.imag() << ")\n"; + std::cout << "State 2-norm = (" << stateNorm.real() << ", " << stateNorm.imag() << ")\n"; + + // Sphinx: MPS Expectation #19 + + // Destroy the workspace descriptor + HANDLE_CUTN_ERROR(cutensornetDestroyWorkspaceDescriptor(workDesc)); + std::cout << "Destroyed the workspace descriptor\n"; + + // Destroy the quantum circuit expectation value + HANDLE_CUTN_ERROR(cutensornetDestroyExpectation(expectation)); + std::cout << "Destroyed the quantum circuit state expectation value\n"; + + // Destroy the tensor network operator + HANDLE_CUTN_ERROR(cutensornetDestroyNetworkOperator(hamiltonian)); + std::cout << "Destroyed the tensor network operator\n"; + + // Destroy the quantum circuit state + HANDLE_CUTN_ERROR(cutensornetDestroyState(quantumState)); + std::cout << "Destroyed the quantum circuit state\n"; + + for (int32_t i = 0; i < numQubits; i++) { + HANDLE_CUDA_ERROR(cudaFree(d_mpsTensors[i])); + } + HANDLE_CUDA_ERROR(cudaFree(d_scratch)); + HANDLE_CUDA_ERROR(cudaFree(d_gateCX)); + HANDLE_CUDA_ERROR(cudaFree(d_gateZ)); + HANDLE_CUDA_ERROR(cudaFree(d_gateY)); + HANDLE_CUDA_ERROR(cudaFree(d_gateX)); + HANDLE_CUDA_ERROR(cudaFree(d_gateH)); + std::cout << "Freed memory on GPU\n"; + + // Finalize the cuTensorNet library + HANDLE_CUTN_ERROR(cutensornetDestroy(cutnHandle)); + std::cout << "Finalized the cuTensorNet library\n"; + + return 0; +} diff --git a/samples/cutensornet/high_level/mps_marginal_example.cu b/samples/cutensornet/high_level/mps_marginal_example.cu new file mode 100644 index 0000000..fbb2941 --- /dev/null +++ b/samples/cutensornet/high_level/mps_marginal_example.cu @@ -0,0 +1,270 @@ +/* Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. + * + * SPDX-License-Identifier: BSD-3-Clause + */ + +// Sphinx: MPS Marginal #1 + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + + +#define HANDLE_CUDA_ERROR(x) \ +{ const auto err = x; \ + if( err != cudaSuccess ) \ + { printf("CUDA error %s in line %d\n", cudaGetErrorString(err), __LINE__); fflush(stdout); std::abort(); } \ +}; + +#define HANDLE_CUTN_ERROR(x) \ +{ const auto err = x; \ + if( err != CUTENSORNET_STATUS_SUCCESS ) \ + { printf("cuTensorNet error %s in line %d\n", cutensornetGetErrorString(err), __LINE__); fflush(stdout); std::abort(); } \ +}; + + +int main(int argc, char **argv) +{ + static_assert(sizeof(size_t) == sizeof(int64_t), "Please build this sample on a 64-bit architecture!"); + + constexpr std::size_t fp64size = sizeof(double); + + // Sphinx: MPS Marginal #2 + + // Quantum state configuration + constexpr int32_t numQubits = 16; + const std::vector qubitDims(numQubits,2); // qubit dimensions + constexpr int32_t numMarginalModes = 2; // rank of the marginal (reduced density matrix) + const std::vector marginalModes({0,1}); // open qubits (must be in acsending order) + std::cout << "Quantum circuit: " << numQubits << " qubits\n"; + + // Sphinx: MPS Marginal #3 + + // Initialize the cuTensorNet library + HANDLE_CUDA_ERROR(cudaSetDevice(0)); + cutensornetHandle_t cutnHandle; + HANDLE_CUTN_ERROR(cutensornetCreate(&cutnHandle)); + std::cout << "Initialized cuTensorNet library on GPU 0\n"; + + // Sphinx: MPS Marginal #4 + + // Define necessary quantum gate tensors in Host memory + const double invsq2 = 1.0 / std::sqrt(2.0); + // Hadamard gate + const std::vector> h_gateH {{invsq2, 0.0}, {invsq2, 0.0}, + {invsq2, 0.0}, {-invsq2, 0.0}}; + // CX gate + const std::vector> h_gateCX {{1.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, + {0.0, 0.0}, {1.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, + {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {1.0, 0.0}, + {0.0, 0.0}, {0.0, 0.0}, {1.0, 0.0}, {0.0, 0.0}}; + + // Copy quantum gates to Device memory + void *d_gateH{nullptr}, *d_gateCX{nullptr}; + HANDLE_CUDA_ERROR(cudaMalloc(&d_gateH, 4 * (2 * fp64size))); + HANDLE_CUDA_ERROR(cudaMalloc(&d_gateCX, 16 * (2 * fp64size))); + std::cout << "Allocated quantum gate memory on GPU\n"; + HANDLE_CUDA_ERROR(cudaMemcpy(d_gateH, h_gateH.data(), 4 * (2 * fp64size), cudaMemcpyHostToDevice)); + HANDLE_CUDA_ERROR(cudaMemcpy(d_gateCX, h_gateCX.data(), 16 * (2 * fp64size), cudaMemcpyHostToDevice)); + std::cout << "Copied quantum gates to GPU memory\n"; + + // Sphinx: MPS Marginal #5 + + // Determine the MPS representation and allocate buffers for the MPS tensors + const int64_t maxExtent = 2; // GHZ state can be exactly represented with max bond dimension of 2 + std::vector> extents; + std::vector extentsPtr(numQubits); + std::vector d_mpsTensors(numQubits, nullptr); + for (int32_t i = 0; i < numQubits; i++) { + if (i == 0) { // left boundary MPS tensor + extents.push_back({2, maxExtent}); + HANDLE_CUDA_ERROR(cudaMalloc(&d_mpsTensors[i], 2 * maxExtent * 2 * fp64size)); + } + else if (i == numQubits-1) { // right boundary MPS tensor + extents.push_back({maxExtent, 2}); + HANDLE_CUDA_ERROR(cudaMalloc(&d_mpsTensors[i], 2 * maxExtent * 2 * fp64size)); + } + else { // middle MPS tensors + extents.push_back({maxExtent, 2, maxExtent}); + HANDLE_CUDA_ERROR(cudaMalloc(&d_mpsTensors[i], 2 * maxExtent * maxExtent * 2 * fp64size)); + } + extentsPtr[i] = extents[i].data(); + } + + // Sphinx: MPS Marginal #6 + + // Allocate the specified quantum circuit reduced density matrix (marginal) in Device memory + void *d_rdm{nullptr}; + std::size_t rdmDim = 1; + for(const auto & mode: marginalModes) rdmDim *= qubitDims[mode]; + const std::size_t rdmSize = rdmDim * rdmDim; + HANDLE_CUDA_ERROR(cudaMalloc(&d_rdm, rdmSize * (2 * fp64size))); + + // Sphinx: MPS Marginal #7 + + // Query the free memory on Device + std::size_t freeSize{0}, totalSize{0}; + HANDLE_CUDA_ERROR(cudaMemGetInfo(&freeSize, &totalSize)); + const std::size_t scratchSize = (freeSize - (freeSize % 4096)) / 2; // use half of available memory with alignment + void *d_scratch{nullptr}; + HANDLE_CUDA_ERROR(cudaMalloc(&d_scratch, scratchSize)); + std::cout << "Allocated " << scratchSize << " bytes of scratch memory on GPU\n"; + + // Sphinx: MPS Marginal #8 + + // Create the initial quantum state + cutensornetState_t quantumState; + HANDLE_CUTN_ERROR(cutensornetCreateState(cutnHandle, CUTENSORNET_STATE_PURITY_PURE, numQubits, qubitDims.data(), + CUDA_C_64F, &quantumState)); + std::cout << "Created the initial quantum state\n"; + + // Sphinx: MPS Marginal #9 + + // Construct the final quantum circuit state (apply quantum gates) for the GHZ circuit + int64_t id; + HANDLE_CUTN_ERROR(cutensornetStateApplyTensor(cutnHandle, quantumState, 1, std::vector{{0}}.data(), + d_gateH, nullptr, 1, 0, 1, &id)); + for(int32_t i = 1; i < numQubits; ++i) { + HANDLE_CUTN_ERROR(cutensornetStateApplyTensor(cutnHandle, quantumState, 2, std::vector{{i-1,i}}.data(), + d_gateCX, nullptr, 1, 0, 1, &id)); + } + std::cout << "Applied quantum gates\n"; + + // Sphinx: MPS Marginal #10 + + // Specify the final target MPS representation (use default fortran strides) + HANDLE_CUTN_ERROR(cutensornetStateFinalizeMPS(cutnHandle, quantumState, + CUTENSORNET_BOUNDARY_CONDITION_OPEN, extentsPtr.data(), /*strides=*/nullptr)); + std::cout << "Requested the final MPS factorization of the quantum circuit state\n"; + + // Sphinx: MPS Marginal #11 + + // Optional, set up the SVD method for MPS truncation. + cutensornetTensorSVDAlgo_t algo = CUTENSORNET_TENSOR_SVD_ALGO_GESVDJ; + HANDLE_CUTN_ERROR(cutensornetStateConfigure(cutnHandle, quantumState, + CUTENSORNET_STATE_MPS_SVD_CONFIG_ALGO, &algo, sizeof(algo))); + std::cout << "Configured the MPS factorization computation\n"; + + // Sphinx: MPS Marginal #12 + + // Prepare the MPS computation and attach workspace + cutensornetWorkspaceDescriptor_t workDesc; + HANDLE_CUTN_ERROR(cutensornetCreateWorkspaceDescriptor(cutnHandle, &workDesc)); + std::cout << "Created the workspace descriptor\n"; + HANDLE_CUTN_ERROR(cutensornetStatePrepare(cutnHandle, quantumState, scratchSize, workDesc, 0x0)); + int64_t worksize {0}; + HANDLE_CUTN_ERROR(cutensornetWorkspaceGetMemorySize(cutnHandle, + workDesc, + CUTENSORNET_WORKSIZE_PREF_RECOMMENDED, + CUTENSORNET_MEMSPACE_DEVICE, + CUTENSORNET_WORKSPACE_SCRATCH, + &worksize)); + std::cout << "Scratch GPU workspace size (bytes) for MPS computation = " << worksize << std::endl; + if(worksize <= scratchSize) { + HANDLE_CUTN_ERROR(cutensornetWorkspaceSetMemory(cutnHandle, workDesc, CUTENSORNET_MEMSPACE_DEVICE, + CUTENSORNET_WORKSPACE_SCRATCH, d_scratch, worksize)); + }else{ + std::cout << "ERROR: Insufficient workspace size on Device!\n"; + std::abort(); + } + std::cout << "Set the workspace buffer for the MPS factorization computation\n"; + + // Sphinx: MPS Marginal #13 + + // Execute MPS computation + HANDLE_CUTN_ERROR(cutensornetStateCompute(cutnHandle, quantumState, + workDesc, extentsPtr.data(), /*strides=*/nullptr, d_mpsTensors.data(), 0)); + std::cout << "Computed the MPS factorization\n"; + + // Sphinx: MPS Marginal #14 + + // Specify the desired reduced density matrix (marginal) + cutensornetStateMarginal_t marginal; + HANDLE_CUTN_ERROR(cutensornetCreateMarginal(cutnHandle, quantumState, numMarginalModes, marginalModes.data(), + 0, nullptr, std::vector{{1,2,4,8}}.data(), &marginal)); // using explicit strides + std::cout << "Created the specified quantum circuit reduced densitry matrix (marginal)\n"; + + // Sphinx: MPS Marginal #15 + + // Configure the computation of the specified quantum circuit reduced density matrix (marginal) + const int32_t numHyperSamples = 8; // desired number of hyper samples used in the tensor network contraction path finder + HANDLE_CUTN_ERROR(cutensornetMarginalConfigure(cutnHandle, marginal, + CUTENSORNET_MARGINAL_OPT_NUM_HYPER_SAMPLES, &numHyperSamples, sizeof(numHyperSamples))); + std::cout << "Configured the specified quantum circuit reduced density matrix (marginal) computation\n"; + + // Sphinx: MPS Marginal #16 + + // Prepare the specified quantum circuit reduced densitry matrix (marginal) + HANDLE_CUTN_ERROR(cutensornetMarginalPrepare(cutnHandle, marginal, scratchSize, workDesc, 0x0)); + std::cout << "Prepared the specified quantum circuit reduced density matrix (marginal)\n"; + + // Sphinx: MPS Marginal #17 + + // Attach the workspace buffer + HANDLE_CUTN_ERROR(cutensornetWorkspaceGetMemorySize(cutnHandle, + workDesc, + CUTENSORNET_WORKSIZE_PREF_RECOMMENDED, + CUTENSORNET_MEMSPACE_DEVICE, + CUTENSORNET_WORKSPACE_SCRATCH, + &worksize)); + std::cout << "Required scratch GPU workspace size (bytes) for marginal computation = " << worksize << std::endl; + if(worksize <= scratchSize) { + HANDLE_CUTN_ERROR(cutensornetWorkspaceSetMemory(cutnHandle, workDesc, CUTENSORNET_MEMSPACE_DEVICE, + CUTENSORNET_WORKSPACE_SCRATCH, d_scratch, worksize)); + }else{ + std::cout << "ERROR: Insufficient workspace size on Device!\n"; + std::abort(); + } + std::cout << "Set the workspace buffer\n"; + + // Sphinx: MPS Marginal #18 + + // Compute the specified quantum circuit reduced densitry matrix (marginal) + HANDLE_CUTN_ERROR(cutensornetMarginalCompute(cutnHandle, marginal, nullptr, workDesc, d_rdm, 0)); + std::cout << "Computed the specified quantum circuit reduced density matrix (marginal)\n"; + std::vector> h_rdm(rdmSize); + HANDLE_CUDA_ERROR(cudaMemcpy(h_rdm.data(), d_rdm, rdmSize * (2 * fp64size), cudaMemcpyDeviceToHost)); + std::cout << "Reduced density matrix for " << numMarginalModes << " qubits:\n"; + for(std::size_t i = 0; i < rdmDim; ++i) { + for(std::size_t j = 0; j < rdmDim; ++j) { + std::cout << " " << h_rdm[i + j * rdmDim]; + } + std::cout << std::endl; + } + + // Sphinx: MPS Marginal #19 + + // Destroy the workspace descriptor + HANDLE_CUTN_ERROR(cutensornetDestroyWorkspaceDescriptor(workDesc)); + std::cout << "Destroyed the workspace descriptor\n"; + + // Destroy the quantum circuit reduced density matrix + HANDLE_CUTN_ERROR(cutensornetDestroyMarginal(marginal)); + std::cout << "Destroyed the quantum circuit state reduced density matrix (marginal)\n"; + + // Destroy the quantum circuit state + HANDLE_CUTN_ERROR(cutensornetDestroyState(quantumState)); + std::cout << "Destroyed the quantum circuit state\n"; + + for (int32_t i = 0; i < numQubits; i++) { + HANDLE_CUDA_ERROR(cudaFree(d_mpsTensors[i])); + } + HANDLE_CUDA_ERROR(cudaFree(d_scratch)); + HANDLE_CUDA_ERROR(cudaFree(d_rdm)); + HANDLE_CUDA_ERROR(cudaFree(d_gateCX)); + HANDLE_CUDA_ERROR(cudaFree(d_gateH)); + std::cout << "Freed memory on GPU\n"; + + // Finalize the cuTensorNet library + HANDLE_CUTN_ERROR(cutensornetDestroy(cutnHandle)); + std::cout << "Finalized the cuTensorNet library\n"; + + return 0; +} diff --git a/samples/cutensornet/high_level/mps_sampling_example.cu b/samples/cutensornet/high_level/mps_sampling_example.cu new file mode 100644 index 0000000..ebaf441 --- /dev/null +++ b/samples/cutensornet/high_level/mps_sampling_example.cu @@ -0,0 +1,254 @@ +/* Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. + * + * SPDX-License-Identifier: BSD-3-Clause + */ + +// Sphinx: MPS Sampler #1 + +#include +#include +#include +#include +#include +#include + +#include +#include + +#define HANDLE_CUDA_ERROR(x) \ +{ const auto err = x; \ + if( err != cudaSuccess ) \ + { printf("CUDA error %s in line %d\n", cudaGetErrorString(err), __LINE__); fflush(stdout); std::abort(); } \ +}; + +#define HANDLE_CUTN_ERROR(x) \ +{ const auto err = x; \ + if( err != CUTENSORNET_STATUS_SUCCESS ) \ + { printf("cuTensorNet error %s in line %d\n", cutensornetGetErrorString(err), __LINE__); fflush(stdout); std::abort(); } \ +}; + + +int main(int argc, char **argv) +{ + static_assert(sizeof(size_t) == sizeof(int64_t), "Please build this sample on a 64-bit architecture!"); + + constexpr std::size_t fp64size = sizeof(double); + + // Sphinx: MPS Sampler #2 + + // Quantum state configuration + const int64_t numSamples = 100; + const int32_t numQubits = 16; + const std::vector qubitDims(numQubits, 2); // qubit size + std::cout << "Quantum circuit: " << numQubits << " qubits; " << numSamples << " samples\n"; + + // Sphinx: MPS Sampler #3 + + // Initialize the cuTensorNet library + HANDLE_CUDA_ERROR(cudaSetDevice(0)); + cutensornetHandle_t cutnHandle; + HANDLE_CUTN_ERROR(cutensornetCreate(&cutnHandle)); + std::cout << "Initialized cuTensorNet library on GPU 0\n"; + + // Sphinx: MPS Sampler #4 + + // Define necessary quantum gate tensors in Host memory + const double invsq2 = 1.0 / std::sqrt(2.0); + // Hadamard gate + const std::vector> h_gateH {{invsq2, 0.0}, {invsq2, 0.0}, + {invsq2, 0.0}, {-invsq2, 0.0}}; + // CX gate + const std::vector> h_gateCX {{1.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, + {0.0, 0.0}, {1.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, + {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {1.0, 0.0}, + {0.0, 0.0}, {0.0, 0.0}, {1.0, 0.0}, {0.0, 0.0}}; + + // Copy quantum gates to Device memory + void *d_gateH{nullptr}, *d_gateCX{nullptr}; + HANDLE_CUDA_ERROR(cudaMalloc(&d_gateH, 4 * (2 * fp64size))); + std::cout << "H gate buffer allocated on GPU: " << d_gateH << std::endl; //debug + HANDLE_CUDA_ERROR(cudaMalloc(&d_gateCX, 16 * (2 * fp64size))); + std::cout << "CX gate buffer allocated on GPU: " << d_gateCX << std::endl; //debug + std::cout << "Allocated quantum gate memory on GPU\n"; + HANDLE_CUDA_ERROR(cudaMemcpy(d_gateH, h_gateH.data(), 4 * (2 * fp64size), cudaMemcpyHostToDevice)); + HANDLE_CUDA_ERROR(cudaMemcpy(d_gateCX, h_gateCX.data(), 16 * (2 * fp64size), cudaMemcpyHostToDevice)); + std::cout << "Copied quantum gates to GPU memory\n"; + + // Sphinx: MPS Sampler #5 + + // Determine the MPS representation and allocate buffer for the MPS tensors + const int64_t maxExtent = 2; // GHZ state can be exactly represented with max bond dimension of 2 + std::vector> extents; + std::vector extentsPtr(numQubits); + std::vector d_mpsTensors(numQubits, nullptr); + for (int32_t i = 0; i < numQubits; i++) { + if (i == 0) { // left boundary MPS tensor + extents.push_back({2, maxExtent}); + HANDLE_CUDA_ERROR(cudaMalloc(&d_mpsTensors[i], 2 * maxExtent * 2 * fp64size)); + } + else if (i == numQubits-1) { // right boundary MPS tensor + extents.push_back({maxExtent, 2}); + HANDLE_CUDA_ERROR(cudaMalloc(&d_mpsTensors[i], 2 * maxExtent * 2 * fp64size)); + } + else { // middle MPS tensors + extents.push_back({maxExtent, 2, maxExtent}); + HANDLE_CUDA_ERROR(cudaMalloc(&d_mpsTensors[i], 2 * maxExtent * maxExtent * 2 * fp64size)); + } + extentsPtr[i] = extents[i].data(); + } + + // Sphinx: MPS Sampler #6 + + // Query the free memory on Device + std::size_t freeSize {0}, totalSize {0}; + HANDLE_CUDA_ERROR(cudaMemGetInfo(&freeSize, &totalSize)); + const std::size_t scratchSize = (freeSize - (freeSize % 4096)) / 2; // use half of available memory with alignment + void *d_scratch {nullptr}; + HANDLE_CUDA_ERROR(cudaMalloc(&d_scratch, scratchSize)); + std::cout << "Allocated " << scratchSize << " bytes of scratch memory on GPU: " + << "[" << d_scratch << ":" << (void*)(((char*)(d_scratch)) + scratchSize) << ")\n"; + + // Sphinx: MPS Sampler #7 + + // Create the initial quantum state + cutensornetState_t quantumState; + HANDLE_CUTN_ERROR(cutensornetCreateState(cutnHandle, CUTENSORNET_STATE_PURITY_PURE, numQubits, qubitDims.data(), + CUDA_C_64F, &quantumState)); + std::cout << "Created the initial quantum state\n"; + + // Sphinx: MPS Sampler #8 + + // Construct the quantum circuit state (apply quantum gates) + int64_t id; + HANDLE_CUTN_ERROR(cutensornetStateApplyTensor(cutnHandle, quantumState, 1, std::vector{{0}}.data(), + d_gateH, nullptr, 1, 0, 1, &id)); + for(int32_t i = 1; i < numQubits; ++i) { + HANDLE_CUTN_ERROR(cutensornetStateApplyTensor(cutnHandle, quantumState, 2, std::vector{{i-1,i}}.data(), + d_gateCX, nullptr, 1, 0, 1, &id)); + } + std::cout << "Applied quantum gates\n"; + + // Sphinx: MPS Sampler #9 + + // Specify the final target MPS representation (use default fortran strides) + HANDLE_CUTN_ERROR(cutensornetStateFinalizeMPS(cutnHandle, quantumState, + CUTENSORNET_BOUNDARY_CONDITION_OPEN, extentsPtr.data(), /*strides=*/nullptr )); + + // Sphinx: MPS Sampler #10 + + // Optional, set up the SVD method for truncation. + cutensornetTensorSVDAlgo_t algo = CUTENSORNET_TENSOR_SVD_ALGO_GESVDJ; + HANDLE_CUTN_ERROR(cutensornetStateConfigure(cutnHandle, quantumState, + CUTENSORNET_STATE_MPS_SVD_CONFIG_ALGO, &algo, sizeof(algo))); + std::cout << "Configured the MPS computation\n"; + + // Sphinx: MPS Sampler #11 + + // Prepare the MPS computation and attach workspace + cutensornetWorkspaceDescriptor_t workDesc; + HANDLE_CUTN_ERROR(cutensornetCreateWorkspaceDescriptor(cutnHandle, &workDesc)); + std::cout << "Created the workspace descriptor\n"; + HANDLE_CUTN_ERROR(cutensornetStatePrepare(cutnHandle, quantumState, scratchSize, workDesc, 0x0)); + int64_t worksize {0}; + HANDLE_CUTN_ERROR(cutensornetWorkspaceGetMemorySize(cutnHandle, + workDesc, + CUTENSORNET_WORKSIZE_PREF_RECOMMENDED, + CUTENSORNET_MEMSPACE_DEVICE, + CUTENSORNET_WORKSPACE_SCRATCH, + &worksize)); + std::cout << "Scratch GPU workspace size (bytes) for MPS computation = " << worksize << std::endl; + if(worksize <= scratchSize) { + HANDLE_CUTN_ERROR(cutensornetWorkspaceSetMemory(cutnHandle, workDesc, CUTENSORNET_MEMSPACE_DEVICE, + CUTENSORNET_WORKSPACE_SCRATCH, d_scratch, worksize)); + }else{ + std::cout << "ERROR: Insufficient workspace size on Device!\n"; + std::abort(); + } + std::cout << "Set the workspace buffer for MPS computation\n"; + + // Sphinx: MPS Sampler #12 + + // Execute MPS computation + HANDLE_CUTN_ERROR(cutensornetStateCompute(cutnHandle, quantumState, + workDesc, extentsPtr.data(), /*strides=*/nullptr, d_mpsTensors.data(), 0)); + + // Sphinx: MPS Sampler #13 + + // Create the quantum circuit sampler + cutensornetStateSampler_t sampler; + HANDLE_CUTN_ERROR(cutensornetCreateSampler(cutnHandle, quantumState, numQubits, nullptr, &sampler)); + std::cout << "Created the quantum circuit sampler\n"; + + // Sphinx: MPS Sampler #14 + + // Configure the quantum circuit sampler + const int32_t numHyperSamples = 8; // desired number of hyper samples used in the tensor network contraction path finder + HANDLE_CUTN_ERROR(cutensornetSamplerConfigure(cutnHandle, sampler, + CUTENSORNET_SAMPLER_OPT_NUM_HYPER_SAMPLES, &numHyperSamples, sizeof(numHyperSamples))); + + // Sphinx: MPS Sampler #15 + + // Prepare the quantum circuit sampler + HANDLE_CUTN_ERROR(cutensornetSamplerPrepare(cutnHandle, sampler, scratchSize, workDesc, 0x0)); + std::cout << "Prepared the quantum circuit state sampler\n"; + + // Sphinx: MPS Sampler #16 + + // Attach the workspace buffer + HANDLE_CUTN_ERROR(cutensornetWorkspaceGetMemorySize(cutnHandle, + workDesc, + CUTENSORNET_WORKSIZE_PREF_RECOMMENDED, + CUTENSORNET_MEMSPACE_DEVICE, + CUTENSORNET_WORKSPACE_SCRATCH, + &worksize)); + std::cout << "Scratch GPU workspace size (bytes) for MPS Sampling = " << worksize << std::endl; + assert(worksize > 0); + if(worksize <= scratchSize) { + HANDLE_CUTN_ERROR(cutensornetWorkspaceSetMemory(cutnHandle, workDesc, CUTENSORNET_MEMSPACE_DEVICE, + CUTENSORNET_WORKSPACE_SCRATCH, d_scratch, worksize)); + }else{ + std::cout << "ERROR: Insufficient workspace size on Device!\n"; + std::abort(); + } + std::cout << "Set the workspace buffer\n"; + + // Sphinx: MPS Sampler #17 + + // Sample the quantum circuit state + std::vector samples(numQubits * numSamples); // samples[SampleId][QubitId] reside in Host memory + HANDLE_CUTN_ERROR(cutensornetSamplerSample(cutnHandle, sampler, numSamples, workDesc, samples.data(), 0)); + std::cout << "Performed quantum circuit state sampling\n"; + std::cout << "Bit-string samples:\n"; + for(int64_t i = 0; i < numSamples; ++i) { + for(int64_t j = 0; j < numQubits; ++j) std::cout << " " << samples[i * numQubits + j]; + std::cout << std::endl; + } + + // Sphinx: MPS Sampler #18 + + // Destroy the workspace descriptor + HANDLE_CUTN_ERROR(cutensornetDestroyWorkspaceDescriptor(workDesc)); + std::cout << "Destroyed the workspace descriptor\n"; + + // Destroy the quantum circuit sampler + HANDLE_CUTN_ERROR(cutensornetDestroySampler(sampler)); + std::cout << "Destroyed the quantum circuit state sampler\n"; + + // Destroy the quantum circuit state + HANDLE_CUTN_ERROR(cutensornetDestroyState(quantumState)); + std::cout << "Destroyed the quantum circuit state\n"; + + for (int32_t i = 0; i < numQubits; i++) { + HANDLE_CUDA_ERROR(cudaFree(d_mpsTensors[i])); + } + HANDLE_CUDA_ERROR(cudaFree(d_scratch)); + HANDLE_CUDA_ERROR(cudaFree(d_gateCX)); + HANDLE_CUDA_ERROR(cudaFree(d_gateH)); + std::cout << "Freed memory on GPU\n"; + + // Finalize the cuTensorNet library + HANDLE_CUTN_ERROR(cutensornetDestroy(cutnHandle)); + std::cout << "Finalized the cuTensorNet library\n"; + + return 0; +} diff --git a/samples/cutensornet/high_level/sampling_example.cu b/samples/cutensornet/high_level/sampling_example.cu index 3c0c2e0..64376aa 100644 --- a/samples/cutensornet/high_level/sampling_example.cu +++ b/samples/cutensornet/high_level/sampling_example.cu @@ -30,6 +30,8 @@ int main(int argc, char **argv) { + static_assert(sizeof(size_t) == sizeof(int64_t), "Please build this sample on a 64-bit architecture!"); + constexpr std::size_t fp64size = sizeof(double); // Sphinx: Sampler #2 diff --git a/samples/cutensornet/high_level/sampling_mpi_example.cu b/samples/cutensornet/high_level/sampling_mpi_example.cu new file mode 100644 index 0000000..65ba2cc --- /dev/null +++ b/samples/cutensornet/high_level/sampling_mpi_example.cu @@ -0,0 +1,259 @@ +/* Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. + * + * SPDX-License-Identifier: BSD-3-Clause + */ + +// Sphinx: Sampler #1 + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#define HANDLE_CUDA_ERROR(x) \ +{ const auto err = x; \ + if( err != cudaSuccess ) \ + { printf("CUDA error %s in line %d\n", cudaGetErrorString(err), __LINE__); \ + fflush(stdout); \ + std::abort(); \ + } \ +}; + +#define HANDLE_CUTN_ERROR(x) \ +{ const auto err = x; \ + if( err != CUTENSORNET_STATUS_SUCCESS ) \ + { printf("cuTensorNet error %s in line %d\n", cutensornetGetErrorString(err), __LINE__); \ + fflush(stdout); \ + std::abort(); \ + } \ +}; + +#define HANDLE_MPI_ERROR(x) \ +{ const auto err = x; \ + if( err != MPI_SUCCESS ) \ + { char error[MPI_MAX_ERROR_STRING]; \ + int len; \ + MPI_Error_string(err, error, &len); \ + printf("MPI Error: %s in line %d\n", error, __LINE__); \ + fflush(stdout); \ + MPI_Abort(MPI_COMM_WORLD, err); \ + } \ +}; + + +int main(int argc, char **argv) +{ + static_assert(sizeof(size_t) == sizeof(int64_t), "Please build this sample on a 64-bit architecture!"); + + constexpr std::size_t fp64size = sizeof(double); + + // Sphinx: Sampler #2 + + // Initialize MPI library + HANDLE_MPI_ERROR(MPI_Init(&argc, &argv)); + int rank {-1}; + HANDLE_MPI_ERROR(MPI_Comm_rank(MPI_COMM_WORLD, &rank)); + int numProcs {0}; + HANDLE_MPI_ERROR(MPI_Comm_size(MPI_COMM_WORLD, &numProcs)); + + bool verbose = (rank == 0) ? true : false; + if (verbose) + { + std::cout << "*** Printing is done only from the root MPI process to prevent jumbled messages ***\n"; + std::cout << "The number of MPI processes is " << numProcs << std::endl; + } + + // Sphinx: Sampler #3 + + // Quantum state configuration + const int64_t numSamples = 100; + const int32_t numQubits = 16; + const std::vector qubitDims(numQubits, 2); // qubit size + if (verbose) + std::cout << "Quantum circuit: " << numQubits << " qubits; " << numSamples << " samples\n"; + + // Sphinx: Sampler #4 + + // Initialize the cuTensorNet library + int numDevices {0}; + HANDLE_CUDA_ERROR(cudaGetDeviceCount(&numDevices)); + const int deviceId = rank % numDevices; // we assume that the processes are mapped to nodes in contiguous chunks + HANDLE_CUDA_ERROR(cudaSetDevice(deviceId)); + + cutensornetHandle_t cutnHandle; + HANDLE_CUTN_ERROR(cutensornetCreate(&cutnHandle)); + if (verbose) + std::cout << "Initialized cuTensorNet library on GPU 0\n"; + + // Sphinx: Sampler #5 + + // Activate distributed (parallel) execution + // HANDLE_CUTN_ERROR(cutensornetDistributedResetConfiguration(handle, NULL, 0)); // reset back to serial execution + MPI_Comm cutnComm; + HANDLE_MPI_ERROR(MPI_Comm_dup(MPI_COMM_WORLD, &cutnComm)); // duplicate MPI communicator to dedicate it to cuTensorNet + HANDLE_CUTN_ERROR(cutensornetDistributedResetConfiguration(cutnHandle, &cutnComm, sizeof(cutnComm))); + if(verbose) + printf("Reset cuTensorNet distributed MPI configuration\n"); + + // Sphinx: Sampler #6 + + // Define necessary quantum gate tensors in Host memory + const double invsq2 = 1.0 / std::sqrt(2.0); + // Hadamard gate + const std::vector> h_gateH {{invsq2, 0.0}, {invsq2, 0.0}, + {invsq2, 0.0}, {-invsq2, 0.0}}; + // CX gate + const std::vector> h_gateCX {{1.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, + {0.0, 0.0}, {1.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, + {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {1.0, 0.0}, + {0.0, 0.0}, {0.0, 0.0}, {1.0, 0.0}, {0.0, 0.0}}; + + // Copy quantum gates to Device memory + void *d_gateH{nullptr}, *d_gateCX{nullptr}; + HANDLE_CUDA_ERROR(cudaMalloc(&d_gateH, 4 * (2 * fp64size))); + if (verbose) + std::cout << "H gate buffer allocated on GPU: " << d_gateH << std::endl; //debug + HANDLE_CUDA_ERROR(cudaMalloc(&d_gateCX, 16 * (2 * fp64size))); + if (verbose) + std::cout << "CX gate buffer allocated on GPU: " << d_gateCX << std::endl; //debug + if (verbose) + std::cout << "Allocated quantum gate memory on GPU\n"; + HANDLE_CUDA_ERROR(cudaMemcpy(d_gateH, h_gateH.data(), 4 * (2 * fp64size), cudaMemcpyHostToDevice)); + HANDLE_CUDA_ERROR(cudaMemcpy(d_gateCX, h_gateCX.data(), 16 * (2 * fp64size), cudaMemcpyHostToDevice)); + if (verbose) + std::cout << "Copied quantum gates to GPU memory\n"; + + // Sphinx: Sampler #7 + + // Create the initial quantum state + cutensornetState_t quantumState; + HANDLE_CUTN_ERROR(cutensornetCreateState(cutnHandle, CUTENSORNET_STATE_PURITY_PURE, numQubits, qubitDims.data(), + CUDA_C_64F, &quantumState)); + if (verbose) + std::cout << "Created the initial quantum state\n"; + + // Sphinx: Sampler #8 + + // Construct the quantum circuit state (apply quantum gates) + int64_t id; + HANDLE_CUTN_ERROR(cutensornetStateApplyTensor(cutnHandle, quantumState, 1, std::vector{{0}}.data(), + d_gateH, nullptr, 1, 0, 1, &id)); + for(int32_t i = 1; i < numQubits; ++i) { + HANDLE_CUTN_ERROR(cutensornetStateApplyTensor(cutnHandle, quantumState, 2, std::vector{{i-1,i}}.data(), + d_gateCX, nullptr, 1, 0, 1, &id)); + } + if (verbose) + std::cout << "Applied quantum gates\n"; + + // Sphinx: Sampler #9 + + // Create the quantum circuit sampler + cutensornetStateSampler_t sampler; + HANDLE_CUTN_ERROR(cutensornetCreateSampler(cutnHandle, quantumState, numQubits, nullptr, &sampler)); + if (verbose) + std::cout << "Created the quantum circuit sampler\n"; + + // Sphinx: Sampler #10 + + // Query the free memory on Device + std::size_t freeSize {0}, totalSize {0}; + HANDLE_CUDA_ERROR(cudaMemGetInfo(&freeSize, &totalSize)); + std::size_t scratchSize = size_t(double(freeSize) * 0.9) / 8; // assume max of 8 GPUs per node + scratchSize -= scratchSize % 4096; + void *d_scratch {nullptr}; + HANDLE_CUDA_ERROR(cudaMalloc(&d_scratch, scratchSize)); + if (verbose) + std::cout << "Allocated " << scratchSize << " bytes of scratch memory on GPU: " + << "[" << d_scratch << ":" << (void*)(((char*)(d_scratch)) + scratchSize) << ")\n"; + + // Sphinx: Sampler #11 + + // Configure the quantum circuit sampler + const int32_t numHyperSamples = 8; // desired number of hyper samples used in the tensor network contraction path finder + HANDLE_CUTN_ERROR(cutensornetSamplerConfigure(cutnHandle, sampler, + CUTENSORNET_SAMPLER_OPT_NUM_HYPER_SAMPLES, &numHyperSamples, sizeof(numHyperSamples))); + + // Sphinx: Sampler #12 + + // Prepare the quantum circuit sampler + cutensornetWorkspaceDescriptor_t workDesc; + HANDLE_CUTN_ERROR(cutensornetCreateWorkspaceDescriptor(cutnHandle, &workDesc)); + HANDLE_CUTN_ERROR(cutensornetSamplerPrepare(cutnHandle, sampler, scratchSize, workDesc, 0x0)); + if (verbose) + std::cout << "Prepared the quantum circuit state sampler\n"; + + // Sphinx: Sampler #13 + + // Attach the workspace buffer + int64_t worksize {0}; + HANDLE_CUTN_ERROR(cutensornetWorkspaceGetMemorySize(cutnHandle, + workDesc, + CUTENSORNET_WORKSIZE_PREF_RECOMMENDED, + CUTENSORNET_MEMSPACE_DEVICE, + CUTENSORNET_WORKSPACE_SCRATCH, + &worksize)); + assert(worksize > 0); + if(worksize <= scratchSize) { + HANDLE_CUTN_ERROR(cutensornetWorkspaceSetMemory(cutnHandle, workDesc, CUTENSORNET_MEMSPACE_DEVICE, + CUTENSORNET_WORKSPACE_SCRATCH, d_scratch, worksize)); + }else{ + std::cout << "ERROR: Insufficient workspace size on Device!\n"; + std::abort(); + } + if (verbose) + std::cout << "Set the workspace buffer\n"; + + // Sphinx: Sampler #14 + + // Sample the quantum circuit state + std::vector samples(numQubits * numSamples); // samples[SampleId][QubitId] reside in Host memory + HANDLE_CUTN_ERROR(cutensornetSamplerSample(cutnHandle, sampler, numSamples, workDesc, samples.data(), 0)); + if (verbose) { + std::cout << "Performed quantum circuit state sampling\n"; + std::cout << "Bit-string samples:\n"; + for(int64_t i = 0; i < numSamples; ++i) { + for(int64_t j = 0; j < numQubits; ++j) std::cout << " " << samples[i * numQubits + j]; + std::cout << std::endl; + } + } + + // Sphinx: Sampler #15 + + // Destroy the workspace descriptor + HANDLE_CUTN_ERROR(cutensornetDestroyWorkspaceDescriptor(workDesc)); + if (verbose) + std::cout << "Destroyed the workspace descriptor\n"; + + // Destroy the quantum circuit sampler + HANDLE_CUTN_ERROR(cutensornetDestroySampler(sampler)); + if (verbose) + std::cout << "Destroyed the quantum circuit state sampler\n"; + + // Destroy the quantum circuit state + HANDLE_CUTN_ERROR(cutensornetDestroyState(quantumState)); + if (verbose) + std::cout << "Destroyed the quantum circuit state\n"; + + HANDLE_CUDA_ERROR(cudaFree(d_scratch)); + HANDLE_CUDA_ERROR(cudaFree(d_gateCX)); + HANDLE_CUDA_ERROR(cudaFree(d_gateH)); + if (verbose) + std::cout << "Freed memory on GPU\n"; + + // Finalize the cuTensorNet library + HANDLE_CUTN_ERROR(cutensornetDestroy(cutnHandle)); + if (verbose) + std::cout << "Finalized the cuTensorNet library\n"; + + // Finalize the MPI library + HANDLE_MPI_ERROR(MPI_Finalize()); + + return 0; +}