Skip to content

Commit

Permalink
Merge branch 'v24.08.0.1' into 'main'
Browse files Browse the repository at this point in the history
Updating benchmarks and adding back testers

See merge request cuda-hpc-libraries/cuquantum-sdk/cuquantum-public!30
  • Loading branch information
mtjrider committed Sep 11, 2024
2 parents db47013 + 0d70471 commit 8d3bbce
Show file tree
Hide file tree
Showing 14 changed files with 1,653 additions and 129 deletions.
4 changes: 2 additions & 2 deletions benchmarks/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ For backends that support MPI parallelism, it is assumed that `MPI_COMM_WORLD` i

Examples:
- `cuquantum-benchmarks api --benchmark apply_matrix --targets 4,5 --controls 2,3 --nqubits 16`: Apply a random gate matrix controlled by qubits 2 & 3 to qubits 4 & 5 of a 16-qubit statevector using cuStateVec's `apply_matrix()` API
- `cuquantum-benchmarks circuit --frontend qiskit --backend cutn --benchmark qft --nqubits 8 --ngpus 1`: Construct a 8-qubit QFT circuit in Qiskit and run it with cuTensorNet on GPU
- `cuquantum-benchmarks circuit --frontend qiskit --backend cutn --compute-mode statevector --benchmark qft --nqubits 8 --ngpus 1`: Construct a 8-qubit QFT circuit in Qiskit and compute the statevector with cuTensorNet on GPU. Note that the `--compute-mode` can be specified only for `cutn` backend and supports `amplitude` (default), `statevector`, and `expectation`.
- `cuquantum-benchmarks circuit --frontend cirq --backend qsim-mgpu --benchmark qaoa --nqubits 16 --ngpus 2`: Construct a 16-qubit QAOA circuit in Cirq and run it with the (multi-GPU) `qsim-mgpu` backend on 2 GPUs (requires cuQuantum Appliance)
- `mpiexec -n 4 cuquantum-benchmarks circuit --frontend qiskit --backend cusvaer --benchmark quantum_volume --nqubits 32 --ngpus 1 --cusvaer-global-index-bits 1,1 --cusvaer-p2p-device-bits 1`: Construct a 32-qubit Quantum Volume circuit in Qiskit and run it with the (multi-GPU-multi-node) `cusvaer` backend on 2 nodes. Each node runs 2 MPI processes, each of which controls 1 GPU (requires cuQuantum Appliance)

Expand All @@ -69,9 +69,9 @@ It is recommended to loop over all recorded `sim_config_hash` to gather perf dat
Currently all environment variables are reserved for internal use only, and are subject to change in the future without notification.

* `CUTENSORNET_DUMP_TN=txt`
* `CUTENSORNET_BENCHMARK_TARGET={amplitude,state_vector,expectation}` (pick one)
* `CUTENSORNET_APPROX_TN_UTILS_PATH`
* `CUQUANTUM_BENCHMARKS_DUMP_GATES`
* `CUQUANTUM_BENCHMARKS_TCS_FULL_TENSOR`

## Development Overview

Expand Down
4 changes: 2 additions & 2 deletions benchmarks/cuquantum_benchmarks/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES
# Copyright (c) 2021-2024, NVIDIA CORPORATION & AFFILIATES
#
# SPDX-License-Identifier: BSD-3-Clause

__version__ = '0.3.2'
__version__ = '0.4.0'
5 changes: 3 additions & 2 deletions benchmarks/cuquantum_benchmarks/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,6 @@
import cupy as cp
import numpy as np
import nvtx
from cuquantum import cudaDataType, ComputeType
from cuquantum.cutensornet._internal.einsum_parser import create_size_dict
import psutil


Expand Down Expand Up @@ -62,6 +60,7 @@ def precision_str_to_dtype(precision, is_complex=True):


def dtype_to_cuda_type(dtype):
from cuquantum import cudaDataType
if dtype == np.complex64:
return cudaDataType.CUDA_C_32F
elif dtype == np.complex128:
Expand All @@ -71,6 +70,7 @@ def dtype_to_cuda_type(dtype):


def dtype_to_compute_type(dtype):
from cuquantum import ComputeType
if dtype == np.complex64:
return ComputeType.COMPUTE_32F
elif dtype == np.complex128:
Expand All @@ -80,6 +80,7 @@ def dtype_to_compute_type(dtype):


def generate_size_dict_from_operands(einsum, operands):
from cuquantum.cutensornet._internal.einsum_parser import create_size_dict
inputs = einsum.split("->")[0]
inputs = inputs.split(",")
assert len(inputs) == len(operands)
Expand Down
72 changes: 40 additions & 32 deletions benchmarks/cuquantum_benchmarks/backends/backend_cutn.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,19 @@
import os
import time
import warnings
from math import log10, log2

import numpy as np
import cupy as cp
from cuquantum import contract, contract_path, CircuitToEinsum
from cuquantum import cutensornet as cutn

from .backend import Backend

try:
from cuquantum import contract, contract_path, CircuitToEinsum
from cuquantum import cutensornet as cutn
except ImportError:
cutn = None

from .._utils import convert_einsum_to_txt, generate_size_dict_from_operands, is_running_mpiexec


Expand All @@ -24,6 +30,8 @@
class cuTensorNet(Backend):

def __init__(self, ngpus, ncpu_threads, precision, **kwargs):
if cutn is None:
raise RuntimeError("cuquantum-python is not installed")
if ngpus != 1:
raise ValueError("the cutn backend must be run with --ngpus 1 (regardless if MPI is in use)")

Expand All @@ -32,6 +40,7 @@ def __init__(self, ngpus, ncpu_threads, precision, **kwargs):
self.nqubits = kwargs.pop('nqubits')
self.rank = 0
self.handle = cutn.create()
self.meta = {}
try:
# cuQuantum Python 22.11+ supports nonblocking & auto-MPI
opts = cutn.NetworkOptions(handle=self.handle, blocking="auto")
Expand All @@ -50,47 +59,49 @@ def __init__(self, ngpus, ncpu_threads, precision, **kwargs):
# cuQuantum Python 22.07 or below
opts = cutn.NetworkOptions(handle=self.handle)
self.network_opts = opts
self.n_samples = kwargs.pop('nhypersamples')
self.n_hyper_samples = kwargs.pop('nhypersamples')
self.version = cutn.get_version()

self.meta["backend"] = f"cutn-v{self.version} precision={self.precision}"
self.meta['nhypersamples'] = self.n_hyper_samples
self.meta['cpu_threads'] = self.ncpu_threads

def __del__(self):
cutn.destroy(self.handle)

def preprocess_circuit(self, circuit, *args, **kwargs):
circuit_filename = kwargs.pop('circuit_filename')
target = kwargs.pop('target')
pauli = kwargs.pop('pauli')
preprocess_data = {}
self.compute_mode = kwargs.pop('compute_mode')
self.pauli = kwargs.pop('pauli')
valid_choices = ['amplitude', 'expectation', 'statevector']
if self.compute_mode not in valid_choices:
raise ValueError(f"The string '{self.compute_mode}' is not a valid option for --compute-mode argument. Valid options are: {valid_choices}")

t1 = time.perf_counter()

if self.precision == 'single':
circuit_converter = CircuitToEinsum(circuit, dtype='complex64', backend=cp)
else:
circuit_converter = CircuitToEinsum(circuit, dtype='complex128', backend=cp)

t2 = time.perf_counter()
time_circ2einsum = t2 - t1
logger.info(f'CircuitToEinsum took {time_circ2einsum} s')


t1 = time.perf_counter()
if target == 'amplitude':
if self.compute_mode == 'amplitude':
# any bitstring would give same TN topology, so let's just pick "000...0"
self.expression, self.operands = circuit_converter.amplitude('0'*self.nqubits)
elif target == 'state_vector':
elif self.compute_mode == 'statevector':
self.expression, self.operands = circuit_converter.state_vector()
elif target == 'expectation':
elif self.compute_mode == 'expectation':
# new in cuQuantum Python 22.11
assert pauli is not None
logger.info(f"compute expectation value for Pauli string: {pauli}")
self.expression, self.operands = circuit_converter.expectation(pauli)
assert self.pauli is not None
logger.info(f"compute expectation value for Pauli string: {self.pauli}")
self.expression, self.operands = circuit_converter.expectation(self.pauli)
else:
# TODO: add other CircuitToEinsum methods?
raise NotImplementedError(f"the target {target} is not supported")
raise NotImplementedError(f"the target {self.compute_mode} is not supported")
t2 = time.perf_counter()
time_tn = t2 - t1
logger.info(f'{target}() took {time_tn} s')


tn_format = os.environ.get('CUTENSORNET_DUMP_TN')
if tn_format == 'txt':
size_dict = generate_size_dict_from_operands(
Expand All @@ -106,23 +117,20 @@ def preprocess_circuit(self, circuit, *args, **kwargs):
t1 = time.perf_counter()
path, opt_info = self.network.contract_path(
# TODO: samples may be too large for small circuits
optimize={'samples': self.n_samples, 'threads': self.ncpu_threads})
optimize={'samples': self.n_hyper_samples, 'threads': self.ncpu_threads})
t2 = time.perf_counter()
time_path = t2 - t1
logger.info(f'contract_path() took {time_path} s')
logger.debug(f'# samples: {self.n_samples}')
logger.debug(opt_info)

self.path = path
self.opt_info = opt_info
preprocess_data = {
'CircuitToEinsum': time_circ2einsum,
target: time_tn,
'contract_path': time_path,
}

return preprocess_data
self.opt_info = opt_info # cuTensorNet returns "real-number" Flops. To get the true FLOP count, multiply it by 4

self.meta['compute-mode'] = f'{self.compute_mode}()'
self.meta[f'circuit to einsum'] = f"{time_circ2einsum + time_tn} s"

logger.info(f'data: {self.meta}')
logger.info(f'log10[FLOPS]: {log10(self.opt_info.opt_cost * 4)} log2[SIZE]: {log2(opt_info.largest_intermediate)} contract_path(): {time_path} s')
pre_data = {'circuit to einsum time': time_circ2einsum + time_tn, 'contract path time': time_path,
'log2[LargestInter]': log2(opt_info.largest_intermediate), 'log10[FLOPS]': log10(self.opt_info.opt_cost * 4) }
return pre_data
def run(self, circuit, nshots=0):
if self.rank == 0 and nshots > 0:
warnings.warn("the cutn backend does not support sampling")
Expand Down
75 changes: 46 additions & 29 deletions benchmarks/cuquantum_benchmarks/backends/backend_pny.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,51 +43,66 @@ def find_version(self, identifier):
if identifier == "pennylane-lightning-gpu":
if self.ngpus == 1:
try:
import pennylane_lightning_gpu
except ImportError as e:
raise RuntimeError("PennyLane-Lightning-GPU plugin is not installed") from e
from pennylane_lightning.lightning_gpu import LightningGPU
return LightningGPU.version
except ImportError:
try: # pre pennylane_lightning 0.33.0 version
import pennylane_lightning_gpu
return pennylane_lightning_gpu.__version__
except ImportError:
raise RuntimeError("PennyLane-Lightning-GPU plugin is not installed")
else:
raise ValueError(f"cannot specify --ngpus > 1 for the backend {identifier}")
ver = pennylane_lightning_gpu.__version__
elif identifier == "pennylane-lightning-kokkos":
try:
import pennylane_lightning_kokkos
except ImportError as e:
raise RuntimeError("PennyLane-Lightning-Kokkos plugin is not installed") from e
ver = pennylane_lightning_kokkos.__version__
from pennylane_lightning.lightning_kokkos import LightningKokkos
return LightningKokkos.version
except ImportError:
try: # pre pennylane_lightning 0.33.0 version
import pennylane_lightning_kokkos
return pennylane_lightning_kokkos.__version__
except ImportError:
raise RuntimeError("PennyLane-Lightning-Kokkos plugin is not installed")
elif identifier == "pennylane-lightning-qubit":
try:
from pennylane_lightning import lightning_qubit
return lightning_qubit.__version__
except ImportError as e:
raise RuntimeError("PennyLane-Lightning plugin is not installed") from e
ver = lightning_qubit.__version__
else: # identifier == "pennylane"
ver = pennylane.__version__
return ver

return pennylane.__version__

def _make_qnode(self, circuit, nshots=1024, **kwargs):
if self.identifier == "pennylane-lightning-gpu":
dev = pennylane.device("lightning.gpu", wires=self.nqubits, shots=nshots, c_dtype=self.dtype)
elif self.identifier == "pennylane-lightning-kokkos":
# there's no way for us to query what execution space (=backend) that kokkos supports at runtime,
# so let's just set up Kokkos::InitArguments and hope kokkos to do the right thing...
dev = None
try:
import pennylane_lightning_kokkos
except ImportError as e:
raise RuntimeError("PennyLane-Lightning-Kokkos plugin is not installed") from e
args = pennylane_lightning_kokkos.lightning_kokkos.InitArguments()
args.num_threads = self.ncpu_threads
args.disable_warnings = int(logger.getEffectiveLevel() != logging.DEBUG)
## Disable MPI because it's unclear if pennylane actually supports it (at least it's untested)
# # if we're running MPI, we want to know now and get it init'd before kokkos is
# MPI = is_running_mpi()
# if MPI:
# comm = MPI.COMM_WORLD
# args.ndevices = min(comm.Get_size(), self.ngpus) # note: kokkos uses 1 GPU per process
dev = pennylane.device(
if self.ncpu_threads > 1 :
warnings.warn(f"--ncputhreads is ignored for {self.identifier}", stacklevel=2)
dev = pennylane.device(
"lightning.kokkos", wires=self.nqubits, shots=nshots, c_dtype=self.dtype,
sync=False,
kokkos_args=args)
sync=False)
except ImportError:
try: # pre pennylane_lightning 0.33.0 version
from pennylane_lightning_kokkos.lightning_kokkos import InitArguments
args = InitArguments()
args.num_threads = self.ncpu_threads
args.disable_warnings = int(logger.getEffectiveLevel() != logging.DEBUG)
## Disable MPI because it's unclear if pennylane actually supports it (at least it's untested)
# # if we're running MPI, we want to know now and get it init'd before kokkos is
# MPI = is_running_mpi()
# if MPI:
# comm = MPI.COMM_WORLD
# args.ndevices = min(comm.Get_size(), self.ngpus) # note: kokkos uses 1 GPU per process
dev = pennylane.device(
"lightning.kokkos", wires=self.nqubits, shots=nshots, c_dtype=self.dtype,
sync=False,
kokkos_args=args)
except ImportError:
raise RuntimeError("Could not load PennyLane-Lightning-Kokkos plugin. Is it installed?")
elif self.identifier == "pennylane-lightning-qubit":
if self.ngpus != 0:
raise ValueError(f"cannot specify --ngpus for the backend {self.identifier}")
Expand All @@ -98,10 +113,12 @@ def _make_qnode(self, circuit, nshots=1024, **kwargs):
elif self.identifier == "pennylane":
if self.ngpus != 0:
raise ValueError(f"cannot specify --ngpus for the backend {self.identifier}")
dev = pennylane.device("default.qubit", wires=self.nqubits, shots=nshots, c_dtype=self.dtype)
if self.dtype == np.complex64:
raise ValueError("As of version 0.33.0, Pennylane's default.qubit device only supports double precision.")
dev = pennylane.device("default.qubit", wires=self.nqubits, shots=nshots)
else:
raise ValueError(f"the backend {self.identifier} is not recognized")

qnode = pennylane.QNode(circuit, device=dev)
return qnode

Expand Down
8 changes: 6 additions & 2 deletions benchmarks/cuquantum_benchmarks/backends/backend_qiskit.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,12 @@ def find_version(self, identifier):
if identifier == 'cusvaer':
return version('cusvaer')

if hasattr(qiskit_aer, "__version__"):
return qiskit_aer.__version__
if hasattr(qiskit, "__version__") and qiskit.__version__ >= "1.0.0":
try:
from qiskit_aer import __version__ as aer_version
return aer_version
except ImportError as e:
raise RuntimeError("qiskit-aer (or qiskit-aer-gpu) is not installed") from e
else:
return qiskit.__qiskit_version__['qiskit-aer']

Expand Down
2 changes: 1 addition & 1 deletion benchmarks/cuquantum_benchmarks/benchmarks/qpe.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def generateGatesSequence(nqubits, config):

# Example instantiation of QPE circuit paramterized by nqubits
phase = 1/3
U = np.mat([[1, 0], [0, np.exp(np.pi * 1j * phase)]])
U = np.asmatrix([[1, 0], [0, np.exp(np.pi * 1j * phase)]])
in_nqubits = 1
unfold = config['unfold']
measure = config['measure']
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/cuquantum_benchmarks/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,7 @@
'nfused': None,
'ngpus': 0,
'ncputhreads': 1,
'precision': 'single',
'precision': 'double',
},
},

Expand Down
Loading

0 comments on commit 8d3bbce

Please sign in to comment.