Merge branch 'v24.08.0.1' into 'main'

Updating benchmarks and adding back testers See merge request cuda-hpc-libraries/cuquantum-sdk/cuquantum-public!30
NVIDIA · Sep 11, 2024 · 8d3bbce · 8d3bbce
2 parents db47013 + 0d70471
commit 8d3bbce
Show file tree

Hide file tree

Showing 14 changed files with 1,653 additions and 129 deletions.
diff --git a/benchmarks/README.md b/benchmarks/README.md
@@ -50,7 +50,7 @@ For backends that support MPI parallelism, it is assumed that `MPI_COMM_WORLD` i
 
 Examples:
 - `cuquantum-benchmarks api --benchmark apply_matrix --targets 4,5 --controls 2,3 --nqubits 16`: Apply a random gate matrix controlled by qubits 2 & 3 to qubits 4 & 5 of a 16-qubit statevector using cuStateVec's `apply_matrix()` API
-- `cuquantum-benchmarks circuit --frontend qiskit --backend cutn --benchmark qft --nqubits 8 --ngpus 1`: Construct a 8-qubit QFT circuit in Qiskit and run it with cuTensorNet on GPU
+- `cuquantum-benchmarks circuit --frontend qiskit --backend cutn --compute-mode statevector --benchmark qft --nqubits 8 --ngpus 1`: Construct a 8-qubit QFT circuit in Qiskit and compute the statevector with cuTensorNet on GPU. Note that the `--compute-mode` can be specified only for `cutn` backend and supports `amplitude` (default), `statevector`, and `expectation`. 
 - `cuquantum-benchmarks circuit --frontend cirq --backend qsim-mgpu --benchmark qaoa --nqubits 16 --ngpus 2`: Construct a 16-qubit QAOA circuit in Cirq and run it with the (multi-GPU) `qsim-mgpu` backend on 2 GPUs (requires cuQuantum Appliance)
 - `mpiexec -n 4 cuquantum-benchmarks circuit --frontend qiskit --backend cusvaer --benchmark quantum_volume --nqubits 32 --ngpus 1 --cusvaer-global-index-bits 1,1 --cusvaer-p2p-device-bits 1`: Construct a 32-qubit Quantum Volume circuit in Qiskit and run it with the (multi-GPU-multi-node) `cusvaer` backend on 2 nodes. Each node runs 2 MPI processes, each of which controls 1 GPU (requires cuQuantum Appliance)
 
@@ -69,9 +69,9 @@ It is recommended to loop over all recorded `sim_config_hash` to gather perf dat
 Currently all environment variables are reserved for internal use only, and are subject to change in the future without notification.
 
 * `CUTENSORNET_DUMP_TN=txt`
-* `CUTENSORNET_BENCHMARK_TARGET={amplitude,state_vector,expectation}` (pick one)
 * `CUTENSORNET_APPROX_TN_UTILS_PATH`
 * `CUQUANTUM_BENCHMARKS_DUMP_GATES`
+* `CUQUANTUM_BENCHMARKS_TCS_FULL_TENSOR`
 
 ## Development Overview
 

diff --git a/benchmarks/cuquantum_benchmarks/__init__.py b/benchmarks/cuquantum_benchmarks/__init__.py
@@ -1,5 +1,5 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2021-2024, NVIDIA CORPORATION & AFFILIATES
 #
 # SPDX-License-Identifier: BSD-3-Clause
 
-__version__ = '0.3.2'
+__version__ = '0.4.0'
diff --git a/benchmarks/cuquantum_benchmarks/_utils.py b/benchmarks/cuquantum_benchmarks/_utils.py
@@ -21,8 +21,6 @@
 import cupy as cp
 import numpy as np
 import nvtx
-from cuquantum import cudaDataType, ComputeType
-from cuquantum.cutensornet._internal.einsum_parser import create_size_dict
 import psutil
 
 
@@ -62,6 +60,7 @@ def precision_str_to_dtype(precision, is_complex=True):
 
 
 def dtype_to_cuda_type(dtype):
+    from cuquantum import cudaDataType
     if dtype == np.complex64:
         return cudaDataType.CUDA_C_32F
     elif dtype == np.complex128:
@@ -71,6 +70,7 @@ def dtype_to_cuda_type(dtype):
 
 
 def dtype_to_compute_type(dtype):
+    from cuquantum import ComputeType
     if dtype == np.complex64:
         return ComputeType.COMPUTE_32F
     elif dtype == np.complex128:
@@ -80,6 +80,7 @@ def dtype_to_compute_type(dtype):
 
 
 def generate_size_dict_from_operands(einsum, operands):
+    from cuquantum.cutensornet._internal.einsum_parser import create_size_dict
     inputs = einsum.split("->")[0]
     inputs = inputs.split(",")
     assert len(inputs) == len(operands)

diff --git a/benchmarks/cuquantum_benchmarks/backends/backend_cutn.py b/benchmarks/cuquantum_benchmarks/backends/backend_cutn.py
@@ -6,13 +6,19 @@
 import os
 import time
 import warnings
+from math import log10, log2
 
 import numpy as np
 import cupy as cp
-from cuquantum import contract, contract_path, CircuitToEinsum
-from cuquantum import cutensornet as cutn
 
 from .backend import Backend
+
+try:
+    from cuquantum import contract, contract_path, CircuitToEinsum
+    from cuquantum import cutensornet as cutn
+except ImportError:
+    cutn = None
+
 from .._utils import convert_einsum_to_txt, generate_size_dict_from_operands, is_running_mpiexec
 
 
@@ -24,6 +30,8 @@
 class cuTensorNet(Backend):
 
     def __init__(self, ngpus, ncpu_threads, precision, **kwargs):
+        if cutn is None:
+            raise RuntimeError("cuquantum-python is not installed")
         if ngpus != 1:
             raise ValueError("the cutn backend must be run with --ngpus 1 (regardless if MPI is in use)")
 
@@ -32,6 +40,7 @@ def __init__(self, ngpus, ncpu_threads, precision, **kwargs):
         self.nqubits = kwargs.pop('nqubits')
         self.rank = 0
         self.handle = cutn.create()
+        self.meta = {}
         try:
             # cuQuantum Python 22.11+ supports nonblocking & auto-MPI
             opts = cutn.NetworkOptions(handle=self.handle, blocking="auto")
@@ -50,47 +59,49 @@ def __init__(self, ngpus, ncpu_threads, precision, **kwargs):
             # cuQuantum Python 22.07 or below
             opts = cutn.NetworkOptions(handle=self.handle)
         self.network_opts = opts
-        self.n_samples = kwargs.pop('nhypersamples')
+        self.n_hyper_samples = kwargs.pop('nhypersamples')
         self.version = cutn.get_version()
 
+        self.meta["backend"] = f"cutn-v{self.version} precision={self.precision}"
+        self.meta['nhypersamples'] = self.n_hyper_samples
+        self.meta['cpu_threads'] = self.ncpu_threads
+
     def __del__(self):
         cutn.destroy(self.handle)
 
     def preprocess_circuit(self, circuit, *args, **kwargs):
         circuit_filename = kwargs.pop('circuit_filename')
-        target = kwargs.pop('target')
-        pauli = kwargs.pop('pauli')
-        preprocess_data = {}
+        self.compute_mode = kwargs.pop('compute_mode')
+        self.pauli = kwargs.pop('pauli')
+        valid_choices = ['amplitude', 'expectation', 'statevector']
+        if self.compute_mode not in valid_choices:
+            raise ValueError(f"The string '{self.compute_mode}' is not a valid option for --compute-mode argument. Valid options are: {valid_choices}")
 
         t1 = time.perf_counter()
-
         if self.precision == 'single':
             circuit_converter = CircuitToEinsum(circuit, dtype='complex64', backend=cp)
         else:
             circuit_converter = CircuitToEinsum(circuit, dtype='complex128', backend=cp)
-
         t2 = time.perf_counter()
         time_circ2einsum = t2 - t1
-        logger.info(f'CircuitToEinsum took {time_circ2einsum} s')
-
+
         t1 = time.perf_counter()
-        if target == 'amplitude':
+        if self.compute_mode == 'amplitude':
             # any bitstring would give same TN topology, so let's just pick "000...0"
             self.expression, self.operands = circuit_converter.amplitude('0'*self.nqubits)
-        elif target == 'state_vector':
+        elif self.compute_mode == 'statevector':
             self.expression, self.operands = circuit_converter.state_vector()
-        elif target == 'expectation':
+        elif self.compute_mode == 'expectation':
             # new in cuQuantum Python 22.11
-            assert pauli is not None
-            logger.info(f"compute expectation value for Pauli string: {pauli}")
-            self.expression, self.operands = circuit_converter.expectation(pauli)
+            assert self.pauli is not None
+            logger.info(f"compute expectation value for Pauli string: {self.pauli}")
+            self.expression, self.operands = circuit_converter.expectation(self.pauli)
         else:
             # TODO: add other CircuitToEinsum methods?
-            raise NotImplementedError(f"the target {target} is not supported")
+            raise NotImplementedError(f"the target {self.compute_mode} is not supported")
         t2 = time.perf_counter()
         time_tn = t2 - t1
-        logger.info(f'{target}() took {time_tn} s')
-
+
         tn_format = os.environ.get('CUTENSORNET_DUMP_TN')
         if tn_format == 'txt':
             size_dict = generate_size_dict_from_operands(
@@ -106,23 +117,20 @@ def preprocess_circuit(self, circuit, *args, **kwargs):
         t1 = time.perf_counter()
         path, opt_info = self.network.contract_path(
             # TODO: samples may be too large for small circuits
-            optimize={'samples': self.n_samples, 'threads': self.ncpu_threads})
+            optimize={'samples': self.n_hyper_samples, 'threads': self.ncpu_threads})
         t2 = time.perf_counter()
         time_path = t2 - t1
-        logger.info(f'contract_path() took {time_path} s')
-        logger.debug(f'# samples: {self.n_samples}')
-        logger.debug(opt_info)
-
-        self.path = path
-        self.opt_info = opt_info
-        preprocess_data = {
-            'CircuitToEinsum': time_circ2einsum,
-            target:            time_tn,
-            'contract_path':   time_path,
-        }
 
-        return preprocess_data
+        self.opt_info = opt_info # cuTensorNet returns "real-number" Flops. To get the true FLOP count, multiply it by 4
 
+        self.meta['compute-mode'] = f'{self.compute_mode}()'
+        self.meta[f'circuit to einsum'] = f"{time_circ2einsum + time_tn} s"
+
+        logger.info(f'data: {self.meta}')
+        logger.info(f'log10[FLOPS]: {log10(self.opt_info.opt_cost * 4)}  log2[SIZE]: {log2(opt_info.largest_intermediate)}  contract_path(): {time_path} s')
+        pre_data = {'circuit to einsum time': time_circ2einsum + time_tn, 'contract path time': time_path, 
+                    'log2[LargestInter]': log2(opt_info.largest_intermediate), 'log10[FLOPS]': log10(self.opt_info.opt_cost * 4) }
+        return pre_data
     def run(self, circuit, nshots=0):
         if self.rank == 0 and nshots > 0:
             warnings.warn("the cutn backend does not support sampling")

diff --git a/benchmarks/cuquantum_benchmarks/backends/backend_pny.py b/benchmarks/cuquantum_benchmarks/backends/backend_pny.py
@@ -43,51 +43,66 @@ def find_version(self, identifier):
         if identifier == "pennylane-lightning-gpu":
             if self.ngpus == 1:
                 try:
-                    import pennylane_lightning_gpu
-                except ImportError as e:
-                    raise RuntimeError("PennyLane-Lightning-GPU plugin is not installed") from e
+                    from pennylane_lightning.lightning_gpu import LightningGPU
+                    return LightningGPU.version
+                except ImportError:
+                    try: # pre pennylane_lightning 0.33.0 version
+                        import pennylane_lightning_gpu
+                        return pennylane_lightning_gpu.__version__
+                    except ImportError:
+                        raise RuntimeError("PennyLane-Lightning-GPU plugin is not installed")
             else:
                 raise ValueError(f"cannot specify --ngpus > 1 for the backend {identifier}")
-            ver = pennylane_lightning_gpu.__version__
         elif identifier == "pennylane-lightning-kokkos":
             try:
-                import pennylane_lightning_kokkos
-            except ImportError as e:
-                raise RuntimeError("PennyLane-Lightning-Kokkos plugin is not installed") from e
-            ver = pennylane_lightning_kokkos.__version__
+                from pennylane_lightning.lightning_kokkos import LightningKokkos
+                return LightningKokkos.version
+            except ImportError:
+                try: # pre pennylane_lightning 0.33.0 version
+                    import pennylane_lightning_kokkos
+                    return pennylane_lightning_kokkos.__version__
+                except ImportError:
+                    raise RuntimeError("PennyLane-Lightning-Kokkos plugin is not installed")
         elif identifier == "pennylane-lightning-qubit":
             try:
                 from pennylane_lightning import lightning_qubit
+                return lightning_qubit.__version__
             except ImportError as e:
                 raise RuntimeError("PennyLane-Lightning plugin is not installed") from e
-            ver = lightning_qubit.__version__
         else: # identifier == "pennylane"
-            ver = pennylane.__version__
-        return ver
-
+            return pennylane.__version__
+
     def _make_qnode(self, circuit, nshots=1024, **kwargs):
         if self.identifier == "pennylane-lightning-gpu":
             dev = pennylane.device("lightning.gpu", wires=self.nqubits, shots=nshots, c_dtype=self.dtype)
         elif self.identifier == "pennylane-lightning-kokkos":
             # there's no way for us to query what execution space (=backend) that kokkos supports at runtime,
             # so let's just set up Kokkos::InitArguments and hope kokkos to do the right thing...
+            dev = None
             try:
-                import pennylane_lightning_kokkos
-            except ImportError as e:
-                raise RuntimeError("PennyLane-Lightning-Kokkos plugin is not installed") from e
-            args = pennylane_lightning_kokkos.lightning_kokkos.InitArguments()
-            args.num_threads = self.ncpu_threads
-            args.disable_warnings = int(logger.getEffectiveLevel() != logging.DEBUG)
-            ## Disable MPI because it's unclear if pennylane actually supports it (at least it's untested)
-            # # if we're running MPI, we want to know now and get it init'd before kokkos is
-            # MPI = is_running_mpi()
-            # if MPI:
-            #     comm = MPI.COMM_WORLD
-            #     args.ndevices = min(comm.Get_size(), self.ngpus)  # note: kokkos uses 1 GPU per process
-            dev = pennylane.device(
+                if self.ncpu_threads > 1 :
+                    warnings.warn(f"--ncputhreads is ignored for {self.identifier}", stacklevel=2)
+                dev = pennylane.device(
                 "lightning.kokkos", wires=self.nqubits, shots=nshots, c_dtype=self.dtype,
-                sync=False,
-                kokkos_args=args)
+                sync=False)
+            except ImportError:
+                try: # pre pennylane_lightning 0.33.0 version
+                    from pennylane_lightning_kokkos.lightning_kokkos import InitArguments
+                    args = InitArguments()
+                    args.num_threads = self.ncpu_threads
+                    args.disable_warnings = int(logger.getEffectiveLevel() != logging.DEBUG)
+                    ## Disable MPI because it's unclear if pennylane actually supports it (at least it's untested)
+                    # # if we're running MPI, we want to know now and get it init'd before kokkos is
+                    # MPI = is_running_mpi()
+                    # if MPI:
+                    #     comm = MPI.COMM_WORLD
+                    #     args.ndevices = min(comm.Get_size(), self.ngpus)  # note: kokkos uses 1 GPU per process
+                    dev = pennylane.device(
+                        "lightning.kokkos", wires=self.nqubits, shots=nshots, c_dtype=self.dtype,
+                        sync=False,
+                        kokkos_args=args)
+                except ImportError:
+                    raise RuntimeError("Could not load PennyLane-Lightning-Kokkos plugin. Is it installed?")
         elif self.identifier == "pennylane-lightning-qubit":
             if self.ngpus != 0:
                 raise ValueError(f"cannot specify --ngpus for the backend {self.identifier}")
@@ -98,10 +113,12 @@ def _make_qnode(self, circuit, nshots=1024, **kwargs):
         elif self.identifier == "pennylane":
             if self.ngpus != 0:
                 raise ValueError(f"cannot specify --ngpus for the backend {self.identifier}")
-            dev = pennylane.device("default.qubit", wires=self.nqubits, shots=nshots, c_dtype=self.dtype)
+            if self.dtype == np.complex64:
+                raise ValueError("As of version 0.33.0, Pennylane's default.qubit device only supports double precision.")
+            dev = pennylane.device("default.qubit", wires=self.nqubits, shots=nshots)
         else:
             raise ValueError(f"the backend {self.identifier} is not recognized")
-
+        
         qnode = pennylane.QNode(circuit, device=dev)
         return qnode
 

diff --git a/benchmarks/cuquantum_benchmarks/backends/backend_qiskit.py b/benchmarks/cuquantum_benchmarks/backends/backend_qiskit.py
@@ -44,8 +44,12 @@ def find_version(self, identifier):
         if identifier == 'cusvaer':
             return version('cusvaer')
 
-        if hasattr(qiskit_aer, "__version__"):
-            return qiskit_aer.__version__
+        if hasattr(qiskit, "__version__") and qiskit.__version__ >= "1.0.0":
+            try:
+                from qiskit_aer import __version__ as aer_version
+                return aer_version
+            except ImportError as e:
+                raise RuntimeError("qiskit-aer (or qiskit-aer-gpu) is not installed") from e
         else:
             return qiskit.__qiskit_version__['qiskit-aer']
 

diff --git a/benchmarks/cuquantum_benchmarks/benchmarks/qpe.py b/benchmarks/cuquantum_benchmarks/benchmarks/qpe.py
@@ -18,7 +18,7 @@ def generateGatesSequence(nqubits, config):
 
         # Example instantiation of QPE circuit paramterized by nqubits
         phase = 1/3
-        U = np.mat([[1, 0], [0, np.exp(np.pi * 1j * phase)]])
+        U = np.asmatrix([[1, 0], [0, np.exp(np.pi * 1j * phase)]])
         in_nqubits = 1
         unfold = config['unfold']
         measure = config['measure']

diff --git a/benchmarks/cuquantum_benchmarks/config.py b/benchmarks/cuquantum_benchmarks/config.py
@@ -201,7 +201,7 @@
             'nfused': None,
             'ngpus': 0,
             'ncputhreads': 1,
-            'precision': 'single',
+            'precision': 'double',
         },
     },