Skip to content

Commit

Permalink
Merge pull request #70 from jpmorganchase/performance_c
Browse files Browse the repository at this point in the history
Improve the performance of the C simulator
  • Loading branch information
rsln-s authored Aug 20, 2024
2 parents 776c9f6 + 72f3035 commit 9965dfa
Show file tree
Hide file tree
Showing 3 changed files with 118 additions and 51 deletions.
56 changes: 54 additions & 2 deletions examples/benchmark.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
"id": "11678646-6e36-416f-aec6-39b38f1bc02f",
"metadata": {},
"source": [
"The output should be `['gpu', 'c', 'python']`. `'c'` simulator is optional and is not relevant to this benchmark. However, if it's missing, you can compile it manually by running `make -C qokit/fur/c/csim/src/` in the home directory of QOKit"
"The output should be `['gpu', 'c', 'python']`. If the `'c'` simulator is missing, you can compile it manually by running `make -C qokit/fur/c/csim/src/` in the home directory of QOKit"
]
},
{
Expand Down Expand Up @@ -238,7 +238,9 @@
"N=28\n",
"\tLABS finished in 1.3889 on average, min: 1.3856, max: 1.3953\n",
"\tMaxCut finished in 1.3112 on average, min: 1.3088, max: 1.3201\n",
"```"
"```\n",
"\n",
"On large memory CPU nodes, you can run with more qubits using the `'c'` simulator. "
]
},
{
Expand All @@ -247,6 +249,56 @@
"id": "549162cf-99c4-4473-bc01-87b616dae7bc",
"metadata": {},
"outputs": [],
"source": [
"# number of qubits\n",
"for N in [12, 16, 20, 24, 28, 32, 33, 34]:\n",
" print(f\"N={N}\")\n",
" # QAOA depth\n",
" p = 6\n",
"\n",
" theta = np.random.uniform(0,1,2*p)\n",
" G = nx.random_regular_graph(4, N, seed=42)\n",
"\n",
" # Function initialization may not be fast\n",
" f_labs = get_qaoa_labs_objective(N, p, simulator='c')\n",
"\n",
" # Function evaluation is fast\n",
" for f, label in [(f_labs, \"LABS\")]:\n",
" f(theta) # do not count the first evaluation\n",
" times = []\n",
" for _ in tqdm(range(3)):\n",
" start = timeit.default_timer()\n",
" f(theta)\n",
" end = timeit.default_timer()\n",
" times.append(end-start)\n",
" print(f\"For N = {N}, {label} finished in {np.mean(times):.4f} on average, min: {np.min(times):.4f}, max: {np.max(times):.4f}\")"
]
},
{
"cell_type": "markdown",
"id": "ec7d4d59-03a3-4fea-8516-6d03ad410cd3",
"metadata": {},
"source": [
"Here's what I measured on `r5.24xlarge` (Intel Xeon Platinum 8175M CPU @ 2.50GHz, 24 Cores per socket with 2 sockets, 2 threads per core, 768 GB RAM):\n",
"\n",
"```\n",
"For N = 12, LABS finished in 0.0013 on average, min: 0.0012, max: 0.0016\n",
"For N = 16, LABS finished in 0.1293 on average, min: 0.1180, max: 0.1483\n",
"For N = 20, LABS finished in 0.1510 on average, min: 0.1325, max: 0.1745\n",
"For N = 24, LABS finished in 0.7461 on average, min: 0.7184, max: 0.7760\n",
"For N = 28, LABS finished in 11.4725 on average, min: 11.3852, max: 11.6067\n",
"For N = 32, LABS finished in 234.7204 on average, min: 234.3182, max: 235.0045\n",
"For N = 33, LABS finished in 482.1637 on average, min: 476.6924, max: 485.6158\n",
"For N = 34, LABS finished in 958.0517 on average, min: 946.1425, max: 968.6004\n",
"```"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "84c0121b-cab8-4062-a93f-426a4d3c9757",
"metadata": {},
"outputs": [],
"source": []
}
],
Expand Down
11 changes: 0 additions & 11 deletions qokit/fur/c/csim/lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,17 +14,6 @@
raise ImportError("You must compile the C simulator before running the code. Please follow the instructions in README.md") from e


_furx = lib.furx
_furx.restype = None
_furx.argtypes = [
ndpointer(ctypes.c_double, flags="C_CONTIGUOUS"),
ndpointer(ctypes.c_double, flags="C_CONTIGUOUS"),
ctypes.c_double,
ctypes.c_uint,
ctypes.c_size_t,
]


_apply_qaoa_furx = lib.apply_qaoa_furx
_apply_qaoa_furx.restype = None
_apply_qaoa_furx.argtypes = [
Expand Down
102 changes: 64 additions & 38 deletions qokit/fur/c/csim/src/fur.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,45 +7,53 @@
#include <stdio.h>
#include <math.h>

/**
* apply to a statevector a single-qubit Pauli-X rotation defined by
* Rx(theta) = exp(-i*theta*X)
* where X is the Pauli-X operator.
* @param sv_real array of length n containing real parts of the statevector
* @param sv_imag array of length n containing imaginary parts of the statevector
* @param theta rotation angle
* @param q index of qubit on which the rotation is applied
* @param n_states size of the statevector
*/
void furx(double* a_real, double* a_imag, double theta, unsigned int q, size_t n_states)
{
// number of groups of states on which the operation is applied locally
size_t num_groups = n_states / 2;
void furx_kernel(double* a_real, double* a_imag, const double a, const double b,
const int block_idx, const int n_q, const int q_offset, const int state_mask){

// helper digit masks for constructing the locality indices
// the mask is applied on the index through all groups of local operations
size_t mask1 = ((size_t)1<<q) - 1; // digits lower than q
size_t mask2 = mask1 ^ ((n_states-1) >> 1); // digits higher than q

// pre-compute coefficients in transformation
double cx = cos(theta), sx = sin(theta);
const unsigned int data_size = 1 << n_q;
const unsigned int stride_size = data_size >> 1;

#pragma omp parallel for
for (size_t i = 0; i < num_groups; i++)
{
size_t i1 = (i&mask1) | ((i&mask2)<<1);
size_t i2 = i1 | ((size_t)1<<q);
const unsigned int stride = 1 << q_offset;
const unsigned int index_mask = stride - 1;
const unsigned int stride_mask = ~index_mask;
const unsigned int offset = ((stride_mask & block_idx) << n_q) | (index_mask & block_idx);

double a1r = a_real[i1];
double a2r = a_real[i2];
double a1i = a_imag[i1];
double a2i = a_imag[i2];
// temporary local arrays
double real[data_size], imag[data_size];

a_real[i1] = cx * a1r + sx * a2i;
a_real[i2] = sx * a1i + cx * a2r;

a_imag[i1] = cx * a1i - sx * a2r;
a_imag[i2] = -sx * a1r + cx * a2i;
// load global data into local array
for(int idx = 0; idx < data_size; ++idx){
const unsigned int data_idx = offset + idx*stride;
real[idx] = a_real[data_idx];
imag[idx] = a_imag[data_idx];
}

// perform n_q steps
for(int q = 0; q < n_q; ++q){
const unsigned int mask1 = (1 << q) - 1;
const unsigned int mask2 = state_mask - mask1;

for(int tid = 0; tid < stride_size; ++tid){
const unsigned int ia = ((tid & mask1) | ((tid & mask2) << 1));
const unsigned int ib = (ia | (1 << q));

const double ar = real[ia];
const double ai = imag[ia];
const double br = real[ib];
const double bi = imag[ib];

real[ia] = a*ar - b*bi;
imag[ia] = a*ai + b*br;
real[ib] = a*br - b*ai;
imag[ib] = a*bi + b*ar;
}
}

// load local data into global array
for(int idx = 0; idx < data_size; ++idx){
const unsigned int data_idx = offset + idx*stride;
a_real[data_idx] = real[idx];
a_imag[data_idx] = imag[idx];
}
}

Expand All @@ -62,9 +70,27 @@ void furx(double* a_real, double* a_imag, double theta, unsigned int q, size_t n
*/
void furx_all(double* a_real, double* a_imag, double theta, unsigned int n_qubits, size_t n_states)
{
for (unsigned int i=0; i< n_qubits; i++)
{
furx(a_real, a_imag, theta, i, n_states);
const size_t state_mask = (n_states - 1) >> 1;

const double a = cos(theta);
const double b = -sin(theta);

const unsigned int group_size = 10;
const unsigned int group_blocks = n_states >> group_size;
const unsigned int last_group_size = n_qubits % group_size;
const unsigned int last_group_blocks = n_states >> last_group_size;

for(int q_offset = 0; q_offset < n_qubits - last_group_size; q_offset += group_size){
#pragma omp parallel for
for(int i = 0; i < group_blocks; ++i)
furx_kernel(a_real, a_imag, a, b,i, group_size, q_offset,state_mask);
}

if(last_group_size > 0){
const unsigned int q_offset = n_qubits - last_group_size;
#pragma omp parallel for
for(int i = 0; i < last_group_blocks; ++i)
furx_kernel(a_real, a_imag, a, b, i, last_group_size, q_offset, state_mask);
}
}

Expand Down

0 comments on commit 9965dfa

Please sign in to comment.