Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve the performance of the C simulator #70

Merged
merged 6 commits into from
Aug 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 54 additions & 2 deletions examples/benchmark.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
"id": "11678646-6e36-416f-aec6-39b38f1bc02f",
"metadata": {},
"source": [
"The output should be `['gpu', 'c', 'python']`. `'c'` simulator is optional and is not relevant to this benchmark. However, if it's missing, you can compile it manually by running `make -C qokit/fur/c/csim/src/` in the home directory of QOKit"
"The output should be `['gpu', 'c', 'python']`. If the `'c'` simulator is missing, you can compile it manually by running `make -C qokit/fur/c/csim/src/` in the home directory of QOKit"
]
},
{
Expand Down Expand Up @@ -238,7 +238,9 @@
"N=28\n",
"\tLABS finished in 1.3889 on average, min: 1.3856, max: 1.3953\n",
"\tMaxCut finished in 1.3112 on average, min: 1.3088, max: 1.3201\n",
"```"
"```\n",
"\n",
"On large memory CPU nodes, you can run with more qubits using the `'c'` simulator. "
]
},
{
Expand All @@ -247,6 +249,56 @@
"id": "549162cf-99c4-4473-bc01-87b616dae7bc",
"metadata": {},
"outputs": [],
"source": [
"# number of qubits\n",
"for N in [12, 16, 20, 24, 28, 32, 33, 34]:\n",
" print(f\"N={N}\")\n",
" # QAOA depth\n",
" p = 6\n",
"\n",
" theta = np.random.uniform(0,1,2*p)\n",
" G = nx.random_regular_graph(4, N, seed=42)\n",
"\n",
" # Function initialization may not be fast\n",
" f_labs = get_qaoa_labs_objective(N, p, simulator='c')\n",
"\n",
" # Function evaluation is fast\n",
" for f, label in [(f_labs, \"LABS\")]:\n",
" f(theta) # do not count the first evaluation\n",
" times = []\n",
" for _ in tqdm(range(3)):\n",
" start = timeit.default_timer()\n",
" f(theta)\n",
" end = timeit.default_timer()\n",
" times.append(end-start)\n",
" print(f\"For N = {N}, {label} finished in {np.mean(times):.4f} on average, min: {np.min(times):.4f}, max: {np.max(times):.4f}\")"
]
},
{
"cell_type": "markdown",
"id": "ec7d4d59-03a3-4fea-8516-6d03ad410cd3",
"metadata": {},
"source": [
"Here's what I measured on `r5.24xlarge` (Intel Xeon Platinum 8175M CPU @ 2.50GHz, 24 Cores per socket with 2 sockets, 2 threads per core, 768 GB RAM):\n",
"\n",
"```\n",
"For N = 12, LABS finished in 0.0013 on average, min: 0.0012, max: 0.0016\n",
"For N = 16, LABS finished in 0.1293 on average, min: 0.1180, max: 0.1483\n",
"For N = 20, LABS finished in 0.1510 on average, min: 0.1325, max: 0.1745\n",
"For N = 24, LABS finished in 0.7461 on average, min: 0.7184, max: 0.7760\n",
"For N = 28, LABS finished in 11.4725 on average, min: 11.3852, max: 11.6067\n",
"For N = 32, LABS finished in 234.7204 on average, min: 234.3182, max: 235.0045\n",
"For N = 33, LABS finished in 482.1637 on average, min: 476.6924, max: 485.6158\n",
"For N = 34, LABS finished in 958.0517 on average, min: 946.1425, max: 968.6004\n",
"```"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "84c0121b-cab8-4062-a93f-426a4d3c9757",
"metadata": {},
"outputs": [],
"source": []
}
],
Expand Down
11 changes: 0 additions & 11 deletions qokit/fur/c/csim/lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,17 +14,6 @@
raise ImportError("You must compile the C simulator before running the code. Please follow the instructions in README.md") from e


_furx = lib.furx
_furx.restype = None
_furx.argtypes = [
ndpointer(ctypes.c_double, flags="C_CONTIGUOUS"),
ndpointer(ctypes.c_double, flags="C_CONTIGUOUS"),
ctypes.c_double,
ctypes.c_uint,
ctypes.c_size_t,
]


_apply_qaoa_furx = lib.apply_qaoa_furx
_apply_qaoa_furx.restype = None
_apply_qaoa_furx.argtypes = [
Expand Down
102 changes: 64 additions & 38 deletions qokit/fur/c/csim/src/fur.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,45 +7,53 @@
#include <stdio.h>
#include <math.h>

/**
* apply to a statevector a single-qubit Pauli-X rotation defined by
* Rx(theta) = exp(-i*theta*X)
* where X is the Pauli-X operator.
* @param sv_real array of length n containing real parts of the statevector
* @param sv_imag array of length n containing imaginary parts of the statevector
* @param theta rotation angle
* @param q index of qubit on which the rotation is applied
* @param n_states size of the statevector
*/
void furx(double* a_real, double* a_imag, double theta, unsigned int q, size_t n_states)
{
// number of groups of states on which the operation is applied locally
size_t num_groups = n_states / 2;
void furx_kernel(double* a_real, double* a_imag, const double a, const double b,
const int block_idx, const int n_q, const int q_offset, const int state_mask){

// helper digit masks for constructing the locality indices
// the mask is applied on the index through all groups of local operations
size_t mask1 = ((size_t)1<<q) - 1; // digits lower than q
size_t mask2 = mask1 ^ ((n_states-1) >> 1); // digits higher than q

// pre-compute coefficients in transformation
double cx = cos(theta), sx = sin(theta);
const unsigned int data_size = 1 << n_q;
const unsigned int stride_size = data_size >> 1;

#pragma omp parallel for
for (size_t i = 0; i < num_groups; i++)
{
size_t i1 = (i&mask1) | ((i&mask2)<<1);
size_t i2 = i1 | ((size_t)1<<q);
const unsigned int stride = 1 << q_offset;
const unsigned int index_mask = stride - 1;
const unsigned int stride_mask = ~index_mask;
const unsigned int offset = ((stride_mask & block_idx) << n_q) | (index_mask & block_idx);

double a1r = a_real[i1];
double a2r = a_real[i2];
double a1i = a_imag[i1];
double a2i = a_imag[i2];
// temporary local arrays
double real[data_size], imag[data_size];

a_real[i1] = cx * a1r + sx * a2i;
a_real[i2] = sx * a1i + cx * a2r;

a_imag[i1] = cx * a1i - sx * a2r;
a_imag[i2] = -sx * a1r + cx * a2i;
// load global data into local array
for(int idx = 0; idx < data_size; ++idx){
const unsigned int data_idx = offset + idx*stride;
real[idx] = a_real[data_idx];
imag[idx] = a_imag[data_idx];
}

// perform n_q steps
for(int q = 0; q < n_q; ++q){
const unsigned int mask1 = (1 << q) - 1;
const unsigned int mask2 = state_mask - mask1;

for(int tid = 0; tid < stride_size; ++tid){
const unsigned int ia = ((tid & mask1) | ((tid & mask2) << 1));
const unsigned int ib = (ia | (1 << q));

const double ar = real[ia];
const double ai = imag[ia];
const double br = real[ib];
const double bi = imag[ib];

real[ia] = a*ar - b*bi;
imag[ia] = a*ai + b*br;
real[ib] = a*br - b*ai;
imag[ib] = a*bi + b*ar;
}
}

// load local data into global array
for(int idx = 0; idx < data_size; ++idx){
const unsigned int data_idx = offset + idx*stride;
a_real[data_idx] = real[idx];
a_imag[data_idx] = imag[idx];
}
}

Expand All @@ -62,9 +70,27 @@ void furx(double* a_real, double* a_imag, double theta, unsigned int q, size_t n
*/
void furx_all(double* a_real, double* a_imag, double theta, unsigned int n_qubits, size_t n_states)
{
for (unsigned int i=0; i< n_qubits; i++)
{
furx(a_real, a_imag, theta, i, n_states);
const size_t state_mask = (n_states - 1) >> 1;

const double a = cos(theta);
const double b = -sin(theta);

const unsigned int group_size = 10;
const unsigned int group_blocks = n_states >> group_size;
const unsigned int last_group_size = n_qubits % group_size;
const unsigned int last_group_blocks = n_states >> last_group_size;

for(int q_offset = 0; q_offset < n_qubits - last_group_size; q_offset += group_size){
#pragma omp parallel for
for(int i = 0; i < group_blocks; ++i)
furx_kernel(a_real, a_imag, a, b,i, group_size, q_offset,state_mask);
}

if(last_group_size > 0){
const unsigned int q_offset = n_qubits - last_group_size;
#pragma omp parallel for
for(int i = 0; i < last_group_blocks; ++i)
furx_kernel(a_real, a_imag, a, b, i, last_group_size, q_offset, state_mask);
}
}

Expand Down
Loading