jpmorganchase · rsln-s · Aug 20, 2024 · Aug 6, 2024 · Aug 16, 2024 · Aug 20, 2024
diff --git a/examples/benchmark.ipynb b/examples/benchmark.ipynb
@@ -35,7 +35,7 @@
    "id": "11678646-6e36-416f-aec6-39b38f1bc02f",
    "metadata": {},
    "source": [
-    "The output should be `['gpu', 'c', 'python']`. `'c'` simulator is optional and is not relevant to this benchmark. However, if it's missing, you can compile it manually by running `make -C qokit/fur/c/csim/src/` in the home directory of QOKit"
+    "The output should be `['gpu', 'c', 'python']`. If the `'c'` simulator is missing, you can compile it manually by running `make -C qokit/fur/c/csim/src/` in the home directory of QOKit"
    ]
   },
   {
@@ -238,7 +238,9 @@
     "N=28\n",
     "\tLABS finished in 1.3889 on average, min: 1.3856, max: 1.3953\n",
     "\tMaxCut finished in 1.3112 on average, min: 1.3088, max: 1.3201\n",
-    "```"
+    "```\n",
+    "\n",
+    "On large memory CPU nodes, you can run with more qubits using the `'c'` simulator. "
    ]
   },
   {
@@ -247,6 +249,56 @@
    "id": "549162cf-99c4-4473-bc01-87b616dae7bc",
    "metadata": {},
    "outputs": [],
+   "source": [
+    "# number of qubits\n",
+    "for N in [12, 16, 20, 24, 28, 32, 33, 34]:\n",
+    "    print(f\"N={N}\")\n",
+    "    # QAOA depth\n",
+    "    p = 6\n",
+    "\n",
+    "    theta = np.random.uniform(0,1,2*p)\n",
+    "    G = nx.random_regular_graph(4, N, seed=42)\n",
+    "\n",
+    "    # Function initialization may not be fast\n",
+    "    f_labs = get_qaoa_labs_objective(N, p, simulator='c')\n",
+    "\n",
+    "    # Function evaluation is fast\n",
+    "    for f, label in [(f_labs, \"LABS\")]:\n",
+    "        f(theta) # do not count the first evaluation\n",
+    "        times = []\n",
+    "        for _ in tqdm(range(3)):\n",
+    "            start = timeit.default_timer()\n",
+    "            f(theta)\n",
+    "            end = timeit.default_timer()\n",
+    "            times.append(end-start)\n",
+    "        print(f\"For N = {N}, {label} finished in {np.mean(times):.4f} on average, min: {np.min(times):.4f}, max: {np.max(times):.4f}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ec7d4d59-03a3-4fea-8516-6d03ad410cd3",
+   "metadata": {},
+   "source": [
+    "Here's what I measured on `r5.24xlarge` (Intel Xeon Platinum 8175M CPU @ 2.50GHz, 24 Cores per socket with 2 sockets, 2 threads per core, 768 GB RAM):\n",
+    "\n",
+    "```\n",
+    "For N = 12, LABS finished in 0.0013 on average, min: 0.0012, max: 0.0016\n",
+    "For N = 16, LABS finished in 0.1293 on average, min: 0.1180, max: 0.1483\n",
+    "For N = 20, LABS finished in 0.1510 on average, min: 0.1325, max: 0.1745\n",
+    "For N = 24, LABS finished in 0.7461 on average, min: 0.7184, max: 0.7760\n",
+    "For N = 28, LABS finished in 11.4725 on average, min: 11.3852, max: 11.6067\n",
+    "For N = 32, LABS finished in 234.7204 on average, min: 234.3182, max: 235.0045\n",
+    "For N = 33, LABS finished in 482.1637 on average, min: 476.6924, max: 485.6158\n",
+    "For N = 34, LABS finished in 958.0517 on average, min: 946.1425, max: 968.6004\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "84c0121b-cab8-4062-a93f-426a4d3c9757",
+   "metadata": {},
+   "outputs": [],
    "source": []
   }
  ],

diff --git a/qokit/fur/c/csim/lib.py b/qokit/fur/c/csim/lib.py
@@ -14,17 +14,6 @@
     raise ImportError("You must compile the C simulator before running the code. Please follow the instructions in README.md") from e
 
 
-_furx = lib.furx
-_furx.restype = None
-_furx.argtypes = [
-    ndpointer(ctypes.c_double, flags="C_CONTIGUOUS"),
-    ndpointer(ctypes.c_double, flags="C_CONTIGUOUS"),
-    ctypes.c_double,
-    ctypes.c_uint,
-    ctypes.c_size_t,
-]
-
-
 _apply_qaoa_furx = lib.apply_qaoa_furx
 _apply_qaoa_furx.restype = None
 _apply_qaoa_furx.argtypes = [

diff --git a/qokit/fur/c/csim/src/fur.c b/qokit/fur/c/csim/src/fur.c
@@ -7,45 +7,53 @@
 #include <stdio.h>
 #include <math.h>
 
-/**
- * apply to a statevector a single-qubit Pauli-X rotation defined by
- * Rx(theta) = exp(-i*theta*X)
- * where X is the Pauli-X operator.
- * @param sv_real array of length n containing real parts of the statevector
- * @param sv_imag array of length n containing imaginary parts of the statevector
- * @param theta rotation angle
- * @param q index of qubit on which the rotation is applied
- * @param n_states size of the statevector
- */ 
-void furx(double* a_real, double* a_imag, double theta, unsigned int q, size_t n_states)
-{
-    // number of groups of states on which the operation is applied locally
-    size_t num_groups = n_states / 2;
+void furx_kernel(double* a_real, double* a_imag, const double a, const double b, 
+    const int block_idx, const int n_q, const int q_offset, const int state_mask){
 
-    // helper digit masks for constructing the locality indices
-    // the mask is applied on the index through all groups of local operations
-    size_t mask1 = ((size_t)1<<q) - 1;  // digits lower than q
-    size_t mask2 = mask1 ^ ((n_states-1) >> 1);  // digits higher than q
-
-    // pre-compute coefficients in transformation
-    double cx = cos(theta), sx = sin(theta);
+    const unsigned int data_size = 1 << n_q;
+    const unsigned int stride_size = data_size >> 1;
 
-    #pragma omp parallel for
-    for (size_t i = 0; i < num_groups; i++)
-    {
-        size_t i1 = (i&mask1) | ((i&mask2)<<1);
-        size_t i2 = i1 | ((size_t)1<<q);
+    const unsigned int stride = 1 << q_offset;
+    const unsigned int index_mask = stride - 1;
+    const unsigned int stride_mask = ~index_mask;
+    const unsigned int offset = ((stride_mask & block_idx) << n_q) | (index_mask & block_idx);
 
-        double a1r = a_real[i1];
-        double a2r = a_real[i2];
-        double a1i = a_imag[i1];
-        double a2i = a_imag[i2];
+    // temporary local arrays
+    double real[data_size], imag[data_size];
 
-        a_real[i1] = cx * a1r + sx * a2i;
-        a_real[i2] = sx * a1i + cx * a2r;
-
-        a_imag[i1] = cx * a1i - sx * a2r;
-        a_imag[i2] = -sx * a1r + cx * a2i;
+    // load global data into local array
+    for(int idx = 0; idx < data_size; ++idx){
+        const unsigned int data_idx = offset + idx*stride;
+        real[idx] = a_real[data_idx];
+        imag[idx] = a_imag[data_idx];
+    }
+
+    // perform n_q steps
+    for(int q = 0; q < n_q; ++q){
+        const unsigned int mask1 = (1 << q) - 1;
+        const unsigned int mask2 = state_mask - mask1;
+
+        for(int tid = 0; tid < stride_size; ++tid){
+            const unsigned int ia = ((tid & mask1) | ((tid & mask2) << 1));
+            const unsigned int ib = (ia | (1 << q));
+
+            const double ar = real[ia];
+            const double ai = imag[ia];
+            const double br = real[ib];
+            const double bi = imag[ib];
+
+            real[ia] = a*ar - b*bi;
+            imag[ia] = a*ai + b*br;
+            real[ib] = a*br - b*ai;
+            imag[ib] = a*bi + b*ar;
+        }
+    }
+
+    // load local data into global array
+    for(int idx = 0; idx < data_size; ++idx){
+        const unsigned int data_idx = offset + idx*stride;
+        a_real[data_idx] = real[idx];
+        a_imag[data_idx] = imag[idx];
     }
 }
 
@@ -62,9 +70,27 @@ void furx(double* a_real, double* a_imag, double theta, unsigned int q, size_t n
  */ 
 void furx_all(double* a_real, double* a_imag, double theta, unsigned int n_qubits, size_t n_states)
 {
-    for (unsigned int i=0; i< n_qubits; i++)
-    {
-        furx(a_real, a_imag, theta, i, n_states);
+    const size_t state_mask = (n_states - 1) >> 1;
+
+    const double a = cos(theta);
+    const double b = -sin(theta);
+
+    const unsigned int group_size = 10;
+    const unsigned int group_blocks = n_states >> group_size;
+    const unsigned int last_group_size = n_qubits % group_size;
+    const unsigned int last_group_blocks = n_states >> last_group_size;
+
+    for(int q_offset = 0; q_offset < n_qubits - last_group_size; q_offset += group_size){
+        #pragma omp parallel for
+        for(int i = 0; i < group_blocks; ++i)
+            furx_kernel(a_real, a_imag, a, b,i, group_size, q_offset,state_mask);
+    }
+
+    if(last_group_size > 0){
+        const unsigned int q_offset = n_qubits - last_group_size;
+        #pragma omp parallel for
+        for(int i = 0; i < last_group_blocks; ++i)
+            furx_kernel(a_real, a_imag, a, b, i, last_group_size, q_offset, state_mask);
     }
 }