diff --git a/docs/nsight_compute.ipynb b/docs/nsight_compute.ipynb
index a8ec3e97..cc861537 100644
--- a/docs/nsight_compute.ipynb
+++ b/docs/nsight_compute.ipynb
@@ -18,6 +18,10 @@
     "\n",
     "Nsight Compute (NCU) is a performance profiler for NVIDIA GPUs. NCU report files do not have a calltree, but with the NVTX Caliper service we can forward Caliper annotations to NCU. By profiling the same executable with a calltree profiler like Caliper, we can map the NCU data to the calltree profile and create a Thicket object. \n",
     "\n",
+    "In Section 6, we reproduce some of the analysis and visualizations from the paper:\n",
+    "\n",
+    "Olga Pearce, Jason Burmark, Rich Hornung, Befikir Bogale, Ian Lumsden, Michael McKinsey, Dewi Yokelson, David Boehme, Stephanie Brink, Michela Taufer, and Tom Scogland. “RAJA Performance Suite: Performance Portability Analysis with Caliper and Thicket”. SC-W 2024: Workshops of ACM/IEEE International Conference for High Performance Computing, Networking, Storage, and Analysis. Performance, Portability & Productivity in HPC. 2024.\n",
+    "\n",
     "***\n",
     "\n",
     "## 1. Import Necessary Packages\n",
@@ -163,8 +167,8 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "(1/2) Reading Files: 100%|██████████| 2/2 [00:00<00:00, 13.01it/s]\n",
-      "(2/2) Creating Thicket: 100%|██████████| 1/1 [00:00<00:00,  5.59it/s]\n"
+      "(1/2) Reading Files: 100%|██████████| 2/2 [00:00<00:00,  4.57it/s]\n",
+      "(2/2) Creating Thicket: 100%|██████████| 1/1 [00:00<00:00,  4.75it/s]\n"
      ]
     },
     {
@@ -459,8 +463,8 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Processing action 600/601: 100%|██████████| 601/601 [00:15<00:00, 38.67it/s] \n",
-      "Processing action 600/601: 100%|██████████| 601/601 [00:01<00:00, 406.66it/s]\n"
+      "Processing action 600/601: 100%|██████████| 601/601 [00:15<00:00, 39.09it/s] \n",
+      "Processing action 600/601: 100%|██████████| 601/601 [00:01<00:00, 375.43it/s]\n"
      ]
     },
     {
@@ -490,8 +494,8 @@
        "      <th>time (gpu)</th>\n",
        "      <th>name</th>\n",
        "      <th>gpu__time_duration.sum</th>\n",
-       "      <th>smsp__maximum_warps_avg_per_active_cycle</th>\n",
        "      <th>sm__throughput.avg.pct_of_peak_sustained_elapsed</th>\n",
+       "      <th>smsp__maximum_warps_avg_per_active_cycle</th>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>node</th>\n",
@@ -619,8 +623,8 @@
        "      <td>0.000051</td>\n",
        "      <td>void rajaperf::algorithm::memcpy&lt;128ul&gt;(double...</td>\n",
        "      <td>43232.0</td>\n",
-       "      <td>16.0</td>\n",
        "      <td>6.521123</td>\n",
+       "      <td>16.0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>528105777</th>\n",
@@ -629,8 +633,8 @@
        "      <td>0.000031</td>\n",
        "      <td>void rajaperf::algorithm::memcpy&lt;128ul&gt;(double...</td>\n",
        "      <td>22880.0</td>\n",
-       "      <td>16.0</td>\n",
        "      <td>6.294607</td>\n",
+       "      <td>16.0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th rowspan=\"2\" valign=\"top\">{'name': 'Algorithm_MEMSET', 'type': 'function'}</th>\n",
@@ -703,8 +707,8 @@
        "      <td>0.000033</td>\n",
        "      <td>void rajaperf::algorithm::memset&lt;128ul&gt;(double...</td>\n",
        "      <td>31648.0</td>\n",
-       "      <td>16.0</td>\n",
        "      <td>7.531866</td>\n",
+       "      <td>16.0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>528105777</th>\n",
@@ -713,8 +717,8 @@
        "      <td>0.000020</td>\n",
        "      <td>void rajaperf::algorithm::memset&lt;128ul&gt;(double...</td>\n",
        "      <td>18016.0</td>\n",
-       "      <td>16.0</td>\n",
        "      <td>6.692635</td>\n",
+       "      <td>16.0</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
@@ -813,51 +817,51 @@
        "{'name': 'void rajaperf::algorithm::memset<128u... 457195964                 31648.0   \n",
        "                                                   528105777                 18016.0   \n",
        "\n",
-       "                                                              smsp__maximum_warps_avg_per_active_cycle  \\\n",
-       "node                                               profile                                               \n",
-       "{'name': 'RAJAPerf', 'type': 'function'}           457195964                                       NaN   \n",
-       "                                                   528105777                                       NaN   \n",
-       "{'name': 'Algorithm', 'type': 'function'}          457195964                                       NaN   \n",
-       "                                                   528105777                                       NaN   \n",
-       "{'name': 'Algorithm_MEMCPY', 'type': 'function'}   457195964                                       NaN   \n",
-       "                                                   528105777                                       NaN   \n",
-       "{'name': 'cudaDeviceSynchronize', 'type': 'func... 457195964                                       NaN   \n",
-       "                                                   528105777                                       NaN   \n",
-       "{'name': 'cudaLaunchKernel', 'type': 'function'}   457195964                                       NaN   \n",
-       "                                                   528105777                                       NaN   \n",
-       "{'name': 'void rajaperf::algorithm::memcpy<128u... 457195964                                      16.0   \n",
-       "                                                   528105777                                      16.0   \n",
-       "{'name': 'Algorithm_MEMSET', 'type': 'function'}   457195964                                       NaN   \n",
-       "                                                   528105777                                       NaN   \n",
-       "{'name': 'cudaDeviceSynchronize', 'type': 'func... 457195964                                       NaN   \n",
-       "                                                   528105777                                       NaN   \n",
-       "{'name': 'cudaLaunchKernel', 'type': 'function'}   457195964                                       NaN   \n",
-       "                                                   528105777                                       NaN   \n",
-       "{'name': 'void rajaperf::algorithm::memset<128u... 457195964                                      16.0   \n",
-       "                                                   528105777                                      16.0   \n",
+       "                                                              sm__throughput.avg.pct_of_peak_sustained_elapsed  \\\n",
+       "node                                               profile                                                       \n",
+       "{'name': 'RAJAPerf', 'type': 'function'}           457195964                                               NaN   \n",
+       "                                                   528105777                                               NaN   \n",
+       "{'name': 'Algorithm', 'type': 'function'}          457195964                                               NaN   \n",
+       "                                                   528105777                                               NaN   \n",
+       "{'name': 'Algorithm_MEMCPY', 'type': 'function'}   457195964                                               NaN   \n",
+       "                                                   528105777                                               NaN   \n",
+       "{'name': 'cudaDeviceSynchronize', 'type': 'func... 457195964                                               NaN   \n",
+       "                                                   528105777                                               NaN   \n",
+       "{'name': 'cudaLaunchKernel', 'type': 'function'}   457195964                                               NaN   \n",
+       "                                                   528105777                                               NaN   \n",
+       "{'name': 'void rajaperf::algorithm::memcpy<128u... 457195964                                          6.521123   \n",
+       "                                                   528105777                                          6.294607   \n",
+       "{'name': 'Algorithm_MEMSET', 'type': 'function'}   457195964                                               NaN   \n",
+       "                                                   528105777                                               NaN   \n",
+       "{'name': 'cudaDeviceSynchronize', 'type': 'func... 457195964                                               NaN   \n",
+       "                                                   528105777                                               NaN   \n",
+       "{'name': 'cudaLaunchKernel', 'type': 'function'}   457195964                                               NaN   \n",
+       "                                                   528105777                                               NaN   \n",
+       "{'name': 'void rajaperf::algorithm::memset<128u... 457195964                                          7.531866   \n",
+       "                                                   528105777                                          6.692635   \n",
        "\n",
-       "                                                              sm__throughput.avg.pct_of_peak_sustained_elapsed  \n",
-       "node                                               profile                                                      \n",
-       "{'name': 'RAJAPerf', 'type': 'function'}           457195964                                               NaN  \n",
-       "                                                   528105777                                               NaN  \n",
-       "{'name': 'Algorithm', 'type': 'function'}          457195964                                               NaN  \n",
-       "                                                   528105777                                               NaN  \n",
-       "{'name': 'Algorithm_MEMCPY', 'type': 'function'}   457195964                                               NaN  \n",
-       "                                                   528105777                                               NaN  \n",
-       "{'name': 'cudaDeviceSynchronize', 'type': 'func... 457195964                                               NaN  \n",
-       "                                                   528105777                                               NaN  \n",
-       "{'name': 'cudaLaunchKernel', 'type': 'function'}   457195964                                               NaN  \n",
-       "                                                   528105777                                               NaN  \n",
-       "{'name': 'void rajaperf::algorithm::memcpy<128u... 457195964                                          6.521123  \n",
-       "                                                   528105777                                          6.294607  \n",
-       "{'name': 'Algorithm_MEMSET', 'type': 'function'}   457195964                                               NaN  \n",
-       "                                                   528105777                                               NaN  \n",
-       "{'name': 'cudaDeviceSynchronize', 'type': 'func... 457195964                                               NaN  \n",
-       "                                                   528105777                                               NaN  \n",
-       "{'name': 'cudaLaunchKernel', 'type': 'function'}   457195964                                               NaN  \n",
-       "                                                   528105777                                               NaN  \n",
-       "{'name': 'void rajaperf::algorithm::memset<128u... 457195964                                          7.531866  \n",
-       "                                                   528105777                                          6.692635  "
+       "                                                              smsp__maximum_warps_avg_per_active_cycle  \n",
+       "node                                               profile                                              \n",
+       "{'name': 'RAJAPerf', 'type': 'function'}           457195964                                       NaN  \n",
+       "                                                   528105777                                       NaN  \n",
+       "{'name': 'Algorithm', 'type': 'function'}          457195964                                       NaN  \n",
+       "                                                   528105777                                       NaN  \n",
+       "{'name': 'Algorithm_MEMCPY', 'type': 'function'}   457195964                                       NaN  \n",
+       "                                                   528105777                                       NaN  \n",
+       "{'name': 'cudaDeviceSynchronize', 'type': 'func... 457195964                                       NaN  \n",
+       "                                                   528105777                                       NaN  \n",
+       "{'name': 'cudaLaunchKernel', 'type': 'function'}   457195964                                       NaN  \n",
+       "                                                   528105777                                       NaN  \n",
+       "{'name': 'void rajaperf::algorithm::memcpy<128u... 457195964                                      16.0  \n",
+       "                                                   528105777                                      16.0  \n",
+       "{'name': 'Algorithm_MEMSET', 'type': 'function'}   457195964                                       NaN  \n",
+       "                                                   528105777                                       NaN  \n",
+       "{'name': 'cudaDeviceSynchronize', 'type': 'func... 457195964                                       NaN  \n",
+       "                                                   528105777                                       NaN  \n",
+       "{'name': 'cudaLaunchKernel', 'type': 'function'}   457195964                                       NaN  \n",
+       "                                                   528105777                                       NaN  \n",
+       "{'name': 'void rajaperf::algorithm::memset<128u... 457195964                                      16.0  \n",
+       "                                                   528105777                                      16.0  "
       ]
      },
      "execution_count": 4,
@@ -1148,6 +1152,10 @@
     "\n",
     "We can make roofline plots using the metrics we have collected using Nsight Compute. The Roofline sets an upper bound on performance of a kernel depending on its operational intensity. We will use roofline plots to understand the performance of RAJAPerf kernels.\n",
     "\n",
+    "In this section, we reproduce some of the analysis and visualizations from the paper:\n",
+    "\n",
+    "Olga Pearce, Jason Burmark, Rich Hornung, Befikir Bogale, Ian Lumsden, Michael McKinsey, Dewi Yokelson, David Boehme, Stephanie Brink, Michela Taufer, and Tom Scogland. “RAJA Performance Suite: Performance Portability Analysis with Caliper and Thicket”. SC-W 2024: Workshops of ACM/IEEE International Conference for High Performance Computing, Networking, Storage, and Analysis. Performance, Portability & Productivity in HPC. 2024.\n",
+    "\n",
     "Instruction Roofline Models were introduced by Ding et. al to better characterize GPU workloads by looking at instruction intensity. We walk through the creation of instruction roofline models for the Appications group of the RAJAPerf kernels below. \n",
     "\n",
     "More references on methodology for roofline models:\n",
@@ -1191,7 +1199,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Processing action 3444/3445: 100%|██████████| 3445/3445 [00:42<00:00, 80.24it/s] \n"
+      "Processing action 3444/3445: 100%|██████████| 3445/3445 [00:42<00:00, 81.94it/s] \n"
      ]
     }
    ],
@@ -1816,7 +1824,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 14,
    "id": "909bbd4e",
    "metadata": {},
    "outputs": [],
@@ -1991,7 +1999,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 15,
    "id": "c208ebcb",
    "metadata": {},
    "outputs": [
@@ -2012,7 +2020,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 16,
    "id": "7a2b50dd",
    "metadata": {},
    "outputs": [
@@ -2033,7 +2041,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 17,
    "id": "3917697f",
    "metadata": {},
    "outputs": [
@@ -2054,7 +2062,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 18,
    "id": "02d2e19e",
    "metadata": {},
    "outputs": [
@@ -2072,6 +2080,14 @@
    "source": [
     "roofline(LABELS=pruned_th.dataframe[\"name\"].tolist(), flag=\"HBM\", data_df=agg_df)"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "63636ddf",
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {