Skip to content

Commit

Permalink
docs: Re-render NCU notebook (#228)
Browse files Browse the repository at this point in the history
  • Loading branch information
michaelmckinsey1 authored Nov 13, 2024
1 parent 8a6927d commit ef65aec
Showing 1 changed file with 75 additions and 59 deletions.
134 changes: 75 additions & 59 deletions docs/nsight_compute.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@
"\n",
"Nsight Compute (NCU) is a performance profiler for NVIDIA GPUs. NCU report files do not have a calltree, but with the NVTX Caliper service we can forward Caliper annotations to NCU. By profiling the same executable with a calltree profiler like Caliper, we can map the NCU data to the calltree profile and create a Thicket object. \n",
"\n",
"In Section 6, we reproduce some of the analysis and visualizations from the paper:\n",
"\n",
"Olga Pearce, Jason Burmark, Rich Hornung, Befikir Bogale, Ian Lumsden, Michael McKinsey, Dewi Yokelson, David Boehme, Stephanie Brink, Michela Taufer, and Tom Scogland. “RAJA Performance Suite: Performance Portability Analysis with Caliper and Thicket”. SC-W 2024: Workshops of ACM/IEEE International Conference for High Performance Computing, Networking, Storage, and Analysis. Performance, Portability & Productivity in HPC. 2024.\n",
"\n",
"***\n",
"\n",
"## 1. Import Necessary Packages\n",
Expand Down Expand Up @@ -163,8 +167,8 @@
"name": "stderr",
"output_type": "stream",
"text": [
"(1/2) Reading Files: 100%|██████████| 2/2 [00:00<00:00, 13.01it/s]\n",
"(2/2) Creating Thicket: 100%|██████████| 1/1 [00:00<00:00, 5.59it/s]\n"
"(1/2) Reading Files: 100%|██████████| 2/2 [00:00<00:00, 4.57it/s]\n",
"(2/2) Creating Thicket: 100%|██████████| 1/1 [00:00<00:00, 4.75it/s]\n"
]
},
{
Expand Down Expand Up @@ -459,8 +463,8 @@
"name": "stderr",
"output_type": "stream",
"text": [
"Processing action 600/601: 100%|██████████| 601/601 [00:15<00:00, 38.67it/s] \n",
"Processing action 600/601: 100%|██████████| 601/601 [00:01<00:00, 406.66it/s]\n"
"Processing action 600/601: 100%|██████████| 601/601 [00:15<00:00, 39.09it/s] \n",
"Processing action 600/601: 100%|██████████| 601/601 [00:01<00:00, 375.43it/s]\n"
]
},
{
Expand Down Expand Up @@ -490,8 +494,8 @@
" <th>time (gpu)</th>\n",
" <th>name</th>\n",
" <th>gpu__time_duration.sum</th>\n",
" <th>smsp__maximum_warps_avg_per_active_cycle</th>\n",
" <th>sm__throughput.avg.pct_of_peak_sustained_elapsed</th>\n",
" <th>smsp__maximum_warps_avg_per_active_cycle</th>\n",
" </tr>\n",
" <tr>\n",
" <th>node</th>\n",
Expand Down Expand Up @@ -619,8 +623,8 @@
" <td>0.000051</td>\n",
" <td>void rajaperf::algorithm::memcpy&lt;128ul&gt;(double...</td>\n",
" <td>43232.0</td>\n",
" <td>16.0</td>\n",
" <td>6.521123</td>\n",
" <td>16.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>528105777</th>\n",
Expand All @@ -629,8 +633,8 @@
" <td>0.000031</td>\n",
" <td>void rajaperf::algorithm::memcpy&lt;128ul&gt;(double...</td>\n",
" <td>22880.0</td>\n",
" <td>16.0</td>\n",
" <td>6.294607</td>\n",
" <td>16.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th rowspan=\"2\" valign=\"top\">{'name': 'Algorithm_MEMSET', 'type': 'function'}</th>\n",
Expand Down Expand Up @@ -703,8 +707,8 @@
" <td>0.000033</td>\n",
" <td>void rajaperf::algorithm::memset&lt;128ul&gt;(double...</td>\n",
" <td>31648.0</td>\n",
" <td>16.0</td>\n",
" <td>7.531866</td>\n",
" <td>16.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>528105777</th>\n",
Expand All @@ -713,8 +717,8 @@
" <td>0.000020</td>\n",
" <td>void rajaperf::algorithm::memset&lt;128ul&gt;(double...</td>\n",
" <td>18016.0</td>\n",
" <td>16.0</td>\n",
" <td>6.692635</td>\n",
" <td>16.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
Expand Down Expand Up @@ -813,51 +817,51 @@
"{'name': 'void rajaperf::algorithm::memset<128u... 457195964 31648.0 \n",
" 528105777 18016.0 \n",
"\n",
" smsp__maximum_warps_avg_per_active_cycle \\\n",
"node profile \n",
"{'name': 'RAJAPerf', 'type': 'function'} 457195964 NaN \n",
" 528105777 NaN \n",
"{'name': 'Algorithm', 'type': 'function'} 457195964 NaN \n",
" 528105777 NaN \n",
"{'name': 'Algorithm_MEMCPY', 'type': 'function'} 457195964 NaN \n",
" 528105777 NaN \n",
"{'name': 'cudaDeviceSynchronize', 'type': 'func... 457195964 NaN \n",
" 528105777 NaN \n",
"{'name': 'cudaLaunchKernel', 'type': 'function'} 457195964 NaN \n",
" 528105777 NaN \n",
"{'name': 'void rajaperf::algorithm::memcpy<128u... 457195964 16.0 \n",
" 528105777 16.0 \n",
"{'name': 'Algorithm_MEMSET', 'type': 'function'} 457195964 NaN \n",
" 528105777 NaN \n",
"{'name': 'cudaDeviceSynchronize', 'type': 'func... 457195964 NaN \n",
" 528105777 NaN \n",
"{'name': 'cudaLaunchKernel', 'type': 'function'} 457195964 NaN \n",
" 528105777 NaN \n",
"{'name': 'void rajaperf::algorithm::memset<128u... 457195964 16.0 \n",
" 528105777 16.0 \n",
" sm__throughput.avg.pct_of_peak_sustained_elapsed \\\n",
"node profile \n",
"{'name': 'RAJAPerf', 'type': 'function'} 457195964 NaN \n",
" 528105777 NaN \n",
"{'name': 'Algorithm', 'type': 'function'} 457195964 NaN \n",
" 528105777 NaN \n",
"{'name': 'Algorithm_MEMCPY', 'type': 'function'} 457195964 NaN \n",
" 528105777 NaN \n",
"{'name': 'cudaDeviceSynchronize', 'type': 'func... 457195964 NaN \n",
" 528105777 NaN \n",
"{'name': 'cudaLaunchKernel', 'type': 'function'} 457195964 NaN \n",
" 528105777 NaN \n",
"{'name': 'void rajaperf::algorithm::memcpy<128u... 457195964 6.521123 \n",
" 528105777 6.294607 \n",
"{'name': 'Algorithm_MEMSET', 'type': 'function'} 457195964 NaN \n",
" 528105777 NaN \n",
"{'name': 'cudaDeviceSynchronize', 'type': 'func... 457195964 NaN \n",
" 528105777 NaN \n",
"{'name': 'cudaLaunchKernel', 'type': 'function'} 457195964 NaN \n",
" 528105777 NaN \n",
"{'name': 'void rajaperf::algorithm::memset<128u... 457195964 7.531866 \n",
" 528105777 6.692635 \n",
"\n",
" sm__throughput.avg.pct_of_peak_sustained_elapsed \n",
"node profile \n",
"{'name': 'RAJAPerf', 'type': 'function'} 457195964 NaN \n",
" 528105777 NaN \n",
"{'name': 'Algorithm', 'type': 'function'} 457195964 NaN \n",
" 528105777 NaN \n",
"{'name': 'Algorithm_MEMCPY', 'type': 'function'} 457195964 NaN \n",
" 528105777 NaN \n",
"{'name': 'cudaDeviceSynchronize', 'type': 'func... 457195964 NaN \n",
" 528105777 NaN \n",
"{'name': 'cudaLaunchKernel', 'type': 'function'} 457195964 NaN \n",
" 528105777 NaN \n",
"{'name': 'void rajaperf::algorithm::memcpy<128u... 457195964 6.521123 \n",
" 528105777 6.294607 \n",
"{'name': 'Algorithm_MEMSET', 'type': 'function'} 457195964 NaN \n",
" 528105777 NaN \n",
"{'name': 'cudaDeviceSynchronize', 'type': 'func... 457195964 NaN \n",
" 528105777 NaN \n",
"{'name': 'cudaLaunchKernel', 'type': 'function'} 457195964 NaN \n",
" 528105777 NaN \n",
"{'name': 'void rajaperf::algorithm::memset<128u... 457195964 7.531866 \n",
" 528105777 6.692635 "
" smsp__maximum_warps_avg_per_active_cycle \n",
"node profile \n",
"{'name': 'RAJAPerf', 'type': 'function'} 457195964 NaN \n",
" 528105777 NaN \n",
"{'name': 'Algorithm', 'type': 'function'} 457195964 NaN \n",
" 528105777 NaN \n",
"{'name': 'Algorithm_MEMCPY', 'type': 'function'} 457195964 NaN \n",
" 528105777 NaN \n",
"{'name': 'cudaDeviceSynchronize', 'type': 'func... 457195964 NaN \n",
" 528105777 NaN \n",
"{'name': 'cudaLaunchKernel', 'type': 'function'} 457195964 NaN \n",
" 528105777 NaN \n",
"{'name': 'void rajaperf::algorithm::memcpy<128u... 457195964 16.0 \n",
" 528105777 16.0 \n",
"{'name': 'Algorithm_MEMSET', 'type': 'function'} 457195964 NaN \n",
" 528105777 NaN \n",
"{'name': 'cudaDeviceSynchronize', 'type': 'func... 457195964 NaN \n",
" 528105777 NaN \n",
"{'name': 'cudaLaunchKernel', 'type': 'function'} 457195964 NaN \n",
" 528105777 NaN \n",
"{'name': 'void rajaperf::algorithm::memset<128u... 457195964 16.0 \n",
" 528105777 16.0 "
]
},
"execution_count": 4,
Expand Down Expand Up @@ -1148,6 +1152,10 @@
"\n",
"We can make roofline plots using the metrics we have collected using Nsight Compute. The Roofline sets an upper bound on performance of a kernel depending on its operational intensity. We will use roofline plots to understand the performance of RAJAPerf kernels.\n",
"\n",
"In this section, we reproduce some of the analysis and visualizations from the paper:\n",
"\n",
"Olga Pearce, Jason Burmark, Rich Hornung, Befikir Bogale, Ian Lumsden, Michael McKinsey, Dewi Yokelson, David Boehme, Stephanie Brink, Michela Taufer, and Tom Scogland. “RAJA Performance Suite: Performance Portability Analysis with Caliper and Thicket”. SC-W 2024: Workshops of ACM/IEEE International Conference for High Performance Computing, Networking, Storage, and Analysis. Performance, Portability & Productivity in HPC. 2024.\n",
"\n",
"Instruction Roofline Models were introduced by Ding et. al to better characterize GPU workloads by looking at instruction intensity. We walk through the creation of instruction roofline models for the Appications group of the RAJAPerf kernels below. \n",
"\n",
"More references on methodology for roofline models:\n",
Expand Down Expand Up @@ -1191,7 +1199,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
"Processing action 3444/3445: 100%|██████████| 3445/3445 [00:42<00:00, 80.24it/s] \n"
"Processing action 3444/3445: 100%|██████████| 3445/3445 [00:42<00:00, 81.94it/s] \n"
]
}
],
Expand Down Expand Up @@ -1816,7 +1824,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 14,
"id": "909bbd4e",
"metadata": {},
"outputs": [],
Expand Down Expand Up @@ -1991,7 +1999,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 15,
"id": "c208ebcb",
"metadata": {},
"outputs": [
Expand All @@ -2012,7 +2020,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 16,
"id": "7a2b50dd",
"metadata": {},
"outputs": [
Expand All @@ -2033,7 +2041,7 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 17,
"id": "3917697f",
"metadata": {},
"outputs": [
Expand All @@ -2054,7 +2062,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 18,
"id": "02d2e19e",
"metadata": {},
"outputs": [
Expand All @@ -2072,6 +2080,14 @@
"source": [
"roofline(LABELS=pruned_th.dataframe[\"name\"].tolist(), flag=\"HBM\", data_df=agg_df)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "63636ddf",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand Down

0 comments on commit ef65aec

Please sign in to comment.