diff --git a/docs/nsight_compute.ipynb b/docs/nsight_compute.ipynb index a8ec3e97..cc861537 100644 --- a/docs/nsight_compute.ipynb +++ b/docs/nsight_compute.ipynb @@ -18,6 +18,10 @@ "\n", "Nsight Compute (NCU) is a performance profiler for NVIDIA GPUs. NCU report files do not have a calltree, but with the NVTX Caliper service we can forward Caliper annotations to NCU. By profiling the same executable with a calltree profiler like Caliper, we can map the NCU data to the calltree profile and create a Thicket object. \n", "\n", + "In Section 6, we reproduce some of the analysis and visualizations from the paper:\n", + "\n", + "Olga Pearce, Jason Burmark, Rich Hornung, Befikir Bogale, Ian Lumsden, Michael McKinsey, Dewi Yokelson, David Boehme, Stephanie Brink, Michela Taufer, and Tom Scogland. “RAJA Performance Suite: Performance Portability Analysis with Caliper and Thicket”. SC-W 2024: Workshops of ACM/IEEE International Conference for High Performance Computing, Networking, Storage, and Analysis. Performance, Portability & Productivity in HPC. 2024.\n", + "\n", "***\n", "\n", "## 1. Import Necessary Packages\n", @@ -163,8 +167,8 @@ "name": "stderr", "output_type": "stream", "text": [ - "(1/2) Reading Files: 100%|██████████| 2/2 [00:00<00:00, 13.01it/s]\n", - "(2/2) Creating Thicket: 100%|██████████| 1/1 [00:00<00:00, 5.59it/s]\n" + "(1/2) Reading Files: 100%|██████████| 2/2 [00:00<00:00, 4.57it/s]\n", + "(2/2) Creating Thicket: 100%|██████████| 1/1 [00:00<00:00, 4.75it/s]\n" ] }, { @@ -459,8 +463,8 @@ "name": "stderr", "output_type": "stream", "text": [ - "Processing action 600/601: 100%|██████████| 601/601 [00:15<00:00, 38.67it/s] \n", - "Processing action 600/601: 100%|██████████| 601/601 [00:01<00:00, 406.66it/s]\n" + "Processing action 600/601: 100%|██████████| 601/601 [00:15<00:00, 39.09it/s] \n", + "Processing action 600/601: 100%|██████████| 601/601 [00:01<00:00, 375.43it/s]\n" ] }, { @@ -490,8 +494,8 @@ " time (gpu)\n", " name\n", " gpu__time_duration.sum\n", - " smsp__maximum_warps_avg_per_active_cycle\n", " sm__throughput.avg.pct_of_peak_sustained_elapsed\n", + " smsp__maximum_warps_avg_per_active_cycle\n", " \n", " \n", " node\n", @@ -619,8 +623,8 @@ " 0.000051\n", " void rajaperf::algorithm::memcpy<128ul>(double...\n", " 43232.0\n", - " 16.0\n", " 6.521123\n", + " 16.0\n", " \n", " \n", " 528105777\n", @@ -629,8 +633,8 @@ " 0.000031\n", " void rajaperf::algorithm::memcpy<128ul>(double...\n", " 22880.0\n", - " 16.0\n", " 6.294607\n", + " 16.0\n", " \n", " \n", " {'name': 'Algorithm_MEMSET', 'type': 'function'}\n", @@ -703,8 +707,8 @@ " 0.000033\n", " void rajaperf::algorithm::memset<128ul>(double...\n", " 31648.0\n", - " 16.0\n", " 7.531866\n", + " 16.0\n", " \n", " \n", " 528105777\n", @@ -713,8 +717,8 @@ " 0.000020\n", " void rajaperf::algorithm::memset<128ul>(double...\n", " 18016.0\n", - " 16.0\n", " 6.692635\n", + " 16.0\n", " \n", " \n", "\n", @@ -813,51 +817,51 @@ "{'name': 'void rajaperf::algorithm::memset<128u... 457195964 31648.0 \n", " 528105777 18016.0 \n", "\n", - " smsp__maximum_warps_avg_per_active_cycle \\\n", - "node profile \n", - "{'name': 'RAJAPerf', 'type': 'function'} 457195964 NaN \n", - " 528105777 NaN \n", - "{'name': 'Algorithm', 'type': 'function'} 457195964 NaN \n", - " 528105777 NaN \n", - "{'name': 'Algorithm_MEMCPY', 'type': 'function'} 457195964 NaN \n", - " 528105777 NaN \n", - "{'name': 'cudaDeviceSynchronize', 'type': 'func... 457195964 NaN \n", - " 528105777 NaN \n", - "{'name': 'cudaLaunchKernel', 'type': 'function'} 457195964 NaN \n", - " 528105777 NaN \n", - "{'name': 'void rajaperf::algorithm::memcpy<128u... 457195964 16.0 \n", - " 528105777 16.0 \n", - "{'name': 'Algorithm_MEMSET', 'type': 'function'} 457195964 NaN \n", - " 528105777 NaN \n", - "{'name': 'cudaDeviceSynchronize', 'type': 'func... 457195964 NaN \n", - " 528105777 NaN \n", - "{'name': 'cudaLaunchKernel', 'type': 'function'} 457195964 NaN \n", - " 528105777 NaN \n", - "{'name': 'void rajaperf::algorithm::memset<128u... 457195964 16.0 \n", - " 528105777 16.0 \n", + " sm__throughput.avg.pct_of_peak_sustained_elapsed \\\n", + "node profile \n", + "{'name': 'RAJAPerf', 'type': 'function'} 457195964 NaN \n", + " 528105777 NaN \n", + "{'name': 'Algorithm', 'type': 'function'} 457195964 NaN \n", + " 528105777 NaN \n", + "{'name': 'Algorithm_MEMCPY', 'type': 'function'} 457195964 NaN \n", + " 528105777 NaN \n", + "{'name': 'cudaDeviceSynchronize', 'type': 'func... 457195964 NaN \n", + " 528105777 NaN \n", + "{'name': 'cudaLaunchKernel', 'type': 'function'} 457195964 NaN \n", + " 528105777 NaN \n", + "{'name': 'void rajaperf::algorithm::memcpy<128u... 457195964 6.521123 \n", + " 528105777 6.294607 \n", + "{'name': 'Algorithm_MEMSET', 'type': 'function'} 457195964 NaN \n", + " 528105777 NaN \n", + "{'name': 'cudaDeviceSynchronize', 'type': 'func... 457195964 NaN \n", + " 528105777 NaN \n", + "{'name': 'cudaLaunchKernel', 'type': 'function'} 457195964 NaN \n", + " 528105777 NaN \n", + "{'name': 'void rajaperf::algorithm::memset<128u... 457195964 7.531866 \n", + " 528105777 6.692635 \n", "\n", - " sm__throughput.avg.pct_of_peak_sustained_elapsed \n", - "node profile \n", - "{'name': 'RAJAPerf', 'type': 'function'} 457195964 NaN \n", - " 528105777 NaN \n", - "{'name': 'Algorithm', 'type': 'function'} 457195964 NaN \n", - " 528105777 NaN \n", - "{'name': 'Algorithm_MEMCPY', 'type': 'function'} 457195964 NaN \n", - " 528105777 NaN \n", - "{'name': 'cudaDeviceSynchronize', 'type': 'func... 457195964 NaN \n", - " 528105777 NaN \n", - "{'name': 'cudaLaunchKernel', 'type': 'function'} 457195964 NaN \n", - " 528105777 NaN \n", - "{'name': 'void rajaperf::algorithm::memcpy<128u... 457195964 6.521123 \n", - " 528105777 6.294607 \n", - "{'name': 'Algorithm_MEMSET', 'type': 'function'} 457195964 NaN \n", - " 528105777 NaN \n", - "{'name': 'cudaDeviceSynchronize', 'type': 'func... 457195964 NaN \n", - " 528105777 NaN \n", - "{'name': 'cudaLaunchKernel', 'type': 'function'} 457195964 NaN \n", - " 528105777 NaN \n", - "{'name': 'void rajaperf::algorithm::memset<128u... 457195964 7.531866 \n", - " 528105777 6.692635 " + " smsp__maximum_warps_avg_per_active_cycle \n", + "node profile \n", + "{'name': 'RAJAPerf', 'type': 'function'} 457195964 NaN \n", + " 528105777 NaN \n", + "{'name': 'Algorithm', 'type': 'function'} 457195964 NaN \n", + " 528105777 NaN \n", + "{'name': 'Algorithm_MEMCPY', 'type': 'function'} 457195964 NaN \n", + " 528105777 NaN \n", + "{'name': 'cudaDeviceSynchronize', 'type': 'func... 457195964 NaN \n", + " 528105777 NaN \n", + "{'name': 'cudaLaunchKernel', 'type': 'function'} 457195964 NaN \n", + " 528105777 NaN \n", + "{'name': 'void rajaperf::algorithm::memcpy<128u... 457195964 16.0 \n", + " 528105777 16.0 \n", + "{'name': 'Algorithm_MEMSET', 'type': 'function'} 457195964 NaN \n", + " 528105777 NaN \n", + "{'name': 'cudaDeviceSynchronize', 'type': 'func... 457195964 NaN \n", + " 528105777 NaN \n", + "{'name': 'cudaLaunchKernel', 'type': 'function'} 457195964 NaN \n", + " 528105777 NaN \n", + "{'name': 'void rajaperf::algorithm::memset<128u... 457195964 16.0 \n", + " 528105777 16.0 " ] }, "execution_count": 4, @@ -1148,6 +1152,10 @@ "\n", "We can make roofline plots using the metrics we have collected using Nsight Compute. The Roofline sets an upper bound on performance of a kernel depending on its operational intensity. We will use roofline plots to understand the performance of RAJAPerf kernels.\n", "\n", + "In this section, we reproduce some of the analysis and visualizations from the paper:\n", + "\n", + "Olga Pearce, Jason Burmark, Rich Hornung, Befikir Bogale, Ian Lumsden, Michael McKinsey, Dewi Yokelson, David Boehme, Stephanie Brink, Michela Taufer, and Tom Scogland. “RAJA Performance Suite: Performance Portability Analysis with Caliper and Thicket”. SC-W 2024: Workshops of ACM/IEEE International Conference for High Performance Computing, Networking, Storage, and Analysis. Performance, Portability & Productivity in HPC. 2024.\n", + "\n", "Instruction Roofline Models were introduced by Ding et. al to better characterize GPU workloads by looking at instruction intensity. We walk through the creation of instruction roofline models for the Appications group of the RAJAPerf kernels below. \n", "\n", "More references on methodology for roofline models:\n", @@ -1191,7 +1199,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "Processing action 3444/3445: 100%|██████████| 3445/3445 [00:42<00:00, 80.24it/s] \n" + "Processing action 3444/3445: 100%|██████████| 3445/3445 [00:42<00:00, 81.94it/s] \n" ] } ], @@ -1816,7 +1824,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 14, "id": "909bbd4e", "metadata": {}, "outputs": [], @@ -1991,7 +1999,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 15, "id": "c208ebcb", "metadata": {}, "outputs": [ @@ -2012,7 +2020,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 16, "id": "7a2b50dd", "metadata": {}, "outputs": [ @@ -2033,7 +2041,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 17, "id": "3917697f", "metadata": {}, "outputs": [ @@ -2054,7 +2062,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 18, "id": "02d2e19e", "metadata": {}, "outputs": [ @@ -2072,6 +2080,14 @@ "source": [ "roofline(LABELS=pruned_th.dataframe[\"name\"].tolist(), flag=\"HBM\", data_df=agg_df)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "63636ddf", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": {