diff --git a/README.md b/README.md index c36437c..abd1334 100644 --- a/README.md +++ b/README.md @@ -20,12 +20,12 @@ Validation capacities for physics compares OACC and original Fortran on. Automatic benchmarking are as follow (legends after table) -| Experimentation | Resolutions | Layout | Setup | -| ----------------------------- | ------------|-----------| ---------------------------------------- | -| Held-Suarez | C180-L72 | 1x1 | Discover @ node-to-node (exclusive GPU) | -| | C180-L91 | 1x1 | Discover @ node-to-node (exclusive GPU) | -| | C180-L137 | 1x1 | Discover @ node-to-node (exclusive GPU) | -| Aquaplanet | C180-L72 | 1x1 | Discover @ node-to-node (exclusive GPU) | +| Experimentation | Resolutions | Layout | CPU/GPU | +| ----------------------------- | ----------- | ------ | --------------------------------- | +| Held-Suarez | C180-L72 | 4x4 | 96/8 Node-to-node (sharing GPU) | +| | C180-L137 | 4x4 | 96/8 Node-to-node (sharing GPU) | +| | C360-L72 | 4x4 | 96/8 Node-to-node (sharing GPU) | +| Aquaplanet | C180-L72 | 1x1 | 6/6 Node-to-node (exclusive GPU) | Legend: diff --git a/geosongpu_ci/actions/slurm.py b/geosongpu_ci/actions/slurm.py index b6b0c6a..75794c0 100644 --- a/geosongpu_ci/actions/slurm.py +++ b/geosongpu_ci/actions/slurm.py @@ -79,3 +79,14 @@ def slurm_96CPUs_8GPUs(cls, output: Optional[str] = None) -> "SlurmConfiguration mem_per_gpu="40G", output=output or cls.output, ) + + @classmethod + def slurm_96CPUs(cls, output: Optional[str] = None) -> "SlurmConfiguration": + """2 nodes configuration on Discover with Rome Epyc""" + return cls( + nodes=2, + ntasks=96, + ntasks_per_node=48, + sockets_per_node=2, + output=output or cls.output, + ) diff --git a/geosongpu_ci/pipeline/gtfv3_config.py b/geosongpu_ci/pipeline/gtfv3_config.py index 3b7c092..020319e 100644 --- a/geosongpu_ci/pipeline/gtfv3_config.py +++ b/geosongpu_ci/pipeline/gtfv3_config.py @@ -19,6 +19,8 @@ def sh(self) -> str: f"export PACE_FLOAT_PRECISION={self.PACE_FLOAT_PRECISION}\n" f"export PACE_LOGLEVEL={self.PACE_LOGLEVEL}\n" f"export GTFV3_BACKEND={self.GTFV3_BACKEND}\n" + f"export PER_DEVICE_PROCESS=12\n" # default for Discover + f"export PYTHONOPTIMIZE=1\n" ) @classmethod diff --git a/geosongpu_ci/pipeline/held_suarez.py b/geosongpu_ci/pipeline/held_suarez.py index 96a307c..e08d6c0 100644 --- a/geosongpu_ci/pipeline/held_suarez.py +++ b/geosongpu_ci/pipeline/held_suarez.py @@ -46,48 +46,11 @@ def __init__( def _make_gpu_wrapper_script( self, experiment_directory: str, - hardware_sampling: bool = False, ) -> None: - script_name = "gpu-wrapper-slurm" - pre_execution = [] - post_execution = [] - if hardware_sampling: - script_name += "-hws" - pre_execution.append("if [ $SLURM_LOCALID -eq 0 ]; then") - pre_execution.append(" geosongpu_hws server &") - pre_execution.append(" sleep 20") - pre_execution.append(" geosongpu_hws client start") - pre_execution.append("fi") - - post_execution.append( - "if [ $SLURM_LOCALID -eq 0 ]; then", - ) - post_execution.append( - " geosongpu_hws client dump", - ) - post_execution.append( - " geosongpu_hws client stop", - ) - post_execution.append( - "fi", - ) - - cuda_device_setup = [ - "export CUDA_VISIBLE_DEVICES=$SLURM_LOCALID", - 'echo "Node: $SLURM_NODEID | Rank: $SLURM_PROCID,' - ' pinned to GPU: $CUDA_VISIBLE_DEVICES"', - ] - execution = ["$*"] self.gpu_wrapper = ShellScript( - script_name, + "gpu-wrapper-slurm-mps", working_directory=experiment_directory, - ).write( - modules=[], - shell_commands=cuda_device_setup - + pre_execution - + execution - + post_execution, - ) + ).from_template(template_name="gpu-wrapper-slurm-mps.sh") def _copy_executable_script( self, @@ -114,13 +77,29 @@ def _make_srun_script( slurm_config: SlurmConfiguration, gtfv3_config: GTFV3Config, prolog_scripts: PrologScripts, + hardware_sampler_on: bool = False, + mps_on: bool = False, + local_redirect_log: bool = False, ) -> ShellScript: + # Executing command with the SLURM setup srun_cmd = slurm_config.srun_bash( wrapper=prolog_scripts.gpu_wrapper.path, executable_name=executable_name, ) - srun_script_script = ShellScript( - f"srun_{slurm_config.ntasks}tasks_{gtfv3_config.backend_sanitized()}", + # Options + options = f"""{'export HARDWARE_SAMPLING=1' if hardware_sampler_on else 'unset HARDWARE_SAMPLING' } +{'export MPS_ON=1' if mps_on else 'unset MPS_ON' } +{f'export LOCAL_REDIRECT_LOG=1' if local_redirect_log else 'unset LOCAL_REDIRECT_LOG' } + """ + + if "dace" in gtfv3_config.GTFV3_BACKEND: + backend = f"{gtfv3_config.backend_sanitized()}.{gtfv3_config.FV3_DACEMODE}" + else: + backend = f"{gtfv3_config.backend_sanitized()}" + srun_script_name = f"srun_{slurm_config.ntasks}tasks_{backend}" + + srun_script = ShellScript( + srun_script_name, working_directory=experiment_directory, ).write( env_to_source=[ @@ -132,13 +111,14 @@ def _make_srun_script( f"source {prolog_scripts.copy_executable.path}", "", f"{gtfv3_config.sh()}", - "export PYTHONOPTIMIZE=1", f"export CUPY_CACHE_DIR={experiment_directory}/.cupy", "", + f"{options}", + "", f"{srun_cmd}", ], ) - return srun_script_script + return srun_script VALIDATION_RESOLUTION = "C180-L72" @@ -169,7 +149,7 @@ def _setup_1ts_1node_gtfv3(self, experiment_directory: str) -> ShellScript: def _setup_1day_1node_gtfv3(self, experiment_directory: str) -> ShellScript: return ShellScript( - name="_setup_config_1day_1node_gtfv3", + name="setup_config_1day_1node_gtfv3", working_directory=experiment_directory, ).write( shell_commands=[ @@ -193,6 +173,45 @@ def _setup_1day_1node_fortran(self, experiment_directory: str) -> ShellScript: ], ) + def _setup_1ts_2nodes_gtfv3(self, experiment_directory: str) -> ShellScript: + return ShellScript( + name="setup_config_1ts_2nodes_gtfv3", + working_directory=experiment_directory, + ).write( + shell_commands=[ + f"cd {experiment_directory}", + "cp -f AgcmSimple.rc.4x24.gtfv3 AgcmSimple.rc", + "cp -f input.nml.4x4 input.nml", + "cp -f CAP.rc.1ts CAP.rc", + ], + ) + + def _setup_1day_2nodes_gtfv3(self, experiment_directory: str) -> ShellScript: + return ShellScript( + name="setup_config_1day_2nodes_gtfv3", + working_directory=experiment_directory, + ).write( + shell_commands=[ + f"cd {experiment_directory}", + "cp -f AgcmSimple.rc.4x24.gtfv3 AgcmSimple.rc", + "cp -f input.nml.4x4 input.nml", + "cp -f CAP.rc.1day CAP.rc", + ], + ) + + def _setup_1day_2nodes_fortran(self, experiment_directory: str) -> ShellScript: + return ShellScript( + name="setup_config_1day_2nodes_fortran", + working_directory=experiment_directory, + ).write( + shell_commands=[ + f"cd {experiment_directory}", + "cp -f AgcmSimple.rc.4x24.fortran AgcmSimple.rc", + "cp -f input.nml.4x4 input.nml", + "cp -f CAP.rc.1day CAP.rc", + ], + ) + def prepare_experiment( self, input_directory: str, @@ -224,6 +243,9 @@ def simulate( gtfv3_config: GTFV3Config, setup_script: ShellScript, setup_only: bool = False, + hardware_sampler_on: bool = False, + mps_on: bool = False, + local_redirect_log: bool = False, ): srun_script = _make_srun_script( executable_name=executable_name, @@ -231,6 +253,9 @@ def simulate( slurm_config=slurm_config, gtfv3_config=gtfv3_config, prolog_scripts=prolog_scripts, + hardware_sampler_on=hardware_sampler_on, + mps_on=mps_on, + local_redirect_log=local_redirect_log, ) setup_script.execute() @@ -310,11 +335,11 @@ def run_action( experiment_directory=experiment_directory, executable_name=self.executable_name, prolog_scripts=prolog_scripts, - slurm_config=SlurmConfiguration.slurm_6CPUs_6GPUs( + slurm_config=SlurmConfiguration.slurm_96CPUs_8GPUs( output="benchmark.cache.dacegpu.%t.out" ), gtfv3_config=GTFV3Config.dace_gpu_32_bit_BAR(), - setup_script=self._setup_1ts_1node_gtfv3(experiment_directory), + setup_script=self._setup_1ts_2nodes_gtfv3(experiment_directory), setup_only=env.setup_only, ) @@ -323,12 +348,13 @@ def run_action( experiment_directory=experiment_directory, # type: ignore executable_name=self.executable_name, prolog_scripts=prolog_scripts, # type: ignore - slurm_config=SlurmConfiguration.slurm_6CPUs_6GPUs( - output="benchmark.1day.dacegpu.%t.out" + slurm_config=SlurmConfiguration.slurm_96CPUs_8GPUs( + output="benchmark.1day.MPS.44.dacegpu.%t.out" ), gtfv3_config=GTFV3Config.dace_gpu_32_bit_BAR(dacemode="Run"), - setup_script=self._setup_1day_1node_gtfv3(experiment_directory), # type: ignore + setup_script=self._setup_1day_2nodes_gtfv3(experiment_directory), # type: ignore setup_only=env.setup_only, + mps_on=True, ) # Run 1 day Fortran @@ -336,12 +362,13 @@ def run_action( experiment_directory=experiment_directory, # type: ignore executable_name=self.executable_name, prolog_scripts=prolog_scripts, # type: ignore - slurm_config=SlurmConfiguration.slurm_72CPUs( - output="benchmark.1day.fortran.%t.out" + slurm_config=SlurmConfiguration.slurm_96CPUs( + output="benchmark.1day.MPS.44.fortran.%t.out" ), gtfv3_config=GTFV3Config.fortran(), - setup_script=self._setup_1day_1node_fortran(experiment_directory), # type: ignore + setup_script=self._setup_1day_2nodes_fortran(experiment_directory), # type: ignore setup_only=env.setup_only, + mps_on=True, ) def check( diff --git a/geosongpu_ci/pipeline/templates/__init__.py b/geosongpu_ci/pipeline/templates/__init__.py new file mode 100644 index 0000000..57a20ff --- /dev/null +++ b/geosongpu_ci/pipeline/templates/__init__.py @@ -0,0 +1,22 @@ +import os +import sys +import site + + +def find_template(name: str) -> str: + # pip install geosongpu-ci + candidate = f"{sys.prefix}/geosongpu/templates/{name}.tpl" + if os.path.isfile(candidate): + return candidate + # pip install --user geosongpu-ci + candidate = f"{site.USER_BASE}/geosongpu/templates/{name}.tpl" + if os.path.isfile(candidate): + return candidate + # pip install -e geosongpu-ci + candidate = os.path.join( + os.path.dirname(__file__), + f"{name}.tpl", + ) + if os.path.isfile(candidate): + return candidate + raise FileNotFoundError(f"Template: could not locate {name}") diff --git a/geosongpu_ci/pipeline/templates/gpu-wrapper-slurm-mps.sh.tpl b/geosongpu_ci/pipeline/templates/gpu-wrapper-slurm-mps.sh.tpl new file mode 100644 index 0000000..e2af09f --- /dev/null +++ b/geosongpu_ci/pipeline/templates/gpu-wrapper-slurm-mps.sh.tpl @@ -0,0 +1,76 @@ +#!/bin/sh + +# We open GPU visibility to full node at first +export CUDA_VISIBLE_DEVICES=0,1,2,3 + +# Hardware sampling is a python tools that reads at intervals +# various hardware sensors (power, usage, memory load...) +if [ -z ${HARDWARE_SAMPLING} ]; then + echo "Hardware sampling is OFF" +else + echo "Hardware sampling is ON" + # We restrict usage to (world) rank 0 + if [ $SLURM_PROCID -eq 0 ]; then + geosongpu_hws server & + sleep 10 + geosongpu_hws client start + fi + +fi + +if [ -z ${MPS_ON} ]; then + echo "MPS is OFF" + # No MPS, we assume rank==GPU + GPU=$SLURM_LOCALID + export CUDA_VISIBLE_DEVICES=$GPU +else + echo "MPS is ON" + if [ -z ${PER_DEVICE_PROCESS} ]; then + echo "PER_DEVICE_PROCESS needs to be setup on MPS. Exiting." + exit 1 + fi + # All ranks needs to know where to look + export CUDA_MPS_PIPE_DIRECTORY=./nvidia-mps/$SLURM_NODEID + export CUDA_MPS_LOG_DIRECTORY=./nvidia-log/$SLURM_NODEID + # Only 1 rank per node (local rank 0) handles the server chatter + if [ $SLURM_LOCALID -eq 0 ]; then + echo "Turn nvidia-cuda-mps-control on for node $SLURM_NODEID" + mkdir -p nvidia-mps + mkdir -p nvidia-log/$SLURM_NODEID + # sudo nividia -i 0 -c 3 # Per docs, we should insure GPU is in EXCLUSIVE mode but we might be curtail by HPC settings + nvidia-cuda-mps-control -d + fi + # MPS server is socket base, leave time for the filesystem + sleep 10 + # Server should be spun, we restrict this rank to a single GPU + GPU=$((SLURM_LOCALID/PER_DEVICE_PROCESS)) + export CUDA_VISIBLE_DEVICES=$GPU +fi + + +echo "Node: $SLURM_NODEID | Rank: $SLURM_PROCID, pinned to GPU: $CUDA_VISIBLE_DEVICES" + +# Run program with or without log dump in file +if [ -z ${LOCAL_REDIRECT_LOG} ]; then + $* +else + $* > log.redirect_local.$SLURM_PROCID.out 2>&1 +fi + +# Clean up of all tools +if [ -z ${HARDWARE_SAMPLING} ]; then + echo "" +else + if [ $SLURM_PROCID -eq 0 ]; then + geosongpu_hws client dump + geosongpu_hws client stop + fi +fi +if [ -z ${MPS_ON} ]; then + echo "" +else + if [ $SLURM_LOCALID -eq 0 ]; then + echo quit | nvidia-cuda-mps-control + # sudo nividia -i 0 -c 0 # Per docs, we should insure GPU is flipped back to DEFAULT mode but we might be curtail by HPC settings + fi +fi diff --git a/geosongpu_ci/tools/benchmark/geos_log_parser.py b/geosongpu_ci/tools/benchmark/geos_log_parser.py index 33d84c6..20a2bf2 100644 --- a/geosongpu_ci/tools/benchmark/geos_log_parser.py +++ b/geosongpu_ci/tools/benchmark/geos_log_parser.py @@ -68,7 +68,9 @@ def parse_geos_log(filename: str) -> BenchmarkRawData: benchmark.fv_dyncore_timings = _extract_numerics(interface_timings) if "dace" in benchmark.backend: - dycore_timings = _grep(filename, "] Run...", exclude_pattern=True) + dycore_timings = _grep( + filename, "] Run...", exclude_pattern=True, expected=False + ) benchmark.inner_dycore_timings = _extract_numerics(dycore_timings) else: dycore_timings = _grep(filename, "0: fv_dynamics", exclude_pattern=True) @@ -131,8 +133,16 @@ def parse_geos_log(filename: str) -> BenchmarkRawData: # Model throughput gloabl_profiler_entry = "Model Throughput" + global_init_time = _grep( + filename, "--Initialize", start_pattern=gloabl_profiler_entry + ) + benchmark.global_init_time = _extract_numerics(global_init_time)[1] global_run_time = _grep(filename, "--Run", start_pattern=gloabl_profiler_entry) benchmark.global_run_time = _extract_numerics(global_run_time)[1] + global_finalize_time = _grep( + filename, "--Finalize", start_pattern=gloabl_profiler_entry + ) + benchmark.global_finalize_time = _extract_numerics(global_finalize_time)[1] return benchmark diff --git a/geosongpu_ci/tools/benchmark/raw_data.py b/geosongpu_ci/tools/benchmark/raw_data.py index 27096ec..70d6796 100644 --- a/geosongpu_ci/tools/benchmark/raw_data.py +++ b/geosongpu_ci/tools/benchmark/raw_data.py @@ -7,7 +7,9 @@ class BenchmarkRawData: backend: str = "" grid_resolution: Tuple[int, int, int] = (0, 0, 0) # nx / ny / nz node_setup: Tuple[int, int, int] = (0, 0, 0) # NX / NY / Total ranks used + global_init_time: float = 0 # seconds fort the global INITIALIZE global_run_time: float = 0 # seconds fort the global RUN + global_finalize_time: float = 0 # seconds fort the global FINALIZE fv_dyncore_timings: List[float] = field(default_factory=list) # seconds inner_dycore_timings: List[float] = field(default_factory=list) # seconds fv_gridcomp_detailed_profiling: List[Tuple[str, float, str]] = field( diff --git a/geosongpu_ci/tools/hws/analysis.py b/geosongpu_ci/tools/hws/analysis.py new file mode 100644 index 0000000..c202bcc --- /dev/null +++ b/geosongpu_ci/tools/hws/analysis.py @@ -0,0 +1,66 @@ +import numpy as np +from typing import Dict, Any +import geosongpu_ci.tools.hws.constants as cst +import dataclasses + + +@dataclasses.dataclass +class EnergyReport: + CPU_envelop_integrated: float = 0 # kW * sample_count + CPU_envelop_kWh: float = 0 + GPU_envelop_integrated: float = 0 # kW * sample_count + GPU_envelop_kWh: float = 0 + overall_envelop_integrated: float = 0 # kW * sample_count + overall_envelop_kWh: float = 0 + + +def energy_envelop_calculation( + cpu_psu_data: np.ndarray, gpu_psu_data: np.ndarray, verbose: bool = True +) -> EnergyReport: + report = EnergyReport() + # TODO we need the sample rate here too + sample_count = len(cpu_psu_data) + + # Grab integrated power using trapezoide integration on all samples + report.GPU_envelop_integrated = np.trapz(gpu_psu_data / 1000) + report.CPU_envelop_integrated = np.trapz(cpu_psu_data / 1000) + report.overall_envelop_integrated = ( + report.GPU_envelop_integrated + report.CPU_envelop_integrated + ) + + # Average kWh calculation + # TODO: Wrong?! + sampling_time_hours_default = ( + (sample_count - 1) * cst.DEFAULT_SAMPLERATE_IN_S / (60 * 60) + ) + report.overall_envelop_kWh = ( + report.overall_envelop_integrated / sample_count + ) / sampling_time_hours_default + report.CPU_envelop_kWh = ( + report.CPU_envelop_integrated / sample_count + ) / sampling_time_hours_default + report.GPU_envelop_kWh = ( + report.GPU_envelop_integrated / sample_count + ) / sampling_time_hours_default + + if verbose: + print( + f"Number of samples: {sample_count}\n" + f"CPU envelop:{report.CPU_envelop_integrated:.0f} kW.sample_count\n" + f"CPU envelop (@ default sample rate): {report.CPU_envelop_kWh:.2f} kW/h\n" + f"GPU envelop:{report.GPU_envelop_integrated:.0f} kW.sample_count\n" + f"GPU envelop (@ default sample rate): {report.GPU_envelop_kWh:.2f} kW/h\n" + f"Overall envelop: {report.overall_envelop_integrated:.0f} kW.sample_count\n" + f"Overall envelop (@ default sample rate): {report.overall_envelop_kWh:.2f} kW/h" + ) + + return report + + +def load_data( + data_filepath: str, + data_format: str = "npz", +) -> Dict[str, Any]: + if data_format != "npz": + raise NotImplementedError(f"Format {data_format} not implemented for graphing") + return np.load(data_filepath) diff --git a/geosongpu_ci/tools/hws/cli.py b/geosongpu_ci/tools/hws/cli.py index a1b951f..49f7f8c 100644 --- a/geosongpu_ci/tools/hws/cli.py +++ b/geosongpu_ci/tools/hws/cli.py @@ -2,6 +2,8 @@ import geosongpu_ci.tools.hws.server as hws_server import geosongpu_ci.tools.hws.client as hws_client import geosongpu_ci.tools.hws.graph as hws_graph +import geosongpu_ci.tools.hws.constants as cst +from typing import Tuple, Optional @click.group() @@ -28,5 +30,21 @@ def graph(data_filepath: str): hws_graph.cli(data_filepath) +@cli.command() +@click.argument("data_filepath") +@click.option("--data_range", nargs=2, type=float) +def envelop( + data_filepath: str, data_range: Optional[Tuple[float, float]] +): # TODO: this should be saved in a header in the data file + dt = cst.CLIENT_CMDS[cst.CLIENT_CMD_START]["dt"] + if data_range: + range_start, range_stop = data_range + range_start = int(range_start / dt) + range_stop = int(range_stop / dt) + else: + range_start, range_stop = None, None + hws_graph.cli(data_filepath, data_range=slice(range_start, range_stop)) + + if __name__ == "__main__": cli() diff --git a/geosongpu_ci/tools/hws/client.py b/geosongpu_ci/tools/hws/client.py index 24abec0..74f3f98 100755 --- a/geosongpu_ci/tools/hws/client.py +++ b/geosongpu_ci/tools/hws/client.py @@ -4,9 +4,9 @@ def client_main(order: str, dump_name: str): - order = CLIENT_CMDS[order] - order["dump_name"] = dump_name - data = json.dumps(order) + filtered_order = CLIENT_CMDS[order] + filtered_order["dump_name"] = dump_name + data = json.dumps(filtered_order) server = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) server.connect(SOCKET_FILENAME) server.send(data.encode("utf8")) diff --git a/geosongpu_ci/tools/hws/constants.py b/geosongpu_ci/tools/hws/constants.py index 598dca2..66f3b08 100755 --- a/geosongpu_ci/tools/hws/constants.py +++ b/geosongpu_ci/tools/hws/constants.py @@ -26,10 +26,12 @@ CLIENT_CMD_DUMP = "dump" CLIENT_CMD_TICK = "tick" +DEFAULT_SAMPLERATE_IN_S = 0.1 + CLIENT_CMDS = { CLIENT_CMD_START: { "action": SERV_ORDER_START, - "dt": 0.01, + "dt": DEFAULT_SAMPLERATE_IN_S, }, CLIENT_CMD_STOP: {"action": SERV_ORDER_STOP}, CLIENT_CMD_DUMP: { diff --git a/geosongpu_ci/tools/hws/graph.py b/geosongpu_ci/tools/hws/graph.py index 0add21b..ee12d01 100644 --- a/geosongpu_ci/tools/hws/graph.py +++ b/geosongpu_ci/tools/hws/graph.py @@ -1,74 +1,72 @@ -from matplotlib import pyplot +import plotly.graph_objects as go +from plotly.subplots import make_subplots import numpy as np from geosongpu_ci.tools.hws.constants import ( HWS_HW_GPU, HWS_HARDWARE_SPECS, ) -from typing import Dict, Any +from geosongpu_ci.tools.hws.analysis import load_data, energy_envelop_calculation COLOR_VRAM = "C4" -def energy_envelop_calculation(cpu_psu_data: np.ndarray, gpu_psu_data: np.ndarray): - gpu_kW_envelop = np.trapz(gpu_psu_data / 1000) - cpu_kW_envelop = np.trapz(cpu_psu_data / 1000) - return gpu_kW_envelop, cpu_kW_envelop - -def load_data( - data_filepath: str, - data_format: str = "npz", -) -> Dict[str, Any]: - if data_format != "npz": - raise NotImplementedError(f"Format {data_format} not implemented for graphing") - return np.load(data_filepath) - def cli( data_filepath: str, data_format: str = "npz", - dynamic_gpu_load: bool = True, + data_range: slice = slice(None), ): d = load_data(data_filepath, data_format) - n = len(d["cpu_psu"]) - s = slice(0, n) - print(n, s) - yd = np.arange(len(d["cpu_psu"][s])) + sample_count = len(d["cpu_psu"][data_range]) + yd = np.arange(sample_count) - fig, ax1 = pyplot.subplots(figsize=(8, 8)) - ax2 = pyplot.twinx() - - ax1.plot(yd, d["gpu_psu"][s], label="GPU PSU(W)", linewidth=0.5) - ax1.plot(yd, d["gpu_exe_utl"][s], label="GPU Utilization(%)", linewidth=0.5) - ax2.plot(yd, d["gpu_mem"][s], label="GPU VRAM(Mb)", color=COLOR_VRAM, linewidth=0.5) - ax1.plot(yd, d["cpu_psu"][s], label="CPU PSU(W - extrapolated)", linewidth=0.5) - ax1.plot(yd, d["cpu_exe_utl"][s], label="CPU Utilization(%)", linewidth=0.5) - - ax1.legend( - loc="upper center", - bbox_to_anchor=(0.5, 1.15), - ncol=2, - fancybox=True, - shadow=True, + fig = make_subplots(specs=[[{"secondary_y": True}]]) + # Add traces + fig.add_trace( + go.Scatter(y=d["gpu_psu"][data_range], x=yd, name="GPU PSU(W)"), + secondary_y=False, ) - ax1.set_ylabel("W/%", fontsize=10) - ax1.set_ylim(0, HWS_HARDWARE_SPECS[HWS_HW_GPU]["PSU_TDP"]) - - ax2.legend( - loc="upper center", - bbox_to_anchor=(0.5, 1.05), - ncol=3, - fancybox=True, - shadow=True, + fig.add_trace( + go.Scatter(y=d["gpu_exe_utl"][data_range], x=yd, name="GPU Utilization(%)"), + secondary_y=False, + ) + fig.add_trace( + go.Scatter(y=d["cpu_psu"][data_range], x=yd, name="CPU PSU(W - extrapolated)"), + secondary_y=False, + ) + fig.add_trace( + go.Scatter(y=d["cpu_exe_utl"][data_range], x=yd, name="CPU Utilization(%)"), + secondary_y=False, ) - ax2.set_ylabel("Mb", color=COLOR_VRAM, fontsize=10) - ax2.tick_params(axis="y", labelcolor="C4") - ax2.set_ylim(0, HWS_HARDWARE_SPECS[HWS_HW_GPU]["MAX_VRAM"]) - fig.savefig(data_filepath.replace(f".{data_format}", "")) + fig.add_trace( + go.Scatter(y=d["gpu_mem"][data_range], x=yd, name="GPU VRAM (Mb)"), + secondary_y=True, + ) - gpu_kW_envelop, cpu_kW_envelop = energy_envelop_calculation( - d["cpu_psu"][s], d["gpu_psu"][s] + # Labels + fig.update_layout( + title_text="Hardware sensors", + legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1), ) - print( - f"Overall CPU W usage:{cpu_kW_envelop:.0f} kW\n", - f"Overall GPU W usage:{gpu_kW_envelop:.0f} kW", + fig.update_xaxes(title_text="Sample #") + fig.update_yaxes( + title_text="W or %", + secondary_y=False, + range=[0, HWS_HARDWARE_SPECS[HWS_HW_GPU]["PSU_TDP"]], ) + fig.update_yaxes( + title_text="Mb", + secondary_y=True, + range=[0, HWS_HARDWARE_SPECS[HWS_HW_GPU]["MAX_VRAM"]], + ) + + fig.write_image(data_filepath.replace(f".{data_format}", ".png")) + + energy_envelop_calculation(d["cpu_psu"][data_range], d["gpu_psu"][data_range]) + + +# Useful for debug +if __name__ == "__main__": + import sys + + cli(sys.argv[1]) diff --git a/geosongpu_ci/utils/shell.py b/geosongpu_ci/utils/shell.py index 2f9cfb3..21a8285 100644 --- a/geosongpu_ci/utils/shell.py +++ b/geosongpu_ci/utils/shell.py @@ -4,6 +4,7 @@ import stat from geosongpu_ci.utils.progress import Progress from time import sleep +from geosongpu_ci.pipeline.templates import find_template class ShellScript: @@ -21,6 +22,19 @@ def path(self) -> str: def name(self) -> str: return self._name + def from_template(self, template_name: str): + # Read template + template_file = find_template(template_name) + with open(template_file, "r") as f: + tpl = f.read() + + # Write file + with open(self.path, "w") as f: + f.write(tpl) + + self._make_executable() + return self + def write( self, shell_commands: List[str], diff --git a/setup.cfg b/setup.cfg index a06f8cd..db9514c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -3,7 +3,7 @@ [flake8] # Recommend matching the black line length (default 88), # rather than using the flake8 default of 79: -max-line-length = 88 +max-line-length = 110 extend-ignore = # See https://github.com/PyCQA/pycodestyle/issues/373 E203, \ No newline at end of file diff --git a/setup.py b/setup.py index b772948..881e448 100644 --- a/setup.py +++ b/setup.py @@ -23,12 +23,15 @@ "pdoc", "pynvml", "psutil", - "matplotlib", "plotly", - "kaleido" + "kaleido", ], data_files=[ ("./geosongpu/experiments", ["./experiments/experiments.yaml"]), + ( + "./geosongpu/templates", + ["./geosongpu_ci/pipeline/templates/gpu-wrapper-slurm-mps.sh.tpl"], + ), ], entry_points={ "console_scripts": [