Merge pull request #25 from GEOS-ESM/feature/hardware_sampelr

Hardware Sampler & MPS
GEOS-ESM · Aug 17, 2023 · ce52517 · ce52517
2 parents a0094dc + b221f62
commit ce52517
Show file tree

Hide file tree

Showing 16 changed files with 370 additions and 119 deletions.
diff --git a/README.md b/README.md
@@ -20,12 +20,12 @@ Validation capacities for physics compares OACC and original Fortran on.
 
 Automatic benchmarking are as follow (legends after table)
 
-| Experimentation               | Resolutions |  Layout   | Setup                                    |
-| ----------------------------- | ------------|-----------| ---------------------------------------- |
-| Held-Suarez                   | C180-L72    | 1x1       | Discover @ node-to-node (exclusive GPU)  |
-|                               | C180-L91    | 1x1       | Discover @ node-to-node (exclusive GPU)  |
-|                               | C180-L137   | 1x1       | Discover @ node-to-node (exclusive GPU)  |
-| Aquaplanet                    | C180-L72    | 1x1       | Discover @ node-to-node (exclusive GPU)  |
+| Experimentation               | Resolutions | Layout | CPU/GPU                           |
+| ----------------------------- | ----------- | ------ | --------------------------------- |
+| Held-Suarez                   | C180-L72    | 4x4    | 96/8 Node-to-node (sharing GPU)   |
+|                               | C180-L137   | 4x4    | 96/8 Node-to-node (sharing GPU)   |
+|                               | C360-L72    | 4x4    | 96/8 Node-to-node (sharing GPU)   |
+| Aquaplanet                    | C180-L72    | 1x1    | 6/6  Node-to-node (exclusive GPU) |
 
 Legend:
 

diff --git a/geosongpu_ci/actions/slurm.py b/geosongpu_ci/actions/slurm.py
@@ -79,3 +79,14 @@ def slurm_96CPUs_8GPUs(cls, output: Optional[str] = None) -> "SlurmConfiguration
             mem_per_gpu="40G",
             output=output or cls.output,
         )
+
+    @classmethod
+    def slurm_96CPUs(cls, output: Optional[str] = None) -> "SlurmConfiguration":
+        """2 nodes configuration on Discover with Rome Epyc"""
+        return cls(
+            nodes=2,
+            ntasks=96,
+            ntasks_per_node=48,
+            sockets_per_node=2,
+            output=output or cls.output,
+        )
diff --git a/geosongpu_ci/pipeline/gtfv3_config.py b/geosongpu_ci/pipeline/gtfv3_config.py
@@ -19,6 +19,8 @@ def sh(self) -> str:
             f"export PACE_FLOAT_PRECISION={self.PACE_FLOAT_PRECISION}\n"
             f"export PACE_LOGLEVEL={self.PACE_LOGLEVEL}\n"
             f"export GTFV3_BACKEND={self.GTFV3_BACKEND}\n"
+            f"export PER_DEVICE_PROCESS=12\n"  # default for Discover
+            f"export PYTHONOPTIMIZE=1\n"
         )
 
     @classmethod

diff --git a/geosongpu_ci/pipeline/held_suarez.py b/geosongpu_ci/pipeline/held_suarez.py
@@ -46,48 +46,11 @@ def __init__(
     def _make_gpu_wrapper_script(
         self,
         experiment_directory: str,
-        hardware_sampling: bool = False,
     ) -> None:
-        script_name = "gpu-wrapper-slurm"
-        pre_execution = []
-        post_execution = []
-        if hardware_sampling:
-            script_name += "-hws"
-            pre_execution.append("if [ $SLURM_LOCALID -eq 0 ]; then")
-            pre_execution.append("    geosongpu_hws server &")
-            pre_execution.append("    sleep 20")
-            pre_execution.append("    geosongpu_hws client start")
-            pre_execution.append("fi")
-
-            post_execution.append(
-                "if [ $SLURM_LOCALID -eq 0 ]; then",
-            )
-            post_execution.append(
-                "    geosongpu_hws client dump",
-            )
-            post_execution.append(
-                "    geosongpu_hws client stop",
-            )
-            post_execution.append(
-                "fi",
-            )
-
-        cuda_device_setup = [
-            "export CUDA_VISIBLE_DEVICES=$SLURM_LOCALID",
-            'echo "Node: $SLURM_NODEID | Rank: $SLURM_PROCID,'
-            ' pinned to GPU: $CUDA_VISIBLE_DEVICES"',
-        ]
-        execution = ["$*"]
         self.gpu_wrapper = ShellScript(
-            script_name,
+            "gpu-wrapper-slurm-mps",
             working_directory=experiment_directory,
-        ).write(
-            modules=[],
-            shell_commands=cuda_device_setup
-            + pre_execution
-            + execution
-            + post_execution,
-        )
+        ).from_template(template_name="gpu-wrapper-slurm-mps.sh")
 
     def _copy_executable_script(
         self,
@@ -114,13 +77,29 @@ def _make_srun_script(
     slurm_config: SlurmConfiguration,
     gtfv3_config: GTFV3Config,
     prolog_scripts: PrologScripts,
+    hardware_sampler_on: bool = False,
+    mps_on: bool = False,
+    local_redirect_log: bool = False,
 ) -> ShellScript:
+    # Executing command with the SLURM setup
     srun_cmd = slurm_config.srun_bash(
         wrapper=prolog_scripts.gpu_wrapper.path,
         executable_name=executable_name,
     )
-    srun_script_script = ShellScript(
-        f"srun_{slurm_config.ntasks}tasks_{gtfv3_config.backend_sanitized()}",
+    # Options
+    options = f"""{'export HARDWARE_SAMPLING=1' if hardware_sampler_on else 'unset HARDWARE_SAMPLING' }
+{'export MPS_ON=1' if mps_on else 'unset MPS_ON' }
+{f'export LOCAL_REDIRECT_LOG=1' if local_redirect_log else 'unset LOCAL_REDIRECT_LOG' }
+    """
+
+    if "dace" in gtfv3_config.GTFV3_BACKEND:
+        backend = f"{gtfv3_config.backend_sanitized()}.{gtfv3_config.FV3_DACEMODE}"
+    else:
+        backend = f"{gtfv3_config.backend_sanitized()}"
+    srun_script_name = f"srun_{slurm_config.ntasks}tasks_{backend}"
+
+    srun_script = ShellScript(
+        srun_script_name,
         working_directory=experiment_directory,
     ).write(
         env_to_source=[
@@ -132,13 +111,14 @@ def _make_srun_script(
             f"source {prolog_scripts.copy_executable.path}",
             "",
             f"{gtfv3_config.sh()}",
-            "export PYTHONOPTIMIZE=1",
             f"export CUPY_CACHE_DIR={experiment_directory}/.cupy",
             "",
+            f"{options}",
+            "",
             f"{srun_cmd}",
         ],
     )
-    return srun_script_script
+    return srun_script
 
 
 VALIDATION_RESOLUTION = "C180-L72"
@@ -169,7 +149,7 @@ def _setup_1ts_1node_gtfv3(self, experiment_directory: str) -> ShellScript:
 
     def _setup_1day_1node_gtfv3(self, experiment_directory: str) -> ShellScript:
         return ShellScript(
-            name="_setup_config_1day_1node_gtfv3",
+            name="setup_config_1day_1node_gtfv3",
             working_directory=experiment_directory,
         ).write(
             shell_commands=[
@@ -193,6 +173,45 @@ def _setup_1day_1node_fortran(self, experiment_directory: str) -> ShellScript:
             ],
         )
 
+    def _setup_1ts_2nodes_gtfv3(self, experiment_directory: str) -> ShellScript:
+        return ShellScript(
+            name="setup_config_1ts_2nodes_gtfv3",
+            working_directory=experiment_directory,
+        ).write(
+            shell_commands=[
+                f"cd {experiment_directory}",
+                "cp -f AgcmSimple.rc.4x24.gtfv3 AgcmSimple.rc",
+                "cp -f input.nml.4x4 input.nml",
+                "cp -f CAP.rc.1ts CAP.rc",
+            ],
+        )
+
+    def _setup_1day_2nodes_gtfv3(self, experiment_directory: str) -> ShellScript:
+        return ShellScript(
+            name="setup_config_1day_2nodes_gtfv3",
+            working_directory=experiment_directory,
+        ).write(
+            shell_commands=[
+                f"cd {experiment_directory}",
+                "cp -f AgcmSimple.rc.4x24.gtfv3 AgcmSimple.rc",
+                "cp -f input.nml.4x4 input.nml",
+                "cp -f CAP.rc.1day CAP.rc",
+            ],
+        )
+
+    def _setup_1day_2nodes_fortran(self, experiment_directory: str) -> ShellScript:
+        return ShellScript(
+            name="setup_config_1day_2nodes_fortran",
+            working_directory=experiment_directory,
+        ).write(
+            shell_commands=[
+                f"cd {experiment_directory}",
+                "cp -f AgcmSimple.rc.4x24.fortran AgcmSimple.rc",
+                "cp -f input.nml.4x4 input.nml",
+                "cp -f CAP.rc.1day CAP.rc",
+            ],
+        )
+
     def prepare_experiment(
         self,
         input_directory: str,
@@ -224,13 +243,19 @@ def simulate(
         gtfv3_config: GTFV3Config,
         setup_script: ShellScript,
         setup_only: bool = False,
+        hardware_sampler_on: bool = False,
+        mps_on: bool = False,
+        local_redirect_log: bool = False,
     ):
         srun_script = _make_srun_script(
             executable_name=executable_name,
             experiment_directory=experiment_directory,
             slurm_config=slurm_config,
             gtfv3_config=gtfv3_config,
             prolog_scripts=prolog_scripts,
+            hardware_sampler_on=hardware_sampler_on,
+            mps_on=mps_on,
+            local_redirect_log=local_redirect_log,
         )
 
         setup_script.execute()
@@ -310,11 +335,11 @@ def run_action(
                         experiment_directory=experiment_directory,
                         executable_name=self.executable_name,
                         prolog_scripts=prolog_scripts,
-                        slurm_config=SlurmConfiguration.slurm_6CPUs_6GPUs(
+                        slurm_config=SlurmConfiguration.slurm_96CPUs_8GPUs(
                             output="benchmark.cache.dacegpu.%t.out"
                         ),
                         gtfv3_config=GTFV3Config.dace_gpu_32_bit_BAR(),
-                        setup_script=self._setup_1ts_1node_gtfv3(experiment_directory),
+                        setup_script=self._setup_1ts_2nodes_gtfv3(experiment_directory),
                         setup_only=env.setup_only,
                     )
 
@@ -323,25 +348,27 @@ def run_action(
                     experiment_directory=experiment_directory,  # type: ignore
                     executable_name=self.executable_name,
                     prolog_scripts=prolog_scripts,  # type: ignore
-                    slurm_config=SlurmConfiguration.slurm_6CPUs_6GPUs(
-                        output="benchmark.1day.dacegpu.%t.out"
+                    slurm_config=SlurmConfiguration.slurm_96CPUs_8GPUs(
+                        output="benchmark.1day.MPS.44.dacegpu.%t.out"
                     ),
                     gtfv3_config=GTFV3Config.dace_gpu_32_bit_BAR(dacemode="Run"),
-                    setup_script=self._setup_1day_1node_gtfv3(experiment_directory),  # type: ignore
+                    setup_script=self._setup_1day_2nodes_gtfv3(experiment_directory),  # type: ignore
                     setup_only=env.setup_only,
+                    mps_on=True,
                 )
 
                 # Run 1 day Fortran
                 self.simulate(
                     experiment_directory=experiment_directory,  # type: ignore
                     executable_name=self.executable_name,
                     prolog_scripts=prolog_scripts,  # type: ignore
-                    slurm_config=SlurmConfiguration.slurm_72CPUs(
-                        output="benchmark.1day.fortran.%t.out"
+                    slurm_config=SlurmConfiguration.slurm_96CPUs(
+                        output="benchmark.1day.MPS.44.fortran.%t.out"
                     ),
                     gtfv3_config=GTFV3Config.fortran(),
-                    setup_script=self._setup_1day_1node_fortran(experiment_directory),  # type: ignore
+                    setup_script=self._setup_1day_2nodes_fortran(experiment_directory),  # type: ignore
                     setup_only=env.setup_only,
+                    mps_on=True,
                 )
 
     def check(

diff --git a/geosongpu_ci/pipeline/templates/__init__.py b/geosongpu_ci/pipeline/templates/__init__.py
@@ -0,0 +1,22 @@
+import os
+import sys
+import site
+
+
+def find_template(name: str) -> str:
+    # pip install geosongpu-ci
+    candidate = f"{sys.prefix}/geosongpu/templates/{name}.tpl"
+    if os.path.isfile(candidate):
+        return candidate
+    # pip install --user geosongpu-ci
+    candidate = f"{site.USER_BASE}/geosongpu/templates/{name}.tpl"
+    if os.path.isfile(candidate):
+        return candidate
+    # pip install -e geosongpu-ci
+    candidate = os.path.join(
+        os.path.dirname(__file__),
+        f"{name}.tpl",
+    )
+    if os.path.isfile(candidate):
+        return candidate
+    raise FileNotFoundError(f"Template: could not locate {name}")
diff --git a/geosongpu_ci/pipeline/templates/gpu-wrapper-slurm-mps.sh.tpl b/geosongpu_ci/pipeline/templates/gpu-wrapper-slurm-mps.sh.tpl
@@ -0,0 +1,76 @@
+#!/bin/sh
+
+# We open GPU visibility to full node at first
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+
+# Hardware sampling is a python tools that reads at intervals
+# various hardware sensors (power, usage, memory load...)
+if [ -z ${HARDWARE_SAMPLING} ]; then
+    echo "Hardware sampling is OFF"
+else
+    echo "Hardware sampling is ON"
+    # We restrict usage to (world) rank 0
+    if [ $SLURM_PROCID -eq 0 ]; then
+        geosongpu_hws server &
+        sleep 10
+        geosongpu_hws client start
+    fi
+
+fi
+
+if [ -z ${MPS_ON} ]; then
+    echo "MPS is OFF"
+    # No MPS, we assume rank==GPU
+    GPU=$SLURM_LOCALID
+    export CUDA_VISIBLE_DEVICES=$GPU
+else
+    echo "MPS is ON"
+    if [ -z ${PER_DEVICE_PROCESS} ]; then
+        echo "PER_DEVICE_PROCESS needs to be setup on MPS. Exiting."
+        exit 1
+    fi
+    # All ranks needs to know where to look
+    export CUDA_MPS_PIPE_DIRECTORY=./nvidia-mps/$SLURM_NODEID
+    export CUDA_MPS_LOG_DIRECTORY=./nvidia-log/$SLURM_NODEID
+    # Only 1 rank per node (local rank 0) handles the server chatter
+    if [ $SLURM_LOCALID -eq 0 ]; then
+        echo "Turn nvidia-cuda-mps-control on for node $SLURM_NODEID"
+        mkdir -p nvidia-mps
+        mkdir -p nvidia-log/$SLURM_NODEID
+        # sudo nividia -i 0 -c 3 # Per docs, we should insure GPU is in EXCLUSIVE mode but we might be curtail by HPC settings
+        nvidia-cuda-mps-control -d
+    fi
+    # MPS server is socket base, leave time for the filesystem
+    sleep 10
+    # Server should be spun, we restrict this rank to a single GPU
+    GPU=$((SLURM_LOCALID/PER_DEVICE_PROCESS))
+    export CUDA_VISIBLE_DEVICES=$GPU
+fi
+
+
+echo "Node: $SLURM_NODEID | Rank: $SLURM_PROCID, pinned to GPU: $CUDA_VISIBLE_DEVICES"
+
+# Run program with or without log dump in file
+if [ -z ${LOCAL_REDIRECT_LOG} ]; then
+    $*
+else
+    $* > log.redirect_local.$SLURM_PROCID.out 2>&1
+fi
+
+# Clean up of all tools
+if [ -z ${HARDWARE_SAMPLING} ]; then
+    echo ""
+else 
+    if [ $SLURM_PROCID -eq 0 ]; then
+        geosongpu_hws client dump
+        geosongpu_hws client stop
+    fi
+fi
+if [ -z ${MPS_ON} ]; then
+    echo ""
+else 
+    if [ $SLURM_LOCALID -eq 0 ]; then
+        echo quit | nvidia-cuda-mps-control
+        # sudo nividia -i 0 -c 0 # Per docs, we should insure GPU is flipped back to DEFAULT mode but we might be curtail by HPC settings
+    fi
+fi
diff --git a/geosongpu_ci/tools/benchmark/geos_log_parser.py b/geosongpu_ci/tools/benchmark/geos_log_parser.py
@@ -68,7 +68,9 @@ def parse_geos_log(filename: str) -> BenchmarkRawData:
         benchmark.fv_dyncore_timings = _extract_numerics(interface_timings)
 
         if "dace" in benchmark.backend:
-            dycore_timings = _grep(filename, "] Run...", exclude_pattern=True)
+            dycore_timings = _grep(
+                filename, "] Run...", exclude_pattern=True, expected=False
+            )
             benchmark.inner_dycore_timings = _extract_numerics(dycore_timings)
     else:
         dycore_timings = _grep(filename, "0: fv_dynamics", exclude_pattern=True)
@@ -131,8 +133,16 @@ def parse_geos_log(filename: str) -> BenchmarkRawData:
 
     # Model throughput
     gloabl_profiler_entry = "Model Throughput"
+    global_init_time = _grep(
+        filename, "--Initialize", start_pattern=gloabl_profiler_entry
+    )
+    benchmark.global_init_time = _extract_numerics(global_init_time)[1]
     global_run_time = _grep(filename, "--Run", start_pattern=gloabl_profiler_entry)
     benchmark.global_run_time = _extract_numerics(global_run_time)[1]
+    global_finalize_time = _grep(
+        filename, "--Finalize", start_pattern=gloabl_profiler_entry
+    )
+    benchmark.global_finalize_time = _extract_numerics(global_finalize_time)[1]
 
     return benchmark