From b221f62d274d9241093334a0a02e620f382f27b2 Mon Sep 17 00:00:00 2001 From: Florian Deconinck Date: Thu, 17 Aug 2023 11:18:01 -0400 Subject: [PATCH] Update README HS: fix setup for MPS & cleanup --- README.md | 12 ++++++------ geosongpu_ci/pipeline/held_suarez.py | 14 +++++++------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index c36437c..abd1334 100644 --- a/README.md +++ b/README.md @@ -20,12 +20,12 @@ Validation capacities for physics compares OACC and original Fortran on. Automatic benchmarking are as follow (legends after table) -| Experimentation | Resolutions | Layout | Setup | -| ----------------------------- | ------------|-----------| ---------------------------------------- | -| Held-Suarez | C180-L72 | 1x1 | Discover @ node-to-node (exclusive GPU) | -| | C180-L91 | 1x1 | Discover @ node-to-node (exclusive GPU) | -| | C180-L137 | 1x1 | Discover @ node-to-node (exclusive GPU) | -| Aquaplanet | C180-L72 | 1x1 | Discover @ node-to-node (exclusive GPU) | +| Experimentation | Resolutions | Layout | CPU/GPU | +| ----------------------------- | ----------- | ------ | --------------------------------- | +| Held-Suarez | C180-L72 | 4x4 | 96/8 Node-to-node (sharing GPU) | +| | C180-L137 | 4x4 | 96/8 Node-to-node (sharing GPU) | +| | C360-L72 | 4x4 | 96/8 Node-to-node (sharing GPU) | +| Aquaplanet | C180-L72 | 1x1 | 6/6 Node-to-node (exclusive GPU) | Legend: diff --git a/geosongpu_ci/pipeline/held_suarez.py b/geosongpu_ci/pipeline/held_suarez.py index b5b91f4..e08d6c0 100644 --- a/geosongpu_ci/pipeline/held_suarez.py +++ b/geosongpu_ci/pipeline/held_suarez.py @@ -87,9 +87,9 @@ def _make_srun_script( executable_name=executable_name, ) # Options - options = f""" {'export HARDWARE_SAMPLING=1' if hardware_sampler_on else 'unset HARDWARE_SAMPLING' } - {'export MPS_ON=1' if mps_on else 'unset MPS_ON' } - {f'export LOCAL_REDIRECT_LOG=1' if local_redirect_log else 'unset LOCAL_REDIRECT_LOG' } + options = f"""{'export HARDWARE_SAMPLING=1' if hardware_sampler_on else 'unset HARDWARE_SAMPLING' } +{'export MPS_ON=1' if mps_on else 'unset MPS_ON' } +{f'export LOCAL_REDIRECT_LOG=1' if local_redirect_log else 'unset LOCAL_REDIRECT_LOG' } """ if "dace" in gtfv3_config.GTFV3_BACKEND: @@ -149,7 +149,7 @@ def _setup_1ts_1node_gtfv3(self, experiment_directory: str) -> ShellScript: def _setup_1day_1node_gtfv3(self, experiment_directory: str) -> ShellScript: return ShellScript( - name="_setup_config_1day_1node_gtfv3", + name="setup_config_1day_1node_gtfv3", working_directory=experiment_directory, ).write( shell_commands=[ @@ -175,7 +175,7 @@ def _setup_1day_1node_fortran(self, experiment_directory: str) -> ShellScript: def _setup_1ts_2nodes_gtfv3(self, experiment_directory: str) -> ShellScript: return ShellScript( - name="_setup_config_1ts_2nodes_gtfv3", + name="setup_config_1ts_2nodes_gtfv3", working_directory=experiment_directory, ).write( shell_commands=[ @@ -188,7 +188,7 @@ def _setup_1ts_2nodes_gtfv3(self, experiment_directory: str) -> ShellScript: def _setup_1day_2nodes_gtfv3(self, experiment_directory: str) -> ShellScript: return ShellScript( - name="_setup_config_1day_2nodes_gtfv3", + name="setup_config_1day_2nodes_gtfv3", working_directory=experiment_directory, ).write( shell_commands=[ @@ -335,7 +335,7 @@ def run_action( experiment_directory=experiment_directory, executable_name=self.executable_name, prolog_scripts=prolog_scripts, - slurm_config=SlurmConfiguration.slurm_6CPUs_6GPUs( + slurm_config=SlurmConfiguration.slurm_96CPUs_8GPUs( output="benchmark.cache.dacegpu.%t.out" ), gtfv3_config=GTFV3Config.dace_gpu_32_bit_BAR(),