Skip to content

Commit

Permalink
Merge pull request #21 from GEOS-ESM/feature/hs_benchmark
Browse files Browse the repository at this point in the history
Held Suarez benchmark
  • Loading branch information
FlorianDeconinck authored Aug 1, 2023
2 parents f280396 + 1675947 commit bf4a07f
Show file tree
Hide file tree
Showing 28 changed files with 1,129 additions and 456 deletions.
32 changes: 0 additions & 32 deletions .editorconfig

This file was deleted.

10 changes: 0 additions & 10 deletions CMakeLists.txt

This file was deleted.

23 changes: 23 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,26 @@
| NCCS Discover GEOS Aquaplanet Validation | [![Discover Nightly GEOS Aquaplanet Validation](https://github.com/GEOS-ESM/geosongpu-ci/actions/workflows/discover_aq_nightly.yml/badge.svg)](https://github.com/GEOS-ESM/geosongpu-ci/actions/workflows/discover_aq_nightly.yml) |

On-premise CI for the GPU ports of GEOS. Includes validation & benchmark worfklows.

## Current capacities

Experiments are listed in `experiments/experiments.yaml`

The package install a `geosongpu_dispatch`

```
Usage: geosongpu_dispatch [OPTIONS] NAME ACTION
Dispatch the _NAME_ experiment (as recorded in experiments.yaml) with the
_ACTION_ (from Validation, Benchmark or All).
Environement variable:
CI_WORKSPACE: dispatch sets all work in this directory.
Options:
--artifact TEXT Artifact directory for results storage
--setup_only Setup the experiment but skip any long running jobs (build,
run...)
--help Show this message and exit.
```
7 changes: 6 additions & 1 deletion experiments/experiments.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,13 @@ geos_hs:
- fvdycore
- env
- cmake
- gtFV3
input:
directory: /discover/nobackup/projects/geosongpu/geos_data/held_suarez/gcm-11.0.4.1/C180-L91
C180-L72: /discover/nobackup/projects/geosongpu/geos_data/held_suarez/gcm-11.0.4.1/C180-L72
C180-L91: /discover/nobackup/projects/geosongpu/geos_data/held_suarez/gcm-11.0.4.1/C180-L91
C180-L137: /discover/nobackup/projects/geosongpu/geos_data/held_suarez/gcm-11.0.4.1/C180-L137
C360-L72: /discover/nobackup/projects/geosongpu/geos_data/held_suarez/gcm-11.0.4.1/C360-L72
C360-L91: /discover/nobackup/projects/geosongpu/geos_data/held_suarez/gcm-11.0.4.1/C360-L91
tasks:
- GEOS
- HeldSuarez
Expand Down
39 changes: 19 additions & 20 deletions geosongpu_ci/actions/git.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from typing import Dict, Any
from geosongpu_ci.utils.shell import shell_script
from geosongpu_ci.utils.shell import ShellScript
from geosongpu_ci.actions.pipeline import PipelineAction


Expand All @@ -12,34 +12,33 @@ def git_prelude(
do_mepo: bool = True,
) -> Dict[str, Any]:
git_config = config["repository"]
modules = []

shell_script(
name="setup_repository",
modules=["other/mepo"],
shell_commands=[
f"git clone {git_config['url']} {override_repo_name}",
f"cd {override_repo_name}",
f"git checkout {git_config['tag_or_hash']}",
],
)
# Basic git commands to clone/checkout the repository
git_commands = [
f"git clone {git_config['url']} {override_repo_name}",
f"cd {override_repo_name}",
f"git checkout {git_config['tag_or_hash']}",
]

# Write metadata file
# Add the mepo commands to be triggered in the repository
if do_mepo:
if "mepo" in git_config.keys() and "develop" in git_config["mepo"].keys():
develop_comp_command = "mepo develop"
for comp in git_config["mepo"]["develop"]:
develop_comp_command += f" {comp}"
else:
develop_comp_command = ""
mepo_status = shell_script(
name="get_mepo_status",
modules=["other/mepo"],
shell_commands=[
f"cd {override_repo_name}",
modules.append("other/mepo")
git_commands.extend(
[
"mepo clone",
develop_comp_command,
"mepo status",
],
temporary=True,
]
)
metadata["mepo_status"] = mepo_status

# Setup script
ShellScript(f"checkout_repository_{experiment_name}").write(
modules=modules,
shell_commands=git_commands,
).execute()
2 changes: 1 addition & 1 deletion geosongpu_ci/actions/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,5 @@

class PipelineAction(enum.Enum):
All = 0
Validation = 1
Validation = 1
Benchmark = 2
65 changes: 65 additions & 0 deletions geosongpu_ci/actions/slurm.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,70 @@
import subprocess
from time import sleep
from dataclasses import dataclass


@dataclass
class SlurmConfiguration:
"""Slurm job options with common default for our project
This class includes the common configuration we use to run validation
and benchmarking.
"""

account: str = "j1013"
constraint: str = "rome"
qos: str = "4n_a100"
partition: str = "gpu_a100"
nodes: int = 1
ntasks: int = 1
ntasks_per_node: int = 1
sockets_per_node: int = 1
gpus_per_node: int = 0
mem_per_gpu: str = "40G"
time: str = "02:00:00"
output: str = "log.%t.out"

def srun_bash(self, wrapper: str, executable_name: str) -> str:
"""Code for an srun command"""
if self.gpus_per_node != 0:
gpu_line = (
f"--gpus-per-node={self.gpus_per_node} --mem-per-gpu={self.mem_per_gpu}"
)
else:
gpu_line = ""
return (
"srun -A j1013 -C rome "
f" --qos={self.qos} --partition={self.partition} "
f" --nodes={self.nodes} --ntasks={self.ntasks} "
f" --ntasks-per-node={self.ntasks_per_node} "
f" --sockets-per-node={self.sockets_per_node} "
f" {gpu_line} "
f" --time={self.time} "
f" --output={self.output} "
f" {wrapper} {executable_name}"
)

@classmethod
def one_half_nodes_GPU(cls) -> "SlurmConfiguration":
"""1/2 node configuration on Discover with A100 & Rome Epyc"""
return SlurmConfiguration(
nodes=2,
ntasks=6,
ntasks_per_node=3,
sockets_per_node=2,
gpus_per_node=3,
mem_per_gpu="40G",
)

@classmethod
def one_half_Nodes_CPU(cls) -> "SlurmConfiguration":
"""1/2 node configuration on Discover with Rome Epyc"""
return SlurmConfiguration(
nodes=2,
ntasks=72,
ntasks_per_node=48,
sockets_per_node=2,
)


def wait_for_sbatch(job_id: str):
Expand Down
32 changes: 20 additions & 12 deletions geosongpu_ci/dispatch.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,25 @@
import click
from geosongpu_ci.pipeline.task import dispatch
from geosongpu_ci.actions.pipeline import PipelineAction


def main():
"""
Expected:
arg[1]: experiment name as listed in the experiments.yaml
arg[2]: experiment action from PipelineAction
arg[3]: artifact directory for LT storage
"""
import sys
@click.command()
@click.argument("name")
@click.argument("action")
@click.option("--artifact", default=".", help="Artifact directory for results storage")
@click.option(
"--setup_only",
is_flag=True,
help="Setup the experiment but skip any long running jobs (build, run...)",
)
def cli_dispatch(name: str, action: str, artifact: str, setup_only: bool):
"""Dispatch the _NAME_ experiment (as recorded in experiments.yaml)
with the _ACTION_ (from Validation, Benchmark or All).
experiment_name = sys.argv[1]
experiment_action = PipelineAction[sys.argv[2]]
artifact_directory = sys.argv[3]
dispatch(experiment_name, experiment_action, artifact_directory)
Environement variable:\n
CI_WORKSPACE: dispatch sets all work in this directory."""
dispatch(name, PipelineAction[action], artifact, setup_only)


if __name__ == "__main__":
cli_dispatch()
31 changes: 13 additions & 18 deletions geosongpu_ci/pipeline/aquaplanet.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
from geosongpu_ci.pipeline.task import TaskBase
from geosongpu_ci.utils.environment import Environment
from geosongpu_ci.utils.registry import Registry
from geosongpu_ci.actions.pipeline import PipelineAction
from geosongpu_ci.actions.slurm import wait_for_sbatch
from geosongpu_ci.pipeline.geos import copy_input_from_project
from geosongpu_ci.utils.shell import shell_script
from geosongpu_ci.utils.shell import ShellScript
from typing import Dict, Any


Expand All @@ -21,13 +20,10 @@ class Aquaplanet(TaskBase):
def run_action(
self,
config: Dict[str, Any],
experiment_name: str,
action: PipelineAction,
env: Environment,
metadata: Dict[str, Any],
):
geos_install_path = env.get("GEOS_INSTALL")
geos = f"{geos_install_path}/.."
geos = env.get("GEOS_BASE_DIRECTORY")
layout = "1x1"

experiment_dir = copy_input_from_project(config, geos, layout)
Expand All @@ -42,25 +38,24 @@ def run_action(
new_text=f"setenv EXPDIR {experiment_dir}",
)

run_script_gpu_name = "run_script_gpu.sh"
sbatch_result = shell_script(
name=run_script_gpu_name.replace(".sh", ""),
env_to_source=[],
shell_commands=[
f"cd {experiment_dir}",
f"export CUPY_CACHE_DIR={experiment_dir}/.cupy",
"sbatch gcm_run.j",
],
sbatch_result = (
ShellScript("run_script_gpu")
.write(
shell_commands=[
f"cd {experiment_dir}",
f"export CUPY_CACHE_DIR={experiment_dir}/.cupy",
"sbatch gcm_run.j",
]
)
.execute()
)

job_id = sbatch_result.split(" ")[-1].strip().replace("\n", "")
wait_for_sbatch(job_id)

def check(
self,
config: Dict[str, Any],
experiment_name: str,
action: PipelineAction,
artifact_base_directory: str,
env: Environment,
) -> bool:
# TODO
Expand Down
20 changes: 2 additions & 18 deletions geosongpu_ci/pipeline/ci_clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,17 @@
from geosongpu_ci.pipeline.task import TaskBase
from geosongpu_ci.utils.registry import Registry
from geosongpu_ci.utils.environment import Environment
from geosongpu_ci.actions.pipeline import PipelineAction
import shutil
from os.path import abspath
from os import mkdir
from geosongpu_ci.utils.shell import shell_script
from geosongpu_ci.utils.shell import ShellScript


@Registry.register
class CIClean(TaskBase):
def run_action(
self,
config: Dict[str, Any],
experiment_name: str,
action: PipelineAction,
env: Environment,
metadata: Dict[str, Any],
):
Expand All @@ -28,9 +25,6 @@ def run_action(
def check(
self,
config: Dict[str, Any],
experiment_name: str,
action: PipelineAction,
artifact_base_directory: str,
env: Environment,
) -> bool:
artifact_dir = abspath(f"{env.CI_WORKSPACE}/../")
Expand All @@ -44,25 +38,15 @@ class SlurmCancelJob(TaskBase):
def run_action(
self,
config: Dict[str, Any],
experiment_name: str,
action: PipelineAction,
env: Environment,
metadata: Dict[str, Any],
):
# Build GEOS
shell_script(
name="cancel_slurm_jobs",
modules=[],
env_to_source=[],
shell_commands=["scancel -u gmao_ci"],
)
ShellScript("cancel_slurm_jobs").write(["scancel -u gmao_ci"]).execute()

def check(
self,
config: Dict[str, Any],
experiment_name: str,
action: PipelineAction,
artifact_base_directory: str,
env: Environment,
) -> bool:
return True
Loading

0 comments on commit bf4a07f

Please sign in to comment.