Skip to content

Commit

Permalink
Merge branch 'feature/project'
Browse files Browse the repository at this point in the history
  • Loading branch information
FlorianDeconinck committed Feb 16, 2024
2 parents e703d6d + bcd318d commit 98ea2cf
Show file tree
Hide file tree
Showing 10 changed files with 433 additions and 1 deletion.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@ __pycache__
*.egg-info
build/
tmp/
git-token

# Temporary directories for WIP
tmp_*/

# Docs are auto-build. See .github/workflows/gh_pages_doc.yml
docs
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "tcn"
version = "2023.0.0"
version = "2024.0.0"
authors = [
{ name = "NASA Advanced Software and Technology Group", email = "[email protected]" },
]
Expand Down Expand Up @@ -41,6 +41,7 @@ dependencies = [
"netcdf4==1.6.3",
"f90nml",
"GitPython",
"pandas",
]

[tool.setuptools]
Expand Down
181 changes: 181 additions & 0 deletions results/mpi_gpu_rdma/24W7.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
{
"latency": {
"message_size": [
1,
2,
4,
8,
16,
32,
64,
128,
256,
512,
1024,
2048,
4096,
8192,
16384,
32768,
65536,
131072,
262144,
524288,
1048576,
2097152,
4194304
],
"sles12": [
19.32,
19.34,
19.32,
19.28,
19.25,
19.53,
19.66,
19.74,
19.89,
19.93,
14.79,
11.12,
11.13,
11.21,
11.34,
11.54,
12.04,
13,
14.27,
17.01,
22.44,
33.64,
59.6
],
"sles15": [
3.21,
3.26,
3.27,
3.28,
3.28,
3.61,
3.72,
3.78,
3.83,
3.99,
13.32,
13.41,
13.78,
13.89,
14.05,
14.12,
14.61,
15.56,
16.96,
19.52,
25.22,
36.2,
57.97
]
},
"bandwith_d_d": {
"message_size": [
1,
2,
4,
8,
16,
32,
64,
128,
256,
512,
1024,
2048,
4096,
8192,
16384,
32768,
65536,
131072,
262144,
524288,
1048576,
2097152,
4194304
],
"discover": [
0.8,
1.61,
3.26,
6.61,
12.79,
25.27,
46.96,
93.44,
185.84,
344.61,
698.7,
1287.32,
2531.08,
5334.86,
6660.63,
7549.35,
7921.56,
8141.28,
8280.57,
8231.82,
8238.02,
8254.08,
8262.24
],
"discover_host": [
2.48,
4.93,
9.77,
19.63,
39.21,
78.52,
156.75,
303.82,
558.15,
1100.59,
2554.42,
4710.63,
6232.18,
10533.33,
15514.43,
20568.02,
22083.17,
22433.31,
22922.42,
23234.75,
23118,
23169.48,
27117.58
],
"perlmutter": [
0.76,
1.01,
2.01,
3.96,
8.08,
16.16,
32.13,
20.7,
469.29,
941.65,
1881.96,
3583.05,
6990.1,
14424.4,
15612.7,
16250.48,
19572.69,
21353.37,
22171.34,
22567.9,
22894.79,
22867.91,
22921.46
]
}
}
Binary file added results/mpi_gpu_rdma/bandwith.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added results/mpi_gpu_rdma/latency.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
74 changes: 74 additions & 0 deletions results/mpi_gpu_rdma/plots.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import pathlib
import pandas as pd
import plotly.express as px
import json
from tcn.plots.colors import COLORS_RETRO_HIGH_CONTRAST # type:ignore

THIS_DIR = pathlib.Path(__file__).parent.resolve()
RESULT_DIR = THIS_DIR / "../"


def summary(project_dir: str, data_name: str):
with open(f"{project_dir}/{data_name}") as f:
data = json.load(f)

df = pd.DataFrame(
[
data["latency"]["message_size"],
data["latency"]["sles12"],
data["latency"]["sles15"],
]
).T
df.columns = ["message_size", "sles12", "sles15"] # type:ignore
fig = px.line(
df,
x="message_size",
y=["sles12", "sles15"],
log_x=True,
color_discrete_sequence=list(COLORS_RETRO_HIGH_CONTRAST.values()),
title="Latency on Discover (lower is better)",
template="simple_white",
labels={
"message_size": "Message size (B)",
"value": "Bandwidth (B/s)",
"variable": "OS",
},
)
fig.write_image(f"{project_dir}/latency.png")

df = pd.DataFrame(
[
data["bandwith_d_d"]["message_size"],
data["bandwith_d_d"]["discover"],
data["bandwith_d_d"]["discover_host"],
data["bandwith_d_d"]["perlmutter"],
]
).T
df.columns = [
"message_size",
"discover",
"discover_host",
"perlmutter",
] # type:ignore
fig = px.line(
df,
x="message_size",
y=["discover", "discover_host", "perlmutter"],
log_x=True,
color_discrete_sequence=list(COLORS_RETRO_HIGH_CONTRAST.values()),
title="Peak bandwith (higher is better)",
template="simple_white",
labels={
"message_size": "Message size (B)",
"value": "Bandwidth (B/s)",
"variable": "Machine",
},
)
fig.write_image(f"{project_dir}/bandwith.png")


if __name__ == "__main__":
summary(
project_dir=str(RESULT_DIR / "mpi_gpu_rdma"),
data_name="24W7.json",
)
109 changes: 109 additions & 0 deletions results/project/24W7/SMT 2024-2026 - Backlog.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
ID Title URL Milestone (NASA) Assignees Status Priority Category xTeam Type Repository Linked pull requests Task Readiness
P0 Planning Mx Backlog Project Sink Ok - Planned
P1 User feedback survey Mx Backlog Project SI/GMAO Sink Ok - Planned
P2 Milestone feedback Mx Backlog Project SI/GMAO Sink Ok - Planned
MW0 Pace split https://github.com/GEOS-ESM/pace/issues/41 M1 FlorianDeconinck Done P0 Middleware NOAA Task GEOS-ESM/pace Ok - Planned
MW1 Versioned release M1 Backlog P1 Middleware NOAA Task Ok - Planned
MW2 MPP/FMS API Backlog Middleware NOAA Parent Ok - Unplanned
MW2.1 Non-square layout M1 Backlog P2 Middleware NOAA Subtask Ok - Planned
MW2.2 All grid capacities Backlog Middleware NOAA Subtask To Subtasks
MW3 CPU optimization M2 Backlog Middleware Parent To Subtasks
MW3.1 Partial expansion M2 Backlog Middleware LLNL Subtask Investigate
MW3.2 Zero-copy on CPU from Fortran align M2 Backlog Middleware Subtask Ok - Planned
MW4 Backend compilation refactor Backlog Middleware Parent To Subtasks
MW4.1 Extend cube-sphere naming to stencil backends Backlog Middleware Subtask Ok - Unplanned
MW4.2 Distributed compilation on many nodes Backlog Middleware Subtask Ok - Unplanned
MW4.3 Better key-ing for the cube-sphere cache Backlog Middleware Subtask Ok - Unplanned
MW4.4 Metadata extension Backlog Middleware Subtask Ok - Unplanned
MW5 GT4Py.cartesian: global fields Backlog Middleware Task Ok - Unplanned
MW6 GT4Py.cartesian: K indirect write Backlog Middleware Task Ok - Unplanned
MW7 GT4Py.cartesian: return/end of loop Backlog Middleware Task Ok - Unplanned
MW8 Fortran <> Python interface generator Backlog Middleware Task To Subtasks
MW8.1 Translate-test-like architecture for timestep level testing Backlog Middleware Subtask Ok - Unplanned
MW8.2 Scientifc fortran-python difference testing Backlog Middleware Subtask Ok - Unplanned
MW9 DaCe opt: kernel merge investigation Backlog Middleware Parent To Subtasks
MW9.1 DaCe opt: corners (dace.map and/or stree merge) Backlog Middleware LLNL Subtask Ok - Unplanned
MW10 DaCe opt: auto-tuning Backlog Middleware Parent Ok - Unplanned
MW10.1 Brute force auto-tuning Backlog Middleware Subtask To Subtasks
MW10.2 ML auto-tuning (Daisy) Backlog Middleware Subtask Ok - Unplanned
MW11 User configuration failure feedback Backlog Middleware Task To Subtasks
MW12 JAX backend Backlog Middleware NOAA Parent To Subtasks
MW12.1 `jax` backend: a working jax under-the-hood Backlog Middleware NOAA Subtask Ok - Unplanned
MW12.2 `jax:ad` backend: a differentiating stencil backend Backlog Middleware NOAA Subtask Ok - Unplanned
MW13 Production check Backlog Middleware Parent To Subtasks
MW13.1 Low data on GPU (bad bandwith) Backlog Middleware Subtask Ok - Unplanned
MW13.2 MPI not CUDA aware Backlog Middleware Subtask Ok - Unplanned
MW13.3 VRAM exceed or low Backlog Middleware Subtask Ok - Unplanned
MW13.4 No GPU or GPU unaccesible Backlog Middleware Subtask Ok - Unplanned
MW14 Initialization performance Backlog Middleware Task To Subtasks
MW15 Halo exchange full program integration Backlog Middleware Task Ok - Unplanned
MW16 GT4Py.cartesian: data dimension parallelism Backlog Middleware Task To Subtasks
MW17 AMD GPU performance Backlog Middleware Task To Subtasks
MW18 GT4Py.cartesian: DaCe v0.15.x M1 Backlog P1 Middleware GT4Py Task Ok - Planned
MW19 Instrumentation Backlog Middleware Parent To Subtasks
MW19.1 DaCe instrumentation: kernel timings Backlog Middleware Subtask Ok - Unplanned
MW20 GPU-CPU node optimal usage Backlog Middleware NOAA Parent To Subtasks
MW20.1 CPU I/O on GPU runs - explore Backlog Middleware Subtask Investigate
S0 Support to middleware users Mx Backlog Support Task To Subtasks
S1 CI/CD Mx Backlog Support Parent To Subtasks
S1.1 Workflow to be seen in github action directly Backlog Support Subtask Ok - Unplanned
S1.2 Make GEOS cachable to speed up CI Backlog Support Subtask Ok - Unplanned
G0 DyCore GEOS-FP on Discover M1 Backlog GEOS SI/GMAO Task To Subtasks
G1 Dynamical core Grid Componenmt M1 Backlog GEOS GMAO Parent To Subtasks
G1.1 Discover config of GEOS-FP M1 Backlog GEOS Subtask Ok - Planned
G1.2 Mixed precision M1 Backlog GEOS Subtask Ok - Planned
G1.3 SubGridZ M1 Backlog GEOS Subtask Ok - Planned
G1.4 Tracer normalization M1 Backlog GEOS Subtask Ok - Planned
G1.5 Wind update M1 Backlog GEOS Subtask Ok - Planned
G2 Moist physics Grid Component M2 Backlog GEOS Parent To Subtasks
G2.1 Compare to OACC port M2 Backlog GEOS Subtask Ok - Planned
G2.2 UW convection port M2 Backlog GEOS Subtask Ok - Planned
G2.3 GF convection port M2 Backlog GEOS Subtask Ok - Planned
G2.4 Microphysics port M2 Backlog GEOS Subtask Ok - Planned
G2.5 Merge all RUN components M2 Backlog GEOS SI Subtask Ok - Planned
G3 Radiation physics Grid Component M2 Backlog GEOS Parent To Subtasks
G3.1 RRTMGP port M2 Backlog GEOS Subtask Investigate
G4 Land surface Gid Component M3 Backlog GEOS Task To Subtasks
G5 Benchmark of GPU-ready GEOS-FP Mx Backlog GEOS Sink Ok - Planned
G5.1 Define operational and HPC metrics M1 Backlog GEOS Subtask Ok - Planned
G5.2 Define an archiving mechanism and baselining M1 Backlog GEOS Subtask Ok - Planned
G5.3 Benchmark GPU-Dynamical Core GEOS-FP M1 Backlog GEOS Subtask Ok - Planned
G5.4 Benchmark GPU-Moist GEOS-FP M2 Backlog GEOS Subtask Ok - Planned
G5.5 Benchmark GPU-Radiation GEOS-FP M2 Backlog GEOS Subtask Ok - Planned
G5.6 Benchmark GPU-Land Surface GEOS-FP M3 Backlog GEOS Subtask Ok - Planned
G6 Validation of the GPU accelerated GEOS-FP Mx Backlog GEOS Task To Subtasks
G7 Multi-grid support for Component level CPU-GPU switch M1 Backlog GEOS Task Investigate
G8 Document and auto-generate VRAM guidelines Backlog GEOS Task To Subtasks
G9 Pipeline recurring GEOS simulation M1 Backlog GEOS Parent To Subtasks
G9.1 Fast scientific validation M1 Backlog GEOS Subtask Ok - Planned
G9.2 Full scientific validation M1 Backlog GEOS Subtask Ok - Planned
G9.3 Per port timestep-level validation Mx Backlog GEOS Parent To Subtasks
D1 DyCore allowed namelist options M1 Backlog P0 Documentation Task Ok - Planned
D2 Training: a step-by-step porting of a subroutine Backlog P2 Documentation Task Ok - Unplanned
D3 Training: how to start writing a model using the DSL Backlog P2 Documentation Task Ok - Unplanned
D4 Docs - GEOS GPU: quickstart Mx Backlog Documentation Sink Ok - Planned
D5 Docs - GEOS GPU: usage & limitation Backlog P1 Documentation Task Ok - Unplanned
D6 Project Mx Backlog Documentation Sink Ok - Planned
D7 Docs - State of software stack supporting GEOS & GPU Mx Backlog Documentation Sink Ok - Planned
D8 Docs - Developping in/around GEOS Mx Backlog Documentation Sink Ok - Planned
D9 Docs - Team goals and means M1 Backlog Documentation Task Ok - Planned
O0 Conference Mx Backlog Outreach Parent To Subtasks
O0.1 PASC 24 M2 Backlog Outreach Subtask Ok - Planned
O0.2 AMS 25 or AGU 24 M2 Backlog Outreach Subtask Ok - Planned
O0.3 SC24 M2 Backlog Outreach Subtask Ok - Planned
O0.4 Paper Mx Backlog Outreach Subtask Ok - Planned
MW21 Better Translate test Backlog Middleware NOAA Parent To Subtasks
MW21.1 Export precise metadata with NetCDF data and/or have an API to query Backlog Middleware Task Investigate
MW4.5 Move orchestration in backend name. Backlog Middleware Subtask Ok - Unplanned
MW4.6 Use DaCe hash system to remove need for FV3_DACEMODE Backlog Middleware Subtask Ok - Unplanned
MW22 DaCe opt: do away with dynamic memlets M1 Backlog Middleware Task Ok - Planned
G1.6 PyFV3: allow N tracers to be advected M1 Backlog GEOS Subtask Ok - Planned
G1.7 PyFV3 `fv_mapz` is substantially different for GEOS M1 Backlog GEOS Subtask Ok - Planned
G1.8 PyFV3: tracer advection is substantially different for GEOS M1 Backlog GEOS Subtask Investigate
G10 Update GEOS-Dycore Mx Backlog GEOS Sink Ok - Planned
MW23 Exposes "boilerplate" NDSL packages Backlog Middleware NOAA Parent To Subtasks
G11 Central `@GMAO_SHARED/geos_shared/*` refactor and porting strategy. M1 Backlog P0 GEOS Task Ok - Planned
MW24 Completeunit tests setup M1 Backlog Middleware NOAA Parent Ok - Planned
MW24.2 Use gt:gpu backend as a base for GPU unit testing M1 Backlog Middleware Subtask Ok - Planned
MW24.1 Find a strategy to run GPU unit tests M1 Backlog Middleware NOAA Subtask Ok - Planned
MW24.3 Orchestration unit tests M1 Backlog Middleware Subtask Ok - Planned
Loading

0 comments on commit 98ea2cf

Please sign in to comment.