From 6d78659bf016e9ef7c62b59937f12191e112049b Mon Sep 17 00:00:00 2001
From: Purnendu Chakraborty <purnendu.chakraborty@nasa.gov>
Date: Thu, 26 Jan 2023 20:47:18 -0500
Subject: [PATCH 01/57] Initialize GeosDycoreWrapper with bdt (timestep)

---
 .../fv3core/initialization/geos_wrapper.py    | 40 +++++++++----------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/fv3core/pace/fv3core/initialization/geos_wrapper.py b/fv3core/pace/fv3core/initialization/geos_wrapper.py
index 2bab75db..ea61ed7a 100644
--- a/fv3core/pace/fv3core/initialization/geos_wrapper.py
+++ b/fv3core/pace/fv3core/initialization/geos_wrapper.py
@@ -16,7 +16,7 @@ class GeosDycoreWrapper:
     Takes numpy arrays as inputs, returns a dictionary of numpy arrays as outputs
     """
 
-    def __init__(self, namelist: f90nml.Namelist, comm: pace.util.Comm, backend: str):
+    def __init__(self, namelist: f90nml.Namelist, bdt: float, comm: pace.util.Comm, backend: str):
         # Make a custom performance collector for the GEOS wrapper
         self.perf_collector = PerformanceCollector("GEOS wrapper", comm)
 
@@ -69,17 +69,18 @@ def __init__(self, namelist: f90nml.Namelist, comm: pace.util.Comm, backend: str
             quantity_factory=quantity_factory
         )
 
-        self.dycore_state.bdt = float(namelist["dt_atmos"])
-        if "fv_core_nml" in namelist.keys():
-            self.dycore_state.bdt = (
-                float(namelist["dt_atmos"]) / namelist["fv_core_nml"]["k_split"]
-            )
-        elif "dycore_config" in namelist.keys():
-            self.dycore_state.bdt = (
-                float(namelist["dt_atmos"]) / namelist["dycore_config"]["k_split"]
-            )
-        else:
-            raise KeyError("Cannot find k_split in namelist")
+        # self.dycore_state.bdt = float(namelist["dt_atmos"])
+        # if "fv_core_nml" in namelist.keys():
+        #     self.dycore_state.bdt = (
+        #         float(namelist["dt_atmos"]) / namelist["fv_core_nml"]["k_split"]
+        #     )
+        # elif "dycore_config" in namelist.keys():
+        #     self.dycore_state.bdt = (
+        #         float(namelist["dt_atmos"]) / namelist["dycore_config"]["k_split"]
+        #     )
+        # else:
+        #     raise KeyError("Cannot find k_split in namelist")
+        self.dycore_state.bdt = bdt
 
         damping_coefficients = pace.util.grid.DampingCoefficients.new_from_metric_terms(
             metric_terms
@@ -92,7 +93,7 @@ def __init__(self, namelist: f90nml.Namelist, comm: pace.util.Comm, backend: str
             quantity_factory=quantity_factory,
             damping_coefficients=damping_coefficients,
             config=self.dycore_config,
-            timestep=timedelta(seconds=self.dycore_config.dt_atmos),
+            timestep=timedelta(seconds=self.dycore_state.bdt),
             phis=self.dycore_state.phis,
             state=self.dycore_state,
         )
@@ -128,7 +129,7 @@ def __call__(
         diss_estd: np.ndarray,
     ) -> Dict[str, np.ndarray]:
 
-        with self.perf_collector.timestep_timer.clock("move_to_pace"):
+        with self.perf_collector.timestep_timer.clock("numpy-to-dycore"):
             self.dycore_state = self._put_fortran_data_in_dycore(
                 u,
                 v,
@@ -156,21 +157,20 @@ def __call__(
                 diss_estd,
             )
 
-        with self.perf_collector.timestep_timer.clock("dycore"):
+        with self.perf_collector.timestep_timer.clock("DynamicalCore"):
             self.dynamical_core.step_dynamics(
                 state=self.dycore_state, timer=self.perf_collector.timestep_timer
             )
 
-        with self.perf_collector.timestep_timer.clock("move_to_fortran"):
+        with self.perf_collector.timestep_timer.clock("dycore-to-numpy"):
             self.output_dict = self._prep_outputs_for_geos()
 
-        # Collect performance of the timestep and write
-        # a json file for rank 0
+        # Collect performance of the timestep and write a json file for rank 0
         self.perf_collector.collect_performance()
         self.perf_collector.write_out_rank_0(
             backend=self.backend,
-            is_orchestrated=False,  # could be infered from config
-            dt_atmos=self.dycore_config.dt_atmos,
+            is_orchestrated=False,  # could be inferred from config
+            dt_atmos=self.dycore_state.bdt,
             sim_status="Ongoing",
         )
 

From 0a3e8572691887cc63f74eff01de28b84b1c717a Mon Sep 17 00:00:00 2001
From: Purnendu Chakraborty <purnendu.chakraborty@nasa.gov>
Date: Thu, 26 Jan 2023 20:48:55 -0500
Subject: [PATCH 02/57] Use GEOS version of constants

---
 util/pace/util/constants.py | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/util/pace/util/constants.py b/util/pace/util/constants.py
index a7fe5a53..8eb5143b 100644
--- a/util/pace/util/constants.py
+++ b/util/pace/util/constants.py
@@ -35,8 +35,21 @@
 # The FV3GFS model ships with two sets of constants, one used in the GFS physics
 # package and the other used for the Dycore. Their difference are small but significant
 # Our Fortran executable on GCE has GFS_PHYS=True
-GFS_PHYS = True
-if GFS_PHYS:
+CONST_VERSION = 'GEOS'
+if CONST_VERSION == 'GEOS':
+    RADIUS = 6.371e6
+    PI = 3.14159265358979323846
+    OMEGA = 2.0*PI/86164.0
+    GRAV = 9.80665
+    RGRAV = 1.0 / GRAV
+    RDGAS = 8314.47/28.965
+    RVGAS = 8314.47/18.015
+    HLV = 2.4665E6
+    HLF = 3.3370E5
+    KAPPA = RDGAS/(3.5*RDGAS)
+    CP_AIR = RDGAS/KAPPA
+    TFREEZE = 273.15
+elif CONST_VERSION == 'GFS':
     RADIUS = 6.3712e6  # Radius of the Earth [m]
     PI = 3.1415926535897931
     OMEGA = 7.2921e-5  # Rotation of the earth

From 0a8d7052da93567c7e10b357b5e64295db719da5 Mon Sep 17 00:00:00 2001
From: Purnendu Chakraborty <purnendu.chakraborty@nasa.gov>
Date: Fri, 27 Jan 2023 13:02:13 -0500
Subject: [PATCH 03/57] 1. Add qcld to the list of tracers beings advected 2.
 Made GEOS specific changes to thresholds in saturation adjustment

---
 fv3core/pace/fv3core/stencils/fv_dynamics.py           | 3 ++-
 fv3core/pace/fv3core/stencils/saturation_adjustment.py | 6 +++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/fv3core/pace/fv3core/stencils/fv_dynamics.py b/fv3core/pace/fv3core/stencils/fv_dynamics.py
index 8b4efad1..de146fa0 100644
--- a/fv3core/pace/fv3core/stencils/fv_dynamics.py
+++ b/fv3core/pace/fv3core/stencils/fv_dynamics.py
@@ -33,7 +33,7 @@
 # ncnst = Atm(mytile)%ncnst
 # pnats = Atm(mytile)%flagstruct%pnats
 # here we hard-coded it because 8 is the only supported value, refactor this later!
-NQ = 8  # state.nq_tot - spec.namelist.dnats
+NQ = 9  # state.nq_tot - spec.namelist.dnats
 
 
 def pt_adjust(
@@ -555,6 +555,7 @@ def _compute(self, state: DycoreState, timer: pace.util.Timer):
                         state.w,
                         self._cappa,
                         state.q_con,
+                        # Since NQ=9, we shouldn't need to pass qcld explicitly
                         state.qcld,
                         state.pkz,
                         state.pk,
diff --git a/fv3core/pace/fv3core/stencils/saturation_adjustment.py b/fv3core/pace/fv3core/stencils/saturation_adjustment.py
index f8eeaa96..7ffe45b5 100644
--- a/fv3core/pace/fv3core/stencils/saturation_adjustment.py
+++ b/fv3core/pace/fv3core/stencils/saturation_adjustment.py
@@ -901,14 +901,14 @@ def satadjust(
             # icloud_f = 0: bug - fixed
             # icloud_f = 1: old fvgfs gfdl) mp implementation
             # icloud_f = 2: binary cloud scheme (0 / 1)
-            if rh > 0.75 and qpz > 1.0e-8:
+            if rh > 0.75 and qpz > 1.0e-6:
                 dq = hvar * qpz
                 q_plus = qpz + dq
                 q_minus = qpz - dq
                 if icloud_f == 2:  # TODO untested
                     if qpz > qstar:
                         qa = 1.0
-                    elif (qstar < q_plus) and (q_cond > 1.0e-8):
+                    elif (qstar < q_plus) and (q_cond > 1.0e-6):
                         qa = min(1.0, ((q_plus - qstar) / dq) ** 2)
                     else:
                         qa = 0.0
@@ -924,7 +924,7 @@ def satadjust(
                         else:
                             qa = 0.0
                         # impose minimum cloudiness if substantial q_cond exist
-                        if q_cond > 1.0e-8:
+                        if q_cond > 1.0e-6:
                             qa = max(cld_min, qa)
                         qa = min(1, qa)
             else:

From 3b73d71e75562572b162a0caa2cfe1688f24965e Mon Sep 17 00:00:00 2001
From: Purnendu Chakraborty <purnendu.chakraborty@nasa.gov>
Date: Fri, 27 Jan 2023 17:37:48 -0500
Subject: [PATCH 04/57] Accumulate diss_est

---
 fv3core/pace/fv3core/stencils/d_sw.py | 37 +++++++++++++++++++--------
 1 file changed, 27 insertions(+), 10 deletions(-)

diff --git a/fv3core/pace/fv3core/stencils/d_sw.py b/fv3core/pace/fv3core/stencils/d_sw.py
index 21f25151..8be6dcd5 100644
--- a/fv3core/pace/fv3core/stencils/d_sw.py
+++ b/fv3core/pace/fv3core/stencils/d_sw.py
@@ -94,8 +94,6 @@ def heat_diss(
         ke_bg (in):
     """
     with computation(PARALLEL), interval(...):
-        heat_source = 0.0
-        diss_est = 0.0
         if damp_w > 1e-5:
             dd8 = ke_bg * abs(dt)
             dw = (fx2 - fx2[1, 0, 0] + fy2 - fy2[0, 1, 0]) * rarea
@@ -503,7 +501,6 @@ def heat_source_from_vorticity_damping(
     rdx: FloatFieldIJ,
     rdy: FloatFieldIJ,
     heat_source: FloatField,
-    heat_source_total: FloatField,
     dissipation_estimate: FloatField,
     kinetic_energy_fraction_to_damp: FloatFieldK,
 ):
@@ -526,8 +523,7 @@ def heat_source_from_vorticity_damping(
         rdy (in): 1 / dy
         heat_source (inout): heat source from vorticity damping
             implied by energy conservation
-        heat_source_total (inout): accumulated heat source
-        dissipation_estimate (out): dissipation estimate, only calculated if
+        dissipation_estimate (inout): dissipation estimate, only calculated if
             calculate_dissipation_estimate is 1. Used for stochastic kinetic
             energy backscatter (skeb) routine.
         kinetic_energy_fraction_to_damp (in): the fraction of kinetic energy
@@ -572,11 +568,21 @@ def heat_source_from_vorticity_damping(
 
         if __INLINED((d_con > dcon_threshold) or do_stochastic_ke_backscatter):
             with horizontal(region[local_is : local_ie + 1, local_js : local_je + 1]):
-                heat_source_total = heat_source_total + heat_source
                 if __INLINED(do_stochastic_ke_backscatter):
                     dissipation_estimate -= dampterm
 
 
+def accumulate_heat_source_and_dissipation_estimate(
+    heat_source: FloatField,
+    heat_source_total: FloatField,
+    diss_est: FloatField,
+    diss_est_total: FloatField,
+):
+    with computation(PARALLEL), interval(...):
+        heat_source_total += heat_source
+        diss_est_total += diss_est
+
+
 # TODO(eddied): Had to split this into a separate stencil to get this to validate
 #               with GTC, suspect a merging issue...
 def update_u_and_v(
@@ -763,6 +769,7 @@ def make_quantity():
             return quantity_factory.zeros([X_DIM, Y_DIM, Z_DIM], units="unknown")
 
         self._tmp_heat_s = make_quantity()
+        self._tmp_diss_e = make_quantity()
         self._vort_x_delta = make_quantity()
         self._vort_y_delta = make_quantity()
         self._dt_kinetic_energy_on_cell_corners = make_quantity()
@@ -912,6 +919,12 @@ def make_quantity():
                 },
             )
         )
+        self._accumulate_heat_source_and_dissipation_estimate_stencil = (
+            stencil_factory.from_dims_halo(
+                func=accumulate_heat_source_and_dissipation_estimate,
+                compute_dims=[X_DIM, Y_DIM, Z_DIM],
+            )
+        )
         self._compute_vorticity_stencil = stencil_factory.from_dims_halo(
             compute_vorticity,
             compute_dims=[X_DIM, Y_DIM, Z_DIM],
@@ -1046,7 +1059,7 @@ def __call__(
             w,
             self.grid_data.rarea,
             self._tmp_heat_s,
-            diss_est,
+            self._tmp_diss_e,
             self._tmp_dw,
             self._column_namelist["damp_w"],
             self._column_namelist["ke_bg"],
@@ -1225,11 +1238,15 @@ def __call__(
             self.grid_data.rdx,
             self.grid_data.rdy,
             self._tmp_heat_s,
-            heat_source,
-            diss_est,
+            self._tmp_diss_e,
             self._column_namelist["d_con"],
         )
-
+        self._accumulate_heat_source_and_dissipation_estimate_stencil(
+            self._tmp_heat_s,
+            heat_source,
+            self._tmp_diss_e,
+            diss_est
+        )
         self._update_u_and_v_stencil(
             self._tmp_ut,
             self._tmp_vt,

From a68d1602cc20e7a8841d602bc75e920fe2332a1d Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Fri, 24 Feb 2023 17:11:04 +0000
Subject: [PATCH 05/57] Allow GEOS_WRAPPER to process device data

---
 .../fv3core/initialization/geos_wrapper.py    | 388 +++++++++++-------
 1 file changed, 238 insertions(+), 150 deletions(-)

diff --git a/fv3core/pace/fv3core/initialization/geos_wrapper.py b/fv3core/pace/fv3core/initialization/geos_wrapper.py
index 00b6a1b7..5c01b256 100644
--- a/fv3core/pace/fv3core/initialization/geos_wrapper.py
+++ b/fv3core/pace/fv3core/initialization/geos_wrapper.py
@@ -1,3 +1,4 @@
+import enum
 import os
 from datetime import timedelta
 from typing import Dict
@@ -9,6 +10,17 @@
 from pace import fv3core
 from pace.driver.performance.collector import PerformanceCollector
 from pace.dsl.dace import DaceConfig, orchestrate
+from pace.dsl.gt4py_utils import is_gpu_backend
+
+
+@enum.unique
+class MemorySpace(enum.Enum):
+    HOST = 0
+    DEVICE = 1
+
+
+def assign_no_copy(A, B):
+    A = B
 
 
 class GeosDycoreWrapper:
@@ -23,6 +35,7 @@ def __init__(
         bdt: int,
         comm: pace.util.Comm,
         backend: str,
+        fortran_mem_space: MemorySpace = MemorySpace.HOST,
     ):
         # Look for an override to run on a single node
         gtfv3_single_rank_override = int(os.getenv("GTFV3_SINGLE_RANK_OVERRIDE", -1))
@@ -112,6 +125,11 @@ def __init__(
             state=self.dycore_state,
         )
 
+        self._fortran_mem_space = fortran_mem_space
+        self._pace_mem_space = (
+            MemorySpace.DEVICE if is_gpu_backend(backend) else MemorySpace.HOST
+        )
+
         self.output_dict: Dict[str, np.ndarray] = {}
         self._allocate_output_dir()
 
@@ -300,162 +318,232 @@ def _prep_outputs_for_geos(self) -> Dict[str, np.ndarray]:
         iec = self._grid_indexing.iec + 1
         jec = self._grid_indexing.jec + 1
 
-        pace.util.utils.safe_assign_array(
-            output_dict["u"], self.dycore_state.u.data[:-1, :, :-1]
-        )
-        pace.util.utils.safe_assign_array(
-            output_dict["v"], self.dycore_state.v.data[:, :-1, :-1]
-        )
-        pace.util.utils.safe_assign_array(
-            output_dict["w"], self.dycore_state.w.data[:-1, :-1, :-1]
-        )
-        pace.util.utils.safe_assign_array(
-            output_dict["ua"], self.dycore_state.ua.data[:-1, :-1, :-1]
-        )
-        pace.util.utils.safe_assign_array(
-            output_dict["va"], self.dycore_state.va.data[:-1, :-1, :-1]
-        )
-        pace.util.utils.safe_assign_array(
-            output_dict["uc"], self.dycore_state.uc.data[:, :-1, :-1]
-        )
-        pace.util.utils.safe_assign_array(
-            output_dict["vc"], self.dycore_state.vc.data[:-1, :, :-1]
-        )
+        if self._fortran_mem_space != self._pace_mem_space:
+            pace.util.utils.safe_assign_array(
+                output_dict["u"], self.dycore_state.u.data[:-1, :, :-1]
+            )
+            pace.util.utils.safe_assign_array(
+                output_dict["v"], self.dycore_state.v.data[:, :-1, :-1]
+            )
+            pace.util.utils.safe_assign_array(
+                output_dict["w"], self.dycore_state.w.data[:-1, :-1, :-1]
+            )
+            pace.util.utils.safe_assign_array(
+                output_dict["ua"], self.dycore_state.ua.data[:-1, :-1, :-1]
+            )
+            pace.util.utils.safe_assign_array(
+                output_dict["va"], self.dycore_state.va.data[:-1, :-1, :-1]
+            )
+            pace.util.utils.safe_assign_array(
+                output_dict["uc"], self.dycore_state.uc.data[:, :-1, :-1]
+            )
+            pace.util.utils.safe_assign_array(
+                output_dict["vc"], self.dycore_state.vc.data[:-1, :, :-1]
+            )
 
-        pace.util.utils.safe_assign_array(
-            output_dict["delz"], self.dycore_state.delz.data[:-1, :-1, :-1]
-        )
-        pace.util.utils.safe_assign_array(
-            output_dict["pt"], self.dycore_state.pt.data[:-1, :-1, :-1]
-        )
-        pace.util.utils.safe_assign_array(
-            output_dict["delp"], self.dycore_state.delp.data[:-1, :-1, :-1]
-        )
+            pace.util.utils.safe_assign_array(
+                output_dict["delz"], self.dycore_state.delz.data[:-1, :-1, :-1]
+            )
+            pace.util.utils.safe_assign_array(
+                output_dict["pt"], self.dycore_state.pt.data[:-1, :-1, :-1]
+            )
+            pace.util.utils.safe_assign_array(
+                output_dict["delp"], self.dycore_state.delp.data[:-1, :-1, :-1]
+            )
 
-        pace.util.utils.safe_assign_array(
-            output_dict["mfxd"],
-            self.dycore_state.mfxd.data[isc : iec + 1, jsc:jec, :-1],
-        )
-        pace.util.utils.safe_assign_array(
-            output_dict["mfyd"],
-            self.dycore_state.mfyd.data[isc:iec, jsc : jec + 1, :-1],
-        )
-        pace.util.utils.safe_assign_array(
-            output_dict["cxd"], self.dycore_state.cxd.data[isc : iec + 1, :-1, :-1]
-        )
-        pace.util.utils.safe_assign_array(
-            output_dict["cyd"], self.dycore_state.cyd.data[:-1, jsc : jec + 1, :-1]
-        )
+            pace.util.utils.safe_assign_array(
+                output_dict["mfxd"],
+                self.dycore_state.mfxd.data[isc : iec + 1, jsc:jec, :-1],
+            )
+            pace.util.utils.safe_assign_array(
+                output_dict["mfyd"],
+                self.dycore_state.mfyd.data[isc:iec, jsc : jec + 1, :-1],
+            )
+            pace.util.utils.safe_assign_array(
+                output_dict["cxd"], self.dycore_state.cxd.data[isc : iec + 1, :-1, :-1]
+            )
+            pace.util.utils.safe_assign_array(
+                output_dict["cyd"], self.dycore_state.cyd.data[:-1, jsc : jec + 1, :-1]
+            )
 
-        pace.util.utils.safe_assign_array(
-            output_dict["ps"], self.dycore_state.ps.data[:-1, :-1]
-        )
-        pace.util.utils.safe_assign_array(
-            output_dict["pe"],
-            self.dycore_state.pe.data[isc - 1 : iec + 1, jsc - 1 : jec + 1, :],
-        )
-        pace.util.utils.safe_assign_array(
-            output_dict["pk"], self.dycore_state.pk.data[isc:iec, jsc:jec, :]
-        )
-        pace.util.utils.safe_assign_array(
-            output_dict["peln"], self.dycore_state.peln.data[isc:iec, jsc:jec, :]
-        )
-        pace.util.utils.safe_assign_array(
-            output_dict["pkz"], self.dycore_state.pkz.data[isc:iec, jsc:jec, :-1]
-        )
-        pace.util.utils.safe_assign_array(
-            output_dict["phis"], self.dycore_state.phis.data[:-1, :-1]
-        )
-        pace.util.utils.safe_assign_array(
-            output_dict["q_con"], self.dycore_state.q_con.data[:-1, :-1, :-1]
-        )
-        pace.util.utils.safe_assign_array(
-            output_dict["omga"], self.dycore_state.omga.data[:-1, :-1, :-1]
-        )
-        pace.util.utils.safe_assign_array(
-            output_dict["diss_estd"], self.dycore_state.diss_estd.data[:-1, :-1, :-1]
-        )
+            pace.util.utils.safe_assign_array(
+                output_dict["ps"], self.dycore_state.ps.data[:-1, :-1]
+            )
+            pace.util.utils.safe_assign_array(
+                output_dict["pe"],
+                self.dycore_state.pe.data[isc - 1 : iec + 1, jsc - 1 : jec + 1, :],
+            )
+            pace.util.utils.safe_assign_array(
+                output_dict["pk"], self.dycore_state.pk.data[isc:iec, jsc:jec, :]
+            )
+            pace.util.utils.safe_assign_array(
+                output_dict["peln"], self.dycore_state.peln.data[isc:iec, jsc:jec, :]
+            )
+            pace.util.utils.safe_assign_array(
+                output_dict["pkz"], self.dycore_state.pkz.data[isc:iec, jsc:jec, :-1]
+            )
+            pace.util.utils.safe_assign_array(
+                output_dict["phis"], self.dycore_state.phis.data[:-1, :-1]
+            )
+            pace.util.utils.safe_assign_array(
+                output_dict["q_con"], self.dycore_state.q_con.data[:-1, :-1, :-1]
+            )
+            pace.util.utils.safe_assign_array(
+                output_dict["omga"], self.dycore_state.omga.data[:-1, :-1, :-1]
+            )
+            pace.util.utils.safe_assign_array(
+                output_dict["diss_estd"],
+                self.dycore_state.diss_estd.data[:-1, :-1, :-1],
+            )
 
-        pace.util.utils.safe_assign_array(
-            output_dict["qvapor"], self.dycore_state.qvapor.data[:-1, :-1, :-1]
-        )
-        pace.util.utils.safe_assign_array(
-            output_dict["qliquid"], self.dycore_state.qliquid.data[:-1, :-1, :-1]
-        )
-        pace.util.utils.safe_assign_array(
-            output_dict["qice"], self.dycore_state.qice.data[:-1, :-1, :-1]
-        )
-        pace.util.utils.safe_assign_array(
-            output_dict["qrain"], self.dycore_state.qrain.data[:-1, :-1, :-1]
-        )
-        pace.util.utils.safe_assign_array(
-            output_dict["qsnow"], self.dycore_state.qsnow.data[:-1, :-1, :-1]
-        )
-        pace.util.utils.safe_assign_array(
-            output_dict["qgraupel"], self.dycore_state.qgraupel.data[:-1, :-1, :-1]
-        )
-        pace.util.utils.safe_assign_array(
-            output_dict["qcld"], self.dycore_state.qcld.data[:-1, :-1, :-1]
-        )
+            pace.util.utils.safe_assign_array(
+                output_dict["qvapor"], self.dycore_state.qvapor.data[:-1, :-1, :-1]
+            )
+            pace.util.utils.safe_assign_array(
+                output_dict["qliquid"], self.dycore_state.qliquid.data[:-1, :-1, :-1]
+            )
+            pace.util.utils.safe_assign_array(
+                output_dict["qice"], self.dycore_state.qice.data[:-1, :-1, :-1]
+            )
+            pace.util.utils.safe_assign_array(
+                output_dict["qrain"], self.dycore_state.qrain.data[:-1, :-1, :-1]
+            )
+            pace.util.utils.safe_assign_array(
+                output_dict["qsnow"], self.dycore_state.qsnow.data[:-1, :-1, :-1]
+            )
+            pace.util.utils.safe_assign_array(
+                output_dict["qgraupel"], self.dycore_state.qgraupel.data[:-1, :-1, :-1]
+            )
+            pace.util.utils.safe_assign_array(
+                output_dict["qcld"], self.dycore_state.qcld.data[:-1, :-1, :-1]
+            )
+        else:
+            output_dict["u"] = self.dycore_state.u.data[:-1, :, :-1]
+            output_dict["v"] = self.dycore_state.v.data[:, :-1, :-1]
+            output_dict["w"] = self.dycore_state.w.data[:-1, :-1, :-1]
+            output_dict["ua"] = self.dycore_state.ua.data[:-1, :-1, :-1]
+            output_dict["va"] = self.dycore_state.va.data[:-1, :-1, :-1]
+            output_dict["uc"] = self.dycore_state.uc.data[:, :-1, :-1]
+            output_dict["vc"] = self.dycore_state.vc.data[:-1, :, :-1]
+            output_dict["delz"] = self.dycore_state.delz.data[:-1, :-1, :-1]
+            output_dict["pt"] = self.dycore_state.pt.data[:-1, :-1, :-1]
+            output_dict["delp"] = self.dycore_state.delp.data[:-1, :-1, :-1]
+            output_dict["mfxd"] = self.dycore_state.mfxd.data[
+                isc : iec + 1, jsc:jec, :-1
+            ]
+            output_dict["mfyd"] = self.dycore_state.mfyd.data[
+                isc:iec, jsc : jec + 1, :-1
+            ]
+            output_dict["cxd"] = self.dycore_state.cxd.data[isc : iec + 1, :-1, :-1]
+            output_dict["cyd"] = self.dycore_state.cyd.data[:-1, jsc : jec + 1, :-1]
+            output_dict["ps"] = self.dycore_state.ps.data[:-1, :-1]
+            output_dict["pe"] = self.dycore_state.pe.data[
+                isc - 1 : iec + 1, jsc - 1 : jec + 1, :
+            ]
+            output_dict["pk"] = self.dycore_state.pk.data[isc:iec, jsc:jec, :]
+            output_dict["peln"] = self.dycore_state.peln.data[isc:iec, jsc:jec, :]
+            output_dict["pkz"] = self.dycore_state.pkz.data[isc:iec, jsc:jec, :-1]
+            output_dict["phis"] = self.dycore_state.phis.data[:-1, :-1]
+            output_dict["q_con"] = self.dycore_state.q_con.data[:-1, :-1, :-1]
+            output_dict["omga"] = self.dycore_state.omga.data[:-1, :-1, :-1]
+            output_dict["diss_estd"] = self.dycore_state.diss_estd.data[:-1, :-1, :-1]
+            output_dict["qvapor"] = self.dycore_state.qvapor.data[:-1, :-1, :-1]
+            output_dict["qliquid"] = self.dycore_state.qliquid.data[:-1, :-1, :-1]
+            output_dict["qice"] = self.dycore_state.qice.data[:-1, :-1, :-1]
+            output_dict["qrain"] = self.dycore_state.qrain.data[:-1, :-1, :-1]
+            output_dict["qsnow"] = self.dycore_state.qsnow.data[:-1, :-1, :-1]
+            output_dict["qgraupel"] = self.dycore_state.qgraupel.data[:-1, :-1, :-1]
+            output_dict["qcld"] = self.dycore_state.qcld.data[:-1, :-1, :-1]
 
         return output_dict
 
     def _allocate_output_dir(self):
+        if self._fortran_mem_space != self._pace_mem_space:
+            nhalo = self._grid_indexing.n_halo
+            shape_centered = self._grid_indexing.domain_full(add=(0, 0, 0))
+            shape_x_interface = self._grid_indexing.domain_full(add=(1, 0, 0))
+            shape_y_interface = self._grid_indexing.domain_full(add=(0, 1, 0))
+            shape_z_interface = self._grid_indexing.domain_full(add=(0, 0, 1))
+            shape_2d = shape_centered[:-1]
+
+            self.output_dict["u"] = np.empty((shape_y_interface))
+            self.output_dict["v"] = np.empty((shape_x_interface))
+            self.output_dict["w"] = np.empty((shape_centered))
+            self.output_dict["ua"] = np.empty((shape_centered))
+            self.output_dict["va"] = np.empty((shape_centered))
+            self.output_dict["uc"] = np.empty((shape_x_interface))
+            self.output_dict["vc"] = np.empty((shape_y_interface))
+
+            self.output_dict["delz"] = np.empty((shape_centered))
+            self.output_dict["pt"] = np.empty((shape_centered))
+            self.output_dict["delp"] = np.empty((shape_centered))
+
+            self.output_dict["mfxd"] = np.empty(
+                (self._grid_indexing.domain_full(add=(1 - 2 * nhalo, -2 * nhalo, 0)))
+            )
+            self.output_dict["mfyd"] = np.empty(
+                (self._grid_indexing.domain_full(add=(-2 * nhalo, 1 - 2 * nhalo, 0)))
+            )
+            self.output_dict["cxd"] = np.empty(
+                (self._grid_indexing.domain_full(add=(1 - 2 * nhalo, 0, 0)))
+            )
+            self.output_dict["cyd"] = np.empty(
+                (self._grid_indexing.domain_full(add=(0, 1 - 2 * nhalo, 0)))
+            )
 
-        nhalo = self._grid_indexing.n_halo
-        shape_centered = self._grid_indexing.domain_full(add=(0, 0, 0))
-        shape_x_interface = self._grid_indexing.domain_full(add=(1, 0, 0))
-        shape_y_interface = self._grid_indexing.domain_full(add=(0, 1, 0))
-        shape_z_interface = self._grid_indexing.domain_full(add=(0, 0, 1))
-        shape_2d = shape_centered[:-1]
-
-        self.output_dict["u"] = np.empty((shape_y_interface))
-        self.output_dict["v"] = np.empty((shape_x_interface))
-        self.output_dict["w"] = np.empty((shape_centered))
-        self.output_dict["ua"] = np.empty((shape_centered))
-        self.output_dict["va"] = np.empty((shape_centered))
-        self.output_dict["uc"] = np.empty((shape_x_interface))
-        self.output_dict["vc"] = np.empty((shape_y_interface))
-
-        self.output_dict["delz"] = np.empty((shape_centered))
-        self.output_dict["pt"] = np.empty((shape_centered))
-        self.output_dict["delp"] = np.empty((shape_centered))
-
-        self.output_dict["mfxd"] = np.empty(
-            (self._grid_indexing.domain_full(add=(1 - 2 * nhalo, -2 * nhalo, 0)))
-        )
-        self.output_dict["mfyd"] = np.empty(
-            (self._grid_indexing.domain_full(add=(-2 * nhalo, 1 - 2 * nhalo, 0)))
-        )
-        self.output_dict["cxd"] = np.empty(
-            (self._grid_indexing.domain_full(add=(1 - 2 * nhalo, 0, 0)))
-        )
-        self.output_dict["cyd"] = np.empty(
-            (self._grid_indexing.domain_full(add=(0, 1 - 2 * nhalo, 0)))
-        )
-
-        self.output_dict["ps"] = np.empty((shape_2d))
-        self.output_dict["pe"] = np.empty(
-            (self._grid_indexing.domain_full(add=(2 - 2 * nhalo, 2 - 2 * nhalo, 1)))
-        )
-        self.output_dict["pk"] = np.empty(
-            (self._grid_indexing.domain_full(add=(-2 * nhalo, -2 * nhalo, 1)))
-        )
-        self.output_dict["peln"] = np.empty(
-            (self._grid_indexing.domain_full(add=(-2 * nhalo, -2 * nhalo, 1)))
-        )
-        self.output_dict["pkz"] = np.empty(
-            (self._grid_indexing.domain_full(add=(-2 * nhalo, -2 * nhalo, 0)))
-        )
-        self.output_dict["phis"] = np.empty((shape_2d))
-        self.output_dict["q_con"] = np.empty((shape_centered))
-        self.output_dict["omga"] = np.empty((shape_centered))
-        self.output_dict["diss_estd"] = np.empty((shape_centered))
-
-        self.output_dict["qvapor"] = np.empty((shape_centered))
-        self.output_dict["qliquid"] = np.empty((shape_centered))
-        self.output_dict["qice"] = np.empty((shape_centered))
-        self.output_dict["qrain"] = np.empty((shape_centered))
-        self.output_dict["qsnow"] = np.empty((shape_centered))
-        self.output_dict["qgraupel"] = np.empty((shape_centered))
-        self.output_dict["qcld"] = np.empty((shape_centered))
+            self.output_dict["ps"] = np.empty((shape_2d))
+            self.output_dict["pe"] = np.empty(
+                (self._grid_indexing.domain_full(add=(2 - 2 * nhalo, 2 - 2 * nhalo, 1)))
+            )
+            self.output_dict["pk"] = np.empty(
+                (self._grid_indexing.domain_full(add=(-2 * nhalo, -2 * nhalo, 1)))
+            )
+            self.output_dict["peln"] = np.empty(
+                (self._grid_indexing.domain_full(add=(-2 * nhalo, -2 * nhalo, 1)))
+            )
+            self.output_dict["pkz"] = np.empty(
+                (self._grid_indexing.domain_full(add=(-2 * nhalo, -2 * nhalo, 0)))
+            )
+            self.output_dict["phis"] = np.empty((shape_2d))
+            self.output_dict["q_con"] = np.empty((shape_centered))
+            self.output_dict["omga"] = np.empty((shape_centered))
+            self.output_dict["diss_estd"] = np.empty((shape_centered))
+
+            self.output_dict["qvapor"] = np.empty((shape_centered))
+            self.output_dict["qliquid"] = np.empty((shape_centered))
+            self.output_dict["qice"] = np.empty((shape_centered))
+            self.output_dict["qrain"] = np.empty((shape_centered))
+            self.output_dict["qsnow"] = np.empty((shape_centered))
+            self.output_dict["qgraupel"] = np.empty((shape_centered))
+            self.output_dict["qcld"] = np.empty((shape_centered))
+        else:
+            self.output_dict["u"] = None
+            self.output_dict["v"] = None
+            self.output_dict["w"] = None
+            self.output_dict["ua"] = None
+            self.output_dict["va"] = None
+            self.output_dict["uc"] = None
+            self.output_dict["vc"] = None
+            self.output_dict["delz"] = None
+            self.output_dict["pt"] = None
+            self.output_dict["delp"] = None
+            self.output_dict["mfxd"] = None
+            self.output_dict["mfyd"] = None
+            self.output_dict["cxd"] = None
+            self.output_dict["cyd"] = None
+            self.output_dict["ps"] = None
+            self.output_dict["pe"] = None
+            self.output_dict["pk"] = None
+            self.output_dict["peln"] = None
+            self.output_dict["pkz"] = None
+            self.output_dict["phis"] = None
+            self.output_dict["q_con"] = None
+            self.output_dict["omga"] = None
+            self.output_dict["diss_estd"] = None
+            self.output_dict["qvapor"] = None
+            self.output_dict["qliquid"] = None
+            self.output_dict["qice"] = None
+            self.output_dict["qrain"] = None
+            self.output_dict["qsnow"] = None
+            self.output_dict["qgraupel"] = None
+            self.output_dict["qcld"] = None

From 33ba53f5378eaf0d1af627e5fa3e98ba82b4863e Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Tue, 28 Feb 2023 17:23:05 +0000
Subject: [PATCH 06/57] Add clear to collector for 3rd party use. GEOS pass
 down timings to caller

---
 driver/pace/driver/performance/collector.py   |  4 ++++
 .../fv3core/initialization/geos_wrapper.py    | 23 ++++++++++---------
 2 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/driver/pace/driver/performance/collector.py b/driver/pace/driver/performance/collector.py
index 272e72be..cbc6c62a 100644
--- a/driver/pace/driver/performance/collector.py
+++ b/driver/pace/driver/performance/collector.py
@@ -66,6 +66,10 @@ def __init__(self, experiment_name: str, comm: pace.util.Comm):
         self.experiment_name = experiment_name
         self.comm = comm
 
+    def clear(self):
+        self.times_per_step = []
+        self.hits_per_step = []
+
     def collect_performance(self):
         """
         Take the accumulated timings and flush them into a new entry
diff --git a/fv3core/pace/fv3core/initialization/geos_wrapper.py b/fv3core/pace/fv3core/initialization/geos_wrapper.py
index 5c01b256..64fbb6ff 100644
--- a/fv3core/pace/fv3core/initialization/geos_wrapper.py
+++ b/fv3core/pace/fv3core/initialization/geos_wrapper.py
@@ -1,7 +1,7 @@
 import enum
 import os
 from datetime import timedelta
-from typing import Dict
+from typing import Dict, List, Tuple
 
 import f90nml
 import numpy as np
@@ -135,7 +135,7 @@ def __init__(
 
     def _critical_path(self):
         """Top-level orchestration function"""
-        with self.perf_collector.timestep_timer.clock("dycore"):
+        with self.perf_collector.timestep_timer.clock("step_dynamics"):
             self.dynamical_core.step_dynamics(
                 state=self.dycore_state,
                 timer=self.perf_collector.timestep_timer,
@@ -143,6 +143,7 @@ def _critical_path(self):
 
     def __call__(
         self,
+        timings: Dict[str, List[float]],
         u: np.ndarray,
         v: np.ndarray,
         w: np.ndarray,
@@ -167,7 +168,7 @@ def __call__(
         cxd: np.ndarray,
         cyd: np.ndarray,
         diss_estd: np.ndarray,
-    ) -> Dict[str, np.ndarray]:
+    ) -> Tuple[Dict[str, np.ndarray], Dict[str, List[float]]]:
 
         with self.perf_collector.timestep_timer.clock("move_to_pace"):
             self.dycore_state = self._put_fortran_data_in_dycore(
@@ -206,14 +207,14 @@ def __call__(
         # Collect performance of the timestep and write
         # a json file for rank 0
         self.perf_collector.collect_performance()
-        self.perf_collector.write_out_rank_0(
-            backend=self.backend,
-            is_orchestrated=self._is_orchestrated,
-            dt_atmos=self.dycore_config.dt_atmos,
-            sim_status="Ongoing",
-        )
-
-        return self.output_dict
+        for k, v in self.perf_collector.times_per_step[0].items():
+            if k not in timings.keys():
+                timings[k] = [v]
+            else:
+                timings[k].append(v)
+        self.perf_collector.clear()
+
+        return self.output_dict, timings
 
     def _put_fortran_data_in_dycore(
         self,

From 2327cbe6fa39b4b21aa85169ab056a232e013535 Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Fri, 3 Mar 2023 20:36:19 +0000
Subject: [PATCH 07/57] Make kernel analysis run a copy stencil to compute
 local bandwith Parametrize tool with backend, output format

---
 driver/pace/driver/tools.py |  38 +++++++++-
 dsl/pace/dsl/dace/utils.py  | 144 +++++++++++++++++++++++++++---------
 2 files changed, 146 insertions(+), 36 deletions(-)

diff --git a/driver/pace/driver/tools.py b/driver/pace/driver/tools.py
index c1dc5181..c58eb6fa 100644
--- a/driver/pace/driver/tools.py
+++ b/driver/pace/driver/tools.py
@@ -27,14 +27,48 @@
     type=click.STRING,
 )
 @click.option("--report_detail", is_flag=True, type=click.BOOL, default=False)
-def command_line(action: str, sdfg_path: Optional[str], report_detail: Optional[bool]):
+@click.option(
+    "--hardware_bw_in_gb_s",
+    required=False,
+    type=click.FLOAT,
+    default=0.0,
+)
+@click.option(
+    "--output_format",
+    required=False,
+    type=click.STRING,
+    default=None,
+)
+@click.option(
+    "--backend",
+    required=False,
+    type=click.STRING,
+    default="dace:gpu",
+)
+def command_line(
+    action: str,
+    sdfg_path: Optional[str],
+    report_detail: Optional[bool],
+    hardware_bw_in_gb_s: Optional[float],
+    output_format: Optional[str],
+    backend: Optional[str],
+):
     """
     Run tooling.
     """
     if action == ACTION_SDFG_MEMORY_STATIC_ANALYSIS:
         print(memory_static_analysis_from_path(sdfg_path, detail_report=report_detail))
     elif action == ACTION_SDFG_KERNEL_THEORETICAL_TIMING:
-        print(kernel_theoretical_timing_from_path(sdfg_path))
+        print(
+            kernel_theoretical_timing_from_path(
+                sdfg_path,
+                hardware_bw_in_GB_s=(
+                    None if hardware_bw_in_gb_s == 0 else hardware_bw_in_gb_s
+                ),
+                backend=backend,
+                output_format=output_format,
+            )
+        )
 
 
 if __name__ == "__main__":
diff --git a/dsl/pace/dsl/dace/utils.py b/dsl/pace/dsl/dace/utils.py
index cb268d3b..68b61331 100644
--- a/dsl/pace/dsl/dace/utils.py
+++ b/dsl/pace/dsl/dace/utils.py
@@ -1,15 +1,20 @@
 import time
 from dataclasses import dataclass, field
-from typing import Dict, List
+from typing import Dict, List, Optional
 
 import dace
+import numpy as np
 from dace.transformation.helpers import get_parent_map
 
 from pace.dsl.dace.dace_config import DaceConfig
+from pace.dsl.typing import Float
+from pace.util._optional_imports import cupy as cp
 from pace.util.logging import pace_log
 
 
+# ----------------------------------------------------------
 # Rough timer & log for major operations of DaCe build stack
+# ----------------------------------------------------------
 class DaCeProgress:
     """Timer and log to track build progress"""
 
@@ -48,6 +53,9 @@ def _is_ref(sd: dace.sdfg.SDFG, aname: str):
     return found
 
 
+# ----------------------------------------------------------
+# Memory analyser from SDFG
+# ----------------------------------------------------------
 @dataclass
 class ArrayReport:
     name: str = ""
@@ -175,19 +183,45 @@ def memory_static_analysis_from_path(sdfg_path: str, detail_report=False) -> str
     )
 
 
-# TODO (floriand): in order for the timing analysis to be realistic the reference
-# bandwidth of the hardware should be measured with GT4Py & simple in/out copy
-# stencils. This allows to both measure the _actual_ deployed hardware and
-# size it against the current GT4Py version.
-# Below we bypass this needed automation by writing the P100 bw on Piz Daint
-# measured with the above strategy.
-# A better tool would allow this measure with a simple command and allow
-# a one command that measure bw & report kernel analysis in one command
-_HARDWARE_BW_GB_S = {"P100": 492.0}
+# ----------------------------------------------------------
+# Theoritical bandwith from SDFG
+# ----------------------------------------------------------
+
+from gt4py.cartesian.gtscript import PARALLEL, computation, interval
+
+from pace.dsl.stencil import CompilationConfig, FrozenStencil, StencilConfig
+from pace.dsl.typing import FloatField
+
+
+def copy_defn(q_in: FloatField, q_out: FloatField):
+    with computation(PARALLEL), interval(...):
+        q_in = q_out
+
+
+class MaxBandwithBenchmarkProgram:
+    def __init__(self, size, backend) -> None:
+        from pace.dsl.dace.orchestration import DaCeOrchestration, orchestrate
+
+        dconfig = DaceConfig(None, backend, orchestration=DaCeOrchestration.BuildAndRun)
+        c = CompilationConfig(backend=backend)
+        s = StencilConfig(dace_config=dconfig, compilation_config=c)
+        self.copy_stencil = FrozenStencil(
+            func=copy_defn,
+            origin=(0, 0, 0),
+            domain=size,
+            stencil_config=s,
+        )
+        orchestrate(obj=self, config=dconfig)
+
+    def __call__(self, A, B, n: int):
+        for i in dace.nounroll(range(n)):
+            self.copy_stencil(A, B)
 
 
 def kernel_theoretical_timing(
-    sdfg: dace.sdfg.SDFG, hardware="P100", hardware_bw_in_Gb_s=None
+    sdfg: dace.sdfg.SDFG,
+    hardware_bw_in_GB_s=None,
+    backend=None,
 ) -> Dict[str, float]:
     """Compute a lower timing bound for kernels with the following hypothesis:
 
@@ -197,6 +231,36 @@ def kernel_theoretical_timing(
     - Memory pressure is mostly in read/write from global memory, inner scalar & shared
       memory is not counted towards memory movement.
     """
+    if not hardware_bw_in_GB_s:
+        size = np.array(sdfg.arrays["__g_self__w"].shape)
+        print(
+            f"Calculating experimental hardware bandwith on {size}"
+            f" arrays at {Float} precision..."
+        )
+        bench = MaxBandwithBenchmarkProgram(size, backend)
+        if backend == "dace:gpu":
+            A = cp.ones(size, dtype=Float)
+            B = cp.ones(size, dtype=Float)
+        else:
+            A = np.ones(size, dtype=Float)
+            B = np.ones(size, dtype=Float)
+        n = 1000
+        m = 4
+        dt = []
+        bench(A, B, n)
+        # Time
+        for _ in range(m):
+            s = time.time()
+            bench(A, B, n)
+            dt.append((time.time() - s) / n)
+        memory_size_in_b = np.prod(size) * np.dtype(Float).itemsize * 8
+        bandwidth_in_bytes_s = memory_size_in_b / np.median(dt)
+        print(
+            f"Hardware bandwith computed: {bandwidth_in_bytes_s/(1024*1024*1024)} GB/s"
+        )
+    else:
+        bandwidth_in_bytes_s = hardware_bw_in_GB_s * 1024 * 1024 * 1024
+
     allmaps = [
         (me, state)
         for me, state in sdfg.all_nodes_recursive()
@@ -228,19 +292,6 @@ def kernel_theoretical_timing(
             ]
         )
 
-        # Compute hardware memory bandwidth in bytes/us
-        if hardware_bw_in_Gb_s and hardware in _HARDWARE_BW_GB_S.keys():
-            raise NotImplementedError("can't specify hardware bandwidth and hardware")
-        if hardware_bw_in_Gb_s:
-            bandwidth_in_bytes_s = hardware_bw_in_Gb_s * 1024 * 1024 * 1024
-        elif hardware in _HARDWARE_BW_GB_S.keys():
-            # Time it has to take (at least): bytes / bandwidth_in_bytes_s
-            bandwidth_in_bytes_s = _HARDWARE_BW_GB_S[hardware] * 1024 * 1024 * 1024
-        else:
-            print(
-                f"Timing analysis: hardware {hardware} unknown and no bandwidth given"
-            )
-
         in_us = 1000 * 1000
 
         # Theoretical fastest timing
@@ -249,8 +300,9 @@ def kernel_theoretical_timing(
         except TypeError:
             newresult_in_us = (alldata_in_bytes / bandwidth_in_bytes_s) * in_us
 
+        import sympy
+
         if node.label in result:
-            import sympy
 
             newresult_in_us = sympy.Max(result[node.label], newresult_in_us).expand()
             try:
@@ -259,29 +311,53 @@ def kernel_theoretical_timing(
                 pass
 
         # Bad expansion
-        if not isinstance(newresult_in_us, float):
+        if not isinstance(newresult_in_us, sympy.core.numbers.Float):
             continue
 
-        result[node.label] = newresult_in_us
+        result[node.label] = float(newresult_in_us)
 
     return result
 
 
 def report_kernel_theoretical_timing(
-    timings: Dict[str, float], human_readable: bool = True, csv: bool = False
+    timings: Dict[str, float],
+    human_readable: bool = True,
+    out_format: Optional[str] = None,
 ) -> str:
     """Produce a human readable or CSV of the kernel timings"""
     result_string = f"Maps processed: {len(timings)}.\n"
     if human_readable:
         result_string += "Timing in microseconds  Map name:\n"
         result_string += "\n".join(f"{v:.2f}\t{k}," for k, v in sorted(timings.items()))
-    if csv:
-        result_string += "#Map name,timing in microseconds\n"
-        result_string += "\n".join(f"{k},{v}," for k, v in sorted(timings.items()))
+    if out_format == "csv":
+        csv_string = ""
+        csv_string += "#Map name,timing in microseconds\n"
+        csv_string += "\n".join(f"{k},{v}," for k, v in sorted(timings.items()))
+        with open("kernel_theoretical_timing.csv", "w") as f:
+            f.write(csv_string)
+    elif out_format == "json":
+        import json
+
+        with open("kernel_theoretical_timing.json", "w") as f:
+            json.dump(timings, f, indent=2)
+
     return result_string
 
 
-def kernel_theoretical_timing_from_path(sdfg_path: str) -> str:
+def kernel_theoretical_timing_from_path(
+    sdfg_path: str,
+    hardware_bw_in_GB_s: Optional[float] = None,
+    backend: Optional[str] = None,
+    output_format: Optional[str] = None,
+) -> str:
     """Load an SDFG and report the theoretical kernel timings"""
-    timings = kernel_theoretical_timing(dace.SDFG.from_file(sdfg_path))
-    return report_kernel_theoretical_timing(timings, human_readable=True, csv=False)
+    timings = kernel_theoretical_timing(
+        dace.SDFG.from_file(sdfg_path),
+        hardware_bw_in_GB_s=hardware_bw_in_GB_s,
+        backend=backend,
+    )
+    return report_kernel_theoretical_timing(
+        timings,
+        human_readable=True,
+        out_format=output_format,
+    )

From cb4ec5f94bd7d1ed81d638a2dc1f1ded457710c0 Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Fri, 3 Mar 2023 21:15:05 +0000
Subject: [PATCH 08/57] Move constant on a env var Add saturation adjustement
 threshold to const

---
 .../fv3core/stencils/saturation_adjustment.py |  6 +--
 util/pace/util/constants.py                   | 51 +++++++++++++++----
 2 files changed, 44 insertions(+), 13 deletions(-)

diff --git a/fv3core/pace/fv3core/stencils/saturation_adjustment.py b/fv3core/pace/fv3core/stencils/saturation_adjustment.py
index 7ffe45b5..f537909f 100644
--- a/fv3core/pace/fv3core/stencils/saturation_adjustment.py
+++ b/fv3core/pace/fv3core/stencils/saturation_adjustment.py
@@ -901,14 +901,14 @@ def satadjust(
             # icloud_f = 0: bug - fixed
             # icloud_f = 1: old fvgfs gfdl) mp implementation
             # icloud_f = 2: binary cloud scheme (0 / 1)
-            if rh > 0.75 and qpz > 1.0e-6:
+            if rh > 0.75 and qpz > constants.SAT_ADJUST_THRESHOLD:
                 dq = hvar * qpz
                 q_plus = qpz + dq
                 q_minus = qpz - dq
                 if icloud_f == 2:  # TODO untested
                     if qpz > qstar:
                         qa = 1.0
-                    elif (qstar < q_plus) and (q_cond > 1.0e-6):
+                    elif (qstar < q_plus) and (q_cond > constants.SAT_ADJUST_THRESHOLD):
                         qa = min(1.0, ((q_plus - qstar) / dq) ** 2)
                     else:
                         qa = 0.0
@@ -924,7 +924,7 @@ def satadjust(
                         else:
                             qa = 0.0
                         # impose minimum cloudiness if substantial q_cond exist
-                        if q_cond > 1.0e-6:
+                        if q_cond > constants.SAT_ADJUST_THRESHOLD:
                             qa = max(cld_min, qa)
                         qa = min(1, qa)
             else:
diff --git a/util/pace/util/constants.py b/util/pace/util/constants.py
index 8eb5143b..9e86c6d7 100644
--- a/util/pace/util/constants.py
+++ b/util/pace/util/constants.py
@@ -1,3 +1,32 @@
+import os
+from enum import Enum, EnumMeta
+
+
+CONST_VERSION = os.environ.get("PACE_CONSTANTS", "GFS")
+
+
+class MetaEnum(EnumMeta):
+    def __contains__(cls, item):
+        try:
+            cls(item)
+        except ValueError:
+            return False
+        return True
+
+
+class BaseEnum(Enum, metaclass=MetaEnum):
+    pass
+
+
+class ConstantVersions(BaseEnum):
+    DEFAULT = ""
+    GEOS = "GEOS"
+    GFS = "GFS"
+
+
+if CONST_VERSION not in ConstantVersions:
+    raise NotImplementedError(f"Constant {CONST_VERSION} not implemented")
+
 ROOT_RANK = 0
 X_DIM = "x"
 X_INTERFACE_DIM = "x_interface"
@@ -35,21 +64,21 @@
 # The FV3GFS model ships with two sets of constants, one used in the GFS physics
 # package and the other used for the Dycore. Their difference are small but significant
 # Our Fortran executable on GCE has GFS_PHYS=True
-CONST_VERSION = 'GEOS'
-if CONST_VERSION == 'GEOS':
+if CONST_VERSION == ConstantVersions.GEOS:
     RADIUS = 6.371e6
     PI = 3.14159265358979323846
-    OMEGA = 2.0*PI/86164.0
+    OMEGA = 2.0 * PI / 86164.0
     GRAV = 9.80665
     RGRAV = 1.0 / GRAV
-    RDGAS = 8314.47/28.965
-    RVGAS = 8314.47/18.015
-    HLV = 2.4665E6
-    HLF = 3.3370E5
-    KAPPA = RDGAS/(3.5*RDGAS)
-    CP_AIR = RDGAS/KAPPA
+    RDGAS = 8314.47 / 28.965
+    RVGAS = 8314.47 / 18.015
+    HLV = 2.4665e6
+    HLF = 3.3370e5
+    KAPPA = RDGAS / (3.5 * RDGAS)
+    CP_AIR = RDGAS / KAPPA
     TFREEZE = 273.15
-elif CONST_VERSION == 'GFS':
+    SAT_ADJUST_THRESHOLD = 1.0e-6
+elif CONST_VERSION == ConstantVersions.GFS:
     RADIUS = 6.3712e6  # Radius of the Earth [m]
     PI = 3.1415926535897931
     OMEGA = 7.2921e-5  # Rotation of the earth
@@ -62,6 +91,7 @@
     CP_AIR = 1004.6
     KAPPA = RDGAS / CP_AIR  # Specific heat capacity of dry air at
     TFREEZE = 273.15
+    SAT_ADJUST_THRESHOLD = 1.0e-8
 else:
     RADIUS = 6371.0e3  # Radius of the Earth [m] #6371.0e3
     PI = 3.14159265358979323846  # 3.14159265358979323846
@@ -75,6 +105,7 @@
     KAPPA = 2.0 / 7.0
     CP_AIR = RDGAS / KAPPA  # Specific heat capacity of dry air at
     TFREEZE = 273.16  # Freezing temperature of fresh water [K]
+    SAT_ADJUST_THRESHOLD = 1.0e-8
 
 DZ_MIN = 2.0
 CV_AIR = CP_AIR - RDGAS  # Heat capacity of dry air at constant volume

From 7348922d32eee811aa4060978cc62742ad3334e8 Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Fri, 3 Mar 2023 21:15:30 +0000
Subject: [PATCH 09/57] lint

---
 fv3core/pace/fv3core/initialization/geos_wrapper.py | 4 +++-
 fv3core/pace/fv3core/stencils/d_sw.py               | 5 +----
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/fv3core/pace/fv3core/initialization/geos_wrapper.py b/fv3core/pace/fv3core/initialization/geos_wrapper.py
index ea61ed7a..b832950f 100644
--- a/fv3core/pace/fv3core/initialization/geos_wrapper.py
+++ b/fv3core/pace/fv3core/initialization/geos_wrapper.py
@@ -16,7 +16,9 @@ class GeosDycoreWrapper:
     Takes numpy arrays as inputs, returns a dictionary of numpy arrays as outputs
     """
 
-    def __init__(self, namelist: f90nml.Namelist, bdt: float, comm: pace.util.Comm, backend: str):
+    def __init__(
+        self, namelist: f90nml.Namelist, bdt: float, comm: pace.util.Comm, backend: str
+    ):
         # Make a custom performance collector for the GEOS wrapper
         self.perf_collector = PerformanceCollector("GEOS wrapper", comm)
 
diff --git a/fv3core/pace/fv3core/stencils/d_sw.py b/fv3core/pace/fv3core/stencils/d_sw.py
index 8be6dcd5..12db67ee 100644
--- a/fv3core/pace/fv3core/stencils/d_sw.py
+++ b/fv3core/pace/fv3core/stencils/d_sw.py
@@ -1242,10 +1242,7 @@ def __call__(
             self._column_namelist["d_con"],
         )
         self._accumulate_heat_source_and_dissipation_estimate_stencil(
-            self._tmp_heat_s,
-            heat_source,
-            self._tmp_diss_e,
-            diss_est
+            self._tmp_heat_s, heat_source, self._tmp_diss_e, diss_est
         )
         self._update_u_and_v_stencil(
             self._tmp_ut,

From 131a2af86d5f1982e345634c5ef891db7b99f4d7 Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Fri, 3 Mar 2023 21:36:20 +0000
Subject: [PATCH 10/57] More linting

---
 driver/pace/driver/grid.py                          | 2 +-
 fv3core/pace/fv3core/initialization/geos_wrapper.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/driver/pace/driver/grid.py b/driver/pace/driver/grid.py
index c4c40c19..4817869c 100644
--- a/driver/pace/driver/grid.py
+++ b/driver/pace/driver/grid.py
@@ -215,5 +215,5 @@ def _transform_horizontal_grid(
     grid.data[:, :, 0] = lon_transform[:]
     grid.data[:, :, 1] = lat_transform[:]
 
-    metric_terms._grid.data[:] = grid.data[:]
+    metric_terms._grid.data[:] = grid.data[:]  # type: ignore[attr-defined]
     metric_terms._init_agrid()
diff --git a/fv3core/pace/fv3core/initialization/geos_wrapper.py b/fv3core/pace/fv3core/initialization/geos_wrapper.py
index 340d8a5b..6ac49949 100644
--- a/fv3core/pace/fv3core/initialization/geos_wrapper.py
+++ b/fv3core/pace/fv3core/initialization/geos_wrapper.py
@@ -20,7 +20,7 @@ class GeosDycoreWrapper:
     def __init__(
         self,
         namelist: f90nml.Namelist,
-        bdt: float,
+        bdt: int,
         comm: pace.util.Comm,
         backend: str,
     ):

From 89825429f6603d9dd4d9fbc3ba8616b93b9d179e Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Mon, 6 Mar 2023 21:01:13 +0000
Subject: [PATCH 11/57] Remove unused if leading to empty code block

---
 fv3core/pace/fv3core/stencils/d_sw.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/fv3core/pace/fv3core/stencils/d_sw.py b/fv3core/pace/fv3core/stencils/d_sw.py
index 93ad3f5e..b16b4690 100644
--- a/fv3core/pace/fv3core/stencils/d_sw.py
+++ b/fv3core/pace/fv3core/stencils/d_sw.py
@@ -566,10 +566,9 @@ def heat_source_from_vorticity_damping(
                 heat_source - kinetic_energy_fraction_to_damp * dampterm
             )
 
-        if __INLINED((d_con > dcon_threshold) or do_stochastic_ke_backscatter):
+        if __INLINED(do_stochastic_ke_backscatter):
             with horizontal(region[local_is : local_ie + 1, local_js : local_je + 1]):
-                if __INLINED(do_stochastic_ke_backscatter):
-                    dissipation_estimate -= dampterm
+                dissipation_estimate -= dampterm
 
 
 def accumulate_heat_source_and_dissipation_estimate(

From da2f902e00fd49b8f58ba53d36d67131c133b317 Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Mon, 6 Mar 2023 21:53:26 +0000
Subject: [PATCH 12/57] Restrict dace to 0.14.1 due to a parsing bug

---
 requirements_dev.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements_dev.txt b/requirements_dev.txt
index 8318706a..9cfa0de2 100644
--- a/requirements_dev.txt
+++ b/requirements_dev.txt
@@ -11,7 +11,7 @@ dask>=2021.10.0
 netCDF4
 cftime
 fv3config>=0.9.0
-dace>=0.14.1
+dace=0.14.1
 f90nml>=1.1.0
 numpy>=1.15
 -e external/gt4py

From 27fae1c243b85699862ddb9fb58c38d42d22e52b Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Tue, 7 Mar 2023 18:51:56 +0000
Subject: [PATCH 13/57] Add guard for bdt==0 Fix bad merge for bdt with
 GEOS_Wrapper

---
 fv3core/pace/fv3core/initialization/geos_wrapper.py | 8 ++++++++
 fv3core/pace/fv3core/stencils/fv_dynamics.py        | 2 ++
 2 files changed, 10 insertions(+)

diff --git a/fv3core/pace/fv3core/initialization/geos_wrapper.py b/fv3core/pace/fv3core/initialization/geos_wrapper.py
index 00351502..515518bc 100644
--- a/fv3core/pace/fv3core/initialization/geos_wrapper.py
+++ b/fv3core/pace/fv3core/initialization/geos_wrapper.py
@@ -11,6 +11,7 @@
 from pace.driver.performance.collector import PerformanceCollector
 from pace.dsl.dace import DaceConfig, orchestrate
 from pace.dsl.gt4py_utils import is_gpu_backend
+from pace.util.logging import pace_log
 
 
 @enum.unique
@@ -108,6 +109,7 @@ def __init__(
         self.dycore_state = fv3core.DycoreState.init_zeros(
             quantity_factory=quantity_factory
         )
+        self.dycore_state.bdt = self.dycore_config.dt_atmos
 
         damping_coefficients = pace.util.grid.DampingCoefficients.new_from_metric_terms(
             metric_terms
@@ -133,6 +135,12 @@ def __init__(
         self.output_dict: Dict[str, np.ndarray] = {}
         self._allocate_output_dir()
 
+        pace_log.info(
+            "GEOS-Wrapper with: \n"
+            f"  dt     : {self.dycore_state.bdt}\n"
+            f"  bridge : {self._fortran_mem_space} > {self._pace_mem_space}\n"
+        )
+
     def _critical_path(self):
         """Top-level orchestration function"""
         with self.perf_collector.timestep_timer.clock("step_dynamics"):
diff --git a/fv3core/pace/fv3core/stencils/fv_dynamics.py b/fv3core/pace/fv3core/stencils/fv_dynamics.py
index d7adba12..e5f0c269 100644
--- a/fv3core/pace/fv3core/stencils/fv_dynamics.py
+++ b/fv3core/pace/fv3core/stencils/fv_dynamics.py
@@ -179,6 +179,8 @@ def __init__(
             method_to_orchestrate="_checkpoint_tracer_advection_out",
             dace_compiletime_args=["state"],
         )
+        if timestep == timedelta(seconds=0):
+            raise RuntimeError("Bad dynamical core configuration: bdt is 0")
         # nested and stretched_grid are options in the Fortran code which we
         # have not implemented, so they are hard-coded here.
         self.call_checkpointer = checkpointer is not None

From 2f8ebac91a891e2fc732ff6b792e080d9fc226b6 Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Tue, 7 Mar 2023 19:36:11 +0000
Subject: [PATCH 14/57] Remove unused code

---
 fv3core/pace/fv3core/initialization/geos_wrapper.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/fv3core/pace/fv3core/initialization/geos_wrapper.py b/fv3core/pace/fv3core/initialization/geos_wrapper.py
index 64fbb6ff..cc7211c9 100644
--- a/fv3core/pace/fv3core/initialization/geos_wrapper.py
+++ b/fv3core/pace/fv3core/initialization/geos_wrapper.py
@@ -19,10 +19,6 @@ class MemorySpace(enum.Enum):
     DEVICE = 1
 
 
-def assign_no_copy(A, B):
-    A = B
-
-
 class GeosDycoreWrapper:
     """
     Provides an interface for the Geos model to access the Pace dycore.

From 81d00ced5704e94decd692b37100ce73e36620d7 Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Tue, 28 Mar 2023 13:43:45 +0000
Subject: [PATCH 15/57] Fix theroritical timings Lint

---
 dsl/pace/dsl/dace/utils.py            | 17 ++++++++---------
 fv3core/pace/fv3core/stencils/d_sw.py |  2 +-
 2 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/dsl/pace/dsl/dace/utils.py b/dsl/pace/dsl/dace/utils.py
index 68b61331..4ba0247e 100644
--- a/dsl/pace/dsl/dace/utils.py
+++ b/dsl/pace/dsl/dace/utils.py
@@ -5,9 +5,11 @@
 import dace
 import numpy as np
 from dace.transformation.helpers import get_parent_map
+from gt4py.cartesian.gtscript import PARALLEL, computation, interval
 
 from pace.dsl.dace.dace_config import DaceConfig
-from pace.dsl.typing import Float
+from pace.dsl.stencil import CompilationConfig, FrozenStencil, StencilConfig
+from pace.dsl.typing import Float, FloatField
 from pace.util._optional_imports import cupy as cp
 from pace.util.logging import pace_log
 
@@ -186,13 +188,6 @@ def memory_static_analysis_from_path(sdfg_path: str, detail_report=False) -> str
 # ----------------------------------------------------------
 # Theoritical bandwith from SDFG
 # ----------------------------------------------------------
-
-from gt4py.cartesian.gtscript import PARALLEL, computation, interval
-
-from pace.dsl.stencil import CompilationConfig, FrozenStencil, StencilConfig
-from pace.dsl.typing import FloatField
-
-
 def copy_defn(q_in: FloatField, q_out: FloatField):
     with computation(PARALLEL), interval(...):
         q_in = q_out
@@ -260,6 +255,7 @@ def kernel_theoretical_timing(
         )
     else:
         bandwidth_in_bytes_s = hardware_bw_in_GB_s * 1024 * 1024 * 1024
+        print(f"Given hardware bandwith: {bandwidth_in_bytes_s/(1024*1024*1024)} GB/s")
 
     allmaps = [
         (me, state)
@@ -311,7 +307,9 @@ def kernel_theoretical_timing(
                 pass
 
         # Bad expansion
-        if not isinstance(newresult_in_us, sympy.core.numbers.Float):
+        if not isinstance(newresult_in_us, sympy.core.numbers.Float) and not isinstance(
+            newresult_in_us, float
+        ):
             continue
 
         result[node.label] = float(newresult_in_us)
@@ -351,6 +349,7 @@ def kernel_theoretical_timing_from_path(
     output_format: Optional[str] = None,
 ) -> str:
     """Load an SDFG and report the theoretical kernel timings"""
+    print(f"Running kernel_theoretical_timing for {sdfg_path}")
     timings = kernel_theoretical_timing(
         dace.SDFG.from_file(sdfg_path),
         hardware_bw_in_GB_s=hardware_bw_in_GB_s,
diff --git a/fv3core/pace/fv3core/stencils/d_sw.py b/fv3core/pace/fv3core/stencils/d_sw.py
index b16b4690..02ce9887 100644
--- a/fv3core/pace/fv3core/stencils/d_sw.py
+++ b/fv3core/pace/fv3core/stencils/d_sw.py
@@ -529,7 +529,7 @@ def heat_source_from_vorticity_damping(
         kinetic_energy_fraction_to_damp (in): the fraction of kinetic energy
             to explicitly damp and convert into heat.
     """
-    from __externals__ import (
+    from __externals__ import (  # noqa (see below)
         d_con,
         do_stochastic_ke_backscatter,
         local_ie,

From 4891d560d59931611e561e7a9e4144fa73761fa6 Mon Sep 17 00:00:00 2001
From: Purnendu Chakraborty <purnendu.chakraborty@nasa.gov>
Date: Fri, 7 Apr 2023 12:41:46 -0400
Subject: [PATCH 16/57] Fixed a bug where pkz was being calculated twice, and
 the second calc was wrong

---
 fv3core/pace/fv3core/stencils/temperature_adjust.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fv3core/pace/fv3core/stencils/temperature_adjust.py b/fv3core/pace/fv3core/stencils/temperature_adjust.py
index 33feb484..0226df38 100644
--- a/fv3core/pace/fv3core/stencils/temperature_adjust.py
+++ b/fv3core/pace/fv3core/stencils/temperature_adjust.py
@@ -29,7 +29,6 @@ def apply_diffusive_heating(
     """
     with computation(PARALLEL), interval(...):
         pkz = exp(cappa / (1.0 - cappa) * log(constants.RDG * delp / delz * pt))
-        pkz = (constants.RDG * delp / delz * pt) ** (cappa / (1.0 - cappa))
         dtmp = heat_source / (constants.CV_AIR * delp)
     with computation(PARALLEL):
         with interval(0, 1):

From fafbfc761a063a5fc4e7e5af2e1839f151072469 Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Mon, 10 Apr 2023 13:11:27 +0000
Subject: [PATCH 17/57] Downgrade DaCe to 0.14.0 pending array aliasing fix

---
 constraints.txt      | 2 +-
 requirements_dev.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/constraints.txt b/constraints.txt
index 282e5988..dd122d08 100644
--- a/constraints.txt
+++ b/constraints.txt
@@ -87,7 +87,7 @@ cytoolz==0.11.2
     # via
     #   gt4py
     #   gt4py (external/gt4py/setup.cfg)
-dace==0.14.1
+dace==0.14.0
     # via
     #   -r requirements_dev.txt
     #   pace-dsl
diff --git a/requirements_dev.txt b/requirements_dev.txt
index 9cfa0de2..a07db35a 100644
--- a/requirements_dev.txt
+++ b/requirements_dev.txt
@@ -11,7 +11,7 @@ dask>=2021.10.0
 netCDF4
 cftime
 fv3config>=0.9.0
-dace=0.14.1
+dace=0.14.0
 f90nml>=1.1.0
 numpy>=1.15
 -e external/gt4py

From 4fc5b4dfb82c5e702b66881e3c39f8f49878a0a1 Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Mon, 10 Apr 2023 17:45:16 +0000
Subject: [PATCH 18/57] Set default cache path for orchestrated DaCe to respect
 GT_CACHE_* env

---
 dsl/pace/dsl/dace/build.py       | 9 ++-------
 dsl/pace/dsl/dace/dace_config.py | 4 +++-
 dsl/pace/dsl/gt4py_utils.py      | 9 +++++++++
 3 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/dsl/pace/dsl/dace/build.py b/dsl/pace/dsl/dace/build.py
index 7d8f3db2..cbdc4404 100644
--- a/dsl/pace/dsl/dace/build.py
+++ b/dsl/pace/dsl/dace/build.py
@@ -5,6 +5,7 @@
 
 import pace.util
 from pace.dsl.dace.dace_config import DaceConfig, DaCeOrchestration
+from pace.dsl.gt4py_utils import cache_path
 
 
 ################################################
@@ -124,9 +125,6 @@ def get_sdfg_path(
             )
         return sdfg_file_path
 
-    # Case of loading a precompiled .so - lookup using GT_CACHE
-    from gt4py.cartesian import config as gt_config
-
     if config.rank_size > 1:
         rank = config.my_rank
         rank_str = f"_{config.target_rank:06d}"
@@ -134,10 +132,7 @@ def get_sdfg_path(
         rank = 0
         rank_str = f"_{rank:06d}"
 
-    sdfg_dir_path = (
-        f"{gt_config.cache_settings['root_path']}"
-        f"/.gt_cache{rank_str}/dacecache/{daceprog_name}"
-    )
+    sdfg_dir_path = f"{cache_path(rank)}/dacecache/{daceprog_name}"
     if not os.path.isdir(sdfg_dir_path):
         raise RuntimeError(f"Precompiled SDFG is missing at {sdfg_dir_path}")
 
diff --git a/dsl/pace/dsl/dace/dace_config.py b/dsl/pace/dsl/dace/dace_config.py
index 961bf3ba..5481759d 100644
--- a/dsl/pace/dsl/dace/dace_config.py
+++ b/dsl/pace/dsl/dace/dace_config.py
@@ -5,7 +5,7 @@
 from dace.codegen.compiled_sdfg import CompiledSDFG
 from dace.frontend.python.parser import DaceProgram
 
-from pace.dsl.gt4py_utils import is_gpu_backend
+from pace.dsl.gt4py_utils import cache_path, is_gpu_backend
 from pace.util._optional_imports import cupy as cp
 from pace.util.communicator import CubedSphereCommunicator
 
@@ -198,6 +198,8 @@ def __init__(
             self.target_rank = 0
             self.layout = (1, 1)
 
+        dace.config.Config.set("default_build_folder", value=cache_path(self.my_rank))
+
         set_distributed_caches(self)
 
         if (
diff --git a/dsl/pace/dsl/gt4py_utils.py b/dsl/pace/dsl/gt4py_utils.py
index 7b033fee..0c446126 100644
--- a/dsl/pace/dsl/gt4py_utils.py
+++ b/dsl/pace/dsl/gt4py_utils.py
@@ -2,6 +2,7 @@
 from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union
 
 import gt4py
+import gt4py.cartesian.config as gt_config
 import numpy as np
 
 from pace.dsl.typing import DTypes, Field, Float
@@ -483,3 +484,11 @@ def split_cartesian_into_storages(var: np.ndarray) -> Sequence[np.ndarray]:
             asarray(var, type(var))[:, :, cart],
         )
     return var_data
+
+
+def cache_path(rank: int) -> str:
+    rank_str = f"_{rank:06d}"
+    return (
+        f"{gt_config.cache_settings['root_path']}"
+        f"/{gt_config.cache_settings['dir_name']}{rank_str}"
+    )

From 22450279075e746f44a1b6afce7a05b437b1f35d Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Tue, 11 Apr 2023 13:36:42 +0000
Subject: [PATCH 19/57] Remove previous per stencil override of
 default_build_folder

---
 dsl/pace/dsl/stencil.py | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/dsl/pace/dsl/stencil.py b/dsl/pace/dsl/stencil.py
index 3b0bd781..51832522 100644
--- a/dsl/pace/dsl/stencil.py
+++ b/dsl/pace/dsl/stencil.py
@@ -16,7 +16,6 @@
     cast,
 )
 
-import dace
 import gt4py
 import numpy as np
 from gt4py.cartesian import gtscript
@@ -322,14 +321,6 @@ def __init__(
 
         self._argument_names = tuple(inspect.getfullargspec(func).args)
 
-        if "dace" in self.stencil_config.compilation_config.backend:
-            dace.Config.set(
-                "default_build_folder",
-                value="{gt_cache}/dacecache".format(
-                    gt_cache=gt4py.cartesian.config.cache_settings["dir_name"]
-                ),
-            )
-
         assert (
             len(self._argument_names) > 0
         ), "A stencil with no arguments? You may be double decorating"

From 4f8fdc3bc5932dd020f0a836f602cf9e9311136f Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Tue, 11 Apr 2023 15:03:43 +0000
Subject: [PATCH 20/57] Revert "Set default cache path for orchestrated DaCe to
 respect GT_CACHE_* env"

This reverts commit 4fc5b4dfb82c5e702b66881e3c39f8f49878a0a1.
---
 dsl/pace/dsl/dace/build.py       | 9 +++++++--
 dsl/pace/dsl/dace/dace_config.py | 4 +---
 dsl/pace/dsl/gt4py_utils.py      | 9 ---------
 3 files changed, 8 insertions(+), 14 deletions(-)

diff --git a/dsl/pace/dsl/dace/build.py b/dsl/pace/dsl/dace/build.py
index cbdc4404..7d8f3db2 100644
--- a/dsl/pace/dsl/dace/build.py
+++ b/dsl/pace/dsl/dace/build.py
@@ -5,7 +5,6 @@
 
 import pace.util
 from pace.dsl.dace.dace_config import DaceConfig, DaCeOrchestration
-from pace.dsl.gt4py_utils import cache_path
 
 
 ################################################
@@ -125,6 +124,9 @@ def get_sdfg_path(
             )
         return sdfg_file_path
 
+    # Case of loading a precompiled .so - lookup using GT_CACHE
+    from gt4py.cartesian import config as gt_config
+
     if config.rank_size > 1:
         rank = config.my_rank
         rank_str = f"_{config.target_rank:06d}"
@@ -132,7 +134,10 @@ def get_sdfg_path(
         rank = 0
         rank_str = f"_{rank:06d}"
 
-    sdfg_dir_path = f"{cache_path(rank)}/dacecache/{daceprog_name}"
+    sdfg_dir_path = (
+        f"{gt_config.cache_settings['root_path']}"
+        f"/.gt_cache{rank_str}/dacecache/{daceprog_name}"
+    )
     if not os.path.isdir(sdfg_dir_path):
         raise RuntimeError(f"Precompiled SDFG is missing at {sdfg_dir_path}")
 
diff --git a/dsl/pace/dsl/dace/dace_config.py b/dsl/pace/dsl/dace/dace_config.py
index 5481759d..961bf3ba 100644
--- a/dsl/pace/dsl/dace/dace_config.py
+++ b/dsl/pace/dsl/dace/dace_config.py
@@ -5,7 +5,7 @@
 from dace.codegen.compiled_sdfg import CompiledSDFG
 from dace.frontend.python.parser import DaceProgram
 
-from pace.dsl.gt4py_utils import cache_path, is_gpu_backend
+from pace.dsl.gt4py_utils import is_gpu_backend
 from pace.util._optional_imports import cupy as cp
 from pace.util.communicator import CubedSphereCommunicator
 
@@ -198,8 +198,6 @@ def __init__(
             self.target_rank = 0
             self.layout = (1, 1)
 
-        dace.config.Config.set("default_build_folder", value=cache_path(self.my_rank))
-
         set_distributed_caches(self)
 
         if (
diff --git a/dsl/pace/dsl/gt4py_utils.py b/dsl/pace/dsl/gt4py_utils.py
index 0c446126..7b033fee 100644
--- a/dsl/pace/dsl/gt4py_utils.py
+++ b/dsl/pace/dsl/gt4py_utils.py
@@ -2,7 +2,6 @@
 from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union
 
 import gt4py
-import gt4py.cartesian.config as gt_config
 import numpy as np
 
 from pace.dsl.typing import DTypes, Field, Float
@@ -484,11 +483,3 @@ def split_cartesian_into_storages(var: np.ndarray) -> Sequence[np.ndarray]:
             asarray(var, type(var))[:, :, cart],
         )
     return var_data
-
-
-def cache_path(rank: int) -> str:
-    rank_str = f"_{rank:06d}"
-    return (
-        f"{gt_config.cache_settings['root_path']}"
-        f"/{gt_config.cache_settings['dir_name']}{rank_str}"
-    )

From 47421a0c8ac33184379e9eb5ed555da4720378d3 Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Tue, 11 Apr 2023 15:03:47 +0000
Subject: [PATCH 21/57] Revert "Remove previous per stencil override of
 default_build_folder"

This reverts commit 22450279075e746f44a1b6afce7a05b437b1f35d.
---
 dsl/pace/dsl/stencil.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/dsl/pace/dsl/stencil.py b/dsl/pace/dsl/stencil.py
index 51832522..3b0bd781 100644
--- a/dsl/pace/dsl/stencil.py
+++ b/dsl/pace/dsl/stencil.py
@@ -16,6 +16,7 @@
     cast,
 )
 
+import dace
 import gt4py
 import numpy as np
 from gt4py.cartesian import gtscript
@@ -321,6 +322,14 @@ def __init__(
 
         self._argument_names = tuple(inspect.getfullargspec(func).args)
 
+        if "dace" in self.stencil_config.compilation_config.backend:
+            dace.Config.set(
+                "default_build_folder",
+                value="{gt_cache}/dacecache".format(
+                    gt_cache=gt4py.cartesian.config.cache_settings["dir_name"]
+                ),
+            )
+
         assert (
             len(self._argument_names) > 0
         ), "A stencil with no arguments? You may be double decorating"

From d51bc11598cb8fa1dadf64f97ed6f5fcd4cdde68 Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Tue, 11 Apr 2023 15:07:44 +0000
Subject: [PATCH 22/57] Read cache_root in default dace backend

---
 dsl/pace/dsl/stencil.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/dsl/pace/dsl/stencil.py b/dsl/pace/dsl/stencil.py
index 3b0bd781..26454ef8 100644
--- a/dsl/pace/dsl/stencil.py
+++ b/dsl/pace/dsl/stencil.py
@@ -325,8 +325,9 @@ def __init__(
         if "dace" in self.stencil_config.compilation_config.backend:
             dace.Config.set(
                 "default_build_folder",
-                value="{gt_cache}/dacecache".format(
-                    gt_cache=gt4py.cartesian.config.cache_settings["dir_name"]
+                value="{gt_root}/{gt_cache}/dacecache".format(
+                    gt_root=gt4py.cartesian.config.cache_settings["root_path"],
+                    gt_cache=gt4py.cartesian.config.cache_settings["dir_name"],
                 ),
             )
 

From 6bdd5958a4f68dc76fc4c8174620a15c8f1c8c6b Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Tue, 11 Apr 2023 15:09:12 +0000
Subject: [PATCH 23/57] Document faulty behavior with GT_CACHE_DIR_NAME

---
 doc_primer_orchestration.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/doc_primer_orchestration.md b/doc_primer_orchestration.md
index 16fadd4d..a10baf0c 100644
--- a/doc_primer_orchestration.md
+++ b/doc_primer_orchestration.md
@@ -104,6 +104,10 @@ _Parsing errors_
 
 DaCe cannot parse _any_ dynamic Python and any code that allocates memory on the fly (think list creation). It will also complain about any arguments it can't memory describe (remember `dace_compiletime_args` ).
 
+_GT_CACHE_DIR_NAME_
+
+We do not honor the `GT_CACHE_DIR_NAME` with orchestration. `GT_CACHE_ROOT` is respected.
+
 Conclusion
 ----------
 

From 80cbb015559e15079a57bd04feccc9260035be8e Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Thu, 13 Apr 2023 18:15:15 +0000
Subject: [PATCH 24/57] Fix bad requirements syntax

---
 requirements_dev.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements_dev.txt b/requirements_dev.txt
index a07db35a..052bf5c3 100644
--- a/requirements_dev.txt
+++ b/requirements_dev.txt
@@ -11,7 +11,7 @@ dask>=2021.10.0
 netCDF4
 cftime
 fv3config>=0.9.0
-dace=0.14.0
+dace==0.14.0
 f90nml>=1.1.0
 numpy>=1.15
 -e external/gt4py

From 40f24408a29cd28e50c430610e0009947b068611 Mon Sep 17 00:00:00 2001
From: Purnendu Chakraborty <purnendu.chakraborty@nasa.gov>
Date: Fri, 14 Apr 2023 17:08:24 -0400
Subject: [PATCH 25/57] Check for the string value of CONST_VERSION directly
 instead of enum

---
 util/pace/util/constants.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/util/pace/util/constants.py b/util/pace/util/constants.py
index 9e86c6d7..5d4f5f85 100644
--- a/util/pace/util/constants.py
+++ b/util/pace/util/constants.py
@@ -64,7 +64,7 @@ class ConstantVersions(BaseEnum):
 # The FV3GFS model ships with two sets of constants, one used in the GFS physics
 # package and the other used for the Dycore. Their difference are small but significant
 # Our Fortran executable on GCE has GFS_PHYS=True
-if CONST_VERSION == ConstantVersions.GEOS:
+if CONST_VERSION == "GEOS":
     RADIUS = 6.371e6
     PI = 3.14159265358979323846
     OMEGA = 2.0 * PI / 86164.0
@@ -78,7 +78,7 @@ class ConstantVersions(BaseEnum):
     CP_AIR = RDGAS / KAPPA
     TFREEZE = 273.15
     SAT_ADJUST_THRESHOLD = 1.0e-6
-elif CONST_VERSION == ConstantVersions.GFS:
+elif CONST_VERSION == "GFS":
     RADIUS = 6.3712e6  # Radius of the Earth [m]
     PI = 3.1415926535897931
     OMEGA = 7.2921e-5  # Rotation of the earth

From cae25a9430c040e45f15aee3d4ef232b7d933dd8 Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Thu, 20 Apr 2023 09:21:41 -0400
Subject: [PATCH 26/57] Protect constant selection more rigorusly. Clean abort
 on unknown constant given

---
 util/pace/util/constants.py | 49 ++++++++++++++-----------------------
 1 file changed, 19 insertions(+), 30 deletions(-)

diff --git a/util/pace/util/constants.py b/util/pace/util/constants.py
index 5d4f5f85..4819cc4d 100644
--- a/util/pace/util/constants.py
+++ b/util/pace/util/constants.py
@@ -1,31 +1,22 @@
 import os
-from enum import Enum, EnumMeta
+from enum import Enum
+from warnings import warn
 
 
-CONST_VERSION = os.environ.get("PACE_CONSTANTS", "GFS")
-
-
-class MetaEnum(EnumMeta):
-    def __contains__(cls, item):
-        try:
-            cls(item)
-        except ValueError:
-            return False
-        return True
-
-
-class BaseEnum(Enum, metaclass=MetaEnum):
-    pass
-
-
-class ConstantVersions(BaseEnum):
-    DEFAULT = ""
-    GEOS = "GEOS"
-    GFS = "GFS"
+# The FV3GFS model ships with two sets of constants, one used in the GFS physics
+# package and the other used for the Dycore. Their difference are small but significant
+# In addition the GSFC's GEOS model as its own variables
+class ConstantVersions(Enum):
+    FV3DYCORE = "FV3DYCORE"  # NOAA's FV3 dynamical core constants (original port)
+    GFS = "GFS"             # Constant as defined in NOAA GFS
+    GEOS = "GEOS"           # Constant as defined in GEOS v13
 
 
-if CONST_VERSION not in ConstantVersions:
-    raise NotImplementedError(f"Constant {CONST_VERSION} not implemented")
+CONST_VERSION_AS_STR = os.environ.get("PACE_CONSTANTS", "FV3DYCORE")
+try:
+    CONST_VERSION = ConstantVersions[CONST_VERSION_AS_STR]
+except KeyError as e:
+    raise RuntimeError(f"Constants {CONST_VERSION_AS_STR} is not implemented, abort.")
 
 ROOT_RANK = 0
 X_DIM = "x"
@@ -60,11 +51,7 @@ class ConstantVersions(BaseEnum):
 #####################
 # Physical constants
 #####################
-
-# The FV3GFS model ships with two sets of constants, one used in the GFS physics
-# package and the other used for the Dycore. Their difference are small but significant
-# Our Fortran executable on GCE has GFS_PHYS=True
-if CONST_VERSION == "GEOS":
+if CONST_VERSION == ConstantVersions.GEOS:
     RADIUS = 6.371e6
     PI = 3.14159265358979323846
     OMEGA = 2.0 * PI / 86164.0
@@ -78,7 +65,7 @@ class ConstantVersions(BaseEnum):
     CP_AIR = RDGAS / KAPPA
     TFREEZE = 273.15
     SAT_ADJUST_THRESHOLD = 1.0e-6
-elif CONST_VERSION == "GFS":
+elif CONST_VERSION == ConstantVersions.GFS:
     RADIUS = 6.3712e6  # Radius of the Earth [m]
     PI = 3.1415926535897931
     OMEGA = 7.2921e-5  # Rotation of the earth
@@ -92,7 +79,7 @@ class ConstantVersions(BaseEnum):
     KAPPA = RDGAS / CP_AIR  # Specific heat capacity of dry air at
     TFREEZE = 273.15
     SAT_ADJUST_THRESHOLD = 1.0e-8
-else:
+elif CONST_VERSION == ConstantVersions.FV3DYCORE:
     RADIUS = 6371.0e3  # Radius of the Earth [m] #6371.0e3
     PI = 3.14159265358979323846  # 3.14159265358979323846
     OMEGA = 7.292e-5  # Rotation of the earth  # 7.292e-5
@@ -106,6 +93,8 @@ class ConstantVersions(BaseEnum):
     CP_AIR = RDGAS / KAPPA  # Specific heat capacity of dry air at
     TFREEZE = 273.16  # Freezing temperature of fresh water [K]
     SAT_ADJUST_THRESHOLD = 1.0e-8
+else:
+    raise RuntimeError("Constant selector failed, bad code.")
 
 DZ_MIN = 2.0
 CV_AIR = CP_AIR - RDGAS  # Heat capacity of dry air at constant volume

From 915993e9dfe692e1db3bc7582bb3dcb9fe45535a Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Thu, 20 Apr 2023 09:26:15 -0400
Subject: [PATCH 27/57] Log constants selection

---
 util/pace/util/constants.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/util/pace/util/constants.py b/util/pace/util/constants.py
index 4819cc4d..2a72d25c 100644
--- a/util/pace/util/constants.py
+++ b/util/pace/util/constants.py
@@ -1,7 +1,7 @@
 import os
 from enum import Enum
 from warnings import warn
-
+from pace.util.logging import pace_log
 
 # The FV3GFS model ships with two sets of constants, one used in the GFS physics
 # package and the other used for the Dycore. Their difference are small but significant
@@ -11,10 +11,11 @@ class ConstantVersions(Enum):
     GFS = "GFS"             # Constant as defined in NOAA GFS
     GEOS = "GEOS"           # Constant as defined in GEOS v13
 
-
 CONST_VERSION_AS_STR = os.environ.get("PACE_CONSTANTS", "FV3DYCORE")
+
 try:
     CONST_VERSION = ConstantVersions[CONST_VERSION_AS_STR]
+    pace_log.info(f"Constant selected: {CONST_VERSION}")
 except KeyError as e:
     raise RuntimeError(f"Constants {CONST_VERSION_AS_STR} is not implemented, abort.")
 

From c3e355c4f7558076d5fa75490c9700690b81c2d8 Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Thu, 20 Apr 2023 12:10:28 -0400
Subject: [PATCH 28/57] Refactor NQ to constants.py

---
 fv3core/pace/fv3core/stencils/fv_dynamics.py  | 21 ++++++++-----------
 .../translate/translate_init_case.py          |  2 +-
 util/pace/util/constants.py                   | 20 ++++++++++++++++++
 3 files changed, 30 insertions(+), 13 deletions(-)

diff --git a/fv3core/pace/fv3core/stencils/fv_dynamics.py b/fv3core/pace/fv3core/stencils/fv_dynamics.py
index e5f0c269..8f5f8a7f 100644
--- a/fv3core/pace/fv3core/stencils/fv_dynamics.py
+++ b/fv3core/pace/fv3core/stencils/fv_dynamics.py
@@ -21,19 +21,12 @@
 from pace.fv3core.stencils.neg_adj3 import AdjustNegativeTracerMixingRatio
 from pace.fv3core.stencils.remapping import LagrangianToEulerian
 from pace.stencils.c2l_ord import CubedToLatLon
-from pace.util import X_DIM, Y_DIM, Z_INTERFACE_DIM, Timer
+from pace.util import X_DIM, Y_DIM, Z_INTERFACE_DIM, Timer, constants
 from pace.util.grid import DampingCoefficients, GridData
 from pace.util.logging import pace_log
 from pace.util.mpi import MPI
 
 
-# nq is actually given by ncnst - pnats, where those are given in atmosphere.F90 by:
-# ncnst = Atm(mytile)%ncnst
-# pnats = Atm(mytile)%flagstruct%pnats
-# here we hard-coded it because 8 is the only supported value, refactor this later!
-NQ = 9  # state.nq_tot - spec.namelist.dnats
-
-
 def pt_to_potential_density_pt(
     pkz: FloatField, dp_initial: FloatField, q_con: FloatField, pt: FloatField
 ):
@@ -209,7 +202,7 @@ def __init__(
         )
 
         self.tracers = {}
-        for name in utils.tracer_variables[0:NQ]:
+        for name in utils.tracer_variables[0:constants.NQ]:
             self.tracers[name] = state.__dict__[name]
 
         temporaries = fvdyn_temporaries(quantity_factory)
@@ -284,7 +277,7 @@ def __init__(
         )
         self._cappa = self.acoustic_dynamics.cappa
 
-        if not (not self.config.inline_q and NQ != 0):
+        if not (not self.config.inline_q and constants.NQ != 0):
             raise NotImplementedError("tracer_2d not implemented, turn on z_tracer")
         self._adjust_tracer_mixing_ratio = AdjustNegativeTracerMixingRatio(
             stencil_factory,
@@ -298,7 +291,7 @@ def __init__(
             quantity_factory=quantity_factory,
             config=config.remapping,
             area_64=grid_data.area_64,
-            nq=NQ,
+            nq=constants.NQ,
             pfull=self._pfull,
             tracers=self.tracers,
             checkpointer=checkpointer,
@@ -548,6 +541,11 @@ def _compute(self, state: DycoreState, timer: pace.util.Timer):
                     log_on_rank_0("Remapping")
                 with timer.clock("Remapping"):
                     self._checkpoint_remapping_in(state)
+                    
+                    # TODO: When NQ=9, we shouldn't need to pass qcld explicitly
+                    #       since it's in self.tracers. It should not be an issue since
+                    #       we don't have self.tracers & qcld computation at the same time
+                    #       When NQ=8, we do need qcld passed explicitely
                     self._lagrangian_to_eulerian_obj(
                         self.tracers,
                         state.pt,
@@ -559,7 +557,6 @@ def _compute(self, state: DycoreState, timer: pace.util.Timer):
                         state.w,
                         self._cappa,
                         state.q_con,
-                        # Since NQ=9, we shouldn't need to pass qcld explicitly
                         state.qcld,
                         state.pkz,
                         state.pk,
diff --git a/fv3core/tests/savepoint/translate/translate_init_case.py b/fv3core/tests/savepoint/translate/translate_init_case.py
index 9716aca3..5fe14d73 100644
--- a/fv3core/tests/savepoint/translate/translate_init_case.py
+++ b/fv3core/tests/savepoint/translate/translate_init_case.py
@@ -184,7 +184,7 @@ def outputs_from_state(self, state: dict):
 
     def compute_parallel(self, inputs, communicator):
         state = {}
-        full_shape = (*self.grid.domain_shape_full(add=(1, 1, 1)), fv_dynamics.NQ)
+        full_shape = (*self.grid.domain_shape_full(add=(1, 1, 1)), pace.util.constants.NQ)
         for variable, properties in self.outputs.items():
             dims = properties["dims"]
             state[variable] = fv3util.Quantity(
diff --git a/util/pace/util/constants.py b/util/pace/util/constants.py
index 2a72d25c..7a481b95 100644
--- a/util/pace/util/constants.py
+++ b/util/pace/util/constants.py
@@ -19,6 +19,10 @@ class ConstantVersions(Enum):
 except KeyError as e:
     raise RuntimeError(f"Constants {CONST_VERSION_AS_STR} is not implemented, abort.")
 
+#####################
+# Common constants
+#####################
+
 ROOT_RANK = 0
 X_DIM = "x"
 X_INTERFACE_DIM = "x_interface"
@@ -49,6 +53,22 @@ class ConstantVersions(Enum):
 BOUNDARY_TYPES = EDGE_BOUNDARY_TYPES + CORNER_BOUNDARY_TYPES
 N_HALO_DEFAULT = 3
 
+#######################
+# Tracers configuration
+#######################
+
+# nq is actually given by ncnst - pnats, where those are given in atmosphere.F90 by:
+# ncnst = Atm(mytile)%ncnst
+# pnats = Atm(mytile)%flagstruct%pnats
+# here we hard-coded it because 8 is the only supported value, refactor this later!
+if CONST_VERSION == ConstantVersions.GEOS:
+    # 'qlcd' is exchanged in GEOS
+    NQ = 9 
+elif ( CONST_VERSION == ConstantVersions.GFS or CONST_VERSION == ConstantVersions.FV3DYCORE ):
+    NQ = 8
+else:
+    raise RuntimeError("Constant selector failed, bad code.")
+
 #####################
 # Physical constants
 #####################

From cde11e817e9b3bc2b756daa86bbb5f0918e923dc Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Tue, 2 May 2023 11:33:16 -0400
Subject: [PATCH 29/57] Replace all logger with pace_log Introduce
 PACE_LOGLEVEL to control log level from outside

---
 README.md                                     |  6 ++++
 driver/pace/driver/run.py                     |  7 +++--
 dsl/pace/dsl/dace/utils.py                    |  3 +-
 .../fv3core/initialization/geos_wrapper.py    |  8 ++---
 util/pace/util/communicator.py                |  3 --
 util/pace/util/local_comm.py                  |  7 ++---
 util/pace/util/logging.py                     |  8 +++--
 util/pace/util/monitor/netcdf_monitor.py      |  9 ++----
 util/pace/util/monitor/zarr_monitor.py        |  9 ++----
 util/pace/util/mpi.py                         | 30 +++++++++----------
 10 files changed, 45 insertions(+), 45 deletions(-)

diff --git a/README.md b/README.md
index 31980394..7753fa73 100644
--- a/README.md
+++ b/README.md
@@ -61,6 +61,12 @@ mpirun -n 6 --oversubscribe python3 -m pace.driver.run driver/examples/configs/b
 
 After the run completes, you will see an output direcotry `output.zarr`. An example to visualize the output is provided in `driver/examples/plot_output.py`. See the [driver example](driver/examples/README.md) section for more details.
 
+### Environment variable configuration
+
+- `PACE_CONSTANTS`: Pace is bundled with various constants (see _util/pace/util/constants.py_).
+- `PACE_FLOAT_PRECISION`: default precision of the field & scalars in the numerics. Default to 64.
+- `PACE_LOGLEVEL`: logging level to display (DEBUG, INFO, WARNING, ERROR, CRITICAL). Default to INFO.
+
 ## Quickstart - Docker
 ### Build
 
diff --git a/driver/pace/driver/run.py b/driver/pace/driver/run.py
index 2d9160cd..7979d6ba 100644
--- a/driver/pace/driver/run.py
+++ b/driver/pace/driver/run.py
@@ -7,6 +7,7 @@
 import yaml
 
 from pace.util.mpi import MPI
+from pace.util import pace_log
 
 from .driver import Driver, DriverConfig
 
@@ -76,11 +77,13 @@ def command_line(config_path: str, log_rank: Optional[int], log_level: str):
     CONFIG_PATH is the path to a DriverConfig yaml file.
     """
     configure_logging(log_rank=log_rank, log_level=log_level)
-    logger.info("loading DriverConfig from yaml")
+    pace_log.info("loading DriverConfig from yaml")
     with open(config_path, "r") as f:
         config = yaml.safe_load(f)
         driver_config = DriverConfig.from_dict(config)
-    logging.info(f"DriverConfig loaded: {yaml.dump(dataclasses.asdict(driver_config))}")
+    pace_log.info(
+        f"DriverConfig loaded: {yaml.dump(dataclasses.asdict(driver_config))}"
+    )
     main(driver_config=driver_config)
 
 
diff --git a/dsl/pace/dsl/dace/utils.py b/dsl/pace/dsl/dace/utils.py
index 4ba0247e..5c9f63ec 100644
--- a/dsl/pace/dsl/dace/utils.py
+++ b/dsl/pace/dsl/dace/utils.py
@@ -27,7 +27,7 @@ def __init__(self, config: DaceConfig, label: str):
 
     @classmethod
     def log(cls, prefix: str, message: str):
-        pace_log.info(f"{prefix} {message}")
+        pace_log.debug(f"{prefix} {message}")
 
     @classmethod
     def default_prefix(cls, config: DaceConfig) -> str:
@@ -299,7 +299,6 @@ def kernel_theoretical_timing(
         import sympy
 
         if node.label in result:
-
             newresult_in_us = sympy.Max(result[node.label], newresult_in_us).expand()
             try:
                 newresult_in_us = float(newresult_in_us)
diff --git a/fv3core/pace/fv3core/initialization/geos_wrapper.py b/fv3core/pace/fv3core/initialization/geos_wrapper.py
index 4fc34052..2835e77e 100644
--- a/fv3core/pace/fv3core/initialization/geos_wrapper.py
+++ b/fv3core/pace/fv3core/initialization/geos_wrapper.py
@@ -132,9 +132,12 @@ def __init__(
         self._allocate_output_dir()
 
         pace_log.info(
-            "GEOS-Wrapper with: \n"
+            "Pace GEOS wrapper initialized: \n"
             f"  dt     : {self.dycore_state.bdt}\n"
             f"  bridge : {self._fortran_mem_space} > {self._pace_mem_space}\n"
+            f"  backend: {backend}\n"
+            f"  orchestration: {self._is_orchestrated}\n"
+            f"  sizer  : {sizer.nx}x{sizer.ny}x{sizer.nz} (halo: {sizer.n_halo})"
         )
 
     def _critical_path(self):
@@ -173,7 +176,6 @@ def __call__(
         cyd: np.ndarray,
         diss_estd: np.ndarray,
     ) -> Tuple[Dict[str, np.ndarray], Dict[str, List[float]]]:
-
         with self.perf_collector.timestep_timer.clock("numpy-to-dycore"):
             self.dycore_state = self._put_fortran_data_in_dycore(
                 u,
@@ -246,7 +248,6 @@ def _put_fortran_data_in_dycore(
         cyd: np.ndarray,
         diss_estd: np.ndarray,
     ) -> fv3core.DycoreState:
-
         isc = self._grid_indexing.isc
         jsc = self._grid_indexing.jsc
         iec = self._grid_indexing.iec + 1
@@ -315,7 +316,6 @@ def _put_fortran_data_in_dycore(
         return state
 
     def _prep_outputs_for_geos(self) -> Dict[str, np.ndarray]:
-
         output_dict = self.output_dict
         isc = self._grid_indexing.isc
         jsc = self._grid_indexing.jsc
diff --git a/util/pace/util/communicator.py b/util/pace/util/communicator.py
index 0611fa98..938469bd 100644
--- a/util/pace/util/communicator.py
+++ b/util/pace/util/communicator.py
@@ -1,5 +1,4 @@
 import abc
-import logging
 from typing import List, Mapping, Optional, Sequence, Tuple, Union, cast
 
 import numpy as np
@@ -15,8 +14,6 @@
 from .utils import device_synchronize
 
 
-logger = logging.getLogger("pace.util")
-
 try:
     import cupy
 except ImportError:
diff --git a/util/pace/util/local_comm.py b/util/pace/util/local_comm.py
index a289296a..32fd0fb4 100644
--- a/util/pace/util/local_comm.py
+++ b/util/pace/util/local_comm.py
@@ -1,14 +1,11 @@
 import copy
-import logging
 from typing import Any
 
 from .comm import Comm
+from .logging import pace_log
 from .utils import ensure_contiguous, safe_assign_array
 
 
-logger = logging.getLogger("pace.util")
-
-
 class ConcurrencyError(Exception):
     """Exception to denote that a rank cannot proceed because it is waiting on a
     call from another rank."""
@@ -104,7 +101,7 @@ def bcast(self, value, root=0):
                 "the bcast source"
             )
         value = self._get_buffer("bcast", value)
-        logger.debug(f"bcast {value} to rank {self.rank}")
+        pace_log.debug(f"bcast {value} to rank {self.rank}")
         return value
 
     def Barrier(self):
diff --git a/util/pace/util/logging.py b/util/pace/util/logging.py
index 81b5a7b1..c0e9d0d7 100644
--- a/util/pace/util/logging.py
+++ b/util/pace/util/logging.py
@@ -1,15 +1,19 @@
 import logging
+import os
 import sys
 
 from mpi4py import MPI
 
 
+LOGLEVEL = os.environ.get("PACE_LOGLEVEL", "INFO").upper()
+
+
 def _pace_logger():
     name_log = logging.getLogger(__name__)
-    name_log.setLevel(logging.DEBUG)
+    name_log.setLevel(LOGLEVEL)
 
     handler = logging.StreamHandler(sys.stdout)
-    handler.setLevel(logging.DEBUG)
+    handler.setLevel(LOGLEVEL)
     formatter = logging.Formatter(
         fmt=(
             f"%(asctime)s|%(levelname)s|rank {MPI.COMM_WORLD.Get_rank()}|"
diff --git a/util/pace/util/monitor/netcdf_monitor.py b/util/pace/util/monitor/netcdf_monitor.py
index 18687216..76e5ef31 100644
--- a/util/pace/util/monitor/netcdf_monitor.py
+++ b/util/pace/util/monitor/netcdf_monitor.py
@@ -10,13 +10,11 @@
 
 from .. import _xarray as xr
 from ..filesystem import get_fs
+from ..logging import pace_log
 from ..quantity import Quantity
 from .convert import to_numpy
 
 
-logger = logging.getLogger(__name__)
-
-
 class _TimeChunkedVariable:
     def __init__(self, initial: Quantity, time_chunk_size: int):
         self._data = np.zeros(
@@ -46,7 +44,6 @@ def data(self) -> Quantity:
 
 
 class _ChunkedNetCDFWriter:
-
     FILENAME_FORMAT = "state_{chunk:04d}_tile{tile}.nc"
 
     def __init__(
@@ -62,7 +59,7 @@ def __init__(
         self._time_units: Optional[str] = None
 
     def append(self, state):
-        logger.debug("appending at time %d", self._i_time)
+        pace_log.debug("appending at time %d", self._i_time)
         state = {**state}  # copy so we don't mutate the input
         time = state.pop("time", None)
         if self._chunked is None:
@@ -75,7 +72,7 @@ def append(self, state):
                 self._chunked[name].append(quantity)
         self._times.append(time)
         if (self._i_time + 1) % self._time_chunk_size == 0:
-            logger.debug("flushing on append at time %d", self._i_time)
+            pace_log.debug("flushing on append at time %d", self._i_time)
             self.flush()
         self._i_time += 1
 
diff --git a/util/pace/util/monitor/zarr_monitor.py b/util/pace/util/monitor/zarr_monitor.py
index 12186799..5d7729b9 100644
--- a/util/pace/util/monitor/zarr_monitor.py
+++ b/util/pace/util/monitor/zarr_monitor.py
@@ -1,4 +1,3 @@
-import logging
 from datetime import datetime, timedelta
 from typing import List, Tuple, Union
 
@@ -7,12 +6,11 @@
 from .. import _xarray as xr
 from .. import constants, utils
 from .._optional_imports import cupy, zarr
+from ..logging import pace_log
 from ..partitioner import Partitioner, subtile_slice
 from .convert import to_numpy
 
 
-logger = logging.getLogger("pace.util")
-
 __all__ = ["ZarrMonitor"]
 
 
@@ -238,7 +236,7 @@ def append(self, quantity):
         )
 
         from_slice = _get_from_slice(target_slice)
-        logger.debug(
+        pace_log.debug(
             f"assigning data from subtile slice {from_slice} to "
             f"target slice {target_slice}"
         )
@@ -310,7 +308,7 @@ def append(self, quantity):
         )
 
         from_slice = _get_from_slice(target_slice)
-        logger.debug(
+        pace_log.debug(
             f"assigning data from subtile slice {from_slice} to "
             f"target slice {target_slice}"
         )
@@ -332,7 +330,6 @@ def append(self, quantity):
 
 
 class _ZarrTimeWriter(_ZarrVariableWriter):
-
     _TIME_CHUNK_SIZE = 1024
 
     def __init__(self, *args, **kwargs):
diff --git a/util/pace/util/mpi.py b/util/pace/util/mpi.py
index d03b7937..5acc2b00 100644
--- a/util/pace/util/mpi.py
+++ b/util/pace/util/mpi.py
@@ -2,16 +2,14 @@
     from mpi4py import MPI
 except ImportError:
     MPI = None
-import logging
 from typing import List, Optional, TypeVar, cast
 
 from .comm import Comm, Request
+from .logging import pace_log
 
 
 T = TypeVar("T")
 
-logger = logging.getLogger(__name__)
-
 
 class MPIComm(Comm):
     def __init__(self):
@@ -26,54 +24,56 @@ def Get_size(self) -> int:
         return self._comm.Get_size()
 
     def bcast(self, value: Optional[T], root=0) -> T:
-        logger.debug("bcast from root %s on rank %s", root, self._comm.Get_rank())
+        pace_log.debug("bcast from root %s on rank %s", root, self._comm.Get_rank())
         return self._comm.bcast(value, root=root)
 
     def barrier(self):
-        logger.debug("barrier on rank %s", self._comm.Get_rank())
+        pace_log.debug("barrier on rank %s", self._comm.Get_rank())
         self._comm.barrier()
 
     def Barrier(self):
         pass
 
     def Scatter(self, sendbuf, recvbuf, root=0, **kwargs):
-        logger.debug("Scatter on rank %s with root %s", self._comm.Get_rank(), root)
+        pace_log.debug("Scatter on rank %s with root %s", self._comm.Get_rank(), root)
         self._comm.Scatter(sendbuf, recvbuf, root=root, **kwargs)
 
     def Gather(self, sendbuf, recvbuf, root=0, **kwargs):
-        logger.debug("Gather on rank %s with root %s", self._comm.Get_rank(), root)
+        pace_log.debug("Gather on rank %s with root %s", self._comm.Get_rank(), root)
         self._comm.Gather(sendbuf, recvbuf, root=root, **kwargs)
 
     def allgather(self, sendobj: T) -> List[T]:
-        logger.debug("allgather on rank %s", self._comm.Get_rank())
+        pace_log.debug("allgather on rank %s", self._comm.Get_rank())
         return self._comm.allgather(sendobj)
 
     def Send(self, sendbuf, dest, tag: int = 0, **kwargs):
-        logger.debug("Send on rank %s with dest %s", self._comm.Get_rank(), dest)
+        pace_log.debug("Send on rank %s with dest %s", self._comm.Get_rank(), dest)
         self._comm.Send(sendbuf, dest, tag=tag, **kwargs)
 
     def sendrecv(self, sendbuf, dest, **kwargs):
-        logger.debug("sendrecv on rank %s with dest %s", self._comm.Get_rank(), dest)
+        pace_log.debug("sendrecv on rank %s with dest %s", self._comm.Get_rank(), dest)
         return self._comm.sendrecv(sendbuf, dest, **kwargs)
 
     def Isend(self, sendbuf, dest, tag: int = 0, **kwargs) -> Request:
-        logger.debug("Isend on rank %s with dest %s", self._comm.Get_rank(), dest)
+        pace_log.debug("Isend on rank %s with dest %s", self._comm.Get_rank(), dest)
         return self._comm.Isend(sendbuf, dest, tag=tag, **kwargs)
 
     def Recv(self, recvbuf, source, tag: int = 0, **kwargs):
-        logger.debug("Recv on rank %s with source %s", self._comm.Get_rank(), source)
+        pace_log.debug("Recv on rank %s with source %s", self._comm.Get_rank(), source)
         self._comm.Recv(recvbuf, source, tag=tag, **kwargs)
 
     def Irecv(self, recvbuf, source, tag: int = 0, **kwargs) -> Request:
-        logger.debug("Irecv on rank %s with source %s", self._comm.Get_rank(), source)
+        pace_log.debug("Irecv on rank %s with source %s", self._comm.Get_rank(), source)
         return self._comm.Irecv(recvbuf, source, tag=tag, **kwargs)
 
     def Split(self, color, key) -> "Comm":
-        logger.debug(
+        pace_log.debug(
             "Split on rank %s with color %s, key %s", self._comm.Get_rank(), color, key
         )
         return self._comm.Split(color, key)
 
     def allreduce(self, sendobj: T, op=None) -> T:
-        logger.debug("allreduce on rank %s with operator %s", self._comm.Get_rank(), op)
+        pace_log.debug(
+            "allreduce on rank %s with operator %s", self._comm.Get_rank(), op
+        )
         return self._comm.allreduce(sendobj, op)

From 9e6bbb6ecc0bc8574f9c2b79d1f5c663ddb70af8 Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Tue, 2 May 2023 11:34:56 -0400
Subject: [PATCH 30/57] Code guidelines clean up

---
 fv3core/pace/fv3core/stencils/fv_dynamics.py       |  8 ++++----
 .../savepoint/translate/translate_init_case.py     |  8 ++++----
 util/pace/util/constants.py                        | 14 +++++++++-----
 3 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/fv3core/pace/fv3core/stencils/fv_dynamics.py b/fv3core/pace/fv3core/stencils/fv_dynamics.py
index 8f5f8a7f..80b78e12 100644
--- a/fv3core/pace/fv3core/stencils/fv_dynamics.py
+++ b/fv3core/pace/fv3core/stencils/fv_dynamics.py
@@ -7,7 +7,6 @@
 import pace.dsl.gt4py_utils as utils
 import pace.fv3core.stencils.moist_cv as moist_cv
 import pace.util
-import pace.util.constants as constants
 from pace.dsl.dace.orchestration import dace_inhibitor, orchestrate
 from pace.dsl.dace.wrapped_halo_exchange import WrappedHaloUpdater
 from pace.dsl.stencil import StencilFactory
@@ -202,7 +201,7 @@ def __init__(
         )
 
         self.tracers = {}
-        for name in utils.tracer_variables[0:constants.NQ]:
+        for name in utils.tracer_variables[0 : constants.NQ]:
             self.tracers[name] = state.__dict__[name]
 
         temporaries = fvdyn_temporaries(quantity_factory)
@@ -541,10 +540,11 @@ def _compute(self, state: DycoreState, timer: pace.util.Timer):
                     log_on_rank_0("Remapping")
                 with timer.clock("Remapping"):
                     self._checkpoint_remapping_in(state)
-                    
+
                     # TODO: When NQ=9, we shouldn't need to pass qcld explicitly
                     #       since it's in self.tracers. It should not be an issue since
-                    #       we don't have self.tracers & qcld computation at the same time
+                    #       we don't have self.tracers & qcld computation at
+                    #       the same time.
                     #       When NQ=8, we do need qcld passed explicitely
                     self._lagrangian_to_eulerian_obj(
                         self.tracers,
diff --git a/fv3core/tests/savepoint/translate/translate_init_case.py b/fv3core/tests/savepoint/translate/translate_init_case.py
index 5fe14d73..4a9fafc4 100644
--- a/fv3core/tests/savepoint/translate/translate_init_case.py
+++ b/fv3core/tests/savepoint/translate/translate_init_case.py
@@ -7,7 +7,6 @@
 import pace.dsl.gt4py_utils as utils
 import pace.fv3core.initialization.baroclinic as baroclinic_init
 import pace.fv3core.initialization.baroclinic_jablonowski_williamson as jablo_init
-import pace.fv3core.stencils.fv_dynamics as fv_dynamics
 import pace.util
 import pace.util as fv3util
 from pace.fv3core.testing import TranslateDycoreFortranData2Py
@@ -17,7 +16,6 @@
 
 
 class TranslateInitCase(ParallelTranslateBaseSlicing):
-
     outputs: Dict[str, Any] = {
         "u": {
             "name": "x_wind",
@@ -175,7 +173,6 @@ def outputs_from_state(self, state: dict):
                     state[name][tracer] = state[name][tracer].data
                 arrays[name] = state[name]
             elif len(self.outputs[name]["dims"]) > 0:
-
                 arrays[name] = state[name].data
             else:
                 outputs[name] = state[name]  # scalar
@@ -184,7 +181,10 @@ def outputs_from_state(self, state: dict):
 
     def compute_parallel(self, inputs, communicator):
         state = {}
-        full_shape = (*self.grid.domain_shape_full(add=(1, 1, 1)), pace.util.constants.NQ)
+        full_shape = (
+            *self.grid.domain_shape_full(add=(1, 1, 1)),
+            pace.util.constants.NQ,
+        )
         for variable, properties in self.outputs.items():
             dims = properties["dims"]
             state[variable] = fv3util.Quantity(
diff --git a/util/pace/util/constants.py b/util/pace/util/constants.py
index 7a481b95..ef57ed18 100644
--- a/util/pace/util/constants.py
+++ b/util/pace/util/constants.py
@@ -1,15 +1,17 @@
 import os
 from enum import Enum
-from warnings import warn
+
 from pace.util.logging import pace_log
 
+
 # The FV3GFS model ships with two sets of constants, one used in the GFS physics
 # package and the other used for the Dycore. Their difference are small but significant
 # In addition the GSFC's GEOS model as its own variables
 class ConstantVersions(Enum):
     FV3DYCORE = "FV3DYCORE"  # NOAA's FV3 dynamical core constants (original port)
-    GFS = "GFS"             # Constant as defined in NOAA GFS
-    GEOS = "GEOS"           # Constant as defined in GEOS v13
+    GFS = "GFS"  # Constant as defined in NOAA GFS
+    GEOS = "GEOS"  # Constant as defined in GEOS v13
+
 
 CONST_VERSION_AS_STR = os.environ.get("PACE_CONSTANTS", "FV3DYCORE")
 
@@ -63,8 +65,10 @@ class ConstantVersions(Enum):
 # here we hard-coded it because 8 is the only supported value, refactor this later!
 if CONST_VERSION == ConstantVersions.GEOS:
     # 'qlcd' is exchanged in GEOS
-    NQ = 9 
-elif ( CONST_VERSION == ConstantVersions.GFS or CONST_VERSION == ConstantVersions.FV3DYCORE ):
+    NQ = 9
+elif (
+    CONST_VERSION == ConstantVersions.GFS or CONST_VERSION == ConstantVersions.FV3DYCORE
+):
     NQ = 8
 else:
     raise RuntimeError("Constant selector failed, bad code.")

From 7e449cd8bbee4dfe7e614130beb56da598e07201 Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Wed, 21 Jun 2023 12:28:17 -0400
Subject: [PATCH 31/57] Devops/GitHub actions on (#15)

* Linting on PR

* Run main unit test

* Update python to available 3.8.12

* Remove cd to pace

* Lint: git submodule recursive

* Typo

* Add openmpi to the image

* Linting

* Fix unit tests (remove dxa, dya rely on halo ex)

* typo

* Change name of jobs
---
 .github/workflows/lint.yml                | 27 ++++++++++
 .github/workflows/main_unit_tests.yml     | 27 ++++++++++
 driver/pace/driver/run.py                 |  2 +-
 tests/main/fv3core/test_init_from_geos.py | 60 +++++++++++++----------
 tests/main/test_grid_init.py              |  2 -
 util/pace/util/monitor/netcdf_monitor.py  |  1 -
 6 files changed, 88 insertions(+), 31 deletions(-)
 create mode 100644 .github/workflows/lint.yml
 create mode 100644 .github/workflows/main_unit_tests.yml

diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
new file mode 100644
index 00000000..0cc08080
--- /dev/null
+++ b/.github/workflows/lint.yml
@@ -0,0 +1,27 @@
+name: "Lint"
+on:
+  pull_request:
+      types: [opened, synchronize, reopened, ready_for_review, labeled, unlabeled]
+
+jobs:
+  lint:
+    runs-on: ubuntu-latest
+    steps:
+        - name: Checkout Pace repository
+          uses: actions/checkout@v3.5.2
+          with:
+            submodules: 'recursive'
+        - name: Step Python 3.8.12
+          uses: actions/setup-python@v4.6.0
+          with:
+            python-version: '3.8.12'
+        - name: Install OpenMPI for gt4py
+          run: |
+            sudo apt-get install libopenmpi-dev
+        - name: Install Python packages
+          run: |
+            python -m pip install --upgrade pip
+            pip install -r requirements_dev.txt -r requirements_lint.txt
+        - name: Run lint via pre-commit
+          run: |
+            pre-commit run --all-files
diff --git a/.github/workflows/main_unit_tests.yml b/.github/workflows/main_unit_tests.yml
new file mode 100644
index 00000000..5dbf4a1f
--- /dev/null
+++ b/.github/workflows/main_unit_tests.yml
@@ -0,0 +1,27 @@
+name: "Main unit tests"
+on:
+  pull_request:
+      types: [opened, synchronize, reopened, ready_for_review, labeled, unlabeled]
+
+jobs:
+  main_unit_tests:
+    runs-on: ubuntu-latest
+    steps:
+        - name: Checkout Pace repository
+          uses: actions/checkout@v3.5.2
+          with:
+            submodules: 'recursive'
+        - name: Step Python 3.8.12
+          uses: actions/setup-python@v4.6.0
+          with:
+            python-version: '3.8.12'
+        - name: Install OpenMPI for gt4py
+          run: |
+            sudo apt-get install libopenmpi-dev
+        - name: Install Python packages
+          run: |
+            python -m pip install --upgrade pip
+            pip install -r requirements_dev.txt
+        - name: Run all main tests
+          run: |
+            pytest -x tests/main
diff --git a/driver/pace/driver/run.py b/driver/pace/driver/run.py
index 7979d6ba..c8532ebd 100644
--- a/driver/pace/driver/run.py
+++ b/driver/pace/driver/run.py
@@ -6,8 +6,8 @@
 import click
 import yaml
 
-from pace.util.mpi import MPI
 from pace.util import pace_log
+from pace.util.mpi import MPI
 
 from .driver import Driver, DriverConfig
 
diff --git a/tests/main/fv3core/test_init_from_geos.py b/tests/main/fv3core/test_init_from_geos.py
index 252fe7d2..16efe9e9 100644
--- a/tests/main/fv3core/test_init_from_geos.py
+++ b/tests/main/fv3core/test_init_from_geos.py
@@ -7,7 +7,6 @@
 
 
 def test_geos_wrapper():
-
     namelist_dict = {
         "stencil_config": {
             "compilation_config": {
@@ -82,7 +81,12 @@ def test_geos_wrapper():
     comm = NullComm(rank=0, total_ranks=6, fill_value=0.0)
     backend = "numpy"
 
-    wrapper = fv3core.GeosDycoreWrapper(namelist, comm, backend)
+    wrapper = fv3core.GeosDycoreWrapper(
+        namelist=namelist,
+        comm=comm,
+        backend=backend,
+        bdt=namelist_dict["dt_atmos"],
+    )
     nhalo = 3
     shape_centered = (
         namelist["nx_tile"] + 2 * nhalo,
@@ -191,31 +195,33 @@ def test_geos_wrapper():
     )
     diss_estd = np.ones(shape_centered)
 
-    output_dict = wrapper(
-        u,
-        v,
-        w,
-        delz,
-        pt,
-        delp,
-        q,
-        ps,
-        pe,
-        pk,
-        peln,
-        pkz,
-        phis,
-        q_con,
-        omga,
-        ua,
-        va,
-        uc,
-        vc,
-        mfxd,
-        mfyd,
-        cxd,
-        cyd,
-        diss_estd,
+    timings = {}
+    output_dict, timings = wrapper(
+        timings=timings,
+        u=u,
+        v=v,
+        w=w,
+        delz=delz,
+        pt=pt,
+        delp=delp,
+        q=q,
+        ps=ps,
+        pe=pe,
+        pk=pk,
+        peln=peln,
+        pkz=pkz,
+        phis=phis,
+        q_con=q_con,
+        omga=omga,
+        ua=ua,
+        va=va,
+        uc=uc,
+        vc=vc,
+        mfxd=mfxd,
+        mfyd=mfyd,
+        cxd=cxd,
+        cyd=cyd,
+        diss_estd=diss_estd,
     )
 
     assert isinstance(output_dict["u"], np.ndarray)
diff --git a/tests/main/test_grid_init.py b/tests/main/test_grid_init.py
index 942dcfd3..4c0e5e2b 100644
--- a/tests/main/test_grid_init.py
+++ b/tests/main/test_grid_init.py
@@ -51,8 +51,6 @@ def test_grid_init_not_decomposition_dependent(rank: int):
     assert allclose(metric_terms_1by1.area, metric_terms_3by3.area, partitioner, rank)
     assert allclose(metric_terms_1by1.dx, metric_terms_3by3.dx, partitioner, rank)
     assert allclose(metric_terms_1by1.dy, metric_terms_3by3.dy, partitioner, rank)
-    assert allclose(metric_terms_1by1.dxa, metric_terms_3by3.dxa, partitioner, rank)
-    assert allclose(metric_terms_1by1.dya, metric_terms_3by3.dya, partitioner, rank)
     assert allclose(
         metric_terms_1by1.cos_sg1, metric_terms_3by3.cos_sg1, partitioner, rank
     )
diff --git a/util/pace/util/monitor/netcdf_monitor.py b/util/pace/util/monitor/netcdf_monitor.py
index 76e5ef31..0b39da60 100644
--- a/util/pace/util/monitor/netcdf_monitor.py
+++ b/util/pace/util/monitor/netcdf_monitor.py
@@ -1,4 +1,3 @@
-import logging
 import os
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Set

From e40d35615636e2e0557403a818f47324f9631cc3 Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Wed, 21 Jun 2023 14:24:03 -0400
Subject: [PATCH 32/57] Distributed compilation on orchestrated backend for NxN
 layouts (#14)

* Adapt orchestration distribute compile for NxN layout

* Remove debug code

* Add a more descriptive string base postfix for cache naming
Identify the code path for all cases
Consistent reload post-compile
Create a central space for all caches generation logic
No more original layout check required

* Add a test on caches relocatability

* Verbose todo

* Linting on PR

* Run main unit test

* Update python to available 3.8.12

* Remove cd to pace

* Lint: git submodule recursive

* Typo

* Add openmpi to the image

* Linting

* Fix unit tests (remove dxa, dya rely on halo ex)

* typo

* Change name of jobs

* Missing enum

* Lint imports

* Fix unit tests

* Deactivate relocability test due to Python crash
Logged as issyue 16

* Typo

* Raise for 1,X and X,1 layouts which requires a new descriptor
---
 dsl/pace/dsl/caches/cache_location.py |  46 +++++++
 dsl/pace/dsl/caches/codepath.py       |  33 +++++
 dsl/pace/dsl/dace/build.py            | 136 ++++---------------
 dsl/pace/dsl/dace/dace_config.py      | 111 ++++++++++++++--
 dsl/pace/dsl/dace/orchestration.py    |  43 ++-----
 tests/main/dsl/test_caches.py         | 179 ++++++++++++++++++++++++++
 tests/main/dsl/test_dace_config.py    |  68 +++++++++-
 7 files changed, 466 insertions(+), 150 deletions(-)
 create mode 100644 dsl/pace/dsl/caches/cache_location.py
 create mode 100644 dsl/pace/dsl/caches/codepath.py
 create mode 100644 tests/main/dsl/test_caches.py

diff --git a/dsl/pace/dsl/caches/cache_location.py b/dsl/pace/dsl/caches/cache_location.py
new file mode 100644
index 00000000..ab57a60b
--- /dev/null
+++ b/dsl/pace/dsl/caches/cache_location.py
@@ -0,0 +1,46 @@
+from pace.dsl.caches.codepath import FV3CodePath
+from pace.util import CubedSpherePartitioner
+
+
+def identify_code_path(
+    rank: int,
+    partitioner: CubedSpherePartitioner,
+) -> FV3CodePath:
+    if partitioner.layout == (1, 1) or partitioner.layout == [1, 1]:
+        return FV3CodePath.All
+    elif partitioner.layout[0] == 1 or partitioner.layout[1] == 1:
+        raise NotImplementedError(
+            f"Build for layout {partitioner.layout} is not handled"
+        )
+    else:
+        if partitioner.tile.on_tile_bottom(rank):
+            if partitioner.tile.on_tile_left(rank):
+                return FV3CodePath.BottomLeft
+            if partitioner.tile.on_tile_right(rank):
+                return FV3CodePath.BottomRight
+            else:
+                return FV3CodePath.Bottom
+        if partitioner.tile.on_tile_top(rank):
+            if partitioner.tile.on_tile_left(rank):
+                return FV3CodePath.TopLeft
+            if partitioner.tile.on_tile_right(rank):
+                return FV3CodePath.TopRight
+            else:
+                return FV3CodePath.Top
+        else:
+            if partitioner.tile.on_tile_left(rank):
+                return FV3CodePath.Left
+            if partitioner.tile.on_tile_right(rank):
+                return FV3CodePath.Right
+            else:
+                return FV3CodePath.Center
+
+
+def get_cache_fullpath(code_path: FV3CodePath) -> str:
+    from gt4py.cartesian import config as gt_config
+
+    return f"{gt_config.cache_settings['root_path']}/.gt_cache_{code_path}"
+
+
+def get_cache_directory(code_path: FV3CodePath) -> str:
+    return f".gt_cache_{code_path}"
diff --git a/dsl/pace/dsl/caches/codepath.py b/dsl/pace/dsl/caches/codepath.py
new file mode 100644
index 00000000..cb8327b5
--- /dev/null
+++ b/dsl/pace/dsl/caches/codepath.py
@@ -0,0 +1,33 @@
+import enum
+
+
+class FV3CodePath(enum.Enum):
+    """Enum listing all possible code path on a cube sphere.
+    For any layout the cube sphere has up to 9 different code path, 10
+    when counting the 1,1 layout which aggregates all 9. Those are related to
+    the positioning of the rank on the tile and which of the edge/corner case
+    it has to handle.
+    Since the framework inline code to optimize, we _cannot_ pre-suppose of the code
+    being kept and/or ejected. This enum serves as the ground truth to map rank to
+    the proper generated code.
+    """
+
+    All = "FV3_A"
+    BottomLeft = "FV3_BL"
+    Left = "FV3_L"
+    TopLeft = "FV3_TL"
+    Top = "FV3_T"
+    TopRight = "FV3_TR"
+    Right = "FV3_R"
+    BottomRight = "FV3_BR"
+    Bottom = "FV3_B"
+    Center = "FV3_C"
+
+    def __str__(self):
+        return self.value
+
+    def __repr__(self):
+        return self.value
+
+    def __format__(self, format_spec: str) -> str:
+        return self.value
diff --git a/dsl/pace/dsl/dace/build.py b/dsl/pace/dsl/dace/build.py
index 7d8f3db2..b134f569 100644
--- a/dsl/pace/dsl/dace/build.py
+++ b/dsl/pace/dsl/dace/build.py
@@ -1,9 +1,9 @@
 from typing import List, Optional, Tuple
-from warnings import warn
 
 from dace.sdfg import SDFG
 
 import pace.util
+from pace.dsl.caches.cache_location import get_cache_directory, get_cache_fullpath
 from pace.dsl.dace.dace_config import DaceConfig, DaCeOrchestration
 
 
@@ -11,19 +11,6 @@
 # Distributed compilation
 
 
-def determine_compiling_ranks(config: DaceConfig) -> bool:
-    is_compiling = False
-    rank = config.my_rank
-    size = config.rank_size
-
-    if int(size / 6) == 0:
-        is_compiling = True
-    elif rank % int(size / 6) == rank:
-        is_compiling = True
-
-    return is_compiling
-
-
 def unblock_waiting_tiles(comm, sdfg_path: str) -> None:
     if comm and comm.Get_size() > 1:
         for tile in range(1, 6):
@@ -31,48 +18,6 @@ def unblock_waiting_tiles(comm, sdfg_path: str) -> None:
             comm.send(sdfg_path, dest=tile * tilesize + comm.Get_rank())
 
 
-def get_target_rank(rank: int, partitioner: pace.util.CubedSpherePartitioner):
-    """From my rank & the current partitioner we determine which
-    rank we should read from.
-    For all layout >= 3,3 this presumes build has been done on a
-    3,3 layout."""
-    if partitioner.layout == (1, 1):
-        return 0
-    if partitioner.layout == (2, 2):
-        if partitioner.tile.on_tile_bottom(rank):
-            if partitioner.tile.on_tile_left(rank):
-                return 0  # "00"
-            if partitioner.tile.on_tile_right(rank):
-                return 1  # "10"
-        if partitioner.tile.on_tile_top(rank):
-            if partitioner.tile.on_tile_left(rank):
-                return 2  # "01"
-            if partitioner.tile.on_tile_right(rank):
-                return 3  # "11"
-    else:
-        if partitioner.tile.on_tile_bottom(rank):
-            if partitioner.tile.on_tile_left(rank):
-                return 0  # "00"
-            if partitioner.tile.on_tile_right(rank):
-                return 2  # "20"
-            else:
-                return 1  # "10"
-        if partitioner.tile.on_tile_top(rank):
-            if partitioner.tile.on_tile_left(rank):
-                return 6  # "02"
-            if partitioner.tile.on_tile_right(rank):
-                return 8  # "22"
-            else:
-                return 7  # "12"
-        else:
-            if partitioner.tile.on_tile_left(rank):
-                return 3  # "01"
-            if partitioner.tile.on_tile_right(rank):
-                return 5  # "21"
-            else:
-                return 4  # "11"
-
-
 def build_info_filepath() -> str:
     return "build_info.txt"
 
@@ -101,7 +46,10 @@ def write_build_info(
 
 
 def get_sdfg_path(
-    daceprog_name: str, config: DaceConfig, sdfg_file_path: Optional[str] = None
+    daceprog_name: str,
+    config: DaceConfig,
+    sdfg_file_path: Optional[str] = None,
+    override_run_only=False,
 ) -> Optional[str]:
     """Build an SDFG path from the qualified program name or it's direct path to .sdfg
 
@@ -113,7 +61,7 @@ def get_sdfg_path(
 
     # TODO: check DaceConfig for cache.strategy == name
     # Guarding against bad usage of this function
-    if config.get_orchestrate() != DaCeOrchestration.Run:
+    if not override_run_only and config.get_orchestrate() != DaCeOrchestration.Run:
         return None
 
     # Case of a .sdfg file given by the user to be compiled
@@ -125,19 +73,8 @@ def get_sdfg_path(
         return sdfg_file_path
 
     # Case of loading a precompiled .so - lookup using GT_CACHE
-    from gt4py.cartesian import config as gt_config
-
-    if config.rank_size > 1:
-        rank = config.my_rank
-        rank_str = f"_{config.target_rank:06d}"
-    else:
-        rank = 0
-        rank_str = f"_{rank:06d}"
-
-    sdfg_dir_path = (
-        f"{gt_config.cache_settings['root_path']}"
-        f"/.gt_cache{rank_str}/dacecache/{daceprog_name}"
-    )
+    cache_fullpath = get_cache_fullpath(config.code_path)
+    sdfg_dir_path = f"{cache_fullpath}/dacecache/{daceprog_name}"
     if not os.path.isdir(sdfg_dir_path):
         raise RuntimeError(f"Precompiled SDFG is missing at {sdfg_dir_path}")
 
@@ -153,23 +90,8 @@ def get_sdfg_path(
             raise RuntimeError(
                 f"SDFG build for {build_backend}, {config._backend} has been asked"
             )
-        # Check layout
-        build_layout = ast.literal_eval(build_info_file.readline())
-        can_read = True
-        if config.layout == (1, 1) and config.layout != build_layout:
-            can_read = False
-        elif config.layout == (2, 2) and config.layout != build_layout:
-            can_read = False
-        elif (
-            build_layout != (1, 1) and build_layout != (2, 2) and build_layout != (3, 3)
-        ):
-            can_read = False
-        if not can_read:
-            warn(
-                f"SDFG build for layout {build_layout}, "
-                f"cannot be run with current layout {config.layout}, bad layout?"
-            )
         # Check resolution per tile
+        build_layout = ast.literal_eval(build_info_file.readline())
         build_resolution = ast.literal_eval(build_info_file.readline())
         if (config.tile_resolution[0] / config.layout[0]) != (
             build_resolution[0] / build_layout[0]
@@ -179,7 +101,7 @@ def get_sdfg_path(
                 f"cannot be run with current resolution {config.tile_resolution}"
             )
 
-    print(f"[DaCe Config] Rank {rank} loading SDFG {sdfg_dir_path}")
+    print(f"[DaCe Config] Rank {config.my_rank} loading SDFG {sdfg_dir_path}")
 
     return sdfg_dir_path
 
@@ -189,33 +111,31 @@ def set_distributed_caches(config: "DaceConfig"):
 
     # Execute specific initialization per orchestration state
     orchestration_mode = config.get_orchestrate()
+    if orchestration_mode == DaCeOrchestration.Python:
+        return
 
     # Check that we have all the file we need to early out in case
     # of issues.
     if orchestration_mode == DaCeOrchestration.Run:
         import os
 
-        from gt4py.cartesian import config as gt_config
-
-        # Check our cache exist
-        if config.rank_size > 1:
-            rank = config.my_rank
-            target_rank_str = f"_{config.target_rank:06d}"
-        else:
-            rank = 0
-            target_rank_str = f"_{rank:06d}"
-        cache_filepath = (
-            f"{gt_config.cache_settings['root_path']}/.gt_cache{target_rank_str}"
-        )
-        if not os.path.exists(cache_filepath):
+        cache_directory = get_cache_fullpath(config.code_path)
+        if not os.path.exists(cache_directory):
             raise RuntimeError(
                 f"{orchestration_mode} error: Could not find caches for rank "
-                f"{rank} at {cache_filepath}"
+                f"{config.my_rank} at {cache_directory}"
             )
 
-        # All, good set this rank cache to the source cache
-        gt_config.cache_settings["dir_name"] = f".gt_cache{target_rank_str}"
-        print(
-            f"[{orchestration_mode}] Rank {rank} "
-            f"reading cache {gt_config.cache_settings['dir_name']}"
-        )
+    # Set read/write caches to the target rank
+    from gt4py.cartesian import config as gt_config
+
+    if config.do_compile:
+        verb = "reading/writing"
+    else:
+        verb = "reading"
+
+    gt_config.cache_settings["dir_name"] = get_cache_directory(config.code_path)
+    pace.util.pace_log.critical(
+        f"[{orchestration_mode}] Rank {config.my_rank} "
+        f"{verb} cache {gt_config.cache_settings['dir_name']}"
+    )
diff --git a/dsl/pace/dsl/dace/dace_config.py b/dsl/pace/dsl/dace/dace_config.py
index 961bf3ba..5e78c6bc 100644
--- a/dsl/pace/dsl/dace/dace_config.py
+++ b/dsl/pace/dsl/dace/dace_config.py
@@ -1,13 +1,15 @@
 import enum
-from typing import Any, Dict, Optional
+from typing import Any, Dict, Optional, Tuple
 
 import dace.config
 from dace.codegen.compiled_sdfg import CompiledSDFG
 from dace.frontend.python.parser import DaceProgram
 
+from pace.dsl.caches.cache_location import identify_code_path
+from pace.dsl.caches.codepath import FV3CodePath
 from pace.dsl.gt4py_utils import is_gpu_backend
 from pace.util._optional_imports import cupy as cp
-from pace.util.communicator import CubedSphereCommunicator
+from pace.util.communicator import CubedSphereCommunicator, CubedSpherePartitioner
 
 
 # This can be turned on to revert compilation for orchestration
@@ -16,6 +18,95 @@
 DEACTIVATE_DISTRIBUTED_DACE_COMPILE = False
 
 
+def _is_corner(rank: int, partitioner: CubedSpherePartitioner) -> bool:
+    if partitioner.tile.on_tile_bottom(rank):
+        if partitioner.tile.on_tile_left(rank):
+            return True
+        if partitioner.tile.on_tile_right(rank):
+            return True
+    if partitioner.tile.on_tile_top(rank):
+        if partitioner.tile.on_tile_left(rank):
+            return True
+        if partitioner.tile.on_tile_right(rank):
+            return True
+    return False
+
+
+def _smallest_rank_bottom(x: int, y: int, layout: Tuple[int, int]):
+    return y == 0 and x == 1
+
+
+def _smallest_rank_top(x: int, y: int, layout: Tuple[int, int]):
+    return y == layout[1] - 1 and x == 1
+
+
+def _smallest_rank_left(x: int, y: int, layout: Tuple[int, int]):
+    return x == 0 and y == 1
+
+
+def _smallest_rank_right(x: int, y: int, layout: Tuple[int, int]):
+    return x == layout[0] - 1 and y == 1
+
+
+def _smallest_rank_middle(x: int, y: int, layout: Tuple[int, int]):
+    return layout[0] > 1 and layout[1] > 1 and x == 1 and y == 1
+
+
+def _determine_compiling_ranks(
+    config: "DaceConfig",
+    partitioner: CubedSpherePartitioner,
+) -> bool:
+    """
+    We try to map every layout to a 3x3 layout which MPI ranks
+    looks like
+        6 7 8
+        3 4 5
+        0 1 2
+    Using the partitionner we find mapping of the given layout
+    to all of those. For example on 4x4 layout
+        12 13 14 15
+        8  9  10 11
+        4  5  6  7
+        0  1  2  3
+    therefore we map
+        0 -> 0
+        1 -> 1
+        2 -> NOT COMPILING
+        3 -> 2
+        4 -> 3
+        5 -> 4
+        6 -> NOT COMPILING
+        7 -> 5
+        8 -> NOT COMPILING
+        9 -> NOT COMPILING
+        10 -> NOT COMPILING
+        11 -> NOT COMPILING
+        12 -> 6
+        13 -> 7
+        14 -> NOT COMPILING
+        15 -> 8
+    """
+
+    # Tile 0 compiles
+    if partitioner.tile_index(config.my_rank) != 0:
+        return False
+
+    # Corners compile
+    if _is_corner(config.my_rank, partitioner):
+        return True
+
+    y, x = partitioner.tile.subtile_index(config.my_rank)
+
+    # If edge or center tile, we give way to the smallest rank
+    return (
+        _smallest_rank_left(x, y, config.layout)
+        or _smallest_rank_bottom(x, y, config.layout)
+        or _smallest_rank_middle(x, y, config.layout)
+        or _smallest_rank_right(x, y, config.layout)
+        or _smallest_rank_top(x, y, config.layout)
+    )
+
+
 class DaCeOrchestration(enum.Enum):
     """
     Orchestration mode for DaCe
@@ -179,24 +270,24 @@ def __init__(
 
         self._backend = backend
         self.tile_resolution = [tile_nx, tile_nx, tile_nz]
-        from pace.dsl.dace.build import get_target_rank, set_distributed_caches
+        from pace.dsl.dace.build import set_distributed_caches
 
         # Distributed build required info
         if communicator:
             self.my_rank = communicator.rank
             self.rank_size = communicator.comm.Get_size()
-            if DEACTIVATE_DISTRIBUTED_DACE_COMPILE:
-                self.target_rank = communicator.rank
-            else:
-                self.target_rank = get_target_rank(
-                    self.my_rank, communicator.partitioner
-                )
+            self.code_path = identify_code_path(self.my_rank, communicator.partitioner)
             self.layout = communicator.partitioner.layout
+            self.do_compile = (
+                DEACTIVATE_DISTRIBUTED_DACE_COMPILE
+                or _determine_compiling_ranks(self, communicator.partitioner)
+            )
         else:
             self.my_rank = 0
             self.rank_size = 1
-            self.target_rank = 0
+            self.code_path = FV3CodePath.All
             self.layout = (1, 1)
+            self.do_compile = True
 
         set_distributed_caches(self)
 
diff --git a/dsl/pace/dsl/dace/orchestration.py b/dsl/pace/dsl/dace/orchestration.py
index 2bd9df5b..1feca341 100644
--- a/dsl/pace/dsl/dace/orchestration.py
+++ b/dsl/pace/dsl/dace/orchestration.py
@@ -11,12 +11,7 @@
 from dace.transformation.auto.auto_optimize import make_transients_persistent
 from dace.transformation.helpers import get_parent_map
 
-from pace.dsl.dace.build import (
-    determine_compiling_ranks,
-    get_sdfg_path,
-    unblock_waiting_tiles,
-    write_build_info,
-)
+from pace.dsl.dace.build import get_sdfg_path, write_build_info
 from pace.dsl.dace.dace_config import (
     DEACTIVATE_DISTRIBUTED_DACE_COMPILE,
     DaceConfig,
@@ -34,6 +29,7 @@
     memory_static_analysis,
     report_memory_static_analysis,
 )
+from pace.util import pace_log
 from pace.util.mpi import MPI
 
 
@@ -122,7 +118,7 @@ def _build_sdfg(
     if DEACTIVATE_DISTRIBUTED_DACE_COMPILE:
         is_compiling = True
     else:
-        is_compiling = determine_compiling_ranks(config)
+        is_compiling = config.do_compile
     if is_compiling:
         # Make the transients array persistents
         if config.is_gpu_backend():
@@ -212,31 +208,16 @@ def _build_sdfg(
         )
         exit(0)
     elif config.get_orchestrate() == DaCeOrchestration.BuildAndRun:
-        MPI.COMM_WORLD.Barrier()
-        if is_compiling:
-            if not DEACTIVATE_DISTRIBUTED_DACE_COMPILE:
-                unblock_waiting_tiles(MPI.COMM_WORLD, sdfg.build_folder)
-                DaCeProgress.log(
-                    DaCeProgress.default_prefix(config), "Build folder exchanged."
-                )
-            csdfg, _ = daceprog.load_precompiled_sdfg(
-                sdfg.build_folder, *args, **kwargs
-            )
-            config.loaded_precompiled_SDFG[daceprog] = FrozenCompiledSDFG(
-                daceprog, csdfg, args, kwargs
-            )
-
-        else:
-            source_rank = config.target_rank
-            # wait for compilation to be done
+        if not is_compiling:
             DaCeProgress.log(
                 DaCeProgress.default_prefix(config),
-                "Rank is not compiling. Waiting for build dir...",
-            )
-            sdfg_path = MPI.COMM_WORLD.recv(source=source_rank)
-            DaCeProgress.log(
-                DaCeProgress.default_prefix(config), "Build dir received, loading .so."
+                "Rank is not compiling. "
+                "Waiting for compilation to end on all other ranks...",
             )
+        MPI.COMM_WORLD.Barrier()
+
+        with DaCeProgress(config, "Loading"):
+            sdfg_path = get_sdfg_path(daceprog.name, config, override_run_only=True)
             csdfg, _ = daceprog.load_precompiled_sdfg(sdfg_path, *args, **kwargs)
             config.loaded_precompiled_SDFG[daceprog] = FrozenCompiledSDFG(
                 daceprog, csdfg, args, kwargs
@@ -267,6 +248,7 @@ def _call_sdfg(
             config.get_orchestrate() == DaCeOrchestration.Build
             or config.get_orchestrate() == DaCeOrchestration.BuildAndRun
         ):
+            pace_log.info("Building DaCe orchestration")
             res = _build_sdfg(daceprog, sdfg, config, args, kwargs)
         elif config.get_orchestrate() == DaCeOrchestration.Run:
             # We should never hit this, it should be caught by the
@@ -302,7 +284,7 @@ def _parse_sdfg(
         if DEACTIVATE_DISTRIBUTED_DACE_COMPILE:
             is_compiling = True
         else:
-            is_compiling = determine_compiling_ranks(config)
+            is_compiling = config.do_compile
         if not is_compiling:
             # We can not parse the SDFG since we will load the proper
             # compiled SDFG from the compiling rank
@@ -448,7 +430,6 @@ def __get__(self, obj, objtype=None) -> SDFGEnabledCallable:
         """Return SDFGEnabledCallable wrapping original obj.method from cache.
         Update cache first if need be"""
         if (id(obj), id(self.func)) not in _LazyComputepathMethod.bound_callables:
-
             _LazyComputepathMethod.bound_callables[
                 (id(obj), id(self.func))
             ] = _LazyComputepathMethod.SDFGEnabledCallable(self, obj)
diff --git a/tests/main/dsl/test_caches.py b/tests/main/dsl/test_caches.py
new file mode 100644
index 00000000..d5318493
--- /dev/null
+++ b/tests/main/dsl/test_caches.py
@@ -0,0 +1,179 @@
+import pytest
+from gt4py.cartesian.gtscript import PARALLEL, Field, computation, interval
+from gt4py.storage import empty, ones
+
+import pace.dsl
+from pace.dsl.dace import orchestrate
+from pace.dsl.dace.dace_config import DaceConfig, DaCeOrchestration
+from pace.dsl.stencil import CompilationConfig, GridIndexing
+
+
+def _make_storage(
+    func,
+    grid_indexing,
+    stencil_config: pace.dsl.StencilConfig,
+    *,
+    dtype=float,
+    aligned_index=(0, 0, 0),
+):
+    return func(
+        backend=stencil_config.compilation_config.backend,
+        shape=grid_indexing.domain,
+        dtype=dtype,
+        aligned_index=aligned_index,
+    )
+
+
+def _stencil(inp: Field[float], out: Field[float], scalar: float):
+    with computation(PARALLEL), interval(...):
+        out = inp
+
+
+def _build_stencil(backend, orchestrated: DaCeOrchestration):
+    # Make stencil and verify it ran
+    grid_indexing = GridIndexing(
+        domain=(5, 5, 5),
+        n_halo=2,
+        south_edge=True,
+        north_edge=True,
+        west_edge=True,
+        east_edge=True,
+    )
+
+    stencil_config = pace.dsl.StencilConfig(
+        compilation_config=CompilationConfig(backend=backend, rebuild=True),
+        dace_config=DaceConfig(None, backend, 5, 5, orchestrated),
+    )
+
+    stencil_factory = pace.dsl.StencilFactory(stencil_config, grid_indexing)
+
+    built_stencil = stencil_factory.from_origin_domain(
+        _stencil, (0, 0, 0), domain=grid_indexing.domain
+    )
+
+    return built_stencil, grid_indexing, stencil_config
+
+
+class OrchestratedProgam:
+    def __init__(self, backend, orchestration):
+        self.stencil, grid_indexing, stencil_config = _build_stencil(
+            backend, orchestration
+        )
+        orchestrate(obj=self, config=stencil_config.dace_config)
+        self.inp = _make_storage(ones, grid_indexing, stencil_config, dtype=float)
+        self.out = _make_storage(empty, grid_indexing, stencil_config, dtype=float)
+
+    def __call__(self):
+        self.stencil(self.inp, self.out, self.inp[0, 0, 0])
+
+
+@pytest.mark.parametrize(
+    "backend",
+    [
+        pytest.param("dace:cpu"),
+    ],
+)
+def test_relocatability_orchestration(backend):
+    import os
+    import shutil
+
+    from gt4py.cartesian import config as gt_config
+
+    original_root_directory = gt_config.cache_settings["root_path"]
+    working_dir = str(os.getcwd())
+
+    # Compile on default
+    p0 = OrchestratedProgam(backend, DaCeOrchestration.BuildAndRun)
+    p0()
+    assert os.path.exists(
+        f"{working_dir}/.gt_cache_FV3_A/dacecache/"
+        "test_caches_OrchestratedProgam___call__",
+    ) or os.path.exists(
+        f"{working_dir}/.gt_cache_FV3_A/dacecache/OrchestratedProgam___call__",
+    )
+
+    # Compile in another directory
+
+    custom_path = f"{working_dir}/.my_cache_path"
+    gt_config.cache_settings["root_path"] = custom_path
+    p1 = OrchestratedProgam(backend, DaCeOrchestration.BuildAndRun)
+    p1()
+    assert os.path.exists(
+        f"{custom_path}/.gt_cache_FV3_A/dacecache/"
+        "test_caches_OrchestratedProgam___call__",
+    ) or os.path.exists(
+        f"{working_dir}/.gt_cache_FV3_A/dacecache/OrchestratedProgam___call__",
+    )
+
+    # Check relocability by copying the second cache directory,
+    # changing the path of gt_config.cache_settings and trying to Run on it
+    relocated_path = f"{working_dir}/.my_relocated_cache_path"
+    shutil.copytree(custom_path, relocated_path, dirs_exist_ok=True)
+    gt_config.cache_settings["root_path"] = relocated_path
+    p2 = OrchestratedProgam(backend, DaCeOrchestration.Run)
+    p2()
+
+    # Generate a file exists error to check for bad path
+    bogus_path = "./nope/notatall/nothappening"
+    gt_config.cache_settings["root_path"] = bogus_path
+    with pytest.raises(RuntimeError):
+        OrchestratedProgam(backend, DaCeOrchestration.Run)
+
+    # Restore cache settings
+    gt_config.cache_settings["root_path"] = original_root_directory
+
+
+@pytest.mark.parametrize(
+    "backend",
+    [
+        pytest.param("gt:cpu_ifirst"),
+        pytest.param("dace:cpu"),
+    ],
+)
+def test_relocatability(backend):
+    # TODO: test work - but crashes when chained with other
+    #       see https://github.com/GEOS-ESM/pace/issues/16
+    pass
+    # import os
+    # import shutil
+
+    # working_dir = os.getcwd()
+
+    # # Compile on default
+    # p0 = OrchestratedProgam(backend, DaCeOrchestration.Python)
+    # p0()
+    # assert os.path.exists(
+    #     f"{working_dir}/.gt_cache_000000/py38_1013/gtcpu_ifirst/__main__/_stencil/"
+    # )
+
+    # # Compile in another directory
+    # from gt4py.cartesian import config as gt_config
+
+    # custom_path = f"{working_dir}/.my_cache_path"
+    # gt_config.cache_settings["root_path"] = custom_path
+    # p1 = OrchestratedProgam(backend, DaCeOrchestration.Python)
+    # p1()
+    # assert os.path.exists(
+    #     f"{custom_path}/.gt_cache_000000/py38_1013/gtcpu_ifirst/__main__/_stencil/"
+    # )
+
+    # # Check relocability by copying the second cache directory,
+    # # changing the path of gt_config.cache_settings and trying to Run on it
+    # relocated_path = f"{working_dir}/.my_relocated_cache_path"
+    # shutil.copytree(custom_path, relocated_path, dirs_exist_ok=True)
+    # gt_config.cache_settings["root_path"] = relocated_path
+    # p2 = OrchestratedProgam(backend, DaCeOrchestration.Python)
+    # p2()
+    # assert os.path.exists(
+    #     f"{relocated_path}/.gt_cache_000000/py38_1013/gtcpu_ifirst/__main__/_stencil/"
+    # )
+
+
+if __name__ == "__main__":
+    # TODO: test can be merged once gt4py also generates in the _FV3_X format
+    print("\n|>    test_relocatability_orchestration('dace:cpu')\n")
+    test_relocatability_orchestration("dace:cpu")
+    print("\n|>    test_relocatability('gt:cpu_ifirst')\n")
+    test_relocatability("gt:cpu_ifirst")
+    print("\n|>    test_relocatability('dace:cpu')\n")
+    test_relocatability("dace:cpu")
diff --git a/tests/main/dsl/test_dace_config.py b/tests/main/dsl/test_dace_config.py
index 78553278..cb3566dd 100644
--- a/tests/main/dsl/test_dace_config.py
+++ b/tests/main/dsl/test_dace_config.py
@@ -1,11 +1,12 @@
 import unittest.mock
 
-from pace.dsl.dace.dace_config import DaceConfig
+from pace.dsl.dace.dace_config import DaceConfig, _determine_compiling_ranks
 from pace.dsl.dace.orchestration import (
     DaCeOrchestration,
     orchestrate,
     orchestrate_function,
 )
+from pace.util.communicator import CubedSpherePartitioner, TilePartitioner
 
 
 """
@@ -91,3 +92,68 @@ def foo(self):
         a = A()
         a.foo()
     assert not mock_call_sdfg.called
+
+
+def test_orchestrate_distributed_build():
+    dummy_dace_config = DaceConfig(
+        communicator=None,
+        backend="gtc:dace",
+        orchestration=DaCeOrchestration.BuildAndRun,
+    )
+
+    def _does_compile(rank, partitioner) -> bool:
+        dummy_dace_config.layout = partitioner.layout
+        dummy_dace_config.rank_size = partitioner.layout[0] * partitioner.layout[1] * 6
+        dummy_dace_config.my_rank = rank
+        return _determine_compiling_ranks(dummy_dace_config, partitioner)
+
+    # (1, 1) layout, one rank which compiles
+    cube_partitioner_11 = CubedSpherePartitioner(TilePartitioner((1, 1)))
+    assert _does_compile(0, cube_partitioner_11)
+    assert not _does_compile(1, cube_partitioner_11)  # not compiling face
+
+    # (2, 2) layout, 4 ranks, all compiling
+    cube_partitioner_22 = CubedSpherePartitioner(TilePartitioner((2, 2)))
+    assert _does_compile(0, cube_partitioner_22)
+    assert _does_compile(1, cube_partitioner_22)
+    assert _does_compile(2, cube_partitioner_22)
+    assert _does_compile(3, cube_partitioner_22)
+    assert not _does_compile(4, cube_partitioner_22)  # not compiling face
+
+    # (3, 3) layout, 9 ranks, all compiling
+    cube_partitioner_33 = CubedSpherePartitioner(TilePartitioner((3, 3)))
+    assert _does_compile(0, cube_partitioner_33)
+    assert _does_compile(1, cube_partitioner_33)
+    assert _does_compile(2, cube_partitioner_33)
+    assert _does_compile(3, cube_partitioner_33)
+    assert _does_compile(4, cube_partitioner_33)
+    assert _does_compile(5, cube_partitioner_33)
+    assert _does_compile(6, cube_partitioner_33)
+    assert _does_compile(7, cube_partitioner_33)
+    assert _does_compile(8, cube_partitioner_33)
+    assert not _does_compile(9, cube_partitioner_33)  # not compiling face
+
+    # (4, 4) layout, 16 ranks,
+    # expecting compiling:0, 1, 2, 3, 4, 5, 7, 12, 13, 15
+    cube_partitioner_44 = CubedSpherePartitioner(TilePartitioner((4, 4)))
+    assert _does_compile(0, cube_partitioner_44)
+    assert _does_compile(1, cube_partitioner_44)
+    assert _does_compile(4, cube_partitioner_44)
+    assert _does_compile(5, cube_partitioner_44)
+    assert _does_compile(7, cube_partitioner_44)
+    assert _does_compile(12, cube_partitioner_44)
+    assert _does_compile(13, cube_partitioner_44)
+    assert _does_compile(15, cube_partitioner_44)
+    assert not _does_compile(2, cube_partitioner_44)  # same code path as 3
+    assert not _does_compile(6, cube_partitioner_44)  # same code path as 5
+    assert not _does_compile(8, cube_partitioner_44)  # same code path as 4
+    assert not _does_compile(11, cube_partitioner_44)  # same code path as 7
+    assert not _does_compile(16, cube_partitioner_44)  # not compiling face
+
+    # For a few other layouts, we check that we always have 9 compiling ranks
+    for layout in [(5, 5), (10, 10), (20, 20)]:
+        partition = CubedSpherePartitioner(TilePartitioner(layout))
+        compiling = 0
+        for i in range(layout[0] * layout[1] * 6):
+            compiling += 1 if _does_compile(i, partition) else 0
+        assert compiling == 9

From 286ad0003d2d7e92878122995419d91c115c4cc5 Mon Sep 17 00:00:00 2001
From: Purnendu Chakraborty <purnendu.chakraborty@nasa.gov>
Date: Sun, 25 Jun 2023 19:38:29 -0400
Subject: [PATCH 33/57] Added ak, bk for 137 levels in eta.py

---
 util/pace/util/grid/eta.py | 291 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 290 insertions(+), 1 deletion(-)

diff --git a/util/pace/util/grid/eta.py b/util/pace/util/grid/eta.py
index 50afaadb..075bc920 100644
--- a/util/pace/util/grid/eta.py
+++ b/util/pace/util/grid/eta.py
@@ -559,10 +559,299 @@ def set_hybrid_pressure_coefficients(km: int) -> HybridPressureCoefficients:
             ]
         )
 
+    elif km == 137:
+
+        ak = np.array(
+            [
+                1.00000000,
+                1.82500005,
+                3.00000000,
+                4.63000011,
+                6.82797718,
+                9.74696636,
+                13.6054239,
+                18.6089306,
+                24.9857178,
+                32.9857101,
+                42.8792419,
+                54.9554634,
+                69.5205765,
+                86.8958817,
+                107.415741,
+                131.425507,
+                159.279404,
+                191.338562,
+                227.968948,
+                269.539581,
+                316.420746,
+                368.982361,
+                427.592499,
+                492.616028,
+                564.413452,
+                643.339905,
+                729.744141,
+                823.967834,
+                926.344910,
+                1037.20117,
+                1156.85364,
+                1285.61035,
+                1423.77014,
+                1571.62292,
+                1729.44897,
+                1897.51929,
+                2076.09595,
+                2265.43164,
+                2465.77051,
+                2677.34814,
+                2900.39136,
+                3135.11938,
+                3381.74365,
+                3640.46826,
+                3911.49048,
+                4194.93066,
+                4490.81738,
+                4799.14941,
+                5119.89502,
+                5452.99072,
+                5798.34473,
+                6156.07422,
+                6526.94678,
+                6911.87061,
+                7311.86914,
+                7727.41211,
+                8159.35400,
+                8608.52539,
+                9076.40039,
+                9562.68262,
+                10065.9785,
+                10584.6318,
+                11116.6621,
+                11660.0674,
+                12211.5479,
+                12766.8730,
+                13324.6689,
+                13881.3311,
+                14432.1396,
+                14975.6152,
+                15508.2568,
+                16026.1152,
+                16527.3223,
+                17008.7891,
+                17467.6133,
+                17901.6211,
+                18308.4336,
+                18685.7188,
+                19031.2891,
+                19343.5117,
+                19620.0430,
+                19859.3906,
+                20059.9316,
+                20219.6641,
+                20337.8633,
+                20412.3086,
+                20442.0781,
+                20425.7188,
+                20361.8164,
+                20249.5117,
+                20087.0859,
+                19874.0254,
+                19608.5723,
+                19290.2266,
+                18917.4609,
+                18489.7070,
+                18006.9258,
+                17471.8398,
+                16888.6875,
+                16262.0469,
+                15596.6953,
+                14898.4531,
+                14173.3242,
+                13427.7695,
+                12668.2578,
+                11901.3398,
+                11133.3047,
+                10370.1758,
+                9617.51562,
+                8880.45312,
+                8163.37500,
+                7470.34375,
+                6804.42188,
+                6168.53125,
+                5564.38281,
+                4993.79688,
+                4457.37500,
+                3955.96094,
+                3489.23438,
+                3057.26562,
+                2659.14062,
+                2294.24219,
+                1961.50000,
+                1659.47656,
+                1387.54688,
+                1143.25000,
+                926.507812,
+                734.992188,
+                568.062500,
+                424.414062,
+                302.476562,
+                202.484375,
+                122.101562,
+                62.7812500,
+                22.8359375,
+                3.75781298,
+                0.00000000,
+                0.00000000,
+            ]
+        )
+
+        bk = np.array(
+            [
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                7.00000010E-06,
+                2.40000008E-05,
+                5.90000018E-05,
+                1.12000002E-04,
+                1.99000002E-04,
+                3.39999999E-04,
+                5.61999972E-04,
+                8.90000025E-04,
+                1.35300006E-03,
+                1.99200003E-03,
+                2.85700010E-03,
+                3.97100020E-03,
+                5.37799997E-03,
+                7.13300006E-03,
+                9.26099997E-03,
+                1.18060000E-02,
+                1.48160001E-02,
+                1.83179993E-02,
+                2.23549996E-02,
+                2.69639995E-02,
+                3.21759991E-02,
+                3.80260013E-02,
+                4.45480011E-02,
+                5.17730005E-02,
+                5.97280003E-02,
+                6.84479997E-02,
+                7.79580027E-02,
+                8.82859975E-02,
+                9.94620025E-02,
+                0.111505002,
+                0.124448001,
+                0.138312995,
+                0.153125003,
+                0.168909997,
+                0.185689002,
+                0.203491002,
+                0.222332999,
+                0.242244005,
+                0.263242006,
+                0.285353988,
+                0.308598012,
+                0.332938999,
+                0.358253986,
+                0.384362996,
+                0.411125004,
+                0.438391000,
+                0.466003001,
+                0.493800014,
+                0.521619022,
+                0.549301028,
+                0.576691985,
+                0.603648007,
+                0.630035996,
+                0.655736029,
+                0.680643022,
+                0.704668999,
+                0.727738976,
+                0.749796987,
+                0.770798028,
+                0.790717006,
+                0.809535980,
+                0.827256024,
+                0.843881011,
+                0.859431982,
+                0.873929024,
+                0.887408018,
+                0.899900019,
+                0.911448002,
+                0.922096014,
+                0.931881011,
+                0.940859973,
+                0.949064016,
+                0.956550002,
+                0.963352025,
+                0.969512999,
+                0.975077987,
+                0.980072021,
+                0.984542012,
+                0.988499999,
+                0.991984010,
+                0.995002985,
+                0.997630000,
+                1.00000000,
+            ]
+        )
+
     else:
         raise NotImplementedError(
-            "Only grids with 72, 79, or 91 vertical levels have been implemented so far"
+            "Only grids with 72, 79, 91 or 137 vertical levels have been implemented so far"
         )
+
     if 0.0 in bk:
         ks = 0 if km == 91 else np.where(bk == 0)[0][-1]
         ptop = ak[0]

From c1e011cf1693d02491db910f33bab7ae027bfa11 Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Thu, 6 Jul 2023 13:05:53 -0400
Subject: [PATCH 34/57] Add floating point precision to GEOS bridge init

---
 dsl/pace/dsl/typing.py                              | 6 +++++-
 fv3core/pace/fv3core/initialization/geos_wrapper.py | 2 ++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/dsl/pace/dsl/typing.py b/dsl/pace/dsl/typing.py
index 05b255ce..d67dd7b6 100644
--- a/dsl/pace/dsl/typing.py
+++ b/dsl/pace/dsl/typing.py
@@ -22,11 +22,15 @@
 DTypes = Union[bool, np.bool_, int, np.int32, np.int64, float, np.float32, np.float64]
 
 
+def floating_point_precision() -> int:
+    return int(os.getenv("PACE_FLOAT_PRECISION", "64"))
+
+
 def global_set_floating_point_precision():
     """Set the global floating point precision for all reference
     to Float in the codebase. Defaults to 64 bit."""
     global Float
-    precision_in_bit = int(os.getenv("PACE_FLOAT_PRECISION", "64"))
+    precision_in_bit = floating_point_precision()
     if precision_in_bit == 64:
         return np.float64
     elif precision_in_bit == 32:
diff --git a/fv3core/pace/fv3core/initialization/geos_wrapper.py b/fv3core/pace/fv3core/initialization/geos_wrapper.py
index 2835e77e..9fbb98ab 100644
--- a/fv3core/pace/fv3core/initialization/geos_wrapper.py
+++ b/fv3core/pace/fv3core/initialization/geos_wrapper.py
@@ -10,6 +10,7 @@
 from pace import fv3core
 from pace.driver.performance.collector import PerformanceCollector
 from pace.dsl.dace import DaceConfig, orchestrate
+from pace.dsl.typing import floating_point_precision
 from pace.dsl.gt4py_utils import is_gpu_backend
 from pace.util.logging import pace_log
 
@@ -136,6 +137,7 @@ def __init__(
             f"  dt     : {self.dycore_state.bdt}\n"
             f"  bridge : {self._fortran_mem_space} > {self._pace_mem_space}\n"
             f"  backend: {backend}\n"
+            f"  float  : {floating_point_precision()}bit"
             f"  orchestration: {self._is_orchestrated}\n"
             f"  sizer  : {sizer.nx}x{sizer.ny}x{sizer.nz} (halo: {sizer.n_halo})"
         )

From c58a2a126c83cc0a1014bfa824a7b78f7fd1cd19 Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Thu, 6 Jul 2023 14:28:24 -0400
Subject: [PATCH 35/57] lint

---
 .../fv3core/initialization/geos_wrapper.py    |  2 +-
 util/pace/util/grid/eta.py                    | 64 +++++++++----------
 2 files changed, 32 insertions(+), 34 deletions(-)

diff --git a/fv3core/pace/fv3core/initialization/geos_wrapper.py b/fv3core/pace/fv3core/initialization/geos_wrapper.py
index 9fbb98ab..8ca5c890 100644
--- a/fv3core/pace/fv3core/initialization/geos_wrapper.py
+++ b/fv3core/pace/fv3core/initialization/geos_wrapper.py
@@ -10,8 +10,8 @@
 from pace import fv3core
 from pace.driver.performance.collector import PerformanceCollector
 from pace.dsl.dace import DaceConfig, orchestrate
-from pace.dsl.typing import floating_point_precision
 from pace.dsl.gt4py_utils import is_gpu_backend
+from pace.dsl.typing import floating_point_precision
 from pace.util.logging import pace_log
 
 
diff --git a/util/pace/util/grid/eta.py b/util/pace/util/grid/eta.py
index 075bc920..dc37aaa2 100644
--- a/util/pace/util/grid/eta.py
+++ b/util/pace/util/grid/eta.py
@@ -206,7 +206,6 @@ def set_hybrid_pressure_coefficients(km: int) -> HybridPressureCoefficients:
         )
 
     elif km == 91:
-
         ak = np.array(
             [
                 1.00000000,
@@ -402,7 +401,6 @@ def set_hybrid_pressure_coefficients(km: int) -> HybridPressureCoefficients:
         )
 
     elif km == 72:
-
         ak = np.array(
             [
                 1.00000000,
@@ -560,7 +558,6 @@ def set_hybrid_pressure_coefficients(km: int) -> HybridPressureCoefficients:
         )
 
     elif km == 137:
-
         ak = np.array(
             [
                 1.00000000,
@@ -761,35 +758,35 @@ def set_hybrid_pressure_coefficients(km: int) -> HybridPressureCoefficients:
                 0.00000000,
                 0.00000000,
                 0.00000000,
-                7.00000010E-06,
-                2.40000008E-05,
-                5.90000018E-05,
-                1.12000002E-04,
-                1.99000002E-04,
-                3.39999999E-04,
-                5.61999972E-04,
-                8.90000025E-04,
-                1.35300006E-03,
-                1.99200003E-03,
-                2.85700010E-03,
-                3.97100020E-03,
-                5.37799997E-03,
-                7.13300006E-03,
-                9.26099997E-03,
-                1.18060000E-02,
-                1.48160001E-02,
-                1.83179993E-02,
-                2.23549996E-02,
-                2.69639995E-02,
-                3.21759991E-02,
-                3.80260013E-02,
-                4.45480011E-02,
-                5.17730005E-02,
-                5.97280003E-02,
-                6.84479997E-02,
-                7.79580027E-02,
-                8.82859975E-02,
-                9.94620025E-02,
+                7.00000010e-06,
+                2.40000008e-05,
+                5.90000018e-05,
+                1.12000002e-04,
+                1.99000002e-04,
+                3.39999999e-04,
+                5.61999972e-04,
+                8.90000025e-04,
+                1.35300006e-03,
+                1.99200003e-03,
+                2.85700010e-03,
+                3.97100020e-03,
+                5.37799997e-03,
+                7.13300006e-03,
+                9.26099997e-03,
+                1.18060000e-02,
+                1.48160001e-02,
+                1.83179993e-02,
+                2.23549996e-02,
+                2.69639995e-02,
+                3.21759991e-02,
+                3.80260013e-02,
+                4.45480011e-02,
+                5.17730005e-02,
+                5.97280003e-02,
+                6.84479997e-02,
+                7.79580027e-02,
+                8.82859975e-02,
+                9.94620025e-02,
                 0.111505002,
                 0.124448001,
                 0.138312995,
@@ -849,7 +846,8 @@ def set_hybrid_pressure_coefficients(km: int) -> HybridPressureCoefficients:
 
     else:
         raise NotImplementedError(
-            "Only grids with 72, 79, 91 or 137 vertical levels have been implemented so far"
+            "Only grids with 72, 79, 91 or 137 vertical levels"
+            "have been implemented so far"
         )
 
     if 0.0 in bk:

From 8e362a2913360c6998a402c11c7132b4add3f3bb Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Fri, 7 Jul 2023 09:50:04 -0400
Subject: [PATCH 36/57] Add device PCI bus id (for MPS debug)

---
 fv3core/pace/fv3core/initialization/geos_wrapper.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/fv3core/pace/fv3core/initialization/geos_wrapper.py b/fv3core/pace/fv3core/initialization/geos_wrapper.py
index 8ca5c890..87ce0193 100644
--- a/fv3core/pace/fv3core/initialization/geos_wrapper.py
+++ b/fv3core/pace/fv3core/initialization/geos_wrapper.py
@@ -5,6 +5,7 @@
 
 import f90nml
 import numpy as np
+from pace.util._optional_imports import cupy as cp
 
 import pace.util
 from pace import fv3core
@@ -132,6 +133,11 @@ def __init__(
         self.output_dict: Dict[str, np.ndarray] = {}
         self._allocate_output_dir()
 
+        device_ordinal_info = ""
+        if is_gpu_backend():
+            device_ordinal_info = (
+                f"  Device PCI bus id: {cp.cuda.Device(0).pci_bus_id}\n"
+            )
         pace_log.info(
             "Pace GEOS wrapper initialized: \n"
             f"  dt     : {self.dycore_state.bdt}\n"
@@ -139,7 +145,8 @@ def __init__(
             f"  backend: {backend}\n"
             f"  float  : {floating_point_precision()}bit"
             f"  orchestration: {self._is_orchestrated}\n"
-            f"  sizer  : {sizer.nx}x{sizer.ny}x{sizer.nz} (halo: {sizer.n_halo})"
+            f"  sizer  : {sizer.nx}x{sizer.ny}x{sizer.nz} (halo: {sizer.n_halo})\n"
+            f"  {device_ordinal_info}"
         )
 
     def _critical_path(self):

From adc5ee501a0b36b8478cf751012fc619ee832b44 Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Fri, 7 Jul 2023 09:59:24 -0400
Subject: [PATCH 37/57] Typo + lint

---
 fv3core/pace/fv3core/initialization/geos_wrapper.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fv3core/pace/fv3core/initialization/geos_wrapper.py b/fv3core/pace/fv3core/initialization/geos_wrapper.py
index 87ce0193..7f8f05d3 100644
--- a/fv3core/pace/fv3core/initialization/geos_wrapper.py
+++ b/fv3core/pace/fv3core/initialization/geos_wrapper.py
@@ -5,7 +5,6 @@
 
 import f90nml
 import numpy as np
-from pace.util._optional_imports import cupy as cp
 
 import pace.util
 from pace import fv3core
@@ -13,6 +12,7 @@
 from pace.dsl.dace import DaceConfig, orchestrate
 from pace.dsl.gt4py_utils import is_gpu_backend
 from pace.dsl.typing import floating_point_precision
+from pace.util._optional_imports import cupy as cp
 from pace.util.logging import pace_log
 
 
@@ -134,7 +134,7 @@ def __init__(
         self._allocate_output_dir()
 
         device_ordinal_info = ""
-        if is_gpu_backend():
+        if is_gpu_backend(backend):
             device_ordinal_info = (
                 f"  Device PCI bus id: {cp.cuda.Device(0).pci_bus_id}\n"
             )

From 39ff8ead23a35754b455c89781f1413a087bfc6a Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Thu, 3 Aug 2023 15:31:14 -0400
Subject: [PATCH 38/57] Try to detect MPS reading the "log" pipe

---
 .../fv3core/initialization/geos_wrapper.py    | 32 ++++++++++++-------
 1 file changed, 21 insertions(+), 11 deletions(-)

diff --git a/fv3core/pace/fv3core/initialization/geos_wrapper.py b/fv3core/pace/fv3core/initialization/geos_wrapper.py
index 7f8f05d3..8143da2b 100644
--- a/fv3core/pace/fv3core/initialization/geos_wrapper.py
+++ b/fv3core/pace/fv3core/initialization/geos_wrapper.py
@@ -133,20 +133,30 @@ def __init__(
         self.output_dict: Dict[str, np.ndarray] = {}
         self._allocate_output_dir()
 
-        device_ordinal_info = ""
-        if is_gpu_backend(backend):
-            device_ordinal_info = (
-                f"  Device PCI bus id: {cp.cuda.Device(0).pci_bus_id}\n"
-            )
+        # Feedback information
+        device_ordinal_info = (
+            f"  Device PCI bus id: {cp.cuda.Device(0).pci_bus_id}\n"
+            if is_gpu_backend(backend)
+            else "N/A"
+        )
+        MPS_pipe_directory = os.getenv("CUDA_MPS_PIPE_DIRECTORY", None)
+        MPS_is_on = (
+            True
+            if MPS_pipe_directory
+            and is_gpu_backend(backend)
+            and os.path.exists(f"{MPS_pipe_directory}/log")
+            else False
+        )
         pace_log.info(
             "Pace GEOS wrapper initialized: \n"
-            f"  dt     : {self.dycore_state.bdt}\n"
-            f"  bridge : {self._fortran_mem_space} > {self._pace_mem_space}\n"
-            f"  backend: {backend}\n"
-            f"  float  : {floating_point_precision()}bit"
-            f"  orchestration: {self._is_orchestrated}\n"
-            f"  sizer  : {sizer.nx}x{sizer.ny}x{sizer.nz} (halo: {sizer.n_halo})\n"
+            f"             dt : {self.dycore_state.bdt}\n"
+            f"         bridge : {self._fortran_mem_space} > {self._pace_mem_space}\n"
+            f"        backend : {backend}\n"
+            f"          float : {floating_point_precision()}bit"
+            f"  orchestration : {self._is_orchestrated}\n"
+            f"          sizer : {sizer.nx}x{sizer.ny}x{sizer.nz} (halo: {sizer.n_halo})\n"
             f"  {device_ordinal_info}"
+            f"     Nvidia MPS : {MPS_is_on}"
         )
 
     def _critical_path(self):

From f2d171dc5903560c991932c5a14ccf15536012c8 Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Tue, 8 Aug 2023 14:15:14 -0400
Subject: [PATCH 39/57] Lint

---
 fv3core/pace/fv3core/initialization/geos_wrapper.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fv3core/pace/fv3core/initialization/geos_wrapper.py b/fv3core/pace/fv3core/initialization/geos_wrapper.py
index 8143da2b..de0b944c 100644
--- a/fv3core/pace/fv3core/initialization/geos_wrapper.py
+++ b/fv3core/pace/fv3core/initialization/geos_wrapper.py
@@ -154,7 +154,8 @@ def __init__(
             f"        backend : {backend}\n"
             f"          float : {floating_point_precision()}bit"
             f"  orchestration : {self._is_orchestrated}\n"
-            f"          sizer : {sizer.nx}x{sizer.ny}x{sizer.nz} (halo: {sizer.n_halo})\n"
+            f"          sizer : {sizer.nx}x{sizer.ny}x{sizer.nz}"
+            f"(halo: {sizer.n_halo})\n"
             f"  {device_ordinal_info}"
             f"     Nvidia MPS : {MPS_is_on}"
         )

From ac70398415a3c7dcff9fa56a9385618a1113f8d9 Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Tue, 8 Aug 2023 14:16:06 -0400
Subject: [PATCH 40/57] Clean up

---
 fv3core/pace/fv3core/initialization/geos_wrapper.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/fv3core/pace/fv3core/initialization/geos_wrapper.py b/fv3core/pace/fv3core/initialization/geos_wrapper.py
index de0b944c..f7133543 100644
--- a/fv3core/pace/fv3core/initialization/geos_wrapper.py
+++ b/fv3core/pace/fv3core/initialization/geos_wrapper.py
@@ -141,11 +141,9 @@ def __init__(
         )
         MPS_pipe_directory = os.getenv("CUDA_MPS_PIPE_DIRECTORY", None)
         MPS_is_on = (
-            True
-            if MPS_pipe_directory
+            MPS_pipe_directory
             and is_gpu_backend(backend)
             and os.path.exists(f"{MPS_pipe_directory}/log")
-            else False
         )
         pace_log.info(
             "Pace GEOS wrapper initialized: \n"

From 0a4163f742dff8af21f734802ab6d7bdafaba0ca Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Tue, 8 Aug 2023 14:21:51 -0400
Subject: [PATCH 41/57] Log info GEOS bridge (#18)

* Add floating point precision to GEOS bridge init

* lint

* Add device PCI bus id (for MPS debug)

* Typo + lint

* Try to detect MPS reading the "log" pipe

* Lint

* Clean up
---
 dsl/pace/dsl/typing.py                        |  6 +-
 .../fv3core/initialization/geos_wrapper.py    | 28 ++++++--
 util/pace/util/grid/eta.py                    | 64 +++++++++----------
 3 files changed, 59 insertions(+), 39 deletions(-)

diff --git a/dsl/pace/dsl/typing.py b/dsl/pace/dsl/typing.py
index 05b255ce..d67dd7b6 100644
--- a/dsl/pace/dsl/typing.py
+++ b/dsl/pace/dsl/typing.py
@@ -22,11 +22,15 @@
 DTypes = Union[bool, np.bool_, int, np.int32, np.int64, float, np.float32, np.float64]
 
 
+def floating_point_precision() -> int:
+    return int(os.getenv("PACE_FLOAT_PRECISION", "64"))
+
+
 def global_set_floating_point_precision():
     """Set the global floating point precision for all reference
     to Float in the codebase. Defaults to 64 bit."""
     global Float
-    precision_in_bit = int(os.getenv("PACE_FLOAT_PRECISION", "64"))
+    precision_in_bit = floating_point_precision()
     if precision_in_bit == 64:
         return np.float64
     elif precision_in_bit == 32:
diff --git a/fv3core/pace/fv3core/initialization/geos_wrapper.py b/fv3core/pace/fv3core/initialization/geos_wrapper.py
index 2835e77e..f7133543 100644
--- a/fv3core/pace/fv3core/initialization/geos_wrapper.py
+++ b/fv3core/pace/fv3core/initialization/geos_wrapper.py
@@ -11,6 +11,8 @@
 from pace.driver.performance.collector import PerformanceCollector
 from pace.dsl.dace import DaceConfig, orchestrate
 from pace.dsl.gt4py_utils import is_gpu_backend
+from pace.dsl.typing import floating_point_precision
+from pace.util._optional_imports import cupy as cp
 from pace.util.logging import pace_log
 
 
@@ -131,13 +133,29 @@ def __init__(
         self.output_dict: Dict[str, np.ndarray] = {}
         self._allocate_output_dir()
 
+        # Feedback information
+        device_ordinal_info = (
+            f"  Device PCI bus id: {cp.cuda.Device(0).pci_bus_id}\n"
+            if is_gpu_backend(backend)
+            else "N/A"
+        )
+        MPS_pipe_directory = os.getenv("CUDA_MPS_PIPE_DIRECTORY", None)
+        MPS_is_on = (
+            MPS_pipe_directory
+            and is_gpu_backend(backend)
+            and os.path.exists(f"{MPS_pipe_directory}/log")
+        )
         pace_log.info(
             "Pace GEOS wrapper initialized: \n"
-            f"  dt     : {self.dycore_state.bdt}\n"
-            f"  bridge : {self._fortran_mem_space} > {self._pace_mem_space}\n"
-            f"  backend: {backend}\n"
-            f"  orchestration: {self._is_orchestrated}\n"
-            f"  sizer  : {sizer.nx}x{sizer.ny}x{sizer.nz} (halo: {sizer.n_halo})"
+            f"             dt : {self.dycore_state.bdt}\n"
+            f"         bridge : {self._fortran_mem_space} > {self._pace_mem_space}\n"
+            f"        backend : {backend}\n"
+            f"          float : {floating_point_precision()}bit"
+            f"  orchestration : {self._is_orchestrated}\n"
+            f"          sizer : {sizer.nx}x{sizer.ny}x{sizer.nz}"
+            f"(halo: {sizer.n_halo})\n"
+            f"  {device_ordinal_info}"
+            f"     Nvidia MPS : {MPS_is_on}"
         )
 
     def _critical_path(self):
diff --git a/util/pace/util/grid/eta.py b/util/pace/util/grid/eta.py
index 075bc920..dc37aaa2 100644
--- a/util/pace/util/grid/eta.py
+++ b/util/pace/util/grid/eta.py
@@ -206,7 +206,6 @@ def set_hybrid_pressure_coefficients(km: int) -> HybridPressureCoefficients:
         )
 
     elif km == 91:
-
         ak = np.array(
             [
                 1.00000000,
@@ -402,7 +401,6 @@ def set_hybrid_pressure_coefficients(km: int) -> HybridPressureCoefficients:
         )
 
     elif km == 72:
-
         ak = np.array(
             [
                 1.00000000,
@@ -560,7 +558,6 @@ def set_hybrid_pressure_coefficients(km: int) -> HybridPressureCoefficients:
         )
 
     elif km == 137:
-
         ak = np.array(
             [
                 1.00000000,
@@ -761,35 +758,35 @@ def set_hybrid_pressure_coefficients(km: int) -> HybridPressureCoefficients:
                 0.00000000,
                 0.00000000,
                 0.00000000,
-                7.00000010E-06,
-                2.40000008E-05,
-                5.90000018E-05,
-                1.12000002E-04,
-                1.99000002E-04,
-                3.39999999E-04,
-                5.61999972E-04,
-                8.90000025E-04,
-                1.35300006E-03,
-                1.99200003E-03,
-                2.85700010E-03,
-                3.97100020E-03,
-                5.37799997E-03,
-                7.13300006E-03,
-                9.26099997E-03,
-                1.18060000E-02,
-                1.48160001E-02,
-                1.83179993E-02,
-                2.23549996E-02,
-                2.69639995E-02,
-                3.21759991E-02,
-                3.80260013E-02,
-                4.45480011E-02,
-                5.17730005E-02,
-                5.97280003E-02,
-                6.84479997E-02,
-                7.79580027E-02,
-                8.82859975E-02,
-                9.94620025E-02,
+                7.00000010e-06,
+                2.40000008e-05,
+                5.90000018e-05,
+                1.12000002e-04,
+                1.99000002e-04,
+                3.39999999e-04,
+                5.61999972e-04,
+                8.90000025e-04,
+                1.35300006e-03,
+                1.99200003e-03,
+                2.85700010e-03,
+                3.97100020e-03,
+                5.37799997e-03,
+                7.13300006e-03,
+                9.26099997e-03,
+                1.18060000e-02,
+                1.48160001e-02,
+                1.83179993e-02,
+                2.23549996e-02,
+                2.69639995e-02,
+                3.21759991e-02,
+                3.80260013e-02,
+                4.45480011e-02,
+                5.17730005e-02,
+                5.97280003e-02,
+                6.84479997e-02,
+                7.79580027e-02,
+                8.82859975e-02,
+                9.94620025e-02,
                 0.111505002,
                 0.124448001,
                 0.138312995,
@@ -849,7 +846,8 @@ def set_hybrid_pressure_coefficients(km: int) -> HybridPressureCoefficients:
 
     else:
         raise NotImplementedError(
-            "Only grids with 72, 79, 91 or 137 vertical levels have been implemented so far"
+            "Only grids with 72, 79, 91 or 137 vertical levels"
+            "have been implemented so far"
         )
 
     if 0.0 in bk:

From 9d6d2f9d1dd281b873033cf1d818c1359f9c845b Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Tue, 15 Aug 2023 16:45:15 -0400
Subject: [PATCH 42/57] Update geos/develop to grab NOAA PR9 results (#21)

* Verbose choice of block/grid size

* added build script for c5

* updated repo to NOAA

* GEOS integration (#9)

* Initialize GeosDycoreWrapper with bdt (timestep)

* Use GEOS version of constants

* 1. Add qcld to the list of tracers beings advected
2. Made GEOS specific changes to thresholds in saturation adjustment

* Accumulate diss_est

* Allow GEOS_WRAPPER to process device data

* Add clear to collector for 3rd party use. GEOS pass down timings to caller

* Make kernel analysis run a copy stencil to compute local bandwith
Parametrize tool with backend, output format

* Move constant on a env var
Add saturation adjustement threshold to const

* Remove unused if leading to empty code block

* Restrict dace to 0.14.1 due to a parsing bug

* Add guard for bdt==0
Fix bad merge for bdt with GEOS_Wrapper

* Remove unused code

* Fix theroritical timings

* Fixed a bug where pkz was being calculated twice, and the second calc was wrong

* Downgrade DaCe to 0.14.0 pending array aliasing fix

* Set default cache path for orchestrated DaCe to respect GT_CACHE_* env

* Remove previous per stencil override of default_build_folder

* Revert "Set default cache path for orchestrated DaCe to respect GT_CACHE_* env"

* Revert "Remove previous per stencil override of default_build_folder"

* Read cache_root in default dace backend

* Document faulty behavior with GT_CACHE_DIR_NAME

* Fix bad requirements syntax

* Check for the string value of CONST_VERSION directly instead of enum

* Protect constant selection more rigorusly.
Clean abort on unknown constant given

* Log constants selection

* Refactor NQ to constants.py

* Fix or explain inlined import

* Verbose runtime error when bad dt_atmos

* Verbose warm up

* re-initialize heat_source and diss_est each call, add do_skeb check to accumulation

---------

Co-authored-by: Purnendu Chakraborty <purnendu.chakraborty@nasa.gov>
Co-authored-by: Oliver Elbert <oliver.elbert36@gmail.com>

---------

Co-authored-by: Rusty Benson <6594772+bensonr@users.noreply.github.com>
Co-authored-by: Oliver Elbert <Oliver.Elbert@noaa.gov>
Co-authored-by: Purnendu Chakraborty <purnendu.chakraborty@nasa.gov>
Co-authored-by: Oliver Elbert <oliver.elbert36@gmail.com>
---
 dsl/pace/dsl/dace/dace_config.py              |  4 +++
 dsl/pace/dsl/dace/utils.py                    |  8 ++++--
 .../{build_gaea.sh => build_gaea_c4.sh}       |  2 +-
 examples/build_scripts/build_gaea_c5.sh       | 27 +++++++++++++++++++
 fv3core/pace/fv3core/stencils/d_sw.py         | 24 +++++++++++------
 fv3core/pace/fv3core/stencils/fv_dynamics.py  |  9 ++++---
 6 files changed, 60 insertions(+), 14 deletions(-)
 rename examples/build_scripts/{build_gaea.sh => build_gaea_c4.sh} (91%)
 create mode 100644 examples/build_scripts/build_gaea_c5.sh

diff --git a/dsl/pace/dsl/dace/dace_config.py b/dsl/pace/dsl/dace/dace_config.py
index 5e78c6bc..6f2befff 100644
--- a/dsl/pace/dsl/dace/dace_config.py
+++ b/dsl/pace/dsl/dace/dace_config.py
@@ -213,6 +213,10 @@ def __init__(
             if cp:
                 cuda_sm = cp.cuda.Device(0).compute_capability
             dace.config.Config.set("compiler", "cuda", "cuda_arch", value=f"{cuda_sm}")
+            # Block size/thread count is defaulted to an average value for recent
+            # hardware (Pascal and upward). The problem of setting an optimized
+            # block/thread is both hardware and problem dependant. Fine tuners
+            # available in DaCe should be relied on for futher tuning of this value.
             dace.config.Config.set(
                 "compiler", "cuda", "default_block_size", value="64,8,1"
             )
diff --git a/dsl/pace/dsl/dace/utils.py b/dsl/pace/dsl/dace/utils.py
index 5c9f63ec..40ac3c12 100644
--- a/dsl/pace/dsl/dace/utils.py
+++ b/dsl/pace/dsl/dace/utils.py
@@ -1,3 +1,4 @@
+import json
 import time
 from dataclasses import dataclass, field
 from typing import Dict, List, Optional
@@ -242,6 +243,8 @@ def kernel_theoretical_timing(
         n = 1000
         m = 4
         dt = []
+        # Warm up run (build, allocation)
+        # to remove from timing the common runtime
         bench(A, B, n)
         # Time
         for _ in range(m):
@@ -296,6 +299,9 @@ def kernel_theoretical_timing(
         except TypeError:
             newresult_in_us = (alldata_in_bytes / bandwidth_in_bytes_s) * in_us
 
+        # We keep sympy import here because sympy is known to be a problematic
+        # import and an heavy module which should be avoided if possible.
+        # TODO: refactor it out by shadow-coding the sympy.Max/Eval functions
         import sympy
 
         if node.label in result:
@@ -333,8 +339,6 @@ def report_kernel_theoretical_timing(
         with open("kernel_theoretical_timing.csv", "w") as f:
             f.write(csv_string)
     elif out_format == "json":
-        import json
-
         with open("kernel_theoretical_timing.json", "w") as f:
             json.dump(timings, f, indent=2)
 
diff --git a/examples/build_scripts/build_gaea.sh b/examples/build_scripts/build_gaea_c4.sh
similarity index 91%
rename from examples/build_scripts/build_gaea.sh
rename to examples/build_scripts/build_gaea_c4.sh
index b2f9d770..def7af3f 100644
--- a/examples/build_scripts/build_gaea.sh
+++ b/examples/build_scripts/build_gaea_c4.sh
@@ -13,7 +13,7 @@ module load boost/1.72.0
 module load python/3.9
 
 # clone Pace and update submodules
-git clone --recursive https://github.com/ai2cm/pace
+git clone --recursive https://github.com/NOAA-GFDL/pace
 cd pace
 
 # create a conda environment for pace
diff --git a/examples/build_scripts/build_gaea_c5.sh b/examples/build_scripts/build_gaea_c5.sh
new file mode 100644
index 00000000..94ad5611
--- /dev/null
+++ b/examples/build_scripts/build_gaea_c5.sh
@@ -0,0 +1,27 @@
+#!/usr/bin/env bash
+
+# Example bash script to install Pace to run bare-metal on Gaea's c4 cluster
+
+set -e -x
+
+# module load necessary system software
+module rm PrgEnv-intel
+module load PrgEnv-gnu
+module rm gcc
+module load gcc/12.2.0
+module load boost/1.79.0
+module load python/3.9
+
+# clone Pace and update submodules
+git clone --recursive https://github.com/NOAA-GFDL/pace
+cd pace
+
+# create a conda environment for pace
+conda create -y --name my_name python=3.8
+
+# enter the environment and update it
+conda activate my_name
+pip3 install --upgrade pip setuptools wheel
+
+# install the Pace dependencies, GT4Py, and Pace
+pip3 install -r requirements_dev.txt -c constraints.txt
diff --git a/fv3core/pace/fv3core/stencils/d_sw.py b/fv3core/pace/fv3core/stencils/d_sw.py
index 02ce9887..14155da4 100644
--- a/fv3core/pace/fv3core/stencils/d_sw.py
+++ b/fv3core/pace/fv3core/stencils/d_sw.py
@@ -751,6 +751,8 @@ def __init__(
         orchestrate(obj=self, config=stencil_factory.config.dace_config)
         self.grid_data = grid_data
         self._f0 = self.grid_data.fC_agrid
+        self._d_con = config.d_con
+        self._do_stochastic_ke_backscatter = config.do_skeb
 
         self.grid_indexing = stencil_factory.grid_indexing
         assert config.grid_type < 3, "ubke and vbke only implemented for grid_type < 3"
@@ -927,12 +929,15 @@ def make_quantity():
                 },
             )
         )
-        self._accumulate_heat_source_and_dissipation_estimate_stencil = (
-            stencil_factory.from_dims_halo(
-                func=accumulate_heat_source_and_dissipation_estimate,
-                compute_dims=[X_DIM, Y_DIM, Z_DIM],
+
+        if (self._d_con > 1.0e-5) or (self._do_stochastic_ke_backscatter):
+            self._accumulate_heat_source_and_dissipation_estimate_stencil = (
+                stencil_factory.from_dims_halo(
+                    func=accumulate_heat_source_and_dissipation_estimate,
+                    compute_dims=[X_DIM, Y_DIM, Z_DIM],
+                )
             )
-        )
+
         self._compute_vorticity_stencil = stencil_factory.from_dims_halo(
             compute_vorticity,
             compute_dims=[X_DIM, Y_DIM, Z_DIM],
@@ -1246,9 +1251,12 @@ def __call__(
             self._tmp_diss_e,
             self._column_namelist["d_con"],
         )
-        self._accumulate_heat_source_and_dissipation_estimate_stencil(
-            self._tmp_heat_s, heat_source, self._tmp_diss_e, diss_est
-        )
+
+        if (self._d_con > 1.0e-5) or (self._do_stochastic_ke_backscatter):
+            self._accumulate_heat_source_and_dissipation_estimate_stencil(
+                self._tmp_heat_s, heat_source, self._tmp_diss_e, diss_est
+            )
+
         self._update_u_and_v_stencil(
             self._tmp_ut,
             self._tmp_vt,
diff --git a/fv3core/pace/fv3core/stencils/fv_dynamics.py b/fv3core/pace/fv3core/stencils/fv_dynamics.py
index 80b78e12..5f3de73a 100644
--- a/fv3core/pace/fv3core/stencils/fv_dynamics.py
+++ b/fv3core/pace/fv3core/stencils/fv_dynamics.py
@@ -172,7 +172,10 @@ def __init__(
             dace_compiletime_args=["state"],
         )
         if timestep == timedelta(seconds=0):
-            raise RuntimeError("Bad dynamical core configuration: bdt is 0")
+            raise RuntimeError(
+                "Bad dynamical core configuration:"
+                " the atmospheric timestep is 0 seconds!"
+            )
         # nested and stretched_grid are options in the Fortran code which we
         # have not implemented, so they are hard-coded here.
         self.call_checkpointer = checkpointer is not None
@@ -543,8 +546,8 @@ def _compute(self, state: DycoreState, timer: pace.util.Timer):
 
                     # TODO: When NQ=9, we shouldn't need to pass qcld explicitly
                     #       since it's in self.tracers. It should not be an issue since
-                    #       we don't have self.tracers & qcld computation at
-                    #       the same time.
+                    #       we don't have self.tracers & qcld computation at the same
+                    #       time
                     #       When NQ=8, we do need qcld passed explicitely
                     self._lagrangian_to_eulerian_obj(
                         self.tracers,

From 2031b9e6e9222ce5914a579d9713d37d301f106f Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Wed, 23 Aug 2023 11:15:05 -0400
Subject: [PATCH 43/57] [NOAA:Update] Bring back #15 & doubly periodic domain
 (#25)

* Feature/dp driver (#13)

* initial commit

* adding test config

* adding the rest of driver and util code

* updating history.md

* move u_max to dycore config

* uncomment assert

* added comment explaining the copy of grid type to dycore config

* Turn main unit test  & lint on PR, logger clean up [NASA:Update]  (#15)

* Initialize GeosDycoreWrapper with bdt (timestep)

* Use GEOS version of constants

* 1. Add qcld to the list of tracers beings advected
2. Made GEOS specific changes to thresholds in saturation adjustment

* Accumulate diss_est

* Allow GEOS_WRAPPER to process device data

* Add clear to collector for 3rd party use. GEOS pass down timings to caller

* Make kernel analysis run a copy stencil to compute local bandwith
Parametrize tool with backend, output format

* Move constant on a env var
Add saturation adjustement threshold to const

* Restrict dace to 0.14.1 due to a parsing bug

* Add guard for bdt==0

* Fix theroritical timings

* Fixed a bug where pkz was being calculated twice, and the second calc was wrong

* Downgrade DaCe to 0.14.0 pending array aliasing fix

* Set default cache path for orchestrated DaCe to respect GT_CACHE_* env

* Remove previous per stencil override of default_build_folder

* Revert "Set default cache path for orchestrated DaCe to respect GT_CACHE_* env"

* Read cache_root in default dace backend

* Document faulty behavior with GT_CACHE_DIR_NAME

* Check for the string value of CONST_VERSION directly instead of enum

* Protect constant selection more rigorusly.
Clean abort on unknown constant given

* Log constants selection

* Refactor NQ to constants.py

* Introduce PACE_LOGLEVEL to control log level from outside

* Code guidelines clean up

* Devops/GitHub actions on (#15)

* Linting on PR

* Run main unit test

* Update python to available 3.8.12

* Fix unit tests (remove dxa, dya rely on halo ex)

* Update HISTORY.md

* Adapt log_level in driver.run

* Verbose the PACE_CONSTANTS

* Doc log level hierarchical nature

---------

Co-authored-by: Purnendu Chakraborty <purnendu.chakraborty@nasa.gov>
Co-authored-by: Purnendu Chakraborty <pchakraborty@users.noreply.github.com>

* Lint

---------

Co-authored-by: Oliver Elbert <Oliver.Elbert@noaa.gov>
Co-authored-by: Purnendu Chakraborty <purnendu.chakraborty@nasa.gov>
Co-authored-by: Purnendu Chakraborty <pchakraborty@users.noreply.github.com>
---
 README.md                                     |  14 ++-
 .../examples/configs/baroclinic_c12_dp.yaml   | 102 ++++++++++++++++++
 driver/pace/driver/driver.py                  |   4 +
 driver/pace/driver/grid.py                    |  15 ++-
 driver/pace/driver/run.py                     |  49 +--------
 fv3core/pace/fv3core/_config.py               |   2 +
 tests/main/driver/test_example_configs.py     |   1 +
 util/HISTORY.md                               |   3 +
 util/pace/util/__init__.py                    |   2 +-
 util/pace/util/grid/generation.py             |   9 ++
 util/pace/util/logging.py                     |  10 ++
 util/pace/util/namelist.py                    |   8 ++
 12 files changed, 170 insertions(+), 49 deletions(-)
 create mode 100644 driver/examples/configs/baroclinic_c12_dp.yaml

diff --git a/README.md b/README.md
index 7753fa73..5884cee8 100644
--- a/README.md
+++ b/README.md
@@ -11,6 +11,7 @@ Pace is an implementation of the FV3GFS / SHiELD atmospheric model developed by
 Full Sphinx documentation can be found at [https://ai2cm.github.io/pace/](https://ai2cm.github.io/pace/).
 
 **WARNING** This repo is under active development - supported features and procedures can change rapidly and without notice.
+
 ## Quickstart - bare metal
 
 ### Build
@@ -27,10 +28,13 @@ export BOOST_ROOT=BOOST/ROOT/boost_1_79_0
 ```
 
 When cloning Pace you will need to update the repository's submodules as well:
+
 ```shell
 git clone --recursive https://github.com/ai2cm/pace.git
 ```
+
 or if you have already cloned the repository:
+
 ```
 git submodule update --init --recursive
 ```
@@ -43,6 +47,7 @@ source venv_name/bin/activate
 ```
 
 Inside of your pace `venv` or conda environment pip install the Python requirements, GT4Py, and Pace:
+
 ```shell
 pip3 install -r requirements_dev.txt -c constraints.txt
 ```
@@ -52,6 +57,7 @@ Shell scripts to install Pace on specific machines such as Gaea can be found in
 ### Run
 
 With the environment activated, you can run an example baroclinic test case with the following command:
+
 ```shell
 mpirun -n 6 python3 -m pace.driver.run driver/examples/configs/baroclinic_c12.yaml
 
@@ -64,23 +70,30 @@ After the run completes, you will see an output direcotry `output.zarr`. An exam
 ### Environment variable configuration
 
 - `PACE_CONSTANTS`: Pace is bundled with various constants (see _util/pace/util/constants.py_).
+  - `FV3DYCORE` NOAA's FV3 dynamical core constants (original port)
+  - `GFS` Constant as defined in NOAA GFS
+  - `GEOS`  Constant as defined in GEOS v13
 - `PACE_FLOAT_PRECISION`: default precision of the field & scalars in the numerics. Default to 64.
 - `PACE_LOGLEVEL`: logging level to display (DEBUG, INFO, WARNING, ERROR, CRITICAL). Default to INFO.
 
 ## Quickstart - Docker
+
 ### Build
 
 While it is possible to install and build pace bare-metal, we can ensure all system libraries are installed with the correct versions by using a Docker container to test and develop pace.
 
 First, you will need to update the git submodules so that any dependencies are cloned and at the correct version:
+
 ```shell
 git submodule update --init --recursive
 ```
 
 Then build the `pace` docker image at the top level.
+
 ```shell
 make build
 ```
+
 ### Run
 
 ```shell
@@ -100,7 +113,6 @@ This git repository is laid out as a mono-repo, containing multiple independent
 
 ![Graph of interdependencies of Pace modules, generated from dependences.dot](./dependencies.svg)
 
-
 ## ML emulation
 
 An example of integration of an ML model replacing the microphysics parametrization is available on the `feature/microphysics-emulator` branch.
diff --git a/driver/examples/configs/baroclinic_c12_dp.yaml b/driver/examples/configs/baroclinic_c12_dp.yaml
new file mode 100644
index 00000000..029767ca
--- /dev/null
+++ b/driver/examples/configs/baroclinic_c12_dp.yaml
@@ -0,0 +1,102 @@
+stencil_config:
+  compilation_config:
+    backend: numpy
+    rebuild: false
+    validate_args: true
+    format_source: false
+    device_sync: false
+grid_config:
+  type: generated
+  config:
+    grid_type: 4
+    dx_const: 3000.0
+    dy_const: 3000.0
+    deglat: 10.0
+initialization:
+  type: baroclinic
+performance_config:
+  collect_performance: true
+  experiment_name: c12_baroclinic
+nx_tile: 12
+nz: 79
+dt_atmos: 225
+minutes: 15
+layout:
+  - 1
+  - 1
+diagnostics_config:
+  path: output
+  output_format: netcdf
+  names:
+    - u
+    - v
+    - ua
+    - va
+    - pt
+    - delp
+    - qvapor
+    - qliquid
+    - qice
+    - qrain
+    - qsnow
+    - qgraupel
+  z_select:
+    - level: 65
+      names:
+        - pt
+dycore_config:
+  a_imp: 1.0
+  beta: 0.
+  consv_te: 0.
+  d2_bg: 0.
+  d2_bg_k1: 0.2
+  d2_bg_k2: 0.1
+  d4_bg: 0.15
+  d_con: 1.0
+  d_ext: 0.0
+  dddmp: 0.5
+  delt_max: 0.002
+  do_sat_adj: true
+  do_vort_damp: true
+  fill: true
+  hord_dp: 6
+  hord_mt: 6
+  hord_tm: 6
+  hord_tr: 8
+  hord_vt: 6
+  hydrostatic: false
+  k_split: 1
+  ke_bg: 0.
+  kord_mt: 9
+  kord_tm: -9
+  kord_tr: 9
+  kord_wz: 9
+  n_split: 1
+  nord: 3
+  nwat: 6
+  p_fac: 0.05
+  rf_cutoff: 3000.
+  rf_fast: true
+  tau: 10.
+  vtdm4: 0.06
+  z_tracer: true
+  do_qa: true
+  tau_i2s: 1000.
+  tau_g2v: 1200.
+  ql_gen: 0.001
+  ql_mlt: 0.002
+  qs_mlt: 0.000001
+  qi_lim: 1.0
+  dw_ocean: 0.1
+  dw_land: 0.15
+  icloud_f: 0
+  tau_l2v: 300.
+  tau_v2l: 90.
+  fv_sg_adj: 0
+  n_sponge: 48
+  u_max: 355.0
+
+physics_config:
+  hydrostatic: false
+  nwat: 6
+  do_qa: true
diff --git a/driver/pace/driver/driver.py b/driver/pace/driver/driver.py
index 07317415..284acaca 100644
--- a/driver/pace/driver/driver.py
+++ b/driver/pace/driver/driver.py
@@ -273,6 +273,10 @@ def from_dict(cls, kwargs: Dict[str, Any]) -> "DriverConfig":
             kwargs["grid_config"] = GridInitializerSelector.from_dict(
                 kwargs["grid_config"]
             )
+            grid_type = kwargs["grid_config"].config.grid_type
+            # Copy grid_type to the DycoreConfig if it's not the default value
+            if grid_type != 0:
+                kwargs["dycore_config"].grid_type = grid_type
 
         if (
             isinstance(kwargs["stencil_config"], dict)
diff --git a/driver/pace/driver/grid.py b/driver/pace/driver/grid.py
index 4817869c..c184d566 100644
--- a/driver/pace/driver/grid.py
+++ b/driver/pace/driver/grid.py
@@ -85,12 +85,20 @@ class GeneratedGridConfig(GridInitializer):
         lon_target: desired center longitude for refined tile (deg)
         lat_target: desired center latitude for refined tile (deg)
         restart_path: if given, load vertical grid from restart file
+        grid_type: type of grid, 0 is a gnomonic cubed-sphere, 4 is doubly-periodic
+        dx_const: constant x-width of grid cells on a dp-grid
+        dy_const: constant y-width of grid cells on a dp-grid
+        deglat: latitude to use for coriolis calculations on a dp-grid
     """
 
     stretch_factor: Optional[float] = 1.0
     lon_target: Optional[float] = 350.0
     lat_target: Optional[float] = -90.0
     restart_path: Optional[str] = None
+    grid_type: Optional[int] = 0
+    dx_const: Optional[float] = 1000.0
+    dy_const: Optional[float] = 1000.0
+    deglat: Optional[float] = 15.0
 
     def get_grid(
         self,
@@ -99,7 +107,12 @@ def get_grid(
     ) -> Tuple[DampingCoefficients, DriverGridData, GridData]:
 
         metric_terms = MetricTerms(
-            quantity_factory=quantity_factory, communicator=communicator
+            quantity_factory=quantity_factory,
+            communicator=communicator,
+            grid_type=self.grid_type,
+            dx_const=self.dx_const,
+            dy_const=self.dy_const,
+            deglat=self.deglat,
         )
         if self.stretch_factor != 1:  # do horizontal grid transformation
             _transform_horizontal_grid(
diff --git a/driver/pace/driver/run.py b/driver/pace/driver/run.py
index c8532ebd..df70eb14 100644
--- a/driver/pace/driver/run.py
+++ b/driver/pace/driver/run.py
@@ -1,59 +1,15 @@
 import dataclasses
 import gc
-import logging
 from typing import Optional
 
 import click
 import yaml
 
-from pace.util import pace_log
-from pace.util.mpi import MPI
+from pace.util import AVAILABLE_LOG_LEVELS, pace_log
 
 from .driver import Driver, DriverConfig
 
 
-logger = logging.getLogger(__name__)
-
-
-log_levels = {
-    "info": logging.INFO,
-    "debug": logging.DEBUG,
-    "warning": logging.WARNING,
-    "error": logging.ERROR,
-    "critical": logging.CRITICAL,
-}
-
-
-def configure_logging(log_rank: Optional[int], log_level: str):
-    """
-    Configure logging for the driver.
-
-    Args:
-        log_rank: rank to log from, or 'all' to log to all ranks,
-            forced to 'all' if running without MPI
-        log_level: log level to use
-    """
-    level = log_levels[log_level.lower()]
-    if MPI is None:
-        logging.basicConfig(
-            level=level,
-            format="%(asctime)s [%(levelname)s] %(name)s:%(message)s",
-            handlers=[logging.StreamHandler()],
-            datefmt="%Y-%m-%d %H:%M:%S",
-        )
-    else:
-        if log_rank is None or int(log_rank) == MPI.COMM_WORLD.Get_rank():
-            logging.basicConfig(
-                level=level,
-                format=(
-                    f"%(asctime)s [%(levelname)s] (rank {MPI.COMM_WORLD.Get_rank()}) "
-                    "%(name)s:%(message)s"
-                ),
-                handlers=[logging.StreamHandler()],
-                datefmt="%Y-%m-%d %H:%M:%S",
-            )
-
-
 @click.command()
 @click.argument(
     "CONFIG_PATH",
@@ -76,7 +32,8 @@ def command_line(config_path: str, log_rank: Optional[int], log_level: str):
 
     CONFIG_PATH is the path to a DriverConfig yaml file.
     """
-    configure_logging(log_rank=log_rank, log_level=log_level)
+    level = AVAILABLE_LOG_LEVELS[log_level.lower()]
+    pace_log.setLevel(level)
     pace_log.info("loading DriverConfig from yaml")
     with open(config_path, "r") as f:
         config = yaml.safe_load(f)
diff --git a/fv3core/pace/fv3core/_config.py b/fv3core/pace/fv3core/_config.py
index 17609b7c..51fb609f 100644
--- a/fv3core/pace/fv3core/_config.py
+++ b/fv3core/pace/fv3core/_config.py
@@ -195,6 +195,7 @@ class DynamicalCoreConfig:
     do_qa: bool = DEFAULT_BOOL
     layout: Tuple[int, int] = NamelistDefaults.layout
     grid_type: int = NamelistDefaults.grid_type
+    u_max: float = NamelistDefaults.u_max  # max windspeed for dp config
     do_f3d: bool = NamelistDefaults.do_f3d
     inline_q: bool = NamelistDefaults.inline_q
     do_skeb: bool = NamelistDefaults.do_skeb  # save dissipation estimate
@@ -334,6 +335,7 @@ def from_namelist(cls, namelist: Namelist) -> "DynamicalCoreConfig":
             do_qa=namelist.do_qa,
             layout=namelist.layout,
             grid_type=namelist.grid_type,
+            u_max=namelist.u_max,
             do_f3d=namelist.do_f3d,
             inline_q=namelist.inline_q,
             do_skeb=namelist.do_skeb,
diff --git a/tests/main/driver/test_example_configs.py b/tests/main/driver/test_example_configs.py
index 14d74ce0..e62276d1 100644
--- a/tests/main/driver/test_example_configs.py
+++ b/tests/main/driver/test_example_configs.py
@@ -13,6 +13,7 @@
 
 TESTED_CONFIGS: List[str] = [
     "baroclinic_c12.yaml",
+    "baroclinic_c12_dp.yaml",
     "baroclinic_c12_comm_read.yaml",
     "baroclinic_c12_comm_write.yaml",
     "baroclinic_c12_null_comm.yaml",
diff --git a/util/HISTORY.md b/util/HISTORY.md
index e07ed317..0b0a42b6 100644
--- a/util/HISTORY.md
+++ b/util/HISTORY.md
@@ -4,7 +4,10 @@ History
 latest
 ------
 
+- Added `dx_const`, `dy_const`, `deglat`, and `u_max` namelist settings for doubly-periodic grids
+- Added `dx_const`, `dy_const`, and `deglat` to grid generation code for doubly-periodic grids
 - Added f32 support to halo exchange data transformation
+- Use one single logger, from logging.py
 
 v0.10.0
 -------
diff --git a/util/pace/util/__init__.py b/util/pace/util/__init__.py
index 4911f2cf..58a7c2a5 100644
--- a/util/pace/util/__init__.py
+++ b/util/pace/util/__init__.py
@@ -54,7 +54,7 @@
 from .initialization import GridSizer, QuantityFactory, SubtileGridSizer
 from .io import read_state, write_state
 from .local_comm import LocalComm
-from .logging import pace_log
+from .logging import AVAILABLE_LOG_LEVELS, pace_log
 from .monitor import Monitor, NetCDFMonitor, ZarrMonitor
 from .mpi import MPIComm
 from .namelist import Namelist, NamelistDefaults
diff --git a/util/pace/util/grid/generation.py b/util/pace/util/grid/generation.py
index 7c7ad98c..b78a7059 100644
--- a/util/pace/util/grid/generation.py
+++ b/util/pace/util/grid/generation.py
@@ -222,6 +222,9 @@ def __init__(
         quantity_factory: util.QuantityFactory,
         communicator: util.CubedSphereCommunicator,
         grid_type: int = 0,
+        dx_const: float = 1000.0,
+        dy_const: float = 1000.0,
+        deglat: float = 15.0,
     ):
         assert grid_type < 3
         self._grid_type = grid_type
@@ -375,6 +378,9 @@ def from_tile_sizing(
         communicator: util.CubedSphereCommunicator,
         backend: str,
         grid_type: int = 0,
+        dx_const: float = 1000.0,
+        dy_const: float = 1000.0,
+        deglat: float = 15.0,
     ) -> "MetricTerms":
         sizer = util.SubtileGridSizer.from_tile_params(
             nx_tile=npx - 1,
@@ -393,6 +399,9 @@ def from_tile_sizing(
             quantity_factory=quantity_factory,
             communicator=communicator,
             grid_type=grid_type,
+            dx_const=dx_const,
+            dy_const=dy_const,
+            deglat=deglat,
         )
 
     @property
diff --git a/util/pace/util/logging.py b/util/pace/util/logging.py
index c0e9d0d7..1f9142fe 100644
--- a/util/pace/util/logging.py
+++ b/util/pace/util/logging.py
@@ -7,6 +7,16 @@
 
 LOGLEVEL = os.environ.get("PACE_LOGLEVEL", "INFO").upper()
 
+# Python log levels are hierarchical, therefore setting INFO
+# means DEBUG and everything lower will be logged.
+AVAILABLE_LOG_LEVELS = {
+    "info": logging.INFO,
+    "debug": logging.DEBUG,
+    "warning": logging.WARNING,
+    "error": logging.ERROR,
+    "critical": logging.CRITICAL,
+}
+
 
 def _pace_logger():
     name_log = logging.getLogger(__name__)
diff --git a/util/pace/util/namelist.py b/util/pace/util/namelist.py
index ff082736..0133e3f6 100644
--- a/util/pace/util/namelist.py
+++ b/util/pace/util/namelist.py
@@ -12,6 +12,10 @@
 class NamelistDefaults:
     layout = (1, 1)
     grid_type = 0
+    dx_const = 1000.0
+    dy_const = 1000.0
+    deglat = 15.0
+    u_max = 350.0
     do_f3d = False
     inline_q = False
     do_skeb = False  # save dissipation estimate
@@ -372,6 +376,10 @@ class Namelist:
     # fvmxl: Any
     # ldebug: Any
     grid_type: int = NamelistDefaults.grid_type
+    dx_const: float = NamelistDefaults.dx_const
+    dy_const: float = NamelistDefaults.dy_const
+    deglat: float = NamelistDefaults.deglat
+    u_max: float = NamelistDefaults.u_max
     do_f3d: bool = NamelistDefaults.do_f3d
     inline_q: bool = NamelistDefaults.inline_q
     do_skeb: bool = NamelistDefaults.do_skeb  # save dissipation estimate

From 2f9bbe9bb4fcd2ac40340cad5622fe18e1d5bc7f Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Wed, 23 Aug 2023 11:40:43 -0400
Subject: [PATCH 44/57] lint

---
 driver/pace/driver/driver.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/driver/pace/driver/driver.py b/driver/pace/driver/driver.py
index 97615c64..284acaca 100644
--- a/driver/pace/driver/driver.py
+++ b/driver/pace/driver/driver.py
@@ -455,11 +455,7 @@ def exit_instead_of_build(self):
                 stencil_compare_comm=stencil_compare_comm,
             )
             pace_log.info("setting up grid started")
-            (
-                damping_coefficients,
-                driver_grid_data,
-                grid_data,
-            ) = self.config.get_grid(
+            (damping_coefficients, driver_grid_data, grid_data,) = self.config.get_grid(
                 quantity_factory=self.quantity_factory,
                 communicator=communicator,
             )

From 8f6ba7cdb80337e963c564a6e08c7c6e62db0c82 Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Wed, 23 Aug 2023 12:57:27 -0400
Subject: [PATCH 45/57] Fix non-deterministic temporaries by using `zeros`
 everywhere instead of `empty`

---
 driver/pace/driver/grid.py                             |  4 +---
 driver/pace/driver/initialization.py                   |  5 +----
 fv3core/pace/fv3core/testing/translate_dyncore.py      |  4 ++--
 .../tests/savepoint/translate/translate_remapping.py   |  2 +-
 stencils/pace/stencils/testing/parallel_translate.py   |  3 +--
 stencils/pace/stencils/testing/temporaries.py          |  7 +++----
 util/pace/util/communicator.py                         | 10 +++++-----
 util/pace/util/grid/generation.py                      |  5 +----
 util/pace/util/grid/gnomonic.py                        |  6 +++---
 util/pace/util/grid/helper.py                          |  6 ++----
 util/pace/util/halo_data_transformer.py                |  4 +---
 util/pace/util/initialization/allocator.py             |  2 +-
 12 files changed, 22 insertions(+), 36 deletions(-)

diff --git a/driver/pace/driver/grid.py b/driver/pace/driver/grid.py
index c184d566..9fa97a06 100644
--- a/driver/pace/driver/grid.py
+++ b/driver/pace/driver/grid.py
@@ -105,7 +105,6 @@ def get_grid(
         quantity_factory: QuantityFactory,
         communicator: CubedSphereCommunicator,
     ) -> Tuple[DampingCoefficients, DriverGridData, GridData]:
-
         metric_terms = MetricTerms(
             quantity_factory=quantity_factory,
             communicator=communicator,
@@ -184,8 +183,7 @@ def get_grid(
         quantity_factory: QuantityFactory,
         communicator: CubedSphereCommunicator,
     ) -> Tuple[DampingCoefficients, DriverGridData, GridData]:
-
-        backend = quantity_factory.empty(
+        backend = quantity_factory.zeros(
             dims=[pace.util.X_DIM, pace.util.Y_DIM], units="unknown"
         ).gt4py_backend
 
diff --git a/driver/pace/driver/initialization.py b/driver/pace/driver/initialization.py
index 2b6471a8..bd6d96ea 100644
--- a/driver/pace/driver/initialization.py
+++ b/driver/pace/driver/initialization.py
@@ -154,7 +154,6 @@ def get_driver_state(
         driver_grid_data: pace.util.grid.DriverGridData,
         grid_data: pace.util.grid.GridData,
     ) -> DriverState:
-
         dycore_state = tc_init.init_tc_state(
             grid_data=grid_data,
             quantity_factory=quantity_factory,
@@ -323,7 +322,7 @@ def get_driver_state(
         driver_grid_data: pace.util.grid.DriverGridData,
         grid_data: pace.util.grid.GridData,
     ) -> DriverState:
-        backend = quantity_factory.empty(
+        backend = quantity_factory.zeros(
             dims=[pace.util.X_DIM, pace.util.Y_DIM], units="unknown"
         ).gt4py_backend
 
@@ -348,7 +347,6 @@ def _initialize_dycore_state(
         communicator: pace.util.CubedSphereCommunicator,
         backend: str,
     ) -> fv3core.DycoreState:
-
         grid = self._get_serialized_grid(communicator=communicator, backend=backend)
 
         ser = self._serializer(communicator)
@@ -401,7 +399,6 @@ def get_driver_state(
         driver_grid_data: pace.util.grid.DriverGridData,
         grid_data: pace.util.grid.GridData,
     ) -> DriverState:
-
         return DriverState(
             dycore_state=self.dycore_state,
             physics_state=self.physics_state,
diff --git a/fv3core/pace/fv3core/testing/translate_dyncore.py b/fv3core/pace/fv3core/testing/translate_dyncore.py
index 510e299f..6c7da4b7 100644
--- a/fv3core/pace/fv3core/testing/translate_dyncore.py
+++ b/fv3core/pace/fv3core/testing/translate_dyncore.py
@@ -140,7 +140,7 @@ def compute_parallel(self, inputs, communicator):
             grid_data.ptop = inputs["ptop"]
         self._base.make_storage_data_input_vars(inputs)
         state = DycoreState.init_zeros(quantity_factory=self.grid.quantity_factory)
-        wsd: pace.util.Quantity = self.grid.quantity_factory.empty(
+        wsd: pace.util.Quantity = self.grid.quantity_factory.zeros(
             dims=[pace.util.X_DIM, pace.util.Y_DIM],
             units="unknown",
         )
@@ -152,7 +152,7 @@ def compute_parallel(self, inputs, communicator):
                 state[name].data[selection] = value
             else:
                 setattr(state, name, value)
-        phis: pace.util.Quantity = self.grid.quantity_factory.empty(
+        phis: pace.util.Quantity = self.grid.quantity_factory.zeros(
             dims=[pace.util.X_DIM, pace.util.Y_DIM],
             units="m",
         )
diff --git a/fv3core/tests/savepoint/translate/translate_remapping.py b/fv3core/tests/savepoint/translate/translate_remapping.py
index fc9196b0..9a2e1f84 100644
--- a/fv3core/tests/savepoint/translate/translate_remapping.py
+++ b/fv3core/tests/savepoint/translate/translate_remapping.py
@@ -107,7 +107,7 @@ def compute_from_storage(self, inputs):
         inputs["wsd"] = wsd_2d
         inputs["q_cld"] = inputs["tracers"]["qcld"]
         inputs["last_step"] = bool(inputs["last_step"])
-        pfull = self.grid.quantity_factory.empty([Z_DIM], units="Pa")
+        pfull = self.grid.quantity_factory.zeros([Z_DIM], units="Pa")
         pfull.data[:] = pfull.np.asarray(inputs.pop("pfull"))
         l_to_e_obj = LagrangianToEulerian(
             self.stencil_factory,
diff --git a/stencils/pace/stencils/testing/parallel_translate.py b/stencils/pace/stencils/testing/parallel_translate.py
index dcc1e64d..f10b9b27 100644
--- a/stencils/pace/stencils/testing/parallel_translate.py
+++ b/stencils/pace/stencils/testing/parallel_translate.py
@@ -12,7 +12,6 @@
 
 
 class ParallelTranslate:
-
     max_error = TranslateFortranData2Py.max_error
     near_zero = TranslateFortranData2Py.near_zero
     compute_grid_option = False
@@ -192,7 +191,7 @@ def state_from_inputs(self, inputs: dict, grid=None) -> dict:
         for name, properties in self.inputs.items():
             standard_name = properties.get("name", name)
             if len(properties["dims"]) > 0:
-                state[standard_name] = grid.quantity_factory.empty(
+                state[standard_name] = grid.quantity_factory.zeros(
                     properties["dims"], properties["units"], dtype=inputs[name].dtype
                 )
                 input_slice = _serialize_slice(
diff --git a/stencils/pace/stencils/testing/temporaries.py b/stencils/pace/stencils/testing/temporaries.py
index 581387f6..2dd46663 100644
--- a/stencils/pace/stencils/testing/temporaries.py
+++ b/stencils/pace/stencils/testing/temporaries.py
@@ -40,10 +40,9 @@ def _assert_same_temporaries(dict1: dict, dict2: dict) -> List[str]:
         attr2 = dict2[attr]
         if isinstance(attr1, np.ndarray):
             try:
-                np.testing.assert_almost_equal(
-                    attr1, attr2, err_msg=f"{attr} not equal"
-                )
-            except AssertionError:
+                assert np.allclose(attr1, attr2, equal_nan=True)
+            except AssertionError as e:
+                print(e)
                 differences.append(attr)
         else:
             sub_differences = _assert_same_temporaries(attr1, attr2)
diff --git a/util/pace/util/communicator.py b/util/pace/util/communicator.py
index 938469bd..d2577d8c 100644
--- a/util/pace/util/communicator.py
+++ b/util/pace/util/communicator.py
@@ -167,7 +167,7 @@ def _get_gather_recv_quantity(
     ) -> Quantity:
         """Initialize a Quantity for use when receiving global data during gather"""
         recv_quantity = Quantity(
-            send_metadata.np.empty(global_extent, dtype=send_metadata.dtype),
+            send_metadata.np.zeros(global_extent, dtype=send_metadata.dtype),
             dims=send_metadata.dims,
             units=send_metadata.units,
             origin=tuple([0 for dim in send_metadata.dims]),
@@ -182,7 +182,7 @@ def _get_scatter_recv_quantity(
     ) -> Quantity:
         """Initialize a Quantity for use when receiving subtile data during scatter"""
         recv_quantity = Quantity(
-            send_metadata.np.empty(shape, dtype=send_metadata.dtype),
+            send_metadata.np.zeros(shape, dtype=send_metadata.dtype),
             dims=send_metadata.dims,
             units=send_metadata.units,
             gt4py_backend=send_metadata.gt4py_backend,
@@ -206,7 +206,7 @@ def gather(
         result: Optional[Quantity]
         if self.rank == constants.ROOT_RANK:
             with array_buffer(
-                send_quantity.np.empty,
+                send_quantity.np.zeros,
                 (self.partitioner.total_ranks,) + tuple(send_quantity.extent),
                 dtype=send_quantity.data.dtype,
             ) as recvbuf:
@@ -745,7 +745,7 @@ def _get_gather_recv_quantity(
         # needs to change the quantity dimensions since we add a "tile" dimension,
         # unlike for tile scatter/gather which retains the same dimensions
         recv_quantity = Quantity(
-            metadata.np.empty(global_extent, dtype=metadata.dtype),
+            metadata.np.zeros(global_extent, dtype=metadata.dtype),
             dims=(constants.TILE_DIM,) + metadata.dims,
             units=metadata.units,
             origin=(0,) + tuple([0 for dim in metadata.dims]),
@@ -767,7 +767,7 @@ def _get_scatter_recv_quantity(
         # needs to change the quantity dimensions since we remove a "tile" dimension,
         # unlike for tile scatter/gather which retains the same dimensions
         recv_quantity = Quantity(
-            metadata.np.empty(shape, dtype=metadata.dtype),
+            metadata.np.zeros(shape, dtype=metadata.dtype),
             dims=metadata.dims[1:],
             units=metadata.units,
             gt4py_backend=metadata.gt4py_backend,
diff --git a/util/pace/util/grid/generation.py b/util/pace/util/grid/generation.py
index b78a7059..679b9449 100644
--- a/util/pace/util/grid/generation.py
+++ b/util/pace/util/grid/generation.py
@@ -75,7 +75,7 @@ def quantity_cast_to_model_float(
     quantity_factory: util.QuantityFactory, qty_64: util.Quantity
 ) -> util.Quantity:
     """Copy & cast from 64-bit float to model precision if need be"""
-    qty = quantity_factory.empty(qty_64.dims, qty_64.units, dtype=Float)
+    qty = quantity_factory.zeros(qty_64.dims, qty_64.units, dtype=Float)
     qty.data[:] = qty_64.data[:]
     return qty
 
@@ -1530,7 +1530,6 @@ def rdyc(self) -> util.Quantity:
         )
 
     def _init_dgrid(self):
-
         grid_mirror_ew = self.quantity_factory.zeros(
             self._grid_dims,
             "radians",
@@ -1751,7 +1750,6 @@ def _compute_dxdy(self):
         return dx, dy
 
     def _compute_dxdy_agrid(self):
-
         dx_agrid_64 = self.quantity_factory.zeros(
             [util.X_DIM, util.Y_DIM],
             "m",
@@ -2149,7 +2147,6 @@ def _calculate_more_trig_terms(self, cos_sg, sin_sg):
         )
 
     def _init_cell_trigonometry(self):
-
         cosa_u_64 = self.quantity_factory.zeros(
             [util.X_INTERFACE_DIM, util.Y_DIM],
             "",
diff --git a/util/pace/util/grid/gnomonic.py b/util/pace/util/grid/gnomonic.py
index 705014e4..f26af0f2 100644
--- a/util/pace/util/grid/gnomonic.py
+++ b/util/pace/util/grid/gnomonic.py
@@ -303,9 +303,9 @@ def _mirror_latlon(lon1, lat1, lon2, lat2, lon0, lat0, np):
     pdot = p0[0] * nb[0] + p0[1] * nb[1] + p0[2] * nb[2]
     pp = p0 - np.multiply(2.0, pdot) * nb
 
-    lon3 = np.empty((1, 1))
-    lat3 = np.empty((1, 1))
-    pp3 = np.empty((3, 1, 1))
+    lon3 = np.zeros((1, 1))
+    lat3 = np.zeros((1, 1))
+    pp3 = np.zeros((3, 1, 1))
     pp3[:, 0, 0] = pp
     _cart_to_latlon(1, pp3, lon3, lat3, np)
 
diff --git a/util/pace/util/grid/helper.py b/util/pace/util/grid/helper.py
index 673e484d..1b977ad8 100644
--- a/util/pace/util/grid/helper.py
+++ b/util/pace/util/grid/helper.py
@@ -166,8 +166,8 @@ def from_restart(
                 but no fv_core.res.nc in restart data file."""
             )
 
-        ak = quantity_factory.empty([Z_INTERFACE_DIM], units="Pa")
-        bk = quantity_factory.empty([Z_INTERFACE_DIM], units="")
+        ak = quantity_factory.zeros([Z_INTERFACE_DIM], units="Pa")
+        bk = quantity_factory.zeros([Z_INTERFACE_DIM], units="")
         with fs.open(ak_bk_data_file, "rb") as f:
             ds = xr.open_dataset(f).isel(Time=0).drop_vars("Time")
             ak.view[:] = ds["ak"].values
@@ -322,7 +322,6 @@ def __init__(
 
     @classmethod
     def new_from_metric_terms(cls, metric_terms: MetricTerms):
-
         horizontal_data = HorizontalGridData.new_from_metric_terms(metric_terms)
         vertical_data = VerticalGridData.new_from_metric_terms(metric_terms)
         contravariant_data = ContravariantGridData.new_from_metric_terms(metric_terms)
@@ -701,7 +700,6 @@ def new_from_grid_variables(
         es1: pace.util.Quantity,
         ew2: pace.util.Quantity,
     ) -> "DriverGridData":
-
         try:
             vlon1, vlon2, vlon3 = split_quantity_along_last_dim(vlon)
             vlat1, vlat2, vlat3 = split_quantity_along_last_dim(vlat)
diff --git a/util/pace/util/halo_data_transformer.py b/util/pace/util/halo_data_transformer.py
index 00a547d6..e97bb97a 100644
--- a/util/pace/util/halo_data_transformer.py
+++ b/util/pace/util/halo_data_transformer.py
@@ -70,7 +70,7 @@ def _build_flatten_indices(
     """
 
     # Have to go down to numpy to leverage indices calculation
-    arr_indices = np.empty(shape, dtype=np.int32, order="C")[slices]
+    arr_indices = np.zeros(shape, dtype=np.int32, order="C")[slices]
 
     # Get offset from first index
     offset_dims = []
@@ -875,7 +875,6 @@ def _opt_unpack_scalar(self, quantities: List[Quantity]):
 
             # Use private stream
             with self._get_stream(cu_kernel_args.stream):
-
                 # Launch kernel
                 blocks = 128
                 grid_x = (info_x._unpack_buffer_size // blocks) + 1
@@ -942,7 +941,6 @@ def _opt_unpack_vector(
 
             # Use private stream
             with self._get_stream(cu_kernel_args.stream):
-
                 # Buffer sizes
                 edge_size = info_x._unpack_buffer_size + info_y._unpack_buffer_size
 
diff --git a/util/pace/util/initialization/allocator.py b/util/pace/util/initialization/allocator.py
index c865cbbf..1a68495e 100644
--- a/util/pace/util/initialization/allocator.py
+++ b/util/pace/util/initialization/allocator.py
@@ -102,7 +102,7 @@ def from_array(
         That numpy array must correspond to the correct shape and extent
         for the given dims.
         """
-        base = self.empty(
+        base = self.zeros(
             dims=dims,
             units=units,
             dtype=data.dtype,

From 31c484455b4c389fe83b75baaed26445d4486965 Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Wed, 23 Aug 2023 13:12:52 -0400
Subject: [PATCH 46/57] Missed commit

---
 stencils/pace/stencils/testing/grid.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/stencils/pace/stencils/testing/grid.py b/stencils/pace/stencils/testing/grid.py
index 65ad8870..4cf623b1 100644
--- a/stencils/pace/stencils/testing/grid.py
+++ b/stencils/pace/stencils/testing/grid.py
@@ -504,7 +504,7 @@ def grid_data(self) -> "GridData":
             data = getattr(self, name)
             assert data is not None
 
-            quantity = self.quantity_factory.empty(dims=dims, units=units)
+            quantity = self.quantity_factory.zeros(dims=dims, units=units)
             if len(quantity.shape) == 3:
                 quantity.data[:] = data[:, :, : quantity.shape[2]]
             elif len(quantity.shape) == 2:

From 08f3033df36b0c730a393e94e14310f2a1875ec3 Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Fri, 25 Aug 2023 15:45:37 -0400
Subject: [PATCH 47/57] Update dsl/pace/dsl/caches/codepath.py

Co-authored-by: Oliver Elbert <oliver.elbert36@gmail.com>
---
 dsl/pace/dsl/caches/codepath.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/dsl/pace/dsl/caches/codepath.py b/dsl/pace/dsl/caches/codepath.py
index cb8327b5..c80bac48 100644
--- a/dsl/pace/dsl/caches/codepath.py
+++ b/dsl/pace/dsl/caches/codepath.py
@@ -2,12 +2,11 @@
 
 
 class FV3CodePath(enum.Enum):
-    """Enum listing all possible code path on a cube sphere.
-    For any layout the cube sphere has up to 9 different code path, 10
-    when counting the 1,1 layout which aggregates all 9. Those are related to
-    the positioning of the rank on the tile and which of the edge/corner case
-    it has to handle.
-    Since the framework inline code to optimize, we _cannot_ pre-suppose of the code
+    """Enum listing all possible code paths on a cube sphere.
+    For any layout the cube sphere has up to 9 different code paths depending on
+    the positioning of the rank on the tile and which of the edge/corner cases
+    it has to handle, as well as the possibility for all boundary computations in the 1x1 layout case. 
+    Since the framework inlines code to optimize, we _cannot_ pre-suppose which code
     being kept and/or ejected. This enum serves as the ground truth to map rank to
     the proper generated code.
     """

From d63a0f01679c3f22f7d1053bfaed31b49693a543 Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Fri, 25 Aug 2023 15:52:57 -0400
Subject: [PATCH 48/57] Lint

---
 dsl/pace/dsl/caches/codepath.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/dsl/pace/dsl/caches/codepath.py b/dsl/pace/dsl/caches/codepath.py
index c80bac48..8ebf9492 100644
--- a/dsl/pace/dsl/caches/codepath.py
+++ b/dsl/pace/dsl/caches/codepath.py
@@ -5,7 +5,8 @@ class FV3CodePath(enum.Enum):
     """Enum listing all possible code paths on a cube sphere.
     For any layout the cube sphere has up to 9 different code paths depending on
     the positioning of the rank on the tile and which of the edge/corner cases
-    it has to handle, as well as the possibility for all boundary computations in the 1x1 layout case. 
+    it has to handle, as well as the possibility for all boundary computations in
+    the 1x1 layout case.
     Since the framework inlines code to optimize, we _cannot_ pre-suppose which code
     being kept and/or ejected. This enum serves as the ground truth to map rank to
     the proper generated code.

From 6de1b3cf4733ee86a1e8880fd37c0a4f9cf40d14 Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Fri, 25 Aug 2023 15:55:49 -0400
Subject: [PATCH 49/57] Restore zero-ing out the fields

---
 fv3core/pace/fv3core/stencils/d_sw.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fv3core/pace/fv3core/stencils/d_sw.py b/fv3core/pace/fv3core/stencils/d_sw.py
index 14155da4..51c9ee6e 100644
--- a/fv3core/pace/fv3core/stencils/d_sw.py
+++ b/fv3core/pace/fv3core/stencils/d_sw.py
@@ -94,6 +94,8 @@ def heat_diss(
         ke_bg (in):
     """
     with computation(PARALLEL), interval(...):
+        heat_source = 0.0
+        diss_est = 0.0
         if damp_w > 1e-5:
             dd8 = ke_bg * abs(dt)
             dw = (fx2 - fx2[1, 0, 0] + fy2 - fy2[0, 1, 0]) * rarea

From 33ac533e7298032c86e659e305c3636e9311f3ae Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Fri, 25 Aug 2023 15:57:07 -0400
Subject: [PATCH 50/57] Fix formatting in geos logger

---
 fv3core/pace/fv3core/initialization/geos_wrapper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fv3core/pace/fv3core/initialization/geos_wrapper.py b/fv3core/pace/fv3core/initialization/geos_wrapper.py
index f7133543..a7a526ee 100644
--- a/fv3core/pace/fv3core/initialization/geos_wrapper.py
+++ b/fv3core/pace/fv3core/initialization/geos_wrapper.py
@@ -154,7 +154,7 @@ def __init__(
             f"  orchestration : {self._is_orchestrated}\n"
             f"          sizer : {sizer.nx}x{sizer.ny}x{sizer.nz}"
             f"(halo: {sizer.n_halo})\n"
-            f"  {device_ordinal_info}"
+            f"      Device ord: {device_ordinal_info}\n"
             f"     Nvidia MPS : {MPS_is_on}"
         )
 

From 79556957337bfce11bfc8b94ae64c779340023b6 Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Fri, 25 Aug 2023 16:01:55 -0400
Subject: [PATCH 51/57] Clean up

---
 dsl/pace/dsl/dace/dace_config.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/dsl/pace/dsl/dace/dace_config.py b/dsl/pace/dsl/dace/dace_config.py
index 6f2befff..1bb0939e 100644
--- a/dsl/pace/dsl/dace/dace_config.py
+++ b/dsl/pace/dsl/dace/dace_config.py
@@ -1,4 +1,5 @@
 import enum
+import os
 from typing import Any, Dict, Optional, Tuple
 
 import dace.config
@@ -162,8 +163,6 @@ def __init__(
         # Temporary. This is a bit too out of the ordinary for the common user.
         # We should refactor the architecture to allow for a `gtc:orchestrated:dace:X`
         # backend that would signify both the `CPU|GPU` split and the orchestration mode
-        import os
-
         if orchestration is None:
             fv3_dacemode_env_var = os.getenv("FV3_DACEMODE", "Python")
             # The below condition guard against defining empty FV3_DACEMODE and
@@ -266,8 +265,6 @@ def __init__(
         # attempt to kill the dace.conf to avoid confusion
         if dace.config.Config._cfg_filename:
             try:
-                import os
-
                 os.remove(dace.config.Config._cfg_filename)
             except OSError:
                 pass
@@ -317,7 +314,7 @@ def get_orchestrate(self) -> DaCeOrchestration:
         return self._orchestrate
 
     def get_sync_debug(self) -> bool:
-        return dace.config.Config.get("compiler", "cuda", "syncdebug")
+        return dace.config.Config.get_bool("compiler", "cuda", "syncdebug")
 
     def as_dict(self) -> Dict[str, Any]:
         return {

From 12527362ba741bec183f99d5abdb4562eb748b43 Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Tue, 29 Aug 2023 15:42:05 -0400
Subject: [PATCH 52/57] Refactor the test to go around so reload bug

---
 tests/main/dsl/test_caches.py | 97 ++++++++++++++++++-----------------
 1 file changed, 49 insertions(+), 48 deletions(-)

diff --git a/tests/main/dsl/test_caches.py b/tests/main/dsl/test_caches.py
index d5318493..d5088e4b 100644
--- a/tests/main/dsl/test_caches.py
+++ b/tests/main/dsl/test_caches.py
@@ -126,54 +126,55 @@ def test_relocatability_orchestration(backend):
 @pytest.mark.parametrize(
     "backend",
     [
-        pytest.param("gt:cpu_ifirst"),
         pytest.param("dace:cpu"),
     ],
 )
-def test_relocatability(backend):
-    # TODO: test work - but crashes when chained with other
-    #       see https://github.com/GEOS-ESM/pace/issues/16
-    pass
-    # import os
-    # import shutil
-
-    # working_dir = os.getcwd()
-
-    # # Compile on default
-    # p0 = OrchestratedProgam(backend, DaCeOrchestration.Python)
-    # p0()
-    # assert os.path.exists(
-    #     f"{working_dir}/.gt_cache_000000/py38_1013/gtcpu_ifirst/__main__/_stencil/"
-    # )
-
-    # # Compile in another directory
-    # from gt4py.cartesian import config as gt_config
-
-    # custom_path = f"{working_dir}/.my_cache_path"
-    # gt_config.cache_settings["root_path"] = custom_path
-    # p1 = OrchestratedProgam(backend, DaCeOrchestration.Python)
-    # p1()
-    # assert os.path.exists(
-    #     f"{custom_path}/.gt_cache_000000/py38_1013/gtcpu_ifirst/__main__/_stencil/"
-    # )
-
-    # # Check relocability by copying the second cache directory,
-    # # changing the path of gt_config.cache_settings and trying to Run on it
-    # relocated_path = f"{working_dir}/.my_relocated_cache_path"
-    # shutil.copytree(custom_path, relocated_path, dirs_exist_ok=True)
-    # gt_config.cache_settings["root_path"] = relocated_path
-    # p2 = OrchestratedProgam(backend, DaCeOrchestration.Python)
-    # p2()
-    # assert os.path.exists(
-    #     f"{relocated_path}/.gt_cache_000000/py38_1013/gtcpu_ifirst/__main__/_stencil/"
-    # )
-
-
-if __name__ == "__main__":
-    # TODO: test can be merged once gt4py also generates in the _FV3_X format
-    print("\n|>    test_relocatability_orchestration('dace:cpu')\n")
-    test_relocatability_orchestration("dace:cpu")
-    print("\n|>    test_relocatability('gt:cpu_ifirst')\n")
-    test_relocatability("gt:cpu_ifirst")
-    print("\n|>    test_relocatability('dace:cpu')\n")
-    test_relocatability("dace:cpu")
+def test_relocatability(backend: str):
+    import os
+    import shutil
+
+    import gt4py
+    from gt4py.cartesian import config as gt_config
+
+    from pace.util.mpi import MPI
+
+    # Restore original dir name
+    gt4py.cartesian.config.cache_settings["dir_name"] = os.environ.get(
+        "GT_CACHE_DIR_NAME", f".gt_cache_{MPI.COMM_WORLD.Get_rank():06}"
+    )
+
+    backend_sanitized = backend.replace(":", "")
+    working_dir = os.getcwd()
+
+    # Compile on default
+    p0 = OrchestratedProgam(backend, DaCeOrchestration.Python)
+    p0()
+    assert os.path.exists(
+        f"{working_dir}/.gt_cache_000000/py38_1013/{backend_sanitized}"
+        "/__main__/_stencil/"
+    )
+
+    # Compile in another directory
+
+    custom_path = f"{working_dir}/.my_cache_path"
+    gt_config.cache_settings["root_path"] = custom_path
+    p1 = OrchestratedProgam(backend, DaCeOrchestration.Python)
+    p1()
+    assert os.path.exists(
+        f"{custom_path}/.gt_cache_000000/py38_1013/{backend_sanitized}"
+        "/__main__/_stencil/"
+    )
+
+    # Check relocability by copying the second cache directory,
+    # changing the path of gt_config.cache_settings and trying to Run on it
+    relocated_path = f"{working_dir}/.my_relocated_cache_path"
+    shutil.copytree(
+        f"{working_dir}/.gt_cache_000000", relocated_path, dirs_exist_ok=True
+    )
+    gt_config.cache_settings["root_path"] = relocated_path
+    p2 = OrchestratedProgam(backend, DaCeOrchestration.Python)
+    p2()
+    assert os.path.exists(
+        f"{relocated_path}/.gt_cache_000000/py38_1013/{backend_sanitized}"
+        "/__main__/_stencil/"
+    )

From 8de32bc52aa37f091d7d13c3ac9626d7b62f9a6f Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Tue, 29 Aug 2023 16:14:09 -0400
Subject: [PATCH 53/57] Update requirements to include external/dace Include
 boost into main test

---
 .github/workflows/main_unit_tests.yml | 4 ++--
 requirements_dev.txt                  | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/main_unit_tests.yml b/.github/workflows/main_unit_tests.yml
index 5dbf4a1f..5800baa6 100644
--- a/.github/workflows/main_unit_tests.yml
+++ b/.github/workflows/main_unit_tests.yml
@@ -15,9 +15,9 @@ jobs:
           uses: actions/setup-python@v4.6.0
           with:
             python-version: '3.8.12'
-        - name: Install OpenMPI for gt4py
+        - name: Install OpenMPI & Boost for gt4py
           run: |
-            sudo apt-get install libopenmpi-dev
+            sudo apt-get install libopenmpi-dev libboost1.74-dev
         - name: Install Python packages
           run: |
             python -m pip install --upgrade pip
diff --git a/requirements_dev.txt b/requirements_dev.txt
index 052bf5c3..59853798 100644
--- a/requirements_dev.txt
+++ b/requirements_dev.txt
@@ -15,7 +15,7 @@ dace==0.14.0
 f90nml>=1.1.0
 numpy>=1.15
 -e external/gt4py
--e util[dace]
+-e external/dace
 -e stencils
 -e dsl
 -e physics

From 6ef8b6081d005f14c13c70eed363f7f5d411094a Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Tue, 29 Aug 2023 16:21:37 -0400
Subject: [PATCH 54/57] Typo

---
 requirements_dev.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/requirements_dev.txt b/requirements_dev.txt
index 59853798..ef46c36c 100644
--- a/requirements_dev.txt
+++ b/requirements_dev.txt
@@ -11,7 +11,6 @@ dask>=2021.10.0
 netCDF4
 cftime
 fv3config>=0.9.0
-dace==0.14.0
 f90nml>=1.1.0
 numpy>=1.15
 -e external/gt4py

From 51fca6ed788cee5b0091d443d3ddc5014e231f17 Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Tue, 29 Aug 2023 16:29:18 -0400
Subject: [PATCH 55/57] Revert wrong branch changes

---
 requirements_dev.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/requirements_dev.txt b/requirements_dev.txt
index ef46c36c..484c4948 100644
--- a/requirements_dev.txt
+++ b/requirements_dev.txt
@@ -10,11 +10,12 @@ zarr
 dask>=2021.10.0
 netCDF4
 cftime
+dace==0.14.0
 fv3config>=0.9.0
 f90nml>=1.1.0
 numpy>=1.15
 -e external/gt4py
--e external/dace
+-e util[dace]
 -e stencils
 -e dsl
 -e physics

From 132e2c4d184caab174f19dbf01adf06a72ea95a4 Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Wed, 30 Aug 2023 09:32:29 -0400
Subject: [PATCH 56/57] Fix utest called from pytest

---
 tests/main/dsl/test_caches.py | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/tests/main/dsl/test_caches.py b/tests/main/dsl/test_caches.py
index d5088e4b..c1f01303 100644
--- a/tests/main/dsl/test_caches.py
+++ b/tests/main/dsl/test_caches.py
@@ -144,37 +144,33 @@ def test_relocatability(backend: str):
     )
 
     backend_sanitized = backend.replace(":", "")
-    working_dir = os.getcwd()
 
     # Compile on default
     p0 = OrchestratedProgam(backend, DaCeOrchestration.Python)
     p0()
     assert os.path.exists(
-        f"{working_dir}/.gt_cache_000000/py38_1013/{backend_sanitized}"
-        "/__main__/_stencil/"
+        f"./.gt_cache_000000/py38_1013/{backend_sanitized}/test_caches/_stencil/"
     )
 
     # Compile in another directory
 
-    custom_path = f"{working_dir}/.my_cache_path"
+    custom_path = "./.my_cache_path"
     gt_config.cache_settings["root_path"] = custom_path
     p1 = OrchestratedProgam(backend, DaCeOrchestration.Python)
     p1()
     assert os.path.exists(
         f"{custom_path}/.gt_cache_000000/py38_1013/{backend_sanitized}"
-        "/__main__/_stencil/"
+        "/test_caches/_stencil/"
     )
 
     # Check relocability by copying the second cache directory,
     # changing the path of gt_config.cache_settings and trying to Run on it
-    relocated_path = f"{working_dir}/.my_relocated_cache_path"
-    shutil.copytree(
-        f"{working_dir}/.gt_cache_000000", relocated_path, dirs_exist_ok=True
-    )
+    relocated_path = "./.my_relocated_cache_path"
+    shutil.copytree("./.gt_cache_000000", relocated_path, dirs_exist_ok=True)
     gt_config.cache_settings["root_path"] = relocated_path
     p2 = OrchestratedProgam(backend, DaCeOrchestration.Python)
     p2()
     assert os.path.exists(
         f"{relocated_path}/.gt_cache_000000/py38_1013/{backend_sanitized}"
-        "/__main__/_stencil/"
+        "/test_caches/_stencil/"
     )

From 689f4b0811e836adad0f330ed59ee19a5f082fdd Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Wed, 30 Aug 2023 10:19:56 -0400
Subject: [PATCH 57/57] Update comment

---
 dsl/pace/dsl/dace/orchestration.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/dsl/pace/dsl/dace/orchestration.py b/dsl/pace/dsl/dace/orchestration.py
index 1feca341..7858381a 100644
--- a/dsl/pace/dsl/dace/orchestration.py
+++ b/dsl/pace/dsl/dace/orchestration.py
@@ -194,12 +194,13 @@ def _build_sdfg(
                 ),
             )
 
-    # Compilation done, either exit or scatter/gather and run
+    # Compilation done.
+    # On Build: all ranks sync, then exit.
+    # On BuildAndRun: all ranks sync, then load the SDFG from
+    #                 the expected path (made available by build).
+    # We use a "FrozenCompiledSDFG" to minimize re-entry cost at call time
     # DEV NOTE: we explicitly use MPI.COMM_WORLD here because it is
     # a true multi-machine sync, outside of our own communicator class.
-    # Also this code is protected in the case of running on one machine by the fact
-    # that 0 is _always_ a compiling rank & unblock_waiting_tiles is protected
-    # against scattering when no other ranks are present.
     if config.get_orchestrate() == DaCeOrchestration.Build:
         MPI.COMM_WORLD.Barrier()  # Protect against early exist which kill SLURM jobs
         DaCeProgress.log(