From 3afbeefb951c0c6d16ac27605f71c14ffe7d1ea4 Mon Sep 17 00:00:00 2001
From: Oliver Elbert <Oliver.Elbert@noaa.gov>
Date: Wed, 16 Aug 2023 15:25:18 -0400
Subject: [PATCH 1/3] Feature/dp driver (#13)

* initial commit

* adding test config

* adding the rest of driver and util code

* updating history.md

* move u_max to dycore config

* uncomment assert

* added comment explaining the copy of grid type to dycore config
---
 .../examples/configs/baroclinic_c12_dp.yaml   | 102 ++++++++++++++++++
 driver/pace/driver/driver.py                  |   4 +
 driver/pace/driver/grid.py                    |  15 ++-
 fv3core/pace/fv3core/_config.py               |   2 +
 tests/main/driver/test_example_configs.py     |   1 +
 util/HISTORY.md                               |   2 +
 util/pace/util/grid/generation.py             |   9 ++
 util/pace/util/namelist.py                    |   8 ++
 8 files changed, 142 insertions(+), 1 deletion(-)
 create mode 100644 driver/examples/configs/baroclinic_c12_dp.yaml

diff --git a/driver/examples/configs/baroclinic_c12_dp.yaml b/driver/examples/configs/baroclinic_c12_dp.yaml
new file mode 100644
index 00000000..029767ca
--- /dev/null
+++ b/driver/examples/configs/baroclinic_c12_dp.yaml
@@ -0,0 +1,102 @@
+stencil_config:
+  compilation_config:
+    backend: numpy
+    rebuild: false
+    validate_args: true
+    format_source: false
+    device_sync: false
+grid_config:
+  type: generated
+  config:
+    grid_type: 4
+    dx_const: 3000.0
+    dy_const: 3000.0
+    deglat: 10.0
+initialization:
+  type: baroclinic
+performance_config:
+  collect_performance: true
+  experiment_name: c12_baroclinic
+nx_tile: 12
+nz: 79
+dt_atmos: 225
+minutes: 15
+layout:
+  - 1
+  - 1
+diagnostics_config:
+  path: output
+  output_format: netcdf
+  names:
+    - u
+    - v
+    - ua
+    - va
+    - pt
+    - delp
+    - qvapor
+    - qliquid
+    - qice
+    - qrain
+    - qsnow
+    - qgraupel
+  z_select:
+    - level: 65
+      names:
+        - pt
+dycore_config:
+  a_imp: 1.0
+  beta: 0.
+  consv_te: 0.
+  d2_bg: 0.
+  d2_bg_k1: 0.2
+  d2_bg_k2: 0.1
+  d4_bg: 0.15
+  d_con: 1.0
+  d_ext: 0.0
+  dddmp: 0.5
+  delt_max: 0.002
+  do_sat_adj: true
+  do_vort_damp: true
+  fill: true
+  hord_dp: 6
+  hord_mt: 6
+  hord_tm: 6
+  hord_tr: 8
+  hord_vt: 6
+  hydrostatic: false
+  k_split: 1
+  ke_bg: 0.
+  kord_mt: 9
+  kord_tm: -9
+  kord_tr: 9
+  kord_wz: 9
+  n_split: 1
+  nord: 3
+  nwat: 6
+  p_fac: 0.05
+  rf_cutoff: 3000.
+  rf_fast: true
+  tau: 10.
+  vtdm4: 0.06
+  z_tracer: true
+  do_qa: true
+  tau_i2s: 1000.
+  tau_g2v: 1200.
+  ql_gen: 0.001
+  ql_mlt: 0.002
+  qs_mlt: 0.000001
+  qi_lim: 1.0
+  dw_ocean: 0.1
+  dw_land: 0.15
+  icloud_f: 0
+  tau_l2v: 300.
+  tau_v2l: 90.
+  fv_sg_adj: 0
+  n_sponge: 48
+  u_max: 355.0
+
+physics_config:
+  hydrostatic: false
+  nwat: 6
+  do_qa: true
diff --git a/driver/pace/driver/driver.py b/driver/pace/driver/driver.py
index 07317415..0ef3263b 100644
--- a/driver/pace/driver/driver.py
+++ b/driver/pace/driver/driver.py
@@ -273,6 +273,10 @@ def from_dict(cls, kwargs: Dict[str, Any]) -> "DriverConfig":
             kwargs["grid_config"] = GridInitializerSelector.from_dict(
                 kwargs["grid_config"]
             )
+            grid_type = kwargs["grid_config"].config.grid_type
+            # Copy grid_type to the DycoreConfig if it's not the default value
+            if grid_type != 0: 
+                kwargs["dycore_config"].grid_type = grid_type
 
         if (
             isinstance(kwargs["stencil_config"], dict)
diff --git a/driver/pace/driver/grid.py b/driver/pace/driver/grid.py
index 4817869c..c184d566 100644
--- a/driver/pace/driver/grid.py
+++ b/driver/pace/driver/grid.py
@@ -85,12 +85,20 @@ class GeneratedGridConfig(GridInitializer):
         lon_target: desired center longitude for refined tile (deg)
         lat_target: desired center latitude for refined tile (deg)
         restart_path: if given, load vertical grid from restart file
+        grid_type: type of grid, 0 is a gnomonic cubed-sphere, 4 is doubly-periodic
+        dx_const: constant x-width of grid cells on a dp-grid
+        dy_const: constant y-width of grid cells on a dp-grid
+        deglat: latitude to use for coriolis calculations on a dp-grid
     """
 
     stretch_factor: Optional[float] = 1.0
     lon_target: Optional[float] = 350.0
     lat_target: Optional[float] = -90.0
     restart_path: Optional[str] = None
+    grid_type: Optional[int] = 0
+    dx_const: Optional[float] = 1000.0
+    dy_const: Optional[float] = 1000.0
+    deglat: Optional[float] = 15.0
 
     def get_grid(
         self,
@@ -99,7 +107,12 @@ def get_grid(
     ) -> Tuple[DampingCoefficients, DriverGridData, GridData]:
 
         metric_terms = MetricTerms(
-            quantity_factory=quantity_factory, communicator=communicator
+            quantity_factory=quantity_factory,
+            communicator=communicator,
+            grid_type=self.grid_type,
+            dx_const=self.dx_const,
+            dy_const=self.dy_const,
+            deglat=self.deglat,
         )
         if self.stretch_factor != 1:  # do horizontal grid transformation
             _transform_horizontal_grid(
diff --git a/fv3core/pace/fv3core/_config.py b/fv3core/pace/fv3core/_config.py
index 17609b7c..51fb609f 100644
--- a/fv3core/pace/fv3core/_config.py
+++ b/fv3core/pace/fv3core/_config.py
@@ -195,6 +195,7 @@ class DynamicalCoreConfig:
     do_qa: bool = DEFAULT_BOOL
     layout: Tuple[int, int] = NamelistDefaults.layout
     grid_type: int = NamelistDefaults.grid_type
+    u_max: float = NamelistDefaults.u_max  # max windspeed for dp config
     do_f3d: bool = NamelistDefaults.do_f3d
     inline_q: bool = NamelistDefaults.inline_q
     do_skeb: bool = NamelistDefaults.do_skeb  # save dissipation estimate
@@ -334,6 +335,7 @@ def from_namelist(cls, namelist: Namelist) -> "DynamicalCoreConfig":
             do_qa=namelist.do_qa,
             layout=namelist.layout,
             grid_type=namelist.grid_type,
+            u_max=namelist.u_max,
             do_f3d=namelist.do_f3d,
             inline_q=namelist.inline_q,
             do_skeb=namelist.do_skeb,
diff --git a/tests/main/driver/test_example_configs.py b/tests/main/driver/test_example_configs.py
index 14d74ce0..e62276d1 100644
--- a/tests/main/driver/test_example_configs.py
+++ b/tests/main/driver/test_example_configs.py
@@ -13,6 +13,7 @@
 
 TESTED_CONFIGS: List[str] = [
     "baroclinic_c12.yaml",
+    "baroclinic_c12_dp.yaml",
     "baroclinic_c12_comm_read.yaml",
     "baroclinic_c12_comm_write.yaml",
     "baroclinic_c12_null_comm.yaml",
diff --git a/util/HISTORY.md b/util/HISTORY.md
index e07ed317..26aac41c 100644
--- a/util/HISTORY.md
+++ b/util/HISTORY.md
@@ -4,6 +4,8 @@ History
 latest
 ------
 
+- Added `dx_const`, `dy_const`, `deglat`, and `u_max` namelist settings for doubly-periodic grids
+- Added `dx_const`, `dy_const`, and `deglat` to grid generation code for doubly-periodic grids
 - Added f32 support to halo exchange data transformation
 
 v0.10.0
diff --git a/util/pace/util/grid/generation.py b/util/pace/util/grid/generation.py
index 7c7ad98c..b78a7059 100644
--- a/util/pace/util/grid/generation.py
+++ b/util/pace/util/grid/generation.py
@@ -222,6 +222,9 @@ def __init__(
         quantity_factory: util.QuantityFactory,
         communicator: util.CubedSphereCommunicator,
         grid_type: int = 0,
+        dx_const: float = 1000.0,
+        dy_const: float = 1000.0,
+        deglat: float = 15.0,
     ):
         assert grid_type < 3
         self._grid_type = grid_type
@@ -375,6 +378,9 @@ def from_tile_sizing(
         communicator: util.CubedSphereCommunicator,
         backend: str,
         grid_type: int = 0,
+        dx_const: float = 1000.0,
+        dy_const: float = 1000.0,
+        deglat: float = 15.0,
     ) -> "MetricTerms":
         sizer = util.SubtileGridSizer.from_tile_params(
             nx_tile=npx - 1,
@@ -393,6 +399,9 @@ def from_tile_sizing(
             quantity_factory=quantity_factory,
             communicator=communicator,
             grid_type=grid_type,
+            dx_const=dx_const,
+            dy_const=dy_const,
+            deglat=deglat,
         )
 
     @property
diff --git a/util/pace/util/namelist.py b/util/pace/util/namelist.py
index ff082736..0133e3f6 100644
--- a/util/pace/util/namelist.py
+++ b/util/pace/util/namelist.py
@@ -12,6 +12,10 @@
 class NamelistDefaults:
     layout = (1, 1)
     grid_type = 0
+    dx_const = 1000.0
+    dy_const = 1000.0
+    deglat = 15.0
+    u_max = 350.0
     do_f3d = False
     inline_q = False
     do_skeb = False  # save dissipation estimate
@@ -372,6 +376,10 @@ class Namelist:
     # fvmxl: Any
     # ldebug: Any
     grid_type: int = NamelistDefaults.grid_type
+    dx_const: float = NamelistDefaults.dx_const
+    dy_const: float = NamelistDefaults.dy_const
+    deglat: float = NamelistDefaults.deglat
+    u_max: float = NamelistDefaults.u_max
     do_f3d: bool = NamelistDefaults.do_f3d
     inline_q: bool = NamelistDefaults.inline_q
     do_skeb: bool = NamelistDefaults.do_skeb  # save dissipation estimate

From d8ebc398a6b2dfe21e6adfb81f6232214b7b25ae Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Fri, 18 Aug 2023 08:25:24 -0400
Subject: [PATCH 2/3] Turn main unit test  & lint on PR, logger clean up
 [NASA:Update]  (#15)

* Initialize GeosDycoreWrapper with bdt (timestep)

* Use GEOS version of constants

* 1. Add qcld to the list of tracers beings advected
2. Made GEOS specific changes to thresholds in saturation adjustment

* Accumulate diss_est

* Allow GEOS_WRAPPER to process device data

* Add clear to collector for 3rd party use. GEOS pass down timings to caller

* Make kernel analysis run a copy stencil to compute local bandwith
Parametrize tool with backend, output format

* Move constant on a env var
Add saturation adjustement threshold to const

* Restrict dace to 0.14.1 due to a parsing bug

* Add guard for bdt==0

* Fix theroritical timings

* Fixed a bug where pkz was being calculated twice, and the second calc was wrong

* Downgrade DaCe to 0.14.0 pending array aliasing fix

* Set default cache path for orchestrated DaCe to respect GT_CACHE_* env

* Remove previous per stencil override of default_build_folder

* Revert "Set default cache path for orchestrated DaCe to respect GT_CACHE_* env"

* Read cache_root in default dace backend

* Document faulty behavior with GT_CACHE_DIR_NAME

* Check for the string value of CONST_VERSION directly instead of enum

* Protect constant selection more rigorusly.
Clean abort on unknown constant given

* Log constants selection

* Refactor NQ to constants.py

* Introduce PACE_LOGLEVEL to control log level from outside

* Code guidelines clean up

* Devops/GitHub actions on (#15)

* Linting on PR

* Run main unit test

* Update python to available 3.8.12

* Fix unit tests (remove dxa, dya rely on halo ex)

* Update HISTORY.md

* Adapt log_level in driver.run

* Verbose the PACE_CONSTANTS

* Doc log level hierarchical nature

---------

Co-authored-by: Purnendu Chakraborty <purnendu.chakraborty@nasa.gov>
Co-authored-by: Purnendu Chakraborty <pchakraborty@users.noreply.github.com>
---
 .github/workflows/lint.yml                    | 27 +++++++++
 .github/workflows/main_unit_tests.yml         | 27 +++++++++
 README.md                                     | 20 ++++++-
 driver/pace/driver/run.py                     | 54 +++--------------
 dsl/pace/dsl/dace/utils.py                    |  2 +-
 .../fv3core/initialization/geos_wrapper.py    |  8 +--
 fv3core/pace/fv3core/stencils/d_sw.py         |  6 +-
 fv3core/pace/fv3core/stencils/fv_dynamics.py  |  3 +-
 tests/main/fv3core/test_init_from_geos.py     | 60 ++++++++++---------
 util/HISTORY.md                               |  1 +
 util/pace/util/__init__.py                    |  2 +-
 util/pace/util/communicator.py                |  3 -
 util/pace/util/local_comm.py                  |  7 +--
 util/pace/util/logging.py                     | 18 +++++-
 util/pace/util/monitor/netcdf_monitor.py      | 10 +---
 util/pace/util/monitor/zarr_monitor.py        |  9 +--
 util/pace/util/mpi.py                         | 30 +++++-----
 17 files changed, 163 insertions(+), 124 deletions(-)
 create mode 100644 .github/workflows/lint.yml
 create mode 100644 .github/workflows/main_unit_tests.yml

diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
new file mode 100644
index 00000000..0cc08080
--- /dev/null
+++ b/.github/workflows/lint.yml
@@ -0,0 +1,27 @@
+name: "Lint"
+on:
+  pull_request:
+      types: [opened, synchronize, reopened, ready_for_review, labeled, unlabeled]
+
+jobs:
+  lint:
+    runs-on: ubuntu-latest
+    steps:
+        - name: Checkout Pace repository
+          uses: actions/checkout@v3.5.2
+          with:
+            submodules: 'recursive'
+        - name: Step Python 3.8.12
+          uses: actions/setup-python@v4.6.0
+          with:
+            python-version: '3.8.12'
+        - name: Install OpenMPI for gt4py
+          run: |
+            sudo apt-get install libopenmpi-dev
+        - name: Install Python packages
+          run: |
+            python -m pip install --upgrade pip
+            pip install -r requirements_dev.txt -r requirements_lint.txt
+        - name: Run lint via pre-commit
+          run: |
+            pre-commit run --all-files
diff --git a/.github/workflows/main_unit_tests.yml b/.github/workflows/main_unit_tests.yml
new file mode 100644
index 00000000..5dbf4a1f
--- /dev/null
+++ b/.github/workflows/main_unit_tests.yml
@@ -0,0 +1,27 @@
+name: "Main unit tests"
+on:
+  pull_request:
+      types: [opened, synchronize, reopened, ready_for_review, labeled, unlabeled]
+
+jobs:
+  main_unit_tests:
+    runs-on: ubuntu-latest
+    steps:
+        - name: Checkout Pace repository
+          uses: actions/checkout@v3.5.2
+          with:
+            submodules: 'recursive'
+        - name: Step Python 3.8.12
+          uses: actions/setup-python@v4.6.0
+          with:
+            python-version: '3.8.12'
+        - name: Install OpenMPI for gt4py
+          run: |
+            sudo apt-get install libopenmpi-dev
+        - name: Install Python packages
+          run: |
+            python -m pip install --upgrade pip
+            pip install -r requirements_dev.txt
+        - name: Run all main tests
+          run: |
+            pytest -x tests/main
diff --git a/README.md b/README.md
index 31980394..5884cee8 100644
--- a/README.md
+++ b/README.md
@@ -11,6 +11,7 @@ Pace is an implementation of the FV3GFS / SHiELD atmospheric model developed by
 Full Sphinx documentation can be found at [https://ai2cm.github.io/pace/](https://ai2cm.github.io/pace/).
 
 **WARNING** This repo is under active development - supported features and procedures can change rapidly and without notice.
+
 ## Quickstart - bare metal
 
 ### Build
@@ -27,10 +28,13 @@ export BOOST_ROOT=BOOST/ROOT/boost_1_79_0
 ```
 
 When cloning Pace you will need to update the repository's submodules as well:
+
 ```shell
 git clone --recursive https://github.com/ai2cm/pace.git
 ```
+
 or if you have already cloned the repository:
+
 ```
 git submodule update --init --recursive
 ```
@@ -43,6 +47,7 @@ source venv_name/bin/activate
 ```
 
 Inside of your pace `venv` or conda environment pip install the Python requirements, GT4Py, and Pace:
+
 ```shell
 pip3 install -r requirements_dev.txt -c constraints.txt
 ```
@@ -52,6 +57,7 @@ Shell scripts to install Pace on specific machines such as Gaea can be found in
 ### Run
 
 With the environment activated, you can run an example baroclinic test case with the following command:
+
 ```shell
 mpirun -n 6 python3 -m pace.driver.run driver/examples/configs/baroclinic_c12.yaml
 
@@ -61,20 +67,33 @@ mpirun -n 6 --oversubscribe python3 -m pace.driver.run driver/examples/configs/b
 
 After the run completes, you will see an output direcotry `output.zarr`. An example to visualize the output is provided in `driver/examples/plot_output.py`. See the [driver example](driver/examples/README.md) section for more details.
 
+### Environment variable configuration
+
+- `PACE_CONSTANTS`: Pace is bundled with various constants (see _util/pace/util/constants.py_).
+  - `FV3DYCORE` NOAA's FV3 dynamical core constants (original port)
+  - `GFS` Constant as defined in NOAA GFS
+  - `GEOS`  Constant as defined in GEOS v13
+- `PACE_FLOAT_PRECISION`: default precision of the field & scalars in the numerics. Default to 64.
+- `PACE_LOGLEVEL`: logging level to display (DEBUG, INFO, WARNING, ERROR, CRITICAL). Default to INFO.
+
 ## Quickstart - Docker
+
 ### Build
 
 While it is possible to install and build pace bare-metal, we can ensure all system libraries are installed with the correct versions by using a Docker container to test and develop pace.
 
 First, you will need to update the git submodules so that any dependencies are cloned and at the correct version:
+
 ```shell
 git submodule update --init --recursive
 ```
 
 Then build the `pace` docker image at the top level.
+
 ```shell
 make build
 ```
+
 ### Run
 
 ```shell
@@ -94,7 +113,6 @@ This git repository is laid out as a mono-repo, containing multiple independent
 
 ![Graph of interdependencies of Pace modules, generated from dependences.dot](./dependencies.svg)
 
-
 ## ML emulation
 
 An example of integration of an ML model replacing the microphysics parametrization is available on the `feature/microphysics-emulator` branch.
diff --git a/driver/pace/driver/run.py b/driver/pace/driver/run.py
index 2d9160cd..e6c39982 100644
--- a/driver/pace/driver/run.py
+++ b/driver/pace/driver/run.py
@@ -1,58 +1,15 @@
 import dataclasses
 import gc
-import logging
 from typing import Optional
 
 import click
 import yaml
 
-from pace.util.mpi import MPI
+from pace.util import pace_log, AVAILABLE_LOG_LEVELS
 
 from .driver import Driver, DriverConfig
 
 
-logger = logging.getLogger(__name__)
-
-
-log_levels = {
-    "info": logging.INFO,
-    "debug": logging.DEBUG,
-    "warning": logging.WARNING,
-    "error": logging.ERROR,
-    "critical": logging.CRITICAL,
-}
-
-
-def configure_logging(log_rank: Optional[int], log_level: str):
-    """
-    Configure logging for the driver.
-
-    Args:
-        log_rank: rank to log from, or 'all' to log to all ranks,
-            forced to 'all' if running without MPI
-        log_level: log level to use
-    """
-    level = log_levels[log_level.lower()]
-    if MPI is None:
-        logging.basicConfig(
-            level=level,
-            format="%(asctime)s [%(levelname)s] %(name)s:%(message)s",
-            handlers=[logging.StreamHandler()],
-            datefmt="%Y-%m-%d %H:%M:%S",
-        )
-    else:
-        if log_rank is None or int(log_rank) == MPI.COMM_WORLD.Get_rank():
-            logging.basicConfig(
-                level=level,
-                format=(
-                    f"%(asctime)s [%(levelname)s] (rank {MPI.COMM_WORLD.Get_rank()}) "
-                    "%(name)s:%(message)s"
-                ),
-                handlers=[logging.StreamHandler()],
-                datefmt="%Y-%m-%d %H:%M:%S",
-            )
-
-
 @click.command()
 @click.argument(
     "CONFIG_PATH",
@@ -75,12 +32,15 @@ def command_line(config_path: str, log_rank: Optional[int], log_level: str):
 
     CONFIG_PATH is the path to a DriverConfig yaml file.
     """
-    configure_logging(log_rank=log_rank, log_level=log_level)
-    logger.info("loading DriverConfig from yaml")
+    level = AVAILABLE_LOG_LEVELS[log_level.lower()]
+    pace_log.setLevel(level)
+    pace_log.info("loading DriverConfig from yaml")
     with open(config_path, "r") as f:
         config = yaml.safe_load(f)
         driver_config = DriverConfig.from_dict(config)
-    logging.info(f"DriverConfig loaded: {yaml.dump(dataclasses.asdict(driver_config))}")
+    pace_log.info(
+        f"DriverConfig loaded: {yaml.dump(dataclasses.asdict(driver_config))}"
+    )
     main(driver_config=driver_config)
 
 
diff --git a/dsl/pace/dsl/dace/utils.py b/dsl/pace/dsl/dace/utils.py
index 47eb61ad..40ac3c12 100644
--- a/dsl/pace/dsl/dace/utils.py
+++ b/dsl/pace/dsl/dace/utils.py
@@ -28,7 +28,7 @@ def __init__(self, config: DaceConfig, label: str):
 
     @classmethod
     def log(cls, prefix: str, message: str):
-        pace_log.info(f"{prefix} {message}")
+        pace_log.debug(f"{prefix} {message}")
 
     @classmethod
     def default_prefix(cls, config: DaceConfig) -> str:
diff --git a/fv3core/pace/fv3core/initialization/geos_wrapper.py b/fv3core/pace/fv3core/initialization/geos_wrapper.py
index 4fc34052..2835e77e 100644
--- a/fv3core/pace/fv3core/initialization/geos_wrapper.py
+++ b/fv3core/pace/fv3core/initialization/geos_wrapper.py
@@ -132,9 +132,12 @@ def __init__(
         self._allocate_output_dir()
 
         pace_log.info(
-            "GEOS-Wrapper with: \n"
+            "Pace GEOS wrapper initialized: \n"
             f"  dt     : {self.dycore_state.bdt}\n"
             f"  bridge : {self._fortran_mem_space} > {self._pace_mem_space}\n"
+            f"  backend: {backend}\n"
+            f"  orchestration: {self._is_orchestrated}\n"
+            f"  sizer  : {sizer.nx}x{sizer.ny}x{sizer.nz} (halo: {sizer.n_halo})"
         )
 
     def _critical_path(self):
@@ -173,7 +176,6 @@ def __call__(
         cyd: np.ndarray,
         diss_estd: np.ndarray,
     ) -> Tuple[Dict[str, np.ndarray], Dict[str, List[float]]]:
-
         with self.perf_collector.timestep_timer.clock("numpy-to-dycore"):
             self.dycore_state = self._put_fortran_data_in_dycore(
                 u,
@@ -246,7 +248,6 @@ def _put_fortran_data_in_dycore(
         cyd: np.ndarray,
         diss_estd: np.ndarray,
     ) -> fv3core.DycoreState:
-
         isc = self._grid_indexing.isc
         jsc = self._grid_indexing.jsc
         iec = self._grid_indexing.iec + 1
@@ -315,7 +316,6 @@ def _put_fortran_data_in_dycore(
         return state
 
     def _prep_outputs_for_geos(self) -> Dict[str, np.ndarray]:
-
         output_dict = self.output_dict
         isc = self._grid_indexing.isc
         jsc = self._grid_indexing.jsc
diff --git a/fv3core/pace/fv3core/stencils/d_sw.py b/fv3core/pace/fv3core/stencils/d_sw.py
index cc602f4e..51c9ee6e 100644
--- a/fv3core/pace/fv3core/stencils/d_sw.py
+++ b/fv3core/pace/fv3core/stencils/d_sw.py
@@ -932,7 +932,7 @@ def make_quantity():
             )
         )
 
-        if (self._d_con > 1.e-5) or (self._do_stochastic_ke_backscatter):
+        if (self._d_con > 1.0e-5) or (self._do_stochastic_ke_backscatter):
             self._accumulate_heat_source_and_dissipation_estimate_stencil = (
                 stencil_factory.from_dims_halo(
                     func=accumulate_heat_source_and_dissipation_estimate,
@@ -1254,11 +1254,11 @@ def __call__(
             self._column_namelist["d_con"],
         )
 
-        if (self._d_con > 1.e-5) or (self._do_stochastic_ke_backscatter):
+        if (self._d_con > 1.0e-5) or (self._do_stochastic_ke_backscatter):
             self._accumulate_heat_source_and_dissipation_estimate_stencil(
                 self._tmp_heat_s, heat_source, self._tmp_diss_e, diss_est
             )
-        
+
         self._update_u_and_v_stencil(
             self._tmp_ut,
             self._tmp_vt,
diff --git a/fv3core/pace/fv3core/stencils/fv_dynamics.py b/fv3core/pace/fv3core/stencils/fv_dynamics.py
index 3299e501..5f3de73a 100644
--- a/fv3core/pace/fv3core/stencils/fv_dynamics.py
+++ b/fv3core/pace/fv3core/stencils/fv_dynamics.py
@@ -7,7 +7,6 @@
 import pace.dsl.gt4py_utils as utils
 import pace.fv3core.stencils.moist_cv as moist_cv
 import pace.util
-import pace.util.constants as constants
 from pace.dsl.dace.orchestration import dace_inhibitor, orchestrate
 from pace.dsl.dace.wrapped_halo_exchange import WrappedHaloUpdater
 from pace.dsl.stencil import StencilFactory
@@ -21,7 +20,7 @@
 from pace.fv3core.stencils.neg_adj3 import AdjustNegativeTracerMixingRatio
 from pace.fv3core.stencils.remapping import LagrangianToEulerian
 from pace.stencils.c2l_ord import CubedToLatLon
-from pace.util import X_DIM, Y_DIM, Z_INTERFACE_DIM, Timer
+from pace.util import X_DIM, Y_DIM, Z_INTERFACE_DIM, Timer, constants
 from pace.util.grid import DampingCoefficients, GridData
 from pace.util.logging import pace_log
 from pace.util.mpi import MPI
diff --git a/tests/main/fv3core/test_init_from_geos.py b/tests/main/fv3core/test_init_from_geos.py
index 252fe7d2..16efe9e9 100644
--- a/tests/main/fv3core/test_init_from_geos.py
+++ b/tests/main/fv3core/test_init_from_geos.py
@@ -7,7 +7,6 @@
 
 
 def test_geos_wrapper():
-
     namelist_dict = {
         "stencil_config": {
             "compilation_config": {
@@ -82,7 +81,12 @@ def test_geos_wrapper():
     comm = NullComm(rank=0, total_ranks=6, fill_value=0.0)
     backend = "numpy"
 
-    wrapper = fv3core.GeosDycoreWrapper(namelist, comm, backend)
+    wrapper = fv3core.GeosDycoreWrapper(
+        namelist=namelist,
+        comm=comm,
+        backend=backend,
+        bdt=namelist_dict["dt_atmos"],
+    )
     nhalo = 3
     shape_centered = (
         namelist["nx_tile"] + 2 * nhalo,
@@ -191,31 +195,33 @@ def test_geos_wrapper():
     )
     diss_estd = np.ones(shape_centered)
 
-    output_dict = wrapper(
-        u,
-        v,
-        w,
-        delz,
-        pt,
-        delp,
-        q,
-        ps,
-        pe,
-        pk,
-        peln,
-        pkz,
-        phis,
-        q_con,
-        omga,
-        ua,
-        va,
-        uc,
-        vc,
-        mfxd,
-        mfyd,
-        cxd,
-        cyd,
-        diss_estd,
+    timings = {}
+    output_dict, timings = wrapper(
+        timings=timings,
+        u=u,
+        v=v,
+        w=w,
+        delz=delz,
+        pt=pt,
+        delp=delp,
+        q=q,
+        ps=ps,
+        pe=pe,
+        pk=pk,
+        peln=peln,
+        pkz=pkz,
+        phis=phis,
+        q_con=q_con,
+        omga=omga,
+        ua=ua,
+        va=va,
+        uc=uc,
+        vc=vc,
+        mfxd=mfxd,
+        mfyd=mfyd,
+        cxd=cxd,
+        cyd=cyd,
+        diss_estd=diss_estd,
     )
 
     assert isinstance(output_dict["u"], np.ndarray)
diff --git a/util/HISTORY.md b/util/HISTORY.md
index 26aac41c..0b0a42b6 100644
--- a/util/HISTORY.md
+++ b/util/HISTORY.md
@@ -7,6 +7,7 @@ latest
 - Added `dx_const`, `dy_const`, `deglat`, and `u_max` namelist settings for doubly-periodic grids
 - Added `dx_const`, `dy_const`, and `deglat` to grid generation code for doubly-periodic grids
 - Added f32 support to halo exchange data transformation
+- Use one single logger, from logging.py
 
 v0.10.0
 -------
diff --git a/util/pace/util/__init__.py b/util/pace/util/__init__.py
index 4911f2cf..54f684d6 100644
--- a/util/pace/util/__init__.py
+++ b/util/pace/util/__init__.py
@@ -54,7 +54,7 @@
 from .initialization import GridSizer, QuantityFactory, SubtileGridSizer
 from .io import read_state, write_state
 from .local_comm import LocalComm
-from .logging import pace_log
+from .logging import pace_log, AVAILABLE_LOG_LEVELS
 from .monitor import Monitor, NetCDFMonitor, ZarrMonitor
 from .mpi import MPIComm
 from .namelist import Namelist, NamelistDefaults
diff --git a/util/pace/util/communicator.py b/util/pace/util/communicator.py
index 0611fa98..938469bd 100644
--- a/util/pace/util/communicator.py
+++ b/util/pace/util/communicator.py
@@ -1,5 +1,4 @@
 import abc
-import logging
 from typing import List, Mapping, Optional, Sequence, Tuple, Union, cast
 
 import numpy as np
@@ -15,8 +14,6 @@
 from .utils import device_synchronize
 
 
-logger = logging.getLogger("pace.util")
-
 try:
     import cupy
 except ImportError:
diff --git a/util/pace/util/local_comm.py b/util/pace/util/local_comm.py
index a289296a..32fd0fb4 100644
--- a/util/pace/util/local_comm.py
+++ b/util/pace/util/local_comm.py
@@ -1,14 +1,11 @@
 import copy
-import logging
 from typing import Any
 
 from .comm import Comm
+from .logging import pace_log
 from .utils import ensure_contiguous, safe_assign_array
 
 
-logger = logging.getLogger("pace.util")
-
-
 class ConcurrencyError(Exception):
     """Exception to denote that a rank cannot proceed because it is waiting on a
     call from another rank."""
@@ -104,7 +101,7 @@ def bcast(self, value, root=0):
                 "the bcast source"
             )
         value = self._get_buffer("bcast", value)
-        logger.debug(f"bcast {value} to rank {self.rank}")
+        pace_log.debug(f"bcast {value} to rank {self.rank}")
         return value
 
     def Barrier(self):
diff --git a/util/pace/util/logging.py b/util/pace/util/logging.py
index 81b5a7b1..1f9142fe 100644
--- a/util/pace/util/logging.py
+++ b/util/pace/util/logging.py
@@ -1,15 +1,29 @@
 import logging
+import os
 import sys
 
 from mpi4py import MPI
 
 
+LOGLEVEL = os.environ.get("PACE_LOGLEVEL", "INFO").upper()
+
+# Python log levels are hierarchical, therefore setting INFO
+# means DEBUG and everything lower will be logged.
+AVAILABLE_LOG_LEVELS = {
+    "info": logging.INFO,
+    "debug": logging.DEBUG,
+    "warning": logging.WARNING,
+    "error": logging.ERROR,
+    "critical": logging.CRITICAL,
+}
+
+
 def _pace_logger():
     name_log = logging.getLogger(__name__)
-    name_log.setLevel(logging.DEBUG)
+    name_log.setLevel(LOGLEVEL)
 
     handler = logging.StreamHandler(sys.stdout)
-    handler.setLevel(logging.DEBUG)
+    handler.setLevel(LOGLEVEL)
     formatter = logging.Formatter(
         fmt=(
             f"%(asctime)s|%(levelname)s|rank {MPI.COMM_WORLD.Get_rank()}|"
diff --git a/util/pace/util/monitor/netcdf_monitor.py b/util/pace/util/monitor/netcdf_monitor.py
index 18687216..0b39da60 100644
--- a/util/pace/util/monitor/netcdf_monitor.py
+++ b/util/pace/util/monitor/netcdf_monitor.py
@@ -1,4 +1,3 @@
-import logging
 import os
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Set
@@ -10,13 +9,11 @@
 
 from .. import _xarray as xr
 from ..filesystem import get_fs
+from ..logging import pace_log
 from ..quantity import Quantity
 from .convert import to_numpy
 
 
-logger = logging.getLogger(__name__)
-
-
 class _TimeChunkedVariable:
     def __init__(self, initial: Quantity, time_chunk_size: int):
         self._data = np.zeros(
@@ -46,7 +43,6 @@ def data(self) -> Quantity:
 
 
 class _ChunkedNetCDFWriter:
-
     FILENAME_FORMAT = "state_{chunk:04d}_tile{tile}.nc"
 
     def __init__(
@@ -62,7 +58,7 @@ def __init__(
         self._time_units: Optional[str] = None
 
     def append(self, state):
-        logger.debug("appending at time %d", self._i_time)
+        pace_log.debug("appending at time %d", self._i_time)
         state = {**state}  # copy so we don't mutate the input
         time = state.pop("time", None)
         if self._chunked is None:
@@ -75,7 +71,7 @@ def append(self, state):
                 self._chunked[name].append(quantity)
         self._times.append(time)
         if (self._i_time + 1) % self._time_chunk_size == 0:
-            logger.debug("flushing on append at time %d", self._i_time)
+            pace_log.debug("flushing on append at time %d", self._i_time)
             self.flush()
         self._i_time += 1
 
diff --git a/util/pace/util/monitor/zarr_monitor.py b/util/pace/util/monitor/zarr_monitor.py
index 12186799..5d7729b9 100644
--- a/util/pace/util/monitor/zarr_monitor.py
+++ b/util/pace/util/monitor/zarr_monitor.py
@@ -1,4 +1,3 @@
-import logging
 from datetime import datetime, timedelta
 from typing import List, Tuple, Union
 
@@ -7,12 +6,11 @@
 from .. import _xarray as xr
 from .. import constants, utils
 from .._optional_imports import cupy, zarr
+from ..logging import pace_log
 from ..partitioner import Partitioner, subtile_slice
 from .convert import to_numpy
 
 
-logger = logging.getLogger("pace.util")
-
 __all__ = ["ZarrMonitor"]
 
 
@@ -238,7 +236,7 @@ def append(self, quantity):
         )
 
         from_slice = _get_from_slice(target_slice)
-        logger.debug(
+        pace_log.debug(
             f"assigning data from subtile slice {from_slice} to "
             f"target slice {target_slice}"
         )
@@ -310,7 +308,7 @@ def append(self, quantity):
         )
 
         from_slice = _get_from_slice(target_slice)
-        logger.debug(
+        pace_log.debug(
             f"assigning data from subtile slice {from_slice} to "
             f"target slice {target_slice}"
         )
@@ -332,7 +330,6 @@ def append(self, quantity):
 
 
 class _ZarrTimeWriter(_ZarrVariableWriter):
-
     _TIME_CHUNK_SIZE = 1024
 
     def __init__(self, *args, **kwargs):
diff --git a/util/pace/util/mpi.py b/util/pace/util/mpi.py
index d03b7937..5acc2b00 100644
--- a/util/pace/util/mpi.py
+++ b/util/pace/util/mpi.py
@@ -2,16 +2,14 @@
     from mpi4py import MPI
 except ImportError:
     MPI = None
-import logging
 from typing import List, Optional, TypeVar, cast
 
 from .comm import Comm, Request
+from .logging import pace_log
 
 
 T = TypeVar("T")
 
-logger = logging.getLogger(__name__)
-
 
 class MPIComm(Comm):
     def __init__(self):
@@ -26,54 +24,56 @@ def Get_size(self) -> int:
         return self._comm.Get_size()
 
     def bcast(self, value: Optional[T], root=0) -> T:
-        logger.debug("bcast from root %s on rank %s", root, self._comm.Get_rank())
+        pace_log.debug("bcast from root %s on rank %s", root, self._comm.Get_rank())
         return self._comm.bcast(value, root=root)
 
     def barrier(self):
-        logger.debug("barrier on rank %s", self._comm.Get_rank())
+        pace_log.debug("barrier on rank %s", self._comm.Get_rank())
         self._comm.barrier()
 
     def Barrier(self):
         pass
 
     def Scatter(self, sendbuf, recvbuf, root=0, **kwargs):
-        logger.debug("Scatter on rank %s with root %s", self._comm.Get_rank(), root)
+        pace_log.debug("Scatter on rank %s with root %s", self._comm.Get_rank(), root)
         self._comm.Scatter(sendbuf, recvbuf, root=root, **kwargs)
 
     def Gather(self, sendbuf, recvbuf, root=0, **kwargs):
-        logger.debug("Gather on rank %s with root %s", self._comm.Get_rank(), root)
+        pace_log.debug("Gather on rank %s with root %s", self._comm.Get_rank(), root)
         self._comm.Gather(sendbuf, recvbuf, root=root, **kwargs)
 
     def allgather(self, sendobj: T) -> List[T]:
-        logger.debug("allgather on rank %s", self._comm.Get_rank())
+        pace_log.debug("allgather on rank %s", self._comm.Get_rank())
         return self._comm.allgather(sendobj)
 
     def Send(self, sendbuf, dest, tag: int = 0, **kwargs):
-        logger.debug("Send on rank %s with dest %s", self._comm.Get_rank(), dest)
+        pace_log.debug("Send on rank %s with dest %s", self._comm.Get_rank(), dest)
         self._comm.Send(sendbuf, dest, tag=tag, **kwargs)
 
     def sendrecv(self, sendbuf, dest, **kwargs):
-        logger.debug("sendrecv on rank %s with dest %s", self._comm.Get_rank(), dest)
+        pace_log.debug("sendrecv on rank %s with dest %s", self._comm.Get_rank(), dest)
         return self._comm.sendrecv(sendbuf, dest, **kwargs)
 
     def Isend(self, sendbuf, dest, tag: int = 0, **kwargs) -> Request:
-        logger.debug("Isend on rank %s with dest %s", self._comm.Get_rank(), dest)
+        pace_log.debug("Isend on rank %s with dest %s", self._comm.Get_rank(), dest)
         return self._comm.Isend(sendbuf, dest, tag=tag, **kwargs)
 
     def Recv(self, recvbuf, source, tag: int = 0, **kwargs):
-        logger.debug("Recv on rank %s with source %s", self._comm.Get_rank(), source)
+        pace_log.debug("Recv on rank %s with source %s", self._comm.Get_rank(), source)
         self._comm.Recv(recvbuf, source, tag=tag, **kwargs)
 
     def Irecv(self, recvbuf, source, tag: int = 0, **kwargs) -> Request:
-        logger.debug("Irecv on rank %s with source %s", self._comm.Get_rank(), source)
+        pace_log.debug("Irecv on rank %s with source %s", self._comm.Get_rank(), source)
         return self._comm.Irecv(recvbuf, source, tag=tag, **kwargs)
 
     def Split(self, color, key) -> "Comm":
-        logger.debug(
+        pace_log.debug(
             "Split on rank %s with color %s, key %s", self._comm.Get_rank(), color, key
         )
         return self._comm.Split(color, key)
 
     def allreduce(self, sendobj: T, op=None) -> T:
-        logger.debug("allreduce on rank %s with operator %s", self._comm.Get_rank(), op)
+        pace_log.debug(
+            "allreduce on rank %s with operator %s", self._comm.Get_rank(), op
+        )
         return self._comm.allreduce(sendobj, op)

From b1ef6b56f1f041c9e8f8478cd218f807122d6ea5 Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Mon, 11 Sep 2023 13:20:49 -0400
Subject: [PATCH 3/3] [NASA:Update] Distributed dace cache (rework) (#16)

* Initialize GeosDycoreWrapper with bdt (timestep)

* Use GEOS version of constants

* 1. Add qcld to the list of tracers beings advected
  2. Made GEOS specific changes to thresholds in saturation adjustment

* Accumulate diss_est

* Allow GEOS_WRAPPER to process device data

* Add clear to collector for 3rd party use. GEOS pass down timings to caller

* Make kernel analysis run a copy stencil to compute local bandwidth
  Parametrize tool with backend, output format

* Move constant on a env var
  Add saturation adjustment threshold to const

* Remove unused if leading to empty code block

* Add guard for bdt==0

* Fix theroritical timings

* Fixed a bug where pkz was being calculated twice, and the second calc was wrong

* Downgrade DaCe to 0.14.0 pending array aliasing fix

* Read cache_root in default dace backend

* Document faulty behavior with GT_CACHE_DIR_NAME

* Fix bad requirements syntax

* Check for the string value of CONST_VERSION directly instead of enum

* Protect constant selection more rigorusly.
  Clean abort on unknown constant given

* Log constants selection

* Refactor NQ to constants.py

* Replace all logger with pace_log
Introduce PACE_LOGLEVEL to control log level from outside

* Code guidelines clean up

* Devops/GitHub actions on (#15)

* Add openmpi to the image

* Fix unit tests (remove dxa, dya rely on halo ex)

* Distributed compilation on orchestrated backend for NxN layouts (#14)

* Adapt orchestration distribute compile for NxN layout

* Add a more descriptive string base postfix for cache naming
Identify the code path for all cases
Consistent reload post-compile
Create a central space for all caches generation logic
No more original layout check required

* Add a test on caches relocatability

* Deactivate relocatability test due to Python crash
Logged as issue 16


* Raise for 1,X and X,1 layouts which requires a new descriptor

* Added ak, bk for 137 levels in eta.py

* Add floating point precision to GEOS bridge init

* Log info GEOS bridge (#18)

* Add floating point precision to GEOS bridge init

* Update geos/develop to grab NOAA PR9 results (#21)

* Verbose choice of block/grid size


* GEOS integration (#9)

* Initialize GeosDycoreWrapper with bdt (timestep)

* Use GEOS version of constants

* 1. Add qcld to the list of tracers beings advected
2. Made GEOS specific changes to thresholds in saturation adjustment

* Accumulate diss_est

* Allow GEOS_WRAPPER to process device data

* Add clear to collector for 3rd party use. GEOS pass down timings to caller

* Make kernel analysis run a copy stencil to compute local bandwith
Parametrize tool with backend, output format

* Move constant on a env var
Add saturation adjustement threshold to const

* Remove unused if leading to empty code block

* Restrict dace to 0.14.1 due to a parsing bug

* Add guard for bdt==0
Fix bad merge for bdt with GEOS_Wrapper

* Remove unused code

* Fix theroritical timings

* Fixed a bug where pkz was being calculated twice, and the second calc was wrong

* Downgrade DaCe to 0.14.0 pending array aliasing fix

* Set default cache path for orchestrated DaCe to respect GT_CACHE_* env

* Remove previous per stencil override of default_build_folder

* Revert "Set default cache path for orchestrated DaCe to respect GT_CACHE_* env"

* Revert "Remove previous per stencil override of default_build_folder"

* Read cache_root in default dace backend

* Document faulty behavior with GT_CACHE_DIR_NAME

* Fix bad requirements syntax

* Check for the string value of CONST_VERSION directly instead of enum

* Protect constant selection more rigorusly.
Clean abort on unknown constant given

* Log constants selection

* Refactor NQ to constants.py

* Fix or explain inlined import

* Verbose runtime error when bad dt_atmos

* Verbose warm up

* re-initialize heat_source and diss_est each call, add do_skeb check to accumulation

---------

Co-authored-by: Purnendu Chakraborty <purnendu.chakraborty@nasa.gov>
Co-authored-by: Oliver Elbert <oliver.elbert36@gmail.com>

---------

Co-authored-by: Rusty Benson <6594772+bensonr@users.noreply.github.com>
Co-authored-by: Oliver Elbert <Oliver.Elbert@noaa.gov>
Co-authored-by: Purnendu Chakraborty <purnendu.chakraborty@nasa.gov>
Co-authored-by: Oliver Elbert <oliver.elbert36@gmail.com>

* [NOAA:Update] Bring back #15 & doubly periodic domain (#25)

* Feature/dp driver (#13)

* initial commit

* adding test config

* adding the rest of driver and util code

* updating history.md

* move u_max to dycore config

* uncomment assert

* added comment explaining the copy of grid type to dycore config

* Turn main unit test  & lint on PR, logger clean up [NASA:Update]  (#15)

* Initialize GeosDycoreWrapper with bdt (timestep)

* Use GEOS version of constants

* 1. Add qcld to the list of tracers beings advected
2. Made GEOS specific changes to thresholds in saturation adjustment

* Accumulate diss_est

* Allow GEOS_WRAPPER to process device data

* Add clear to collector for 3rd party use. GEOS pass down timings to caller

* Make kernel analysis run a copy stencil to compute local bandwith
Parametrize tool with backend, output format

* Move constant on a env var
Add saturation adjustement threshold to const

* Restrict dace to 0.14.1 due to a parsing bug

* Add guard for bdt==0

* Fix theroritical timings

* Fixed a bug where pkz was being calculated twice, and the second calc was wrong

* Downgrade DaCe to 0.14.0 pending array aliasing fix

* Set default cache path for orchestrated DaCe to respect GT_CACHE_* env

* Remove previous per stencil override of default_build_folder

* Revert "Set default cache path for orchestrated DaCe to respect GT_CACHE_* env"

* Read cache_root in default dace backend

* Document faulty behavior with GT_CACHE_DIR_NAME

* Check for the string value of CONST_VERSION directly instead of enum

* Protect constant selection more rigorusly.
Clean abort on unknown constant given

* Log constants selection

* Refactor NQ to constants.py

* Introduce PACE_LOGLEVEL to control log level from outside

* Devops/GitHub actions on (#15)

* Update python to available 3.8.12

* Fix unit tests (remove dxa, dya rely on halo ex)

* Update HISTORY.md

* Adapt log_level in driver.run

* Verbose the PACE_CONSTANTS

* Doc log level hierarchical nature

---------

Co-authored-by: Purnendu Chakraborty <purnendu.chakraborty@nasa.gov>
Co-authored-by: Purnendu Chakraborty <pchakraborty@users.noreply.github.com>

* Fix non-deterministic temporaries by using `zeros` everywhere instead of `empty`

* Update dsl/pace/dsl/caches/codepath.py

Co-authored-by: Oliver Elbert <oliver.elbert36@gmail.com>

* Refactor the test to go around so reload bug
---------

Co-authored-by: Purnendu Chakraborty <purnendu.chakraborty@nasa.gov>
Co-authored-by: Purnendu Chakraborty <pchakraborty@users.noreply.github.com>
Co-authored-by: Rusty Benson <6594772+bensonr@users.noreply.github.com>
Co-authored-by: Oliver Elbert <Oliver.Elbert@noaa.gov>
Co-authored-by: Oliver Elbert <oliver.elbert36@gmail.com>
---
 .github/workflows/main_unit_tests.yml         |   4 +-
 driver/pace/driver/driver.py                  |   2 +-
 driver/pace/driver/grid.py                    |   4 +-
 driver/pace/driver/initialization.py          |   5 +-
 driver/pace/driver/run.py                     |   2 +-
 dsl/pace/dsl/caches/cache_location.py         |  46 +++
 dsl/pace/dsl/caches/codepath.py               |  33 ++
 dsl/pace/dsl/dace/build.py                    | 136 ++------
 dsl/pace/dsl/dace/dace_config.py              | 118 ++++++-
 dsl/pace/dsl/dace/orchestration.py            |  52 +---
 dsl/pace/dsl/typing.py                        |   6 +-
 .../fv3core/initialization/geos_wrapper.py    |  28 +-
 .../pace/fv3core/testing/translate_dyncore.py |   4 +-
 .../translate/translate_remapping.py          |   2 +-
 requirements_dev.txt                          |   2 +-
 stencils/pace/stencils/testing/grid.py        |   2 +-
 .../stencils/testing/parallel_translate.py    |   3 +-
 stencils/pace/stencils/testing/temporaries.py |   7 +-
 tests/main/dsl/test_caches.py                 | 176 +++++++++++
 tests/main/dsl/test_dace_config.py            |  68 +++-
 tests/main/test_grid_init.py                  |   2 -
 util/pace/util/__init__.py                    |   2 +-
 util/pace/util/communicator.py                |  10 +-
 util/pace/util/grid/eta.py                    | 293 +++++++++++++++++-
 util/pace/util/grid/generation.py             |   5 +-
 util/pace/util/grid/gnomonic.py               |   6 +-
 util/pace/util/grid/helper.py                 |   6 +-
 util/pace/util/halo_data_transformer.py       |   4 +-
 util/pace/util/initialization/allocator.py    |   2 +-
 29 files changed, 817 insertions(+), 213 deletions(-)
 create mode 100644 dsl/pace/dsl/caches/cache_location.py
 create mode 100644 dsl/pace/dsl/caches/codepath.py
 create mode 100644 tests/main/dsl/test_caches.py

diff --git a/.github/workflows/main_unit_tests.yml b/.github/workflows/main_unit_tests.yml
index 5dbf4a1f..5800baa6 100644
--- a/.github/workflows/main_unit_tests.yml
+++ b/.github/workflows/main_unit_tests.yml
@@ -15,9 +15,9 @@ jobs:
           uses: actions/setup-python@v4.6.0
           with:
             python-version: '3.8.12'
-        - name: Install OpenMPI for gt4py
+        - name: Install OpenMPI & Boost for gt4py
           run: |
-            sudo apt-get install libopenmpi-dev
+            sudo apt-get install libopenmpi-dev libboost1.74-dev
         - name: Install Python packages
           run: |
             python -m pip install --upgrade pip
diff --git a/driver/pace/driver/driver.py b/driver/pace/driver/driver.py
index 0ef3263b..284acaca 100644
--- a/driver/pace/driver/driver.py
+++ b/driver/pace/driver/driver.py
@@ -275,7 +275,7 @@ def from_dict(cls, kwargs: Dict[str, Any]) -> "DriverConfig":
             )
             grid_type = kwargs["grid_config"].config.grid_type
             # Copy grid_type to the DycoreConfig if it's not the default value
-            if grid_type != 0: 
+            if grid_type != 0:
                 kwargs["dycore_config"].grid_type = grid_type
 
         if (
diff --git a/driver/pace/driver/grid.py b/driver/pace/driver/grid.py
index c184d566..9fa97a06 100644
--- a/driver/pace/driver/grid.py
+++ b/driver/pace/driver/grid.py
@@ -105,7 +105,6 @@ def get_grid(
         quantity_factory: QuantityFactory,
         communicator: CubedSphereCommunicator,
     ) -> Tuple[DampingCoefficients, DriverGridData, GridData]:
-
         metric_terms = MetricTerms(
             quantity_factory=quantity_factory,
             communicator=communicator,
@@ -184,8 +183,7 @@ def get_grid(
         quantity_factory: QuantityFactory,
         communicator: CubedSphereCommunicator,
     ) -> Tuple[DampingCoefficients, DriverGridData, GridData]:
-
-        backend = quantity_factory.empty(
+        backend = quantity_factory.zeros(
             dims=[pace.util.X_DIM, pace.util.Y_DIM], units="unknown"
         ).gt4py_backend
 
diff --git a/driver/pace/driver/initialization.py b/driver/pace/driver/initialization.py
index 2b6471a8..bd6d96ea 100644
--- a/driver/pace/driver/initialization.py
+++ b/driver/pace/driver/initialization.py
@@ -154,7 +154,6 @@ def get_driver_state(
         driver_grid_data: pace.util.grid.DriverGridData,
         grid_data: pace.util.grid.GridData,
     ) -> DriverState:
-
         dycore_state = tc_init.init_tc_state(
             grid_data=grid_data,
             quantity_factory=quantity_factory,
@@ -323,7 +322,7 @@ def get_driver_state(
         driver_grid_data: pace.util.grid.DriverGridData,
         grid_data: pace.util.grid.GridData,
     ) -> DriverState:
-        backend = quantity_factory.empty(
+        backend = quantity_factory.zeros(
             dims=[pace.util.X_DIM, pace.util.Y_DIM], units="unknown"
         ).gt4py_backend
 
@@ -348,7 +347,6 @@ def _initialize_dycore_state(
         communicator: pace.util.CubedSphereCommunicator,
         backend: str,
     ) -> fv3core.DycoreState:
-
         grid = self._get_serialized_grid(communicator=communicator, backend=backend)
 
         ser = self._serializer(communicator)
@@ -401,7 +399,6 @@ def get_driver_state(
         driver_grid_data: pace.util.grid.DriverGridData,
         grid_data: pace.util.grid.GridData,
     ) -> DriverState:
-
         return DriverState(
             dycore_state=self.dycore_state,
             physics_state=self.physics_state,
diff --git a/driver/pace/driver/run.py b/driver/pace/driver/run.py
index e6c39982..df70eb14 100644
--- a/driver/pace/driver/run.py
+++ b/driver/pace/driver/run.py
@@ -5,7 +5,7 @@
 import click
 import yaml
 
-from pace.util import pace_log, AVAILABLE_LOG_LEVELS
+from pace.util import AVAILABLE_LOG_LEVELS, pace_log
 
 from .driver import Driver, DriverConfig
 
diff --git a/dsl/pace/dsl/caches/cache_location.py b/dsl/pace/dsl/caches/cache_location.py
new file mode 100644
index 00000000..ab57a60b
--- /dev/null
+++ b/dsl/pace/dsl/caches/cache_location.py
@@ -0,0 +1,46 @@
+from pace.dsl.caches.codepath import FV3CodePath
+from pace.util import CubedSpherePartitioner
+
+
+def identify_code_path(
+    rank: int,
+    partitioner: CubedSpherePartitioner,
+) -> FV3CodePath:
+    if partitioner.layout == (1, 1) or partitioner.layout == [1, 1]:
+        return FV3CodePath.All
+    elif partitioner.layout[0] == 1 or partitioner.layout[1] == 1:
+        raise NotImplementedError(
+            f"Build for layout {partitioner.layout} is not handled"
+        )
+    else:
+        if partitioner.tile.on_tile_bottom(rank):
+            if partitioner.tile.on_tile_left(rank):
+                return FV3CodePath.BottomLeft
+            if partitioner.tile.on_tile_right(rank):
+                return FV3CodePath.BottomRight
+            else:
+                return FV3CodePath.Bottom
+        if partitioner.tile.on_tile_top(rank):
+            if partitioner.tile.on_tile_left(rank):
+                return FV3CodePath.TopLeft
+            if partitioner.tile.on_tile_right(rank):
+                return FV3CodePath.TopRight
+            else:
+                return FV3CodePath.Top
+        else:
+            if partitioner.tile.on_tile_left(rank):
+                return FV3CodePath.Left
+            if partitioner.tile.on_tile_right(rank):
+                return FV3CodePath.Right
+            else:
+                return FV3CodePath.Center
+
+
+def get_cache_fullpath(code_path: FV3CodePath) -> str:
+    from gt4py.cartesian import config as gt_config
+
+    return f"{gt_config.cache_settings['root_path']}/.gt_cache_{code_path}"
+
+
+def get_cache_directory(code_path: FV3CodePath) -> str:
+    return f".gt_cache_{code_path}"
diff --git a/dsl/pace/dsl/caches/codepath.py b/dsl/pace/dsl/caches/codepath.py
new file mode 100644
index 00000000..8ebf9492
--- /dev/null
+++ b/dsl/pace/dsl/caches/codepath.py
@@ -0,0 +1,33 @@
+import enum
+
+
+class FV3CodePath(enum.Enum):
+    """Enum listing all possible code paths on a cube sphere.
+    For any layout the cube sphere has up to 9 different code paths depending on
+    the positioning of the rank on the tile and which of the edge/corner cases
+    it has to handle, as well as the possibility for all boundary computations in
+    the 1x1 layout case.
+    Since the framework inlines code to optimize, we _cannot_ pre-suppose which code
+    being kept and/or ejected. This enum serves as the ground truth to map rank to
+    the proper generated code.
+    """
+
+    All = "FV3_A"
+    BottomLeft = "FV3_BL"
+    Left = "FV3_L"
+    TopLeft = "FV3_TL"
+    Top = "FV3_T"
+    TopRight = "FV3_TR"
+    Right = "FV3_R"
+    BottomRight = "FV3_BR"
+    Bottom = "FV3_B"
+    Center = "FV3_C"
+
+    def __str__(self):
+        return self.value
+
+    def __repr__(self):
+        return self.value
+
+    def __format__(self, format_spec: str) -> str:
+        return self.value
diff --git a/dsl/pace/dsl/dace/build.py b/dsl/pace/dsl/dace/build.py
index 7d8f3db2..b134f569 100644
--- a/dsl/pace/dsl/dace/build.py
+++ b/dsl/pace/dsl/dace/build.py
@@ -1,9 +1,9 @@
 from typing import List, Optional, Tuple
-from warnings import warn
 
 from dace.sdfg import SDFG
 
 import pace.util
+from pace.dsl.caches.cache_location import get_cache_directory, get_cache_fullpath
 from pace.dsl.dace.dace_config import DaceConfig, DaCeOrchestration
 
 
@@ -11,19 +11,6 @@
 # Distributed compilation
 
 
-def determine_compiling_ranks(config: DaceConfig) -> bool:
-    is_compiling = False
-    rank = config.my_rank
-    size = config.rank_size
-
-    if int(size / 6) == 0:
-        is_compiling = True
-    elif rank % int(size / 6) == rank:
-        is_compiling = True
-
-    return is_compiling
-
-
 def unblock_waiting_tiles(comm, sdfg_path: str) -> None:
     if comm and comm.Get_size() > 1:
         for tile in range(1, 6):
@@ -31,48 +18,6 @@ def unblock_waiting_tiles(comm, sdfg_path: str) -> None:
             comm.send(sdfg_path, dest=tile * tilesize + comm.Get_rank())
 
 
-def get_target_rank(rank: int, partitioner: pace.util.CubedSpherePartitioner):
-    """From my rank & the current partitioner we determine which
-    rank we should read from.
-    For all layout >= 3,3 this presumes build has been done on a
-    3,3 layout."""
-    if partitioner.layout == (1, 1):
-        return 0
-    if partitioner.layout == (2, 2):
-        if partitioner.tile.on_tile_bottom(rank):
-            if partitioner.tile.on_tile_left(rank):
-                return 0  # "00"
-            if partitioner.tile.on_tile_right(rank):
-                return 1  # "10"
-        if partitioner.tile.on_tile_top(rank):
-            if partitioner.tile.on_tile_left(rank):
-                return 2  # "01"
-            if partitioner.tile.on_tile_right(rank):
-                return 3  # "11"
-    else:
-        if partitioner.tile.on_tile_bottom(rank):
-            if partitioner.tile.on_tile_left(rank):
-                return 0  # "00"
-            if partitioner.tile.on_tile_right(rank):
-                return 2  # "20"
-            else:
-                return 1  # "10"
-        if partitioner.tile.on_tile_top(rank):
-            if partitioner.tile.on_tile_left(rank):
-                return 6  # "02"
-            if partitioner.tile.on_tile_right(rank):
-                return 8  # "22"
-            else:
-                return 7  # "12"
-        else:
-            if partitioner.tile.on_tile_left(rank):
-                return 3  # "01"
-            if partitioner.tile.on_tile_right(rank):
-                return 5  # "21"
-            else:
-                return 4  # "11"
-
-
 def build_info_filepath() -> str:
     return "build_info.txt"
 
@@ -101,7 +46,10 @@ def write_build_info(
 
 
 def get_sdfg_path(
-    daceprog_name: str, config: DaceConfig, sdfg_file_path: Optional[str] = None
+    daceprog_name: str,
+    config: DaceConfig,
+    sdfg_file_path: Optional[str] = None,
+    override_run_only=False,
 ) -> Optional[str]:
     """Build an SDFG path from the qualified program name or it's direct path to .sdfg
 
@@ -113,7 +61,7 @@ def get_sdfg_path(
 
     # TODO: check DaceConfig for cache.strategy == name
     # Guarding against bad usage of this function
-    if config.get_orchestrate() != DaCeOrchestration.Run:
+    if not override_run_only and config.get_orchestrate() != DaCeOrchestration.Run:
         return None
 
     # Case of a .sdfg file given by the user to be compiled
@@ -125,19 +73,8 @@ def get_sdfg_path(
         return sdfg_file_path
 
     # Case of loading a precompiled .so - lookup using GT_CACHE
-    from gt4py.cartesian import config as gt_config
-
-    if config.rank_size > 1:
-        rank = config.my_rank
-        rank_str = f"_{config.target_rank:06d}"
-    else:
-        rank = 0
-        rank_str = f"_{rank:06d}"
-
-    sdfg_dir_path = (
-        f"{gt_config.cache_settings['root_path']}"
-        f"/.gt_cache{rank_str}/dacecache/{daceprog_name}"
-    )
+    cache_fullpath = get_cache_fullpath(config.code_path)
+    sdfg_dir_path = f"{cache_fullpath}/dacecache/{daceprog_name}"
     if not os.path.isdir(sdfg_dir_path):
         raise RuntimeError(f"Precompiled SDFG is missing at {sdfg_dir_path}")
 
@@ -153,23 +90,8 @@ def get_sdfg_path(
             raise RuntimeError(
                 f"SDFG build for {build_backend}, {config._backend} has been asked"
             )
-        # Check layout
-        build_layout = ast.literal_eval(build_info_file.readline())
-        can_read = True
-        if config.layout == (1, 1) and config.layout != build_layout:
-            can_read = False
-        elif config.layout == (2, 2) and config.layout != build_layout:
-            can_read = False
-        elif (
-            build_layout != (1, 1) and build_layout != (2, 2) and build_layout != (3, 3)
-        ):
-            can_read = False
-        if not can_read:
-            warn(
-                f"SDFG build for layout {build_layout}, "
-                f"cannot be run with current layout {config.layout}, bad layout?"
-            )
         # Check resolution per tile
+        build_layout = ast.literal_eval(build_info_file.readline())
         build_resolution = ast.literal_eval(build_info_file.readline())
         if (config.tile_resolution[0] / config.layout[0]) != (
             build_resolution[0] / build_layout[0]
@@ -179,7 +101,7 @@ def get_sdfg_path(
                 f"cannot be run with current resolution {config.tile_resolution}"
             )
 
-    print(f"[DaCe Config] Rank {rank} loading SDFG {sdfg_dir_path}")
+    print(f"[DaCe Config] Rank {config.my_rank} loading SDFG {sdfg_dir_path}")
 
     return sdfg_dir_path
 
@@ -189,33 +111,31 @@ def set_distributed_caches(config: "DaceConfig"):
 
     # Execute specific initialization per orchestration state
     orchestration_mode = config.get_orchestrate()
+    if orchestration_mode == DaCeOrchestration.Python:
+        return
 
     # Check that we have all the file we need to early out in case
     # of issues.
     if orchestration_mode == DaCeOrchestration.Run:
         import os
 
-        from gt4py.cartesian import config as gt_config
-
-        # Check our cache exist
-        if config.rank_size > 1:
-            rank = config.my_rank
-            target_rank_str = f"_{config.target_rank:06d}"
-        else:
-            rank = 0
-            target_rank_str = f"_{rank:06d}"
-        cache_filepath = (
-            f"{gt_config.cache_settings['root_path']}/.gt_cache{target_rank_str}"
-        )
-        if not os.path.exists(cache_filepath):
+        cache_directory = get_cache_fullpath(config.code_path)
+        if not os.path.exists(cache_directory):
             raise RuntimeError(
                 f"{orchestration_mode} error: Could not find caches for rank "
-                f"{rank} at {cache_filepath}"
+                f"{config.my_rank} at {cache_directory}"
             )
 
-        # All, good set this rank cache to the source cache
-        gt_config.cache_settings["dir_name"] = f".gt_cache{target_rank_str}"
-        print(
-            f"[{orchestration_mode}] Rank {rank} "
-            f"reading cache {gt_config.cache_settings['dir_name']}"
-        )
+    # Set read/write caches to the target rank
+    from gt4py.cartesian import config as gt_config
+
+    if config.do_compile:
+        verb = "reading/writing"
+    else:
+        verb = "reading"
+
+    gt_config.cache_settings["dir_name"] = get_cache_directory(config.code_path)
+    pace.util.pace_log.critical(
+        f"[{orchestration_mode}] Rank {config.my_rank} "
+        f"{verb} cache {gt_config.cache_settings['dir_name']}"
+    )
diff --git a/dsl/pace/dsl/dace/dace_config.py b/dsl/pace/dsl/dace/dace_config.py
index e6f3e7df..1bb0939e 100644
--- a/dsl/pace/dsl/dace/dace_config.py
+++ b/dsl/pace/dsl/dace/dace_config.py
@@ -1,13 +1,16 @@
 import enum
-from typing import Any, Dict, Optional
+import os
+from typing import Any, Dict, Optional, Tuple
 
 import dace.config
 from dace.codegen.compiled_sdfg import CompiledSDFG
 from dace.frontend.python.parser import DaceProgram
 
+from pace.dsl.caches.cache_location import identify_code_path
+from pace.dsl.caches.codepath import FV3CodePath
 from pace.dsl.gt4py_utils import is_gpu_backend
 from pace.util._optional_imports import cupy as cp
-from pace.util.communicator import CubedSphereCommunicator
+from pace.util.communicator import CubedSphereCommunicator, CubedSpherePartitioner
 
 
 # This can be turned on to revert compilation for orchestration
@@ -16,6 +19,95 @@
 DEACTIVATE_DISTRIBUTED_DACE_COMPILE = False
 
 
+def _is_corner(rank: int, partitioner: CubedSpherePartitioner) -> bool:
+    if partitioner.tile.on_tile_bottom(rank):
+        if partitioner.tile.on_tile_left(rank):
+            return True
+        if partitioner.tile.on_tile_right(rank):
+            return True
+    if partitioner.tile.on_tile_top(rank):
+        if partitioner.tile.on_tile_left(rank):
+            return True
+        if partitioner.tile.on_tile_right(rank):
+            return True
+    return False
+
+
+def _smallest_rank_bottom(x: int, y: int, layout: Tuple[int, int]):
+    return y == 0 and x == 1
+
+
+def _smallest_rank_top(x: int, y: int, layout: Tuple[int, int]):
+    return y == layout[1] - 1 and x == 1
+
+
+def _smallest_rank_left(x: int, y: int, layout: Tuple[int, int]):
+    return x == 0 and y == 1
+
+
+def _smallest_rank_right(x: int, y: int, layout: Tuple[int, int]):
+    return x == layout[0] - 1 and y == 1
+
+
+def _smallest_rank_middle(x: int, y: int, layout: Tuple[int, int]):
+    return layout[0] > 1 and layout[1] > 1 and x == 1 and y == 1
+
+
+def _determine_compiling_ranks(
+    config: "DaceConfig",
+    partitioner: CubedSpherePartitioner,
+) -> bool:
+    """
+    We try to map every layout to a 3x3 layout which MPI ranks
+    looks like
+        6 7 8
+        3 4 5
+        0 1 2
+    Using the partitionner we find mapping of the given layout
+    to all of those. For example on 4x4 layout
+        12 13 14 15
+        8  9  10 11
+        4  5  6  7
+        0  1  2  3
+    therefore we map
+        0 -> 0
+        1 -> 1
+        2 -> NOT COMPILING
+        3 -> 2
+        4 -> 3
+        5 -> 4
+        6 -> NOT COMPILING
+        7 -> 5
+        8 -> NOT COMPILING
+        9 -> NOT COMPILING
+        10 -> NOT COMPILING
+        11 -> NOT COMPILING
+        12 -> 6
+        13 -> 7
+        14 -> NOT COMPILING
+        15 -> 8
+    """
+
+    # Tile 0 compiles
+    if partitioner.tile_index(config.my_rank) != 0:
+        return False
+
+    # Corners compile
+    if _is_corner(config.my_rank, partitioner):
+        return True
+
+    y, x = partitioner.tile.subtile_index(config.my_rank)
+
+    # If edge or center tile, we give way to the smallest rank
+    return (
+        _smallest_rank_left(x, y, config.layout)
+        or _smallest_rank_bottom(x, y, config.layout)
+        or _smallest_rank_middle(x, y, config.layout)
+        or _smallest_rank_right(x, y, config.layout)
+        or _smallest_rank_top(x, y, config.layout)
+    )
+
+
 class DaCeOrchestration(enum.Enum):
     """
     Orchestration mode for DaCe
@@ -71,8 +163,6 @@ def __init__(
         # Temporary. This is a bit too out of the ordinary for the common user.
         # We should refactor the architecture to allow for a `gtc:orchestrated:dace:X`
         # backend that would signify both the `CPU|GPU` split and the orchestration mode
-        import os
-
         if orchestration is None:
             fv3_dacemode_env_var = os.getenv("FV3_DACEMODE", "Python")
             # The below condition guard against defining empty FV3_DACEMODE and
@@ -175,32 +265,30 @@ def __init__(
         # attempt to kill the dace.conf to avoid confusion
         if dace.config.Config._cfg_filename:
             try:
-                import os
-
                 os.remove(dace.config.Config._cfg_filename)
             except OSError:
                 pass
 
         self._backend = backend
         self.tile_resolution = [tile_nx, tile_nx, tile_nz]
-        from pace.dsl.dace.build import get_target_rank, set_distributed_caches
+        from pace.dsl.dace.build import set_distributed_caches
 
         # Distributed build required info
         if communicator:
             self.my_rank = communicator.rank
             self.rank_size = communicator.comm.Get_size()
-            if DEACTIVATE_DISTRIBUTED_DACE_COMPILE:
-                self.target_rank = communicator.rank
-            else:
-                self.target_rank = get_target_rank(
-                    self.my_rank, communicator.partitioner
-                )
+            self.code_path = identify_code_path(self.my_rank, communicator.partitioner)
             self.layout = communicator.partitioner.layout
+            self.do_compile = (
+                DEACTIVATE_DISTRIBUTED_DACE_COMPILE
+                or _determine_compiling_ranks(self, communicator.partitioner)
+            )
         else:
             self.my_rank = 0
             self.rank_size = 1
-            self.target_rank = 0
+            self.code_path = FV3CodePath.All
             self.layout = (1, 1)
+            self.do_compile = True
 
         set_distributed_caches(self)
 
@@ -226,7 +314,7 @@ def get_orchestrate(self) -> DaCeOrchestration:
         return self._orchestrate
 
     def get_sync_debug(self) -> bool:
-        return dace.config.Config.get("compiler", "cuda", "syncdebug")
+        return dace.config.Config.get_bool("compiler", "cuda", "syncdebug")
 
     def as_dict(self) -> Dict[str, Any]:
         return {
diff --git a/dsl/pace/dsl/dace/orchestration.py b/dsl/pace/dsl/dace/orchestration.py
index 2bd9df5b..7858381a 100644
--- a/dsl/pace/dsl/dace/orchestration.py
+++ b/dsl/pace/dsl/dace/orchestration.py
@@ -11,12 +11,7 @@
 from dace.transformation.auto.auto_optimize import make_transients_persistent
 from dace.transformation.helpers import get_parent_map
 
-from pace.dsl.dace.build import (
-    determine_compiling_ranks,
-    get_sdfg_path,
-    unblock_waiting_tiles,
-    write_build_info,
-)
+from pace.dsl.dace.build import get_sdfg_path, write_build_info
 from pace.dsl.dace.dace_config import (
     DEACTIVATE_DISTRIBUTED_DACE_COMPILE,
     DaceConfig,
@@ -34,6 +29,7 @@
     memory_static_analysis,
     report_memory_static_analysis,
 )
+from pace.util import pace_log
 from pace.util.mpi import MPI
 
 
@@ -122,7 +118,7 @@ def _build_sdfg(
     if DEACTIVATE_DISTRIBUTED_DACE_COMPILE:
         is_compiling = True
     else:
-        is_compiling = determine_compiling_ranks(config)
+        is_compiling = config.do_compile
     if is_compiling:
         # Make the transients array persistents
         if config.is_gpu_backend():
@@ -198,12 +194,13 @@ def _build_sdfg(
                 ),
             )
 
-    # Compilation done, either exit or scatter/gather and run
+    # Compilation done.
+    # On Build: all ranks sync, then exit.
+    # On BuildAndRun: all ranks sync, then load the SDFG from
+    #                 the expected path (made available by build).
+    # We use a "FrozenCompiledSDFG" to minimize re-entry cost at call time
     # DEV NOTE: we explicitly use MPI.COMM_WORLD here because it is
     # a true multi-machine sync, outside of our own communicator class.
-    # Also this code is protected in the case of running on one machine by the fact
-    # that 0 is _always_ a compiling rank & unblock_waiting_tiles is protected
-    # against scattering when no other ranks are present.
     if config.get_orchestrate() == DaCeOrchestration.Build:
         MPI.COMM_WORLD.Barrier()  # Protect against early exist which kill SLURM jobs
         DaCeProgress.log(
@@ -212,31 +209,16 @@ def _build_sdfg(
         )
         exit(0)
     elif config.get_orchestrate() == DaCeOrchestration.BuildAndRun:
-        MPI.COMM_WORLD.Barrier()
-        if is_compiling:
-            if not DEACTIVATE_DISTRIBUTED_DACE_COMPILE:
-                unblock_waiting_tiles(MPI.COMM_WORLD, sdfg.build_folder)
-                DaCeProgress.log(
-                    DaCeProgress.default_prefix(config), "Build folder exchanged."
-                )
-            csdfg, _ = daceprog.load_precompiled_sdfg(
-                sdfg.build_folder, *args, **kwargs
-            )
-            config.loaded_precompiled_SDFG[daceprog] = FrozenCompiledSDFG(
-                daceprog, csdfg, args, kwargs
-            )
-
-        else:
-            source_rank = config.target_rank
-            # wait for compilation to be done
+        if not is_compiling:
             DaCeProgress.log(
                 DaCeProgress.default_prefix(config),
-                "Rank is not compiling. Waiting for build dir...",
-            )
-            sdfg_path = MPI.COMM_WORLD.recv(source=source_rank)
-            DaCeProgress.log(
-                DaCeProgress.default_prefix(config), "Build dir received, loading .so."
+                "Rank is not compiling. "
+                "Waiting for compilation to end on all other ranks...",
             )
+        MPI.COMM_WORLD.Barrier()
+
+        with DaCeProgress(config, "Loading"):
+            sdfg_path = get_sdfg_path(daceprog.name, config, override_run_only=True)
             csdfg, _ = daceprog.load_precompiled_sdfg(sdfg_path, *args, **kwargs)
             config.loaded_precompiled_SDFG[daceprog] = FrozenCompiledSDFG(
                 daceprog, csdfg, args, kwargs
@@ -267,6 +249,7 @@ def _call_sdfg(
             config.get_orchestrate() == DaCeOrchestration.Build
             or config.get_orchestrate() == DaCeOrchestration.BuildAndRun
         ):
+            pace_log.info("Building DaCe orchestration")
             res = _build_sdfg(daceprog, sdfg, config, args, kwargs)
         elif config.get_orchestrate() == DaCeOrchestration.Run:
             # We should never hit this, it should be caught by the
@@ -302,7 +285,7 @@ def _parse_sdfg(
         if DEACTIVATE_DISTRIBUTED_DACE_COMPILE:
             is_compiling = True
         else:
-            is_compiling = determine_compiling_ranks(config)
+            is_compiling = config.do_compile
         if not is_compiling:
             # We can not parse the SDFG since we will load the proper
             # compiled SDFG from the compiling rank
@@ -448,7 +431,6 @@ def __get__(self, obj, objtype=None) -> SDFGEnabledCallable:
         """Return SDFGEnabledCallable wrapping original obj.method from cache.
         Update cache first if need be"""
         if (id(obj), id(self.func)) not in _LazyComputepathMethod.bound_callables:
-
             _LazyComputepathMethod.bound_callables[
                 (id(obj), id(self.func))
             ] = _LazyComputepathMethod.SDFGEnabledCallable(self, obj)
diff --git a/dsl/pace/dsl/typing.py b/dsl/pace/dsl/typing.py
index 05b255ce..d67dd7b6 100644
--- a/dsl/pace/dsl/typing.py
+++ b/dsl/pace/dsl/typing.py
@@ -22,11 +22,15 @@
 DTypes = Union[bool, np.bool_, int, np.int32, np.int64, float, np.float32, np.float64]
 
 
+def floating_point_precision() -> int:
+    return int(os.getenv("PACE_FLOAT_PRECISION", "64"))
+
+
 def global_set_floating_point_precision():
     """Set the global floating point precision for all reference
     to Float in the codebase. Defaults to 64 bit."""
     global Float
-    precision_in_bit = int(os.getenv("PACE_FLOAT_PRECISION", "64"))
+    precision_in_bit = floating_point_precision()
     if precision_in_bit == 64:
         return np.float64
     elif precision_in_bit == 32:
diff --git a/fv3core/pace/fv3core/initialization/geos_wrapper.py b/fv3core/pace/fv3core/initialization/geos_wrapper.py
index 2835e77e..a7a526ee 100644
--- a/fv3core/pace/fv3core/initialization/geos_wrapper.py
+++ b/fv3core/pace/fv3core/initialization/geos_wrapper.py
@@ -11,6 +11,8 @@
 from pace.driver.performance.collector import PerformanceCollector
 from pace.dsl.dace import DaceConfig, orchestrate
 from pace.dsl.gt4py_utils import is_gpu_backend
+from pace.dsl.typing import floating_point_precision
+from pace.util._optional_imports import cupy as cp
 from pace.util.logging import pace_log
 
 
@@ -131,13 +133,29 @@ def __init__(
         self.output_dict: Dict[str, np.ndarray] = {}
         self._allocate_output_dir()
 
+        # Feedback information
+        device_ordinal_info = (
+            f"  Device PCI bus id: {cp.cuda.Device(0).pci_bus_id}\n"
+            if is_gpu_backend(backend)
+            else "N/A"
+        )
+        MPS_pipe_directory = os.getenv("CUDA_MPS_PIPE_DIRECTORY", None)
+        MPS_is_on = (
+            MPS_pipe_directory
+            and is_gpu_backend(backend)
+            and os.path.exists(f"{MPS_pipe_directory}/log")
+        )
         pace_log.info(
             "Pace GEOS wrapper initialized: \n"
-            f"  dt     : {self.dycore_state.bdt}\n"
-            f"  bridge : {self._fortran_mem_space} > {self._pace_mem_space}\n"
-            f"  backend: {backend}\n"
-            f"  orchestration: {self._is_orchestrated}\n"
-            f"  sizer  : {sizer.nx}x{sizer.ny}x{sizer.nz} (halo: {sizer.n_halo})"
+            f"             dt : {self.dycore_state.bdt}\n"
+            f"         bridge : {self._fortran_mem_space} > {self._pace_mem_space}\n"
+            f"        backend : {backend}\n"
+            f"          float : {floating_point_precision()}bit"
+            f"  orchestration : {self._is_orchestrated}\n"
+            f"          sizer : {sizer.nx}x{sizer.ny}x{sizer.nz}"
+            f"(halo: {sizer.n_halo})\n"
+            f"      Device ord: {device_ordinal_info}\n"
+            f"     Nvidia MPS : {MPS_is_on}"
         )
 
     def _critical_path(self):
diff --git a/fv3core/pace/fv3core/testing/translate_dyncore.py b/fv3core/pace/fv3core/testing/translate_dyncore.py
index 510e299f..6c7da4b7 100644
--- a/fv3core/pace/fv3core/testing/translate_dyncore.py
+++ b/fv3core/pace/fv3core/testing/translate_dyncore.py
@@ -140,7 +140,7 @@ def compute_parallel(self, inputs, communicator):
             grid_data.ptop = inputs["ptop"]
         self._base.make_storage_data_input_vars(inputs)
         state = DycoreState.init_zeros(quantity_factory=self.grid.quantity_factory)
-        wsd: pace.util.Quantity = self.grid.quantity_factory.empty(
+        wsd: pace.util.Quantity = self.grid.quantity_factory.zeros(
             dims=[pace.util.X_DIM, pace.util.Y_DIM],
             units="unknown",
         )
@@ -152,7 +152,7 @@ def compute_parallel(self, inputs, communicator):
                 state[name].data[selection] = value
             else:
                 setattr(state, name, value)
-        phis: pace.util.Quantity = self.grid.quantity_factory.empty(
+        phis: pace.util.Quantity = self.grid.quantity_factory.zeros(
             dims=[pace.util.X_DIM, pace.util.Y_DIM],
             units="m",
         )
diff --git a/fv3core/tests/savepoint/translate/translate_remapping.py b/fv3core/tests/savepoint/translate/translate_remapping.py
index fc9196b0..9a2e1f84 100644
--- a/fv3core/tests/savepoint/translate/translate_remapping.py
+++ b/fv3core/tests/savepoint/translate/translate_remapping.py
@@ -107,7 +107,7 @@ def compute_from_storage(self, inputs):
         inputs["wsd"] = wsd_2d
         inputs["q_cld"] = inputs["tracers"]["qcld"]
         inputs["last_step"] = bool(inputs["last_step"])
-        pfull = self.grid.quantity_factory.empty([Z_DIM], units="Pa")
+        pfull = self.grid.quantity_factory.zeros([Z_DIM], units="Pa")
         pfull.data[:] = pfull.np.asarray(inputs.pop("pfull"))
         l_to_e_obj = LagrangianToEulerian(
             self.stencil_factory,
diff --git a/requirements_dev.txt b/requirements_dev.txt
index 052bf5c3..484c4948 100644
--- a/requirements_dev.txt
+++ b/requirements_dev.txt
@@ -10,8 +10,8 @@ zarr
 dask>=2021.10.0
 netCDF4
 cftime
-fv3config>=0.9.0
 dace==0.14.0
+fv3config>=0.9.0
 f90nml>=1.1.0
 numpy>=1.15
 -e external/gt4py
diff --git a/stencils/pace/stencils/testing/grid.py b/stencils/pace/stencils/testing/grid.py
index 65ad8870..4cf623b1 100644
--- a/stencils/pace/stencils/testing/grid.py
+++ b/stencils/pace/stencils/testing/grid.py
@@ -504,7 +504,7 @@ def grid_data(self) -> "GridData":
             data = getattr(self, name)
             assert data is not None
 
-            quantity = self.quantity_factory.empty(dims=dims, units=units)
+            quantity = self.quantity_factory.zeros(dims=dims, units=units)
             if len(quantity.shape) == 3:
                 quantity.data[:] = data[:, :, : quantity.shape[2]]
             elif len(quantity.shape) == 2:
diff --git a/stencils/pace/stencils/testing/parallel_translate.py b/stencils/pace/stencils/testing/parallel_translate.py
index dcc1e64d..f10b9b27 100644
--- a/stencils/pace/stencils/testing/parallel_translate.py
+++ b/stencils/pace/stencils/testing/parallel_translate.py
@@ -12,7 +12,6 @@
 
 
 class ParallelTranslate:
-
     max_error = TranslateFortranData2Py.max_error
     near_zero = TranslateFortranData2Py.near_zero
     compute_grid_option = False
@@ -192,7 +191,7 @@ def state_from_inputs(self, inputs: dict, grid=None) -> dict:
         for name, properties in self.inputs.items():
             standard_name = properties.get("name", name)
             if len(properties["dims"]) > 0:
-                state[standard_name] = grid.quantity_factory.empty(
+                state[standard_name] = grid.quantity_factory.zeros(
                     properties["dims"], properties["units"], dtype=inputs[name].dtype
                 )
                 input_slice = _serialize_slice(
diff --git a/stencils/pace/stencils/testing/temporaries.py b/stencils/pace/stencils/testing/temporaries.py
index 581387f6..2dd46663 100644
--- a/stencils/pace/stencils/testing/temporaries.py
+++ b/stencils/pace/stencils/testing/temporaries.py
@@ -40,10 +40,9 @@ def _assert_same_temporaries(dict1: dict, dict2: dict) -> List[str]:
         attr2 = dict2[attr]
         if isinstance(attr1, np.ndarray):
             try:
-                np.testing.assert_almost_equal(
-                    attr1, attr2, err_msg=f"{attr} not equal"
-                )
-            except AssertionError:
+                assert np.allclose(attr1, attr2, equal_nan=True)
+            except AssertionError as e:
+                print(e)
                 differences.append(attr)
         else:
             sub_differences = _assert_same_temporaries(attr1, attr2)
diff --git a/tests/main/dsl/test_caches.py b/tests/main/dsl/test_caches.py
new file mode 100644
index 00000000..c1f01303
--- /dev/null
+++ b/tests/main/dsl/test_caches.py
@@ -0,0 +1,176 @@
+import pytest
+from gt4py.cartesian.gtscript import PARALLEL, Field, computation, interval
+from gt4py.storage import empty, ones
+
+import pace.dsl
+from pace.dsl.dace import orchestrate
+from pace.dsl.dace.dace_config import DaceConfig, DaCeOrchestration
+from pace.dsl.stencil import CompilationConfig, GridIndexing
+
+
+def _make_storage(
+    func,
+    grid_indexing,
+    stencil_config: pace.dsl.StencilConfig,
+    *,
+    dtype=float,
+    aligned_index=(0, 0, 0),
+):
+    return func(
+        backend=stencil_config.compilation_config.backend,
+        shape=grid_indexing.domain,
+        dtype=dtype,
+        aligned_index=aligned_index,
+    )
+
+
+def _stencil(inp: Field[float], out: Field[float], scalar: float):
+    with computation(PARALLEL), interval(...):
+        out = inp
+
+
+def _build_stencil(backend, orchestrated: DaCeOrchestration):
+    # Make stencil and verify it ran
+    grid_indexing = GridIndexing(
+        domain=(5, 5, 5),
+        n_halo=2,
+        south_edge=True,
+        north_edge=True,
+        west_edge=True,
+        east_edge=True,
+    )
+
+    stencil_config = pace.dsl.StencilConfig(
+        compilation_config=CompilationConfig(backend=backend, rebuild=True),
+        dace_config=DaceConfig(None, backend, 5, 5, orchestrated),
+    )
+
+    stencil_factory = pace.dsl.StencilFactory(stencil_config, grid_indexing)
+
+    built_stencil = stencil_factory.from_origin_domain(
+        _stencil, (0, 0, 0), domain=grid_indexing.domain
+    )
+
+    return built_stencil, grid_indexing, stencil_config
+
+
+class OrchestratedProgam:
+    def __init__(self, backend, orchestration):
+        self.stencil, grid_indexing, stencil_config = _build_stencil(
+            backend, orchestration
+        )
+        orchestrate(obj=self, config=stencil_config.dace_config)
+        self.inp = _make_storage(ones, grid_indexing, stencil_config, dtype=float)
+        self.out = _make_storage(empty, grid_indexing, stencil_config, dtype=float)
+
+    def __call__(self):
+        self.stencil(self.inp, self.out, self.inp[0, 0, 0])
+
+
+@pytest.mark.parametrize(
+    "backend",
+    [
+        pytest.param("dace:cpu"),
+    ],
+)
+def test_relocatability_orchestration(backend):
+    import os
+    import shutil
+
+    from gt4py.cartesian import config as gt_config
+
+    original_root_directory = gt_config.cache_settings["root_path"]
+    working_dir = str(os.getcwd())
+
+    # Compile on default
+    p0 = OrchestratedProgam(backend, DaCeOrchestration.BuildAndRun)
+    p0()
+    assert os.path.exists(
+        f"{working_dir}/.gt_cache_FV3_A/dacecache/"
+        "test_caches_OrchestratedProgam___call__",
+    ) or os.path.exists(
+        f"{working_dir}/.gt_cache_FV3_A/dacecache/OrchestratedProgam___call__",
+    )
+
+    # Compile in another directory
+
+    custom_path = f"{working_dir}/.my_cache_path"
+    gt_config.cache_settings["root_path"] = custom_path
+    p1 = OrchestratedProgam(backend, DaCeOrchestration.BuildAndRun)
+    p1()
+    assert os.path.exists(
+        f"{custom_path}/.gt_cache_FV3_A/dacecache/"
+        "test_caches_OrchestratedProgam___call__",
+    ) or os.path.exists(
+        f"{working_dir}/.gt_cache_FV3_A/dacecache/OrchestratedProgam___call__",
+    )
+
+    # Check relocability by copying the second cache directory,
+    # changing the path of gt_config.cache_settings and trying to Run on it
+    relocated_path = f"{working_dir}/.my_relocated_cache_path"
+    shutil.copytree(custom_path, relocated_path, dirs_exist_ok=True)
+    gt_config.cache_settings["root_path"] = relocated_path
+    p2 = OrchestratedProgam(backend, DaCeOrchestration.Run)
+    p2()
+
+    # Generate a file exists error to check for bad path
+    bogus_path = "./nope/notatall/nothappening"
+    gt_config.cache_settings["root_path"] = bogus_path
+    with pytest.raises(RuntimeError):
+        OrchestratedProgam(backend, DaCeOrchestration.Run)
+
+    # Restore cache settings
+    gt_config.cache_settings["root_path"] = original_root_directory
+
+
+@pytest.mark.parametrize(
+    "backend",
+    [
+        pytest.param("dace:cpu"),
+    ],
+)
+def test_relocatability(backend: str):
+    import os
+    import shutil
+
+    import gt4py
+    from gt4py.cartesian import config as gt_config
+
+    from pace.util.mpi import MPI
+
+    # Restore original dir name
+    gt4py.cartesian.config.cache_settings["dir_name"] = os.environ.get(
+        "GT_CACHE_DIR_NAME", f".gt_cache_{MPI.COMM_WORLD.Get_rank():06}"
+    )
+
+    backend_sanitized = backend.replace(":", "")
+
+    # Compile on default
+    p0 = OrchestratedProgam(backend, DaCeOrchestration.Python)
+    p0()
+    assert os.path.exists(
+        f"./.gt_cache_000000/py38_1013/{backend_sanitized}/test_caches/_stencil/"
+    )
+
+    # Compile in another directory
+
+    custom_path = "./.my_cache_path"
+    gt_config.cache_settings["root_path"] = custom_path
+    p1 = OrchestratedProgam(backend, DaCeOrchestration.Python)
+    p1()
+    assert os.path.exists(
+        f"{custom_path}/.gt_cache_000000/py38_1013/{backend_sanitized}"
+        "/test_caches/_stencil/"
+    )
+
+    # Check relocability by copying the second cache directory,
+    # changing the path of gt_config.cache_settings and trying to Run on it
+    relocated_path = "./.my_relocated_cache_path"
+    shutil.copytree("./.gt_cache_000000", relocated_path, dirs_exist_ok=True)
+    gt_config.cache_settings["root_path"] = relocated_path
+    p2 = OrchestratedProgam(backend, DaCeOrchestration.Python)
+    p2()
+    assert os.path.exists(
+        f"{relocated_path}/.gt_cache_000000/py38_1013/{backend_sanitized}"
+        "/test_caches/_stencil/"
+    )
diff --git a/tests/main/dsl/test_dace_config.py b/tests/main/dsl/test_dace_config.py
index 78553278..cb3566dd 100644
--- a/tests/main/dsl/test_dace_config.py
+++ b/tests/main/dsl/test_dace_config.py
@@ -1,11 +1,12 @@
 import unittest.mock
 
-from pace.dsl.dace.dace_config import DaceConfig
+from pace.dsl.dace.dace_config import DaceConfig, _determine_compiling_ranks
 from pace.dsl.dace.orchestration import (
     DaCeOrchestration,
     orchestrate,
     orchestrate_function,
 )
+from pace.util.communicator import CubedSpherePartitioner, TilePartitioner
 
 
 """
@@ -91,3 +92,68 @@ def foo(self):
         a = A()
         a.foo()
     assert not mock_call_sdfg.called
+
+
+def test_orchestrate_distributed_build():
+    dummy_dace_config = DaceConfig(
+        communicator=None,
+        backend="gtc:dace",
+        orchestration=DaCeOrchestration.BuildAndRun,
+    )
+
+    def _does_compile(rank, partitioner) -> bool:
+        dummy_dace_config.layout = partitioner.layout
+        dummy_dace_config.rank_size = partitioner.layout[0] * partitioner.layout[1] * 6
+        dummy_dace_config.my_rank = rank
+        return _determine_compiling_ranks(dummy_dace_config, partitioner)
+
+    # (1, 1) layout, one rank which compiles
+    cube_partitioner_11 = CubedSpherePartitioner(TilePartitioner((1, 1)))
+    assert _does_compile(0, cube_partitioner_11)
+    assert not _does_compile(1, cube_partitioner_11)  # not compiling face
+
+    # (2, 2) layout, 4 ranks, all compiling
+    cube_partitioner_22 = CubedSpherePartitioner(TilePartitioner((2, 2)))
+    assert _does_compile(0, cube_partitioner_22)
+    assert _does_compile(1, cube_partitioner_22)
+    assert _does_compile(2, cube_partitioner_22)
+    assert _does_compile(3, cube_partitioner_22)
+    assert not _does_compile(4, cube_partitioner_22)  # not compiling face
+
+    # (3, 3) layout, 9 ranks, all compiling
+    cube_partitioner_33 = CubedSpherePartitioner(TilePartitioner((3, 3)))
+    assert _does_compile(0, cube_partitioner_33)
+    assert _does_compile(1, cube_partitioner_33)
+    assert _does_compile(2, cube_partitioner_33)
+    assert _does_compile(3, cube_partitioner_33)
+    assert _does_compile(4, cube_partitioner_33)
+    assert _does_compile(5, cube_partitioner_33)
+    assert _does_compile(6, cube_partitioner_33)
+    assert _does_compile(7, cube_partitioner_33)
+    assert _does_compile(8, cube_partitioner_33)
+    assert not _does_compile(9, cube_partitioner_33)  # not compiling face
+
+    # (4, 4) layout, 16 ranks,
+    # expecting compiling:0, 1, 2, 3, 4, 5, 7, 12, 13, 15
+    cube_partitioner_44 = CubedSpherePartitioner(TilePartitioner((4, 4)))
+    assert _does_compile(0, cube_partitioner_44)
+    assert _does_compile(1, cube_partitioner_44)
+    assert _does_compile(4, cube_partitioner_44)
+    assert _does_compile(5, cube_partitioner_44)
+    assert _does_compile(7, cube_partitioner_44)
+    assert _does_compile(12, cube_partitioner_44)
+    assert _does_compile(13, cube_partitioner_44)
+    assert _does_compile(15, cube_partitioner_44)
+    assert not _does_compile(2, cube_partitioner_44)  # same code path as 3
+    assert not _does_compile(6, cube_partitioner_44)  # same code path as 5
+    assert not _does_compile(8, cube_partitioner_44)  # same code path as 4
+    assert not _does_compile(11, cube_partitioner_44)  # same code path as 7
+    assert not _does_compile(16, cube_partitioner_44)  # not compiling face
+
+    # For a few other layouts, we check that we always have 9 compiling ranks
+    for layout in [(5, 5), (10, 10), (20, 20)]:
+        partition = CubedSpherePartitioner(TilePartitioner(layout))
+        compiling = 0
+        for i in range(layout[0] * layout[1] * 6):
+            compiling += 1 if _does_compile(i, partition) else 0
+        assert compiling == 9
diff --git a/tests/main/test_grid_init.py b/tests/main/test_grid_init.py
index 942dcfd3..4c0e5e2b 100644
--- a/tests/main/test_grid_init.py
+++ b/tests/main/test_grid_init.py
@@ -51,8 +51,6 @@ def test_grid_init_not_decomposition_dependent(rank: int):
     assert allclose(metric_terms_1by1.area, metric_terms_3by3.area, partitioner, rank)
     assert allclose(metric_terms_1by1.dx, metric_terms_3by3.dx, partitioner, rank)
     assert allclose(metric_terms_1by1.dy, metric_terms_3by3.dy, partitioner, rank)
-    assert allclose(metric_terms_1by1.dxa, metric_terms_3by3.dxa, partitioner, rank)
-    assert allclose(metric_terms_1by1.dya, metric_terms_3by3.dya, partitioner, rank)
     assert allclose(
         metric_terms_1by1.cos_sg1, metric_terms_3by3.cos_sg1, partitioner, rank
     )
diff --git a/util/pace/util/__init__.py b/util/pace/util/__init__.py
index 54f684d6..58a7c2a5 100644
--- a/util/pace/util/__init__.py
+++ b/util/pace/util/__init__.py
@@ -54,7 +54,7 @@
 from .initialization import GridSizer, QuantityFactory, SubtileGridSizer
 from .io import read_state, write_state
 from .local_comm import LocalComm
-from .logging import pace_log, AVAILABLE_LOG_LEVELS
+from .logging import AVAILABLE_LOG_LEVELS, pace_log
 from .monitor import Monitor, NetCDFMonitor, ZarrMonitor
 from .mpi import MPIComm
 from .namelist import Namelist, NamelistDefaults
diff --git a/util/pace/util/communicator.py b/util/pace/util/communicator.py
index 938469bd..d2577d8c 100644
--- a/util/pace/util/communicator.py
+++ b/util/pace/util/communicator.py
@@ -167,7 +167,7 @@ def _get_gather_recv_quantity(
     ) -> Quantity:
         """Initialize a Quantity for use when receiving global data during gather"""
         recv_quantity = Quantity(
-            send_metadata.np.empty(global_extent, dtype=send_metadata.dtype),
+            send_metadata.np.zeros(global_extent, dtype=send_metadata.dtype),
             dims=send_metadata.dims,
             units=send_metadata.units,
             origin=tuple([0 for dim in send_metadata.dims]),
@@ -182,7 +182,7 @@ def _get_scatter_recv_quantity(
     ) -> Quantity:
         """Initialize a Quantity for use when receiving subtile data during scatter"""
         recv_quantity = Quantity(
-            send_metadata.np.empty(shape, dtype=send_metadata.dtype),
+            send_metadata.np.zeros(shape, dtype=send_metadata.dtype),
             dims=send_metadata.dims,
             units=send_metadata.units,
             gt4py_backend=send_metadata.gt4py_backend,
@@ -206,7 +206,7 @@ def gather(
         result: Optional[Quantity]
         if self.rank == constants.ROOT_RANK:
             with array_buffer(
-                send_quantity.np.empty,
+                send_quantity.np.zeros,
                 (self.partitioner.total_ranks,) + tuple(send_quantity.extent),
                 dtype=send_quantity.data.dtype,
             ) as recvbuf:
@@ -745,7 +745,7 @@ def _get_gather_recv_quantity(
         # needs to change the quantity dimensions since we add a "tile" dimension,
         # unlike for tile scatter/gather which retains the same dimensions
         recv_quantity = Quantity(
-            metadata.np.empty(global_extent, dtype=metadata.dtype),
+            metadata.np.zeros(global_extent, dtype=metadata.dtype),
             dims=(constants.TILE_DIM,) + metadata.dims,
             units=metadata.units,
             origin=(0,) + tuple([0 for dim in metadata.dims]),
@@ -767,7 +767,7 @@ def _get_scatter_recv_quantity(
         # needs to change the quantity dimensions since we remove a "tile" dimension,
         # unlike for tile scatter/gather which retains the same dimensions
         recv_quantity = Quantity(
-            metadata.np.empty(shape, dtype=metadata.dtype),
+            metadata.np.zeros(shape, dtype=metadata.dtype),
             dims=metadata.dims[1:],
             units=metadata.units,
             gt4py_backend=metadata.gt4py_backend,
diff --git a/util/pace/util/grid/eta.py b/util/pace/util/grid/eta.py
index 50afaadb..dc37aaa2 100644
--- a/util/pace/util/grid/eta.py
+++ b/util/pace/util/grid/eta.py
@@ -206,7 +206,6 @@ def set_hybrid_pressure_coefficients(km: int) -> HybridPressureCoefficients:
         )
 
     elif km == 91:
-
         ak = np.array(
             [
                 1.00000000,
@@ -402,7 +401,6 @@ def set_hybrid_pressure_coefficients(km: int) -> HybridPressureCoefficients:
         )
 
     elif km == 72:
-
         ak = np.array(
             [
                 1.00000000,
@@ -559,10 +557,299 @@ def set_hybrid_pressure_coefficients(km: int) -> HybridPressureCoefficients:
             ]
         )
 
+    elif km == 137:
+        ak = np.array(
+            [
+                1.00000000,
+                1.82500005,
+                3.00000000,
+                4.63000011,
+                6.82797718,
+                9.74696636,
+                13.6054239,
+                18.6089306,
+                24.9857178,
+                32.9857101,
+                42.8792419,
+                54.9554634,
+                69.5205765,
+                86.8958817,
+                107.415741,
+                131.425507,
+                159.279404,
+                191.338562,
+                227.968948,
+                269.539581,
+                316.420746,
+                368.982361,
+                427.592499,
+                492.616028,
+                564.413452,
+                643.339905,
+                729.744141,
+                823.967834,
+                926.344910,
+                1037.20117,
+                1156.85364,
+                1285.61035,
+                1423.77014,
+                1571.62292,
+                1729.44897,
+                1897.51929,
+                2076.09595,
+                2265.43164,
+                2465.77051,
+                2677.34814,
+                2900.39136,
+                3135.11938,
+                3381.74365,
+                3640.46826,
+                3911.49048,
+                4194.93066,
+                4490.81738,
+                4799.14941,
+                5119.89502,
+                5452.99072,
+                5798.34473,
+                6156.07422,
+                6526.94678,
+                6911.87061,
+                7311.86914,
+                7727.41211,
+                8159.35400,
+                8608.52539,
+                9076.40039,
+                9562.68262,
+                10065.9785,
+                10584.6318,
+                11116.6621,
+                11660.0674,
+                12211.5479,
+                12766.8730,
+                13324.6689,
+                13881.3311,
+                14432.1396,
+                14975.6152,
+                15508.2568,
+                16026.1152,
+                16527.3223,
+                17008.7891,
+                17467.6133,
+                17901.6211,
+                18308.4336,
+                18685.7188,
+                19031.2891,
+                19343.5117,
+                19620.0430,
+                19859.3906,
+                20059.9316,
+                20219.6641,
+                20337.8633,
+                20412.3086,
+                20442.0781,
+                20425.7188,
+                20361.8164,
+                20249.5117,
+                20087.0859,
+                19874.0254,
+                19608.5723,
+                19290.2266,
+                18917.4609,
+                18489.7070,
+                18006.9258,
+                17471.8398,
+                16888.6875,
+                16262.0469,
+                15596.6953,
+                14898.4531,
+                14173.3242,
+                13427.7695,
+                12668.2578,
+                11901.3398,
+                11133.3047,
+                10370.1758,
+                9617.51562,
+                8880.45312,
+                8163.37500,
+                7470.34375,
+                6804.42188,
+                6168.53125,
+                5564.38281,
+                4993.79688,
+                4457.37500,
+                3955.96094,
+                3489.23438,
+                3057.26562,
+                2659.14062,
+                2294.24219,
+                1961.50000,
+                1659.47656,
+                1387.54688,
+                1143.25000,
+                926.507812,
+                734.992188,
+                568.062500,
+                424.414062,
+                302.476562,
+                202.484375,
+                122.101562,
+                62.7812500,
+                22.8359375,
+                3.75781298,
+                0.00000000,
+                0.00000000,
+            ]
+        )
+
+        bk = np.array(
+            [
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                7.00000010e-06,
+                2.40000008e-05,
+                5.90000018e-05,
+                1.12000002e-04,
+                1.99000002e-04,
+                3.39999999e-04,
+                5.61999972e-04,
+                8.90000025e-04,
+                1.35300006e-03,
+                1.99200003e-03,
+                2.85700010e-03,
+                3.97100020e-03,
+                5.37799997e-03,
+                7.13300006e-03,
+                9.26099997e-03,
+                1.18060000e-02,
+                1.48160001e-02,
+                1.83179993e-02,
+                2.23549996e-02,
+                2.69639995e-02,
+                3.21759991e-02,
+                3.80260013e-02,
+                4.45480011e-02,
+                5.17730005e-02,
+                5.97280003e-02,
+                6.84479997e-02,
+                7.79580027e-02,
+                8.82859975e-02,
+                9.94620025e-02,
+                0.111505002,
+                0.124448001,
+                0.138312995,
+                0.153125003,
+                0.168909997,
+                0.185689002,
+                0.203491002,
+                0.222332999,
+                0.242244005,
+                0.263242006,
+                0.285353988,
+                0.308598012,
+                0.332938999,
+                0.358253986,
+                0.384362996,
+                0.411125004,
+                0.438391000,
+                0.466003001,
+                0.493800014,
+                0.521619022,
+                0.549301028,
+                0.576691985,
+                0.603648007,
+                0.630035996,
+                0.655736029,
+                0.680643022,
+                0.704668999,
+                0.727738976,
+                0.749796987,
+                0.770798028,
+                0.790717006,
+                0.809535980,
+                0.827256024,
+                0.843881011,
+                0.859431982,
+                0.873929024,
+                0.887408018,
+                0.899900019,
+                0.911448002,
+                0.922096014,
+                0.931881011,
+                0.940859973,
+                0.949064016,
+                0.956550002,
+                0.963352025,
+                0.969512999,
+                0.975077987,
+                0.980072021,
+                0.984542012,
+                0.988499999,
+                0.991984010,
+                0.995002985,
+                0.997630000,
+                1.00000000,
+            ]
+        )
+
     else:
         raise NotImplementedError(
-            "Only grids with 72, 79, or 91 vertical levels have been implemented so far"
+            "Only grids with 72, 79, 91 or 137 vertical levels"
+            "have been implemented so far"
         )
+
     if 0.0 in bk:
         ks = 0 if km == 91 else np.where(bk == 0)[0][-1]
         ptop = ak[0]
diff --git a/util/pace/util/grid/generation.py b/util/pace/util/grid/generation.py
index b78a7059..679b9449 100644
--- a/util/pace/util/grid/generation.py
+++ b/util/pace/util/grid/generation.py
@@ -75,7 +75,7 @@ def quantity_cast_to_model_float(
     quantity_factory: util.QuantityFactory, qty_64: util.Quantity
 ) -> util.Quantity:
     """Copy & cast from 64-bit float to model precision if need be"""
-    qty = quantity_factory.empty(qty_64.dims, qty_64.units, dtype=Float)
+    qty = quantity_factory.zeros(qty_64.dims, qty_64.units, dtype=Float)
     qty.data[:] = qty_64.data[:]
     return qty
 
@@ -1530,7 +1530,6 @@ def rdyc(self) -> util.Quantity:
         )
 
     def _init_dgrid(self):
-
         grid_mirror_ew = self.quantity_factory.zeros(
             self._grid_dims,
             "radians",
@@ -1751,7 +1750,6 @@ def _compute_dxdy(self):
         return dx, dy
 
     def _compute_dxdy_agrid(self):
-
         dx_agrid_64 = self.quantity_factory.zeros(
             [util.X_DIM, util.Y_DIM],
             "m",
@@ -2149,7 +2147,6 @@ def _calculate_more_trig_terms(self, cos_sg, sin_sg):
         )
 
     def _init_cell_trigonometry(self):
-
         cosa_u_64 = self.quantity_factory.zeros(
             [util.X_INTERFACE_DIM, util.Y_DIM],
             "",
diff --git a/util/pace/util/grid/gnomonic.py b/util/pace/util/grid/gnomonic.py
index 705014e4..f26af0f2 100644
--- a/util/pace/util/grid/gnomonic.py
+++ b/util/pace/util/grid/gnomonic.py
@@ -303,9 +303,9 @@ def _mirror_latlon(lon1, lat1, lon2, lat2, lon0, lat0, np):
     pdot = p0[0] * nb[0] + p0[1] * nb[1] + p0[2] * nb[2]
     pp = p0 - np.multiply(2.0, pdot) * nb
 
-    lon3 = np.empty((1, 1))
-    lat3 = np.empty((1, 1))
-    pp3 = np.empty((3, 1, 1))
+    lon3 = np.zeros((1, 1))
+    lat3 = np.zeros((1, 1))
+    pp3 = np.zeros((3, 1, 1))
     pp3[:, 0, 0] = pp
     _cart_to_latlon(1, pp3, lon3, lat3, np)
 
diff --git a/util/pace/util/grid/helper.py b/util/pace/util/grid/helper.py
index 673e484d..1b977ad8 100644
--- a/util/pace/util/grid/helper.py
+++ b/util/pace/util/grid/helper.py
@@ -166,8 +166,8 @@ def from_restart(
                 but no fv_core.res.nc in restart data file."""
             )
 
-        ak = quantity_factory.empty([Z_INTERFACE_DIM], units="Pa")
-        bk = quantity_factory.empty([Z_INTERFACE_DIM], units="")
+        ak = quantity_factory.zeros([Z_INTERFACE_DIM], units="Pa")
+        bk = quantity_factory.zeros([Z_INTERFACE_DIM], units="")
         with fs.open(ak_bk_data_file, "rb") as f:
             ds = xr.open_dataset(f).isel(Time=0).drop_vars("Time")
             ak.view[:] = ds["ak"].values
@@ -322,7 +322,6 @@ def __init__(
 
     @classmethod
     def new_from_metric_terms(cls, metric_terms: MetricTerms):
-
         horizontal_data = HorizontalGridData.new_from_metric_terms(metric_terms)
         vertical_data = VerticalGridData.new_from_metric_terms(metric_terms)
         contravariant_data = ContravariantGridData.new_from_metric_terms(metric_terms)
@@ -701,7 +700,6 @@ def new_from_grid_variables(
         es1: pace.util.Quantity,
         ew2: pace.util.Quantity,
     ) -> "DriverGridData":
-
         try:
             vlon1, vlon2, vlon3 = split_quantity_along_last_dim(vlon)
             vlat1, vlat2, vlat3 = split_quantity_along_last_dim(vlat)
diff --git a/util/pace/util/halo_data_transformer.py b/util/pace/util/halo_data_transformer.py
index 00a547d6..e97bb97a 100644
--- a/util/pace/util/halo_data_transformer.py
+++ b/util/pace/util/halo_data_transformer.py
@@ -70,7 +70,7 @@ def _build_flatten_indices(
     """
 
     # Have to go down to numpy to leverage indices calculation
-    arr_indices = np.empty(shape, dtype=np.int32, order="C")[slices]
+    arr_indices = np.zeros(shape, dtype=np.int32, order="C")[slices]
 
     # Get offset from first index
     offset_dims = []
@@ -875,7 +875,6 @@ def _opt_unpack_scalar(self, quantities: List[Quantity]):
 
             # Use private stream
             with self._get_stream(cu_kernel_args.stream):
-
                 # Launch kernel
                 blocks = 128
                 grid_x = (info_x._unpack_buffer_size // blocks) + 1
@@ -942,7 +941,6 @@ def _opt_unpack_vector(
 
             # Use private stream
             with self._get_stream(cu_kernel_args.stream):
-
                 # Buffer sizes
                 edge_size = info_x._unpack_buffer_size + info_y._unpack_buffer_size
 
diff --git a/util/pace/util/initialization/allocator.py b/util/pace/util/initialization/allocator.py
index c865cbbf..1a68495e 100644
--- a/util/pace/util/initialization/allocator.py
+++ b/util/pace/util/initialization/allocator.py
@@ -102,7 +102,7 @@ def from_array(
         That numpy array must correspond to the correct shape and extent
         for the given dims.
         """
-        base = self.empty(
+        base = self.zeros(
             dims=dims,
             units=units,
             dtype=data.dtype,