diff --git a/.github/workflows/main_unit_tests.yml b/.github/workflows/main_unit_tests.yml
index 5dbf4a1f..5800baa6 100644
--- a/.github/workflows/main_unit_tests.yml
+++ b/.github/workflows/main_unit_tests.yml
@@ -15,9 +15,9 @@ jobs:
           uses: actions/setup-python@v4.6.0
           with:
             python-version: '3.8.12'
-        - name: Install OpenMPI for gt4py
+        - name: Install OpenMPI & Boost for gt4py
           run: |
-            sudo apt-get install libopenmpi-dev
+            sudo apt-get install libopenmpi-dev libboost1.74-dev
         - name: Install Python packages
           run: |
             python -m pip install --upgrade pip
diff --git a/driver/pace/driver/driver.py b/driver/pace/driver/driver.py
index 0ef3263b..284acaca 100644
--- a/driver/pace/driver/driver.py
+++ b/driver/pace/driver/driver.py
@@ -275,7 +275,7 @@ def from_dict(cls, kwargs: Dict[str, Any]) -> "DriverConfig":
             )
             grid_type = kwargs["grid_config"].config.grid_type
             # Copy grid_type to the DycoreConfig if it's not the default value
-            if grid_type != 0: 
+            if grid_type != 0:
                 kwargs["dycore_config"].grid_type = grid_type
 
         if (
diff --git a/driver/pace/driver/grid.py b/driver/pace/driver/grid.py
index c184d566..9fa97a06 100644
--- a/driver/pace/driver/grid.py
+++ b/driver/pace/driver/grid.py
@@ -105,7 +105,6 @@ def get_grid(
         quantity_factory: QuantityFactory,
         communicator: CubedSphereCommunicator,
     ) -> Tuple[DampingCoefficients, DriverGridData, GridData]:
-
         metric_terms = MetricTerms(
             quantity_factory=quantity_factory,
             communicator=communicator,
@@ -184,8 +183,7 @@ def get_grid(
         quantity_factory: QuantityFactory,
         communicator: CubedSphereCommunicator,
     ) -> Tuple[DampingCoefficients, DriverGridData, GridData]:
-
-        backend = quantity_factory.empty(
+        backend = quantity_factory.zeros(
             dims=[pace.util.X_DIM, pace.util.Y_DIM], units="unknown"
         ).gt4py_backend
 
diff --git a/driver/pace/driver/initialization.py b/driver/pace/driver/initialization.py
index 2b6471a8..bd6d96ea 100644
--- a/driver/pace/driver/initialization.py
+++ b/driver/pace/driver/initialization.py
@@ -154,7 +154,6 @@ def get_driver_state(
         driver_grid_data: pace.util.grid.DriverGridData,
         grid_data: pace.util.grid.GridData,
     ) -> DriverState:
-
         dycore_state = tc_init.init_tc_state(
             grid_data=grid_data,
             quantity_factory=quantity_factory,
@@ -323,7 +322,7 @@ def get_driver_state(
         driver_grid_data: pace.util.grid.DriverGridData,
         grid_data: pace.util.grid.GridData,
     ) -> DriverState:
-        backend = quantity_factory.empty(
+        backend = quantity_factory.zeros(
             dims=[pace.util.X_DIM, pace.util.Y_DIM], units="unknown"
         ).gt4py_backend
 
@@ -348,7 +347,6 @@ def _initialize_dycore_state(
         communicator: pace.util.CubedSphereCommunicator,
         backend: str,
     ) -> fv3core.DycoreState:
-
         grid = self._get_serialized_grid(communicator=communicator, backend=backend)
 
         ser = self._serializer(communicator)
@@ -401,7 +399,6 @@ def get_driver_state(
         driver_grid_data: pace.util.grid.DriverGridData,
         grid_data: pace.util.grid.GridData,
     ) -> DriverState:
-
         return DriverState(
             dycore_state=self.dycore_state,
             physics_state=self.physics_state,
diff --git a/driver/pace/driver/run.py b/driver/pace/driver/run.py
index e6c39982..df70eb14 100644
--- a/driver/pace/driver/run.py
+++ b/driver/pace/driver/run.py
@@ -5,7 +5,7 @@
 import click
 import yaml
 
-from pace.util import pace_log, AVAILABLE_LOG_LEVELS
+from pace.util import AVAILABLE_LOG_LEVELS, pace_log
 
 from .driver import Driver, DriverConfig
 
diff --git a/dsl/pace/dsl/caches/cache_location.py b/dsl/pace/dsl/caches/cache_location.py
new file mode 100644
index 00000000..ab57a60b
--- /dev/null
+++ b/dsl/pace/dsl/caches/cache_location.py
@@ -0,0 +1,46 @@
+from pace.dsl.caches.codepath import FV3CodePath
+from pace.util import CubedSpherePartitioner
+
+
+def identify_code_path(
+    rank: int,
+    partitioner: CubedSpherePartitioner,
+) -> FV3CodePath:
+    if partitioner.layout == (1, 1) or partitioner.layout == [1, 1]:
+        return FV3CodePath.All
+    elif partitioner.layout[0] == 1 or partitioner.layout[1] == 1:
+        raise NotImplementedError(
+            f"Build for layout {partitioner.layout} is not handled"
+        )
+    else:
+        if partitioner.tile.on_tile_bottom(rank):
+            if partitioner.tile.on_tile_left(rank):
+                return FV3CodePath.BottomLeft
+            if partitioner.tile.on_tile_right(rank):
+                return FV3CodePath.BottomRight
+            else:
+                return FV3CodePath.Bottom
+        if partitioner.tile.on_tile_top(rank):
+            if partitioner.tile.on_tile_left(rank):
+                return FV3CodePath.TopLeft
+            if partitioner.tile.on_tile_right(rank):
+                return FV3CodePath.TopRight
+            else:
+                return FV3CodePath.Top
+        else:
+            if partitioner.tile.on_tile_left(rank):
+                return FV3CodePath.Left
+            if partitioner.tile.on_tile_right(rank):
+                return FV3CodePath.Right
+            else:
+                return FV3CodePath.Center
+
+
+def get_cache_fullpath(code_path: FV3CodePath) -> str:
+    from gt4py.cartesian import config as gt_config
+
+    return f"{gt_config.cache_settings['root_path']}/.gt_cache_{code_path}"
+
+
+def get_cache_directory(code_path: FV3CodePath) -> str:
+    return f".gt_cache_{code_path}"
diff --git a/dsl/pace/dsl/caches/codepath.py b/dsl/pace/dsl/caches/codepath.py
new file mode 100644
index 00000000..8ebf9492
--- /dev/null
+++ b/dsl/pace/dsl/caches/codepath.py
@@ -0,0 +1,33 @@
+import enum
+
+
+class FV3CodePath(enum.Enum):
+    """Enum listing all possible code paths on a cube sphere.
+    For any layout the cube sphere has up to 9 different code paths depending on
+    the positioning of the rank on the tile and which of the edge/corner cases
+    it has to handle, as well as the possibility for all boundary computations in
+    the 1x1 layout case.
+    Since the framework inlines code to optimize, we _cannot_ pre-suppose which code
+    being kept and/or ejected. This enum serves as the ground truth to map rank to
+    the proper generated code.
+    """
+
+    All = "FV3_A"
+    BottomLeft = "FV3_BL"
+    Left = "FV3_L"
+    TopLeft = "FV3_TL"
+    Top = "FV3_T"
+    TopRight = "FV3_TR"
+    Right = "FV3_R"
+    BottomRight = "FV3_BR"
+    Bottom = "FV3_B"
+    Center = "FV3_C"
+
+    def __str__(self):
+        return self.value
+
+    def __repr__(self):
+        return self.value
+
+    def __format__(self, format_spec: str) -> str:
+        return self.value
diff --git a/dsl/pace/dsl/dace/build.py b/dsl/pace/dsl/dace/build.py
index 7d8f3db2..b134f569 100644
--- a/dsl/pace/dsl/dace/build.py
+++ b/dsl/pace/dsl/dace/build.py
@@ -1,9 +1,9 @@
 from typing import List, Optional, Tuple
-from warnings import warn
 
 from dace.sdfg import SDFG
 
 import pace.util
+from pace.dsl.caches.cache_location import get_cache_directory, get_cache_fullpath
 from pace.dsl.dace.dace_config import DaceConfig, DaCeOrchestration
 
 
@@ -11,19 +11,6 @@
 # Distributed compilation
 
 
-def determine_compiling_ranks(config: DaceConfig) -> bool:
-    is_compiling = False
-    rank = config.my_rank
-    size = config.rank_size
-
-    if int(size / 6) == 0:
-        is_compiling = True
-    elif rank % int(size / 6) == rank:
-        is_compiling = True
-
-    return is_compiling
-
-
 def unblock_waiting_tiles(comm, sdfg_path: str) -> None:
     if comm and comm.Get_size() > 1:
         for tile in range(1, 6):
@@ -31,48 +18,6 @@ def unblock_waiting_tiles(comm, sdfg_path: str) -> None:
             comm.send(sdfg_path, dest=tile * tilesize + comm.Get_rank())
 
 
-def get_target_rank(rank: int, partitioner: pace.util.CubedSpherePartitioner):
-    """From my rank & the current partitioner we determine which
-    rank we should read from.
-    For all layout >= 3,3 this presumes build has been done on a
-    3,3 layout."""
-    if partitioner.layout == (1, 1):
-        return 0
-    if partitioner.layout == (2, 2):
-        if partitioner.tile.on_tile_bottom(rank):
-            if partitioner.tile.on_tile_left(rank):
-                return 0  # "00"
-            if partitioner.tile.on_tile_right(rank):
-                return 1  # "10"
-        if partitioner.tile.on_tile_top(rank):
-            if partitioner.tile.on_tile_left(rank):
-                return 2  # "01"
-            if partitioner.tile.on_tile_right(rank):
-                return 3  # "11"
-    else:
-        if partitioner.tile.on_tile_bottom(rank):
-            if partitioner.tile.on_tile_left(rank):
-                return 0  # "00"
-            if partitioner.tile.on_tile_right(rank):
-                return 2  # "20"
-            else:
-                return 1  # "10"
-        if partitioner.tile.on_tile_top(rank):
-            if partitioner.tile.on_tile_left(rank):
-                return 6  # "02"
-            if partitioner.tile.on_tile_right(rank):
-                return 8  # "22"
-            else:
-                return 7  # "12"
-        else:
-            if partitioner.tile.on_tile_left(rank):
-                return 3  # "01"
-            if partitioner.tile.on_tile_right(rank):
-                return 5  # "21"
-            else:
-                return 4  # "11"
-
-
 def build_info_filepath() -> str:
     return "build_info.txt"
 
@@ -101,7 +46,10 @@ def write_build_info(
 
 
 def get_sdfg_path(
-    daceprog_name: str, config: DaceConfig, sdfg_file_path: Optional[str] = None
+    daceprog_name: str,
+    config: DaceConfig,
+    sdfg_file_path: Optional[str] = None,
+    override_run_only=False,
 ) -> Optional[str]:
     """Build an SDFG path from the qualified program name or it's direct path to .sdfg
 
@@ -113,7 +61,7 @@ def get_sdfg_path(
 
     # TODO: check DaceConfig for cache.strategy == name
     # Guarding against bad usage of this function
-    if config.get_orchestrate() != DaCeOrchestration.Run:
+    if not override_run_only and config.get_orchestrate() != DaCeOrchestration.Run:
         return None
 
     # Case of a .sdfg file given by the user to be compiled
@@ -125,19 +73,8 @@ def get_sdfg_path(
         return sdfg_file_path
 
     # Case of loading a precompiled .so - lookup using GT_CACHE
-    from gt4py.cartesian import config as gt_config
-
-    if config.rank_size > 1:
-        rank = config.my_rank
-        rank_str = f"_{config.target_rank:06d}"
-    else:
-        rank = 0
-        rank_str = f"_{rank:06d}"
-
-    sdfg_dir_path = (
-        f"{gt_config.cache_settings['root_path']}"
-        f"/.gt_cache{rank_str}/dacecache/{daceprog_name}"
-    )
+    cache_fullpath = get_cache_fullpath(config.code_path)
+    sdfg_dir_path = f"{cache_fullpath}/dacecache/{daceprog_name}"
     if not os.path.isdir(sdfg_dir_path):
         raise RuntimeError(f"Precompiled SDFG is missing at {sdfg_dir_path}")
 
@@ -153,23 +90,8 @@ def get_sdfg_path(
             raise RuntimeError(
                 f"SDFG build for {build_backend}, {config._backend} has been asked"
             )
-        # Check layout
-        build_layout = ast.literal_eval(build_info_file.readline())
-        can_read = True
-        if config.layout == (1, 1) and config.layout != build_layout:
-            can_read = False
-        elif config.layout == (2, 2) and config.layout != build_layout:
-            can_read = False
-        elif (
-            build_layout != (1, 1) and build_layout != (2, 2) and build_layout != (3, 3)
-        ):
-            can_read = False
-        if not can_read:
-            warn(
-                f"SDFG build for layout {build_layout}, "
-                f"cannot be run with current layout {config.layout}, bad layout?"
-            )
         # Check resolution per tile
+        build_layout = ast.literal_eval(build_info_file.readline())
         build_resolution = ast.literal_eval(build_info_file.readline())
         if (config.tile_resolution[0] / config.layout[0]) != (
             build_resolution[0] / build_layout[0]
@@ -179,7 +101,7 @@ def get_sdfg_path(
                 f"cannot be run with current resolution {config.tile_resolution}"
             )
 
-    print(f"[DaCe Config] Rank {rank} loading SDFG {sdfg_dir_path}")
+    print(f"[DaCe Config] Rank {config.my_rank} loading SDFG {sdfg_dir_path}")
 
     return sdfg_dir_path
 
@@ -189,33 +111,31 @@ def set_distributed_caches(config: "DaceConfig"):
 
     # Execute specific initialization per orchestration state
     orchestration_mode = config.get_orchestrate()
+    if orchestration_mode == DaCeOrchestration.Python:
+        return
 
     # Check that we have all the file we need to early out in case
     # of issues.
     if orchestration_mode == DaCeOrchestration.Run:
         import os
 
-        from gt4py.cartesian import config as gt_config
-
-        # Check our cache exist
-        if config.rank_size > 1:
-            rank = config.my_rank
-            target_rank_str = f"_{config.target_rank:06d}"
-        else:
-            rank = 0
-            target_rank_str = f"_{rank:06d}"
-        cache_filepath = (
-            f"{gt_config.cache_settings['root_path']}/.gt_cache{target_rank_str}"
-        )
-        if not os.path.exists(cache_filepath):
+        cache_directory = get_cache_fullpath(config.code_path)
+        if not os.path.exists(cache_directory):
             raise RuntimeError(
                 f"{orchestration_mode} error: Could not find caches for rank "
-                f"{rank} at {cache_filepath}"
+                f"{config.my_rank} at {cache_directory}"
             )
 
-        # All, good set this rank cache to the source cache
-        gt_config.cache_settings["dir_name"] = f".gt_cache{target_rank_str}"
-        print(
-            f"[{orchestration_mode}] Rank {rank} "
-            f"reading cache {gt_config.cache_settings['dir_name']}"
-        )
+    # Set read/write caches to the target rank
+    from gt4py.cartesian import config as gt_config
+
+    if config.do_compile:
+        verb = "reading/writing"
+    else:
+        verb = "reading"
+
+    gt_config.cache_settings["dir_name"] = get_cache_directory(config.code_path)
+    pace.util.pace_log.critical(
+        f"[{orchestration_mode}] Rank {config.my_rank} "
+        f"{verb} cache {gt_config.cache_settings['dir_name']}"
+    )
diff --git a/dsl/pace/dsl/dace/dace_config.py b/dsl/pace/dsl/dace/dace_config.py
index e6f3e7df..1bb0939e 100644
--- a/dsl/pace/dsl/dace/dace_config.py
+++ b/dsl/pace/dsl/dace/dace_config.py
@@ -1,13 +1,16 @@
 import enum
-from typing import Any, Dict, Optional
+import os
+from typing import Any, Dict, Optional, Tuple
 
 import dace.config
 from dace.codegen.compiled_sdfg import CompiledSDFG
 from dace.frontend.python.parser import DaceProgram
 
+from pace.dsl.caches.cache_location import identify_code_path
+from pace.dsl.caches.codepath import FV3CodePath
 from pace.dsl.gt4py_utils import is_gpu_backend
 from pace.util._optional_imports import cupy as cp
-from pace.util.communicator import CubedSphereCommunicator
+from pace.util.communicator import CubedSphereCommunicator, CubedSpherePartitioner
 
 
 # This can be turned on to revert compilation for orchestration
@@ -16,6 +19,95 @@
 DEACTIVATE_DISTRIBUTED_DACE_COMPILE = False
 
 
+def _is_corner(rank: int, partitioner: CubedSpherePartitioner) -> bool:
+    if partitioner.tile.on_tile_bottom(rank):
+        if partitioner.tile.on_tile_left(rank):
+            return True
+        if partitioner.tile.on_tile_right(rank):
+            return True
+    if partitioner.tile.on_tile_top(rank):
+        if partitioner.tile.on_tile_left(rank):
+            return True
+        if partitioner.tile.on_tile_right(rank):
+            return True
+    return False
+
+
+def _smallest_rank_bottom(x: int, y: int, layout: Tuple[int, int]):
+    return y == 0 and x == 1
+
+
+def _smallest_rank_top(x: int, y: int, layout: Tuple[int, int]):
+    return y == layout[1] - 1 and x == 1
+
+
+def _smallest_rank_left(x: int, y: int, layout: Tuple[int, int]):
+    return x == 0 and y == 1
+
+
+def _smallest_rank_right(x: int, y: int, layout: Tuple[int, int]):
+    return x == layout[0] - 1 and y == 1
+
+
+def _smallest_rank_middle(x: int, y: int, layout: Tuple[int, int]):
+    return layout[0] > 1 and layout[1] > 1 and x == 1 and y == 1
+
+
+def _determine_compiling_ranks(
+    config: "DaceConfig",
+    partitioner: CubedSpherePartitioner,
+) -> bool:
+    """
+    We try to map every layout to a 3x3 layout which MPI ranks
+    looks like
+        6 7 8
+        3 4 5
+        0 1 2
+    Using the partitionner we find mapping of the given layout
+    to all of those. For example on 4x4 layout
+        12 13 14 15
+        8  9  10 11
+        4  5  6  7
+        0  1  2  3
+    therefore we map
+        0 -> 0
+        1 -> 1
+        2 -> NOT COMPILING
+        3 -> 2
+        4 -> 3
+        5 -> 4
+        6 -> NOT COMPILING
+        7 -> 5
+        8 -> NOT COMPILING
+        9 -> NOT COMPILING
+        10 -> NOT COMPILING
+        11 -> NOT COMPILING
+        12 -> 6
+        13 -> 7
+        14 -> NOT COMPILING
+        15 -> 8
+    """
+
+    # Tile 0 compiles
+    if partitioner.tile_index(config.my_rank) != 0:
+        return False
+
+    # Corners compile
+    if _is_corner(config.my_rank, partitioner):
+        return True
+
+    y, x = partitioner.tile.subtile_index(config.my_rank)
+
+    # If edge or center tile, we give way to the smallest rank
+    return (
+        _smallest_rank_left(x, y, config.layout)
+        or _smallest_rank_bottom(x, y, config.layout)
+        or _smallest_rank_middle(x, y, config.layout)
+        or _smallest_rank_right(x, y, config.layout)
+        or _smallest_rank_top(x, y, config.layout)
+    )
+
+
 class DaCeOrchestration(enum.Enum):
     """
     Orchestration mode for DaCe
@@ -71,8 +163,6 @@ def __init__(
         # Temporary. This is a bit too out of the ordinary for the common user.
         # We should refactor the architecture to allow for a `gtc:orchestrated:dace:X`
         # backend that would signify both the `CPU|GPU` split and the orchestration mode
-        import os
-
         if orchestration is None:
             fv3_dacemode_env_var = os.getenv("FV3_DACEMODE", "Python")
             # The below condition guard against defining empty FV3_DACEMODE and
@@ -175,32 +265,30 @@ def __init__(
         # attempt to kill the dace.conf to avoid confusion
         if dace.config.Config._cfg_filename:
             try:
-                import os
-
                 os.remove(dace.config.Config._cfg_filename)
             except OSError:
                 pass
 
         self._backend = backend
         self.tile_resolution = [tile_nx, tile_nx, tile_nz]
-        from pace.dsl.dace.build import get_target_rank, set_distributed_caches
+        from pace.dsl.dace.build import set_distributed_caches
 
         # Distributed build required info
         if communicator:
             self.my_rank = communicator.rank
             self.rank_size = communicator.comm.Get_size()
-            if DEACTIVATE_DISTRIBUTED_DACE_COMPILE:
-                self.target_rank = communicator.rank
-            else:
-                self.target_rank = get_target_rank(
-                    self.my_rank, communicator.partitioner
-                )
+            self.code_path = identify_code_path(self.my_rank, communicator.partitioner)
             self.layout = communicator.partitioner.layout
+            self.do_compile = (
+                DEACTIVATE_DISTRIBUTED_DACE_COMPILE
+                or _determine_compiling_ranks(self, communicator.partitioner)
+            )
         else:
             self.my_rank = 0
             self.rank_size = 1
-            self.target_rank = 0
+            self.code_path = FV3CodePath.All
             self.layout = (1, 1)
+            self.do_compile = True
 
         set_distributed_caches(self)
 
@@ -226,7 +314,7 @@ def get_orchestrate(self) -> DaCeOrchestration:
         return self._orchestrate
 
     def get_sync_debug(self) -> bool:
-        return dace.config.Config.get("compiler", "cuda", "syncdebug")
+        return dace.config.Config.get_bool("compiler", "cuda", "syncdebug")
 
     def as_dict(self) -> Dict[str, Any]:
         return {
diff --git a/dsl/pace/dsl/dace/orchestration.py b/dsl/pace/dsl/dace/orchestration.py
index 2bd9df5b..7858381a 100644
--- a/dsl/pace/dsl/dace/orchestration.py
+++ b/dsl/pace/dsl/dace/orchestration.py
@@ -11,12 +11,7 @@
 from dace.transformation.auto.auto_optimize import make_transients_persistent
 from dace.transformation.helpers import get_parent_map
 
-from pace.dsl.dace.build import (
-    determine_compiling_ranks,
-    get_sdfg_path,
-    unblock_waiting_tiles,
-    write_build_info,
-)
+from pace.dsl.dace.build import get_sdfg_path, write_build_info
 from pace.dsl.dace.dace_config import (
     DEACTIVATE_DISTRIBUTED_DACE_COMPILE,
     DaceConfig,
@@ -34,6 +29,7 @@
     memory_static_analysis,
     report_memory_static_analysis,
 )
+from pace.util import pace_log
 from pace.util.mpi import MPI
 
 
@@ -122,7 +118,7 @@ def _build_sdfg(
     if DEACTIVATE_DISTRIBUTED_DACE_COMPILE:
         is_compiling = True
     else:
-        is_compiling = determine_compiling_ranks(config)
+        is_compiling = config.do_compile
     if is_compiling:
         # Make the transients array persistents
         if config.is_gpu_backend():
@@ -198,12 +194,13 @@ def _build_sdfg(
                 ),
             )
 
-    # Compilation done, either exit or scatter/gather and run
+    # Compilation done.
+    # On Build: all ranks sync, then exit.
+    # On BuildAndRun: all ranks sync, then load the SDFG from
+    #                 the expected path (made available by build).
+    # We use a "FrozenCompiledSDFG" to minimize re-entry cost at call time
     # DEV NOTE: we explicitly use MPI.COMM_WORLD here because it is
     # a true multi-machine sync, outside of our own communicator class.
-    # Also this code is protected in the case of running on one machine by the fact
-    # that 0 is _always_ a compiling rank & unblock_waiting_tiles is protected
-    # against scattering when no other ranks are present.
     if config.get_orchestrate() == DaCeOrchestration.Build:
         MPI.COMM_WORLD.Barrier()  # Protect against early exist which kill SLURM jobs
         DaCeProgress.log(
@@ -212,31 +209,16 @@ def _build_sdfg(
         )
         exit(0)
     elif config.get_orchestrate() == DaCeOrchestration.BuildAndRun:
-        MPI.COMM_WORLD.Barrier()
-        if is_compiling:
-            if not DEACTIVATE_DISTRIBUTED_DACE_COMPILE:
-                unblock_waiting_tiles(MPI.COMM_WORLD, sdfg.build_folder)
-                DaCeProgress.log(
-                    DaCeProgress.default_prefix(config), "Build folder exchanged."
-                )
-            csdfg, _ = daceprog.load_precompiled_sdfg(
-                sdfg.build_folder, *args, **kwargs
-            )
-            config.loaded_precompiled_SDFG[daceprog] = FrozenCompiledSDFG(
-                daceprog, csdfg, args, kwargs
-            )
-
-        else:
-            source_rank = config.target_rank
-            # wait for compilation to be done
+        if not is_compiling:
             DaCeProgress.log(
                 DaCeProgress.default_prefix(config),
-                "Rank is not compiling. Waiting for build dir...",
-            )
-            sdfg_path = MPI.COMM_WORLD.recv(source=source_rank)
-            DaCeProgress.log(
-                DaCeProgress.default_prefix(config), "Build dir received, loading .so."
+                "Rank is not compiling. "
+                "Waiting for compilation to end on all other ranks...",
             )
+        MPI.COMM_WORLD.Barrier()
+
+        with DaCeProgress(config, "Loading"):
+            sdfg_path = get_sdfg_path(daceprog.name, config, override_run_only=True)
             csdfg, _ = daceprog.load_precompiled_sdfg(sdfg_path, *args, **kwargs)
             config.loaded_precompiled_SDFG[daceprog] = FrozenCompiledSDFG(
                 daceprog, csdfg, args, kwargs
@@ -267,6 +249,7 @@ def _call_sdfg(
             config.get_orchestrate() == DaCeOrchestration.Build
             or config.get_orchestrate() == DaCeOrchestration.BuildAndRun
         ):
+            pace_log.info("Building DaCe orchestration")
             res = _build_sdfg(daceprog, sdfg, config, args, kwargs)
         elif config.get_orchestrate() == DaCeOrchestration.Run:
             # We should never hit this, it should be caught by the
@@ -302,7 +285,7 @@ def _parse_sdfg(
         if DEACTIVATE_DISTRIBUTED_DACE_COMPILE:
             is_compiling = True
         else:
-            is_compiling = determine_compiling_ranks(config)
+            is_compiling = config.do_compile
         if not is_compiling:
             # We can not parse the SDFG since we will load the proper
             # compiled SDFG from the compiling rank
@@ -448,7 +431,6 @@ def __get__(self, obj, objtype=None) -> SDFGEnabledCallable:
         """Return SDFGEnabledCallable wrapping original obj.method from cache.
         Update cache first if need be"""
         if (id(obj), id(self.func)) not in _LazyComputepathMethod.bound_callables:
-
             _LazyComputepathMethod.bound_callables[
                 (id(obj), id(self.func))
             ] = _LazyComputepathMethod.SDFGEnabledCallable(self, obj)
diff --git a/dsl/pace/dsl/typing.py b/dsl/pace/dsl/typing.py
index 05b255ce..d67dd7b6 100644
--- a/dsl/pace/dsl/typing.py
+++ b/dsl/pace/dsl/typing.py
@@ -22,11 +22,15 @@
 DTypes = Union[bool, np.bool_, int, np.int32, np.int64, float, np.float32, np.float64]
 
 
+def floating_point_precision() -> int:
+    return int(os.getenv("PACE_FLOAT_PRECISION", "64"))
+
+
 def global_set_floating_point_precision():
     """Set the global floating point precision for all reference
     to Float in the codebase. Defaults to 64 bit."""
     global Float
-    precision_in_bit = int(os.getenv("PACE_FLOAT_PRECISION", "64"))
+    precision_in_bit = floating_point_precision()
     if precision_in_bit == 64:
         return np.float64
     elif precision_in_bit == 32:
diff --git a/fv3core/pace/fv3core/initialization/geos_wrapper.py b/fv3core/pace/fv3core/initialization/geos_wrapper.py
index 2835e77e..a7a526ee 100644
--- a/fv3core/pace/fv3core/initialization/geos_wrapper.py
+++ b/fv3core/pace/fv3core/initialization/geos_wrapper.py
@@ -11,6 +11,8 @@
 from pace.driver.performance.collector import PerformanceCollector
 from pace.dsl.dace import DaceConfig, orchestrate
 from pace.dsl.gt4py_utils import is_gpu_backend
+from pace.dsl.typing import floating_point_precision
+from pace.util._optional_imports import cupy as cp
 from pace.util.logging import pace_log
 
 
@@ -131,13 +133,29 @@ def __init__(
         self.output_dict: Dict[str, np.ndarray] = {}
         self._allocate_output_dir()
 
+        # Feedback information
+        device_ordinal_info = (
+            f"  Device PCI bus id: {cp.cuda.Device(0).pci_bus_id}\n"
+            if is_gpu_backend(backend)
+            else "N/A"
+        )
+        MPS_pipe_directory = os.getenv("CUDA_MPS_PIPE_DIRECTORY", None)
+        MPS_is_on = (
+            MPS_pipe_directory
+            and is_gpu_backend(backend)
+            and os.path.exists(f"{MPS_pipe_directory}/log")
+        )
         pace_log.info(
             "Pace GEOS wrapper initialized: \n"
-            f"  dt     : {self.dycore_state.bdt}\n"
-            f"  bridge : {self._fortran_mem_space} > {self._pace_mem_space}\n"
-            f"  backend: {backend}\n"
-            f"  orchestration: {self._is_orchestrated}\n"
-            f"  sizer  : {sizer.nx}x{sizer.ny}x{sizer.nz} (halo: {sizer.n_halo})"
+            f"             dt : {self.dycore_state.bdt}\n"
+            f"         bridge : {self._fortran_mem_space} > {self._pace_mem_space}\n"
+            f"        backend : {backend}\n"
+            f"          float : {floating_point_precision()}bit"
+            f"  orchestration : {self._is_orchestrated}\n"
+            f"          sizer : {sizer.nx}x{sizer.ny}x{sizer.nz}"
+            f"(halo: {sizer.n_halo})\n"
+            f"      Device ord: {device_ordinal_info}\n"
+            f"     Nvidia MPS : {MPS_is_on}"
         )
 
     def _critical_path(self):
diff --git a/fv3core/pace/fv3core/testing/translate_dyncore.py b/fv3core/pace/fv3core/testing/translate_dyncore.py
index 510e299f..6c7da4b7 100644
--- a/fv3core/pace/fv3core/testing/translate_dyncore.py
+++ b/fv3core/pace/fv3core/testing/translate_dyncore.py
@@ -140,7 +140,7 @@ def compute_parallel(self, inputs, communicator):
             grid_data.ptop = inputs["ptop"]
         self._base.make_storage_data_input_vars(inputs)
         state = DycoreState.init_zeros(quantity_factory=self.grid.quantity_factory)
-        wsd: pace.util.Quantity = self.grid.quantity_factory.empty(
+        wsd: pace.util.Quantity = self.grid.quantity_factory.zeros(
             dims=[pace.util.X_DIM, pace.util.Y_DIM],
             units="unknown",
         )
@@ -152,7 +152,7 @@ def compute_parallel(self, inputs, communicator):
                 state[name].data[selection] = value
             else:
                 setattr(state, name, value)
-        phis: pace.util.Quantity = self.grid.quantity_factory.empty(
+        phis: pace.util.Quantity = self.grid.quantity_factory.zeros(
             dims=[pace.util.X_DIM, pace.util.Y_DIM],
             units="m",
         )
diff --git a/fv3core/tests/savepoint/translate/translate_remapping.py b/fv3core/tests/savepoint/translate/translate_remapping.py
index fc9196b0..9a2e1f84 100644
--- a/fv3core/tests/savepoint/translate/translate_remapping.py
+++ b/fv3core/tests/savepoint/translate/translate_remapping.py
@@ -107,7 +107,7 @@ def compute_from_storage(self, inputs):
         inputs["wsd"] = wsd_2d
         inputs["q_cld"] = inputs["tracers"]["qcld"]
         inputs["last_step"] = bool(inputs["last_step"])
-        pfull = self.grid.quantity_factory.empty([Z_DIM], units="Pa")
+        pfull = self.grid.quantity_factory.zeros([Z_DIM], units="Pa")
         pfull.data[:] = pfull.np.asarray(inputs.pop("pfull"))
         l_to_e_obj = LagrangianToEulerian(
             self.stencil_factory,
diff --git a/requirements_dev.txt b/requirements_dev.txt
index 052bf5c3..484c4948 100644
--- a/requirements_dev.txt
+++ b/requirements_dev.txt
@@ -10,8 +10,8 @@ zarr
 dask>=2021.10.0
 netCDF4
 cftime
-fv3config>=0.9.0
 dace==0.14.0
+fv3config>=0.9.0
 f90nml>=1.1.0
 numpy>=1.15
 -e external/gt4py
diff --git a/stencils/pace/stencils/testing/grid.py b/stencils/pace/stencils/testing/grid.py
index 65ad8870..4cf623b1 100644
--- a/stencils/pace/stencils/testing/grid.py
+++ b/stencils/pace/stencils/testing/grid.py
@@ -504,7 +504,7 @@ def grid_data(self) -> "GridData":
             data = getattr(self, name)
             assert data is not None
 
-            quantity = self.quantity_factory.empty(dims=dims, units=units)
+            quantity = self.quantity_factory.zeros(dims=dims, units=units)
             if len(quantity.shape) == 3:
                 quantity.data[:] = data[:, :, : quantity.shape[2]]
             elif len(quantity.shape) == 2:
diff --git a/stencils/pace/stencils/testing/parallel_translate.py b/stencils/pace/stencils/testing/parallel_translate.py
index dcc1e64d..f10b9b27 100644
--- a/stencils/pace/stencils/testing/parallel_translate.py
+++ b/stencils/pace/stencils/testing/parallel_translate.py
@@ -12,7 +12,6 @@
 
 
 class ParallelTranslate:
-
     max_error = TranslateFortranData2Py.max_error
     near_zero = TranslateFortranData2Py.near_zero
     compute_grid_option = False
@@ -192,7 +191,7 @@ def state_from_inputs(self, inputs: dict, grid=None) -> dict:
         for name, properties in self.inputs.items():
             standard_name = properties.get("name", name)
             if len(properties["dims"]) > 0:
-                state[standard_name] = grid.quantity_factory.empty(
+                state[standard_name] = grid.quantity_factory.zeros(
                     properties["dims"], properties["units"], dtype=inputs[name].dtype
                 )
                 input_slice = _serialize_slice(
diff --git a/stencils/pace/stencils/testing/temporaries.py b/stencils/pace/stencils/testing/temporaries.py
index 581387f6..2dd46663 100644
--- a/stencils/pace/stencils/testing/temporaries.py
+++ b/stencils/pace/stencils/testing/temporaries.py
@@ -40,10 +40,9 @@ def _assert_same_temporaries(dict1: dict, dict2: dict) -> List[str]:
         attr2 = dict2[attr]
         if isinstance(attr1, np.ndarray):
             try:
-                np.testing.assert_almost_equal(
-                    attr1, attr2, err_msg=f"{attr} not equal"
-                )
-            except AssertionError:
+                assert np.allclose(attr1, attr2, equal_nan=True)
+            except AssertionError as e:
+                print(e)
                 differences.append(attr)
         else:
             sub_differences = _assert_same_temporaries(attr1, attr2)
diff --git a/tests/main/dsl/test_caches.py b/tests/main/dsl/test_caches.py
new file mode 100644
index 00000000..c1f01303
--- /dev/null
+++ b/tests/main/dsl/test_caches.py
@@ -0,0 +1,176 @@
+import pytest
+from gt4py.cartesian.gtscript import PARALLEL, Field, computation, interval
+from gt4py.storage import empty, ones
+
+import pace.dsl
+from pace.dsl.dace import orchestrate
+from pace.dsl.dace.dace_config import DaceConfig, DaCeOrchestration
+from pace.dsl.stencil import CompilationConfig, GridIndexing
+
+
+def _make_storage(
+    func,
+    grid_indexing,
+    stencil_config: pace.dsl.StencilConfig,
+    *,
+    dtype=float,
+    aligned_index=(0, 0, 0),
+):
+    return func(
+        backend=stencil_config.compilation_config.backend,
+        shape=grid_indexing.domain,
+        dtype=dtype,
+        aligned_index=aligned_index,
+    )
+
+
+def _stencil(inp: Field[float], out: Field[float], scalar: float):
+    with computation(PARALLEL), interval(...):
+        out = inp
+
+
+def _build_stencil(backend, orchestrated: DaCeOrchestration):
+    # Make stencil and verify it ran
+    grid_indexing = GridIndexing(
+        domain=(5, 5, 5),
+        n_halo=2,
+        south_edge=True,
+        north_edge=True,
+        west_edge=True,
+        east_edge=True,
+    )
+
+    stencil_config = pace.dsl.StencilConfig(
+        compilation_config=CompilationConfig(backend=backend, rebuild=True),
+        dace_config=DaceConfig(None, backend, 5, 5, orchestrated),
+    )
+
+    stencil_factory = pace.dsl.StencilFactory(stencil_config, grid_indexing)
+
+    built_stencil = stencil_factory.from_origin_domain(
+        _stencil, (0, 0, 0), domain=grid_indexing.domain
+    )
+
+    return built_stencil, grid_indexing, stencil_config
+
+
+class OrchestratedProgam:
+    def __init__(self, backend, orchestration):
+        self.stencil, grid_indexing, stencil_config = _build_stencil(
+            backend, orchestration
+        )
+        orchestrate(obj=self, config=stencil_config.dace_config)
+        self.inp = _make_storage(ones, grid_indexing, stencil_config, dtype=float)
+        self.out = _make_storage(empty, grid_indexing, stencil_config, dtype=float)
+
+    def __call__(self):
+        self.stencil(self.inp, self.out, self.inp[0, 0, 0])
+
+
+@pytest.mark.parametrize(
+    "backend",
+    [
+        pytest.param("dace:cpu"),
+    ],
+)
+def test_relocatability_orchestration(backend):
+    import os
+    import shutil
+
+    from gt4py.cartesian import config as gt_config
+
+    original_root_directory = gt_config.cache_settings["root_path"]
+    working_dir = str(os.getcwd())
+
+    # Compile on default
+    p0 = OrchestratedProgam(backend, DaCeOrchestration.BuildAndRun)
+    p0()
+    assert os.path.exists(
+        f"{working_dir}/.gt_cache_FV3_A/dacecache/"
+        "test_caches_OrchestratedProgam___call__",
+    ) or os.path.exists(
+        f"{working_dir}/.gt_cache_FV3_A/dacecache/OrchestratedProgam___call__",
+    )
+
+    # Compile in another directory
+
+    custom_path = f"{working_dir}/.my_cache_path"
+    gt_config.cache_settings["root_path"] = custom_path
+    p1 = OrchestratedProgam(backend, DaCeOrchestration.BuildAndRun)
+    p1()
+    assert os.path.exists(
+        f"{custom_path}/.gt_cache_FV3_A/dacecache/"
+        "test_caches_OrchestratedProgam___call__",
+    ) or os.path.exists(
+        f"{working_dir}/.gt_cache_FV3_A/dacecache/OrchestratedProgam___call__",
+    )
+
+    # Check relocability by copying the second cache directory,
+    # changing the path of gt_config.cache_settings and trying to Run on it
+    relocated_path = f"{working_dir}/.my_relocated_cache_path"
+    shutil.copytree(custom_path, relocated_path, dirs_exist_ok=True)
+    gt_config.cache_settings["root_path"] = relocated_path
+    p2 = OrchestratedProgam(backend, DaCeOrchestration.Run)
+    p2()
+
+    # Generate a file exists error to check for bad path
+    bogus_path = "./nope/notatall/nothappening"
+    gt_config.cache_settings["root_path"] = bogus_path
+    with pytest.raises(RuntimeError):
+        OrchestratedProgam(backend, DaCeOrchestration.Run)
+
+    # Restore cache settings
+    gt_config.cache_settings["root_path"] = original_root_directory
+
+
+@pytest.mark.parametrize(
+    "backend",
+    [
+        pytest.param("dace:cpu"),
+    ],
+)
+def test_relocatability(backend: str):
+    import os
+    import shutil
+
+    import gt4py
+    from gt4py.cartesian import config as gt_config
+
+    from pace.util.mpi import MPI
+
+    # Restore original dir name
+    gt4py.cartesian.config.cache_settings["dir_name"] = os.environ.get(
+        "GT_CACHE_DIR_NAME", f".gt_cache_{MPI.COMM_WORLD.Get_rank():06}"
+    )
+
+    backend_sanitized = backend.replace(":", "")
+
+    # Compile on default
+    p0 = OrchestratedProgam(backend, DaCeOrchestration.Python)
+    p0()
+    assert os.path.exists(
+        f"./.gt_cache_000000/py38_1013/{backend_sanitized}/test_caches/_stencil/"
+    )
+
+    # Compile in another directory
+
+    custom_path = "./.my_cache_path"
+    gt_config.cache_settings["root_path"] = custom_path
+    p1 = OrchestratedProgam(backend, DaCeOrchestration.Python)
+    p1()
+    assert os.path.exists(
+        f"{custom_path}/.gt_cache_000000/py38_1013/{backend_sanitized}"
+        "/test_caches/_stencil/"
+    )
+
+    # Check relocability by copying the second cache directory,
+    # changing the path of gt_config.cache_settings and trying to Run on it
+    relocated_path = "./.my_relocated_cache_path"
+    shutil.copytree("./.gt_cache_000000", relocated_path, dirs_exist_ok=True)
+    gt_config.cache_settings["root_path"] = relocated_path
+    p2 = OrchestratedProgam(backend, DaCeOrchestration.Python)
+    p2()
+    assert os.path.exists(
+        f"{relocated_path}/.gt_cache_000000/py38_1013/{backend_sanitized}"
+        "/test_caches/_stencil/"
+    )
diff --git a/tests/main/dsl/test_dace_config.py b/tests/main/dsl/test_dace_config.py
index 78553278..cb3566dd 100644
--- a/tests/main/dsl/test_dace_config.py
+++ b/tests/main/dsl/test_dace_config.py
@@ -1,11 +1,12 @@
 import unittest.mock
 
-from pace.dsl.dace.dace_config import DaceConfig
+from pace.dsl.dace.dace_config import DaceConfig, _determine_compiling_ranks
 from pace.dsl.dace.orchestration import (
     DaCeOrchestration,
     orchestrate,
     orchestrate_function,
 )
+from pace.util.communicator import CubedSpherePartitioner, TilePartitioner
 
 
 """
@@ -91,3 +92,68 @@ def foo(self):
         a = A()
         a.foo()
     assert not mock_call_sdfg.called
+
+
+def test_orchestrate_distributed_build():
+    dummy_dace_config = DaceConfig(
+        communicator=None,
+        backend="gtc:dace",
+        orchestration=DaCeOrchestration.BuildAndRun,
+    )
+
+    def _does_compile(rank, partitioner) -> bool:
+        dummy_dace_config.layout = partitioner.layout
+        dummy_dace_config.rank_size = partitioner.layout[0] * partitioner.layout[1] * 6
+        dummy_dace_config.my_rank = rank
+        return _determine_compiling_ranks(dummy_dace_config, partitioner)
+
+    # (1, 1) layout, one rank which compiles
+    cube_partitioner_11 = CubedSpherePartitioner(TilePartitioner((1, 1)))
+    assert _does_compile(0, cube_partitioner_11)
+    assert not _does_compile(1, cube_partitioner_11)  # not compiling face
+
+    # (2, 2) layout, 4 ranks, all compiling
+    cube_partitioner_22 = CubedSpherePartitioner(TilePartitioner((2, 2)))
+    assert _does_compile(0, cube_partitioner_22)
+    assert _does_compile(1, cube_partitioner_22)
+    assert _does_compile(2, cube_partitioner_22)
+    assert _does_compile(3, cube_partitioner_22)
+    assert not _does_compile(4, cube_partitioner_22)  # not compiling face
+
+    # (3, 3) layout, 9 ranks, all compiling
+    cube_partitioner_33 = CubedSpherePartitioner(TilePartitioner((3, 3)))
+    assert _does_compile(0, cube_partitioner_33)
+    assert _does_compile(1, cube_partitioner_33)
+    assert _does_compile(2, cube_partitioner_33)
+    assert _does_compile(3, cube_partitioner_33)
+    assert _does_compile(4, cube_partitioner_33)
+    assert _does_compile(5, cube_partitioner_33)
+    assert _does_compile(6, cube_partitioner_33)
+    assert _does_compile(7, cube_partitioner_33)
+    assert _does_compile(8, cube_partitioner_33)
+    assert not _does_compile(9, cube_partitioner_33)  # not compiling face
+
+    # (4, 4) layout, 16 ranks,
+    # expecting compiling:0, 1, 2, 3, 4, 5, 7, 12, 13, 15
+    cube_partitioner_44 = CubedSpherePartitioner(TilePartitioner((4, 4)))
+    assert _does_compile(0, cube_partitioner_44)
+    assert _does_compile(1, cube_partitioner_44)
+    assert _does_compile(4, cube_partitioner_44)
+    assert _does_compile(5, cube_partitioner_44)
+    assert _does_compile(7, cube_partitioner_44)
+    assert _does_compile(12, cube_partitioner_44)
+    assert _does_compile(13, cube_partitioner_44)
+    assert _does_compile(15, cube_partitioner_44)
+    assert not _does_compile(2, cube_partitioner_44)  # same code path as 3
+    assert not _does_compile(6, cube_partitioner_44)  # same code path as 5
+    assert not _does_compile(8, cube_partitioner_44)  # same code path as 4
+    assert not _does_compile(11, cube_partitioner_44)  # same code path as 7
+    assert not _does_compile(16, cube_partitioner_44)  # not compiling face
+
+    # For a few other layouts, we check that we always have 9 compiling ranks
+    for layout in [(5, 5), (10, 10), (20, 20)]:
+        partition = CubedSpherePartitioner(TilePartitioner(layout))
+        compiling = 0
+        for i in range(layout[0] * layout[1] * 6):
+            compiling += 1 if _does_compile(i, partition) else 0
+        assert compiling == 9
diff --git a/tests/main/test_grid_init.py b/tests/main/test_grid_init.py
index 942dcfd3..4c0e5e2b 100644
--- a/tests/main/test_grid_init.py
+++ b/tests/main/test_grid_init.py
@@ -51,8 +51,6 @@ def test_grid_init_not_decomposition_dependent(rank: int):
     assert allclose(metric_terms_1by1.area, metric_terms_3by3.area, partitioner, rank)
     assert allclose(metric_terms_1by1.dx, metric_terms_3by3.dx, partitioner, rank)
     assert allclose(metric_terms_1by1.dy, metric_terms_3by3.dy, partitioner, rank)
-    assert allclose(metric_terms_1by1.dxa, metric_terms_3by3.dxa, partitioner, rank)
-    assert allclose(metric_terms_1by1.dya, metric_terms_3by3.dya, partitioner, rank)
     assert allclose(
         metric_terms_1by1.cos_sg1, metric_terms_3by3.cos_sg1, partitioner, rank
     )
diff --git a/util/pace/util/__init__.py b/util/pace/util/__init__.py
index 54f684d6..58a7c2a5 100644
--- a/util/pace/util/__init__.py
+++ b/util/pace/util/__init__.py
@@ -54,7 +54,7 @@
 from .initialization import GridSizer, QuantityFactory, SubtileGridSizer
 from .io import read_state, write_state
 from .local_comm import LocalComm
-from .logging import pace_log, AVAILABLE_LOG_LEVELS
+from .logging import AVAILABLE_LOG_LEVELS, pace_log
 from .monitor import Monitor, NetCDFMonitor, ZarrMonitor
 from .mpi import MPIComm
 from .namelist import Namelist, NamelistDefaults
diff --git a/util/pace/util/communicator.py b/util/pace/util/communicator.py
index 938469bd..d2577d8c 100644
--- a/util/pace/util/communicator.py
+++ b/util/pace/util/communicator.py
@@ -167,7 +167,7 @@ def _get_gather_recv_quantity(
     ) -> Quantity:
         """Initialize a Quantity for use when receiving global data during gather"""
         recv_quantity = Quantity(
-            send_metadata.np.empty(global_extent, dtype=send_metadata.dtype),
+            send_metadata.np.zeros(global_extent, dtype=send_metadata.dtype),
             dims=send_metadata.dims,
             units=send_metadata.units,
             origin=tuple([0 for dim in send_metadata.dims]),
@@ -182,7 +182,7 @@ def _get_scatter_recv_quantity(
     ) -> Quantity:
         """Initialize a Quantity for use when receiving subtile data during scatter"""
         recv_quantity = Quantity(
-            send_metadata.np.empty(shape, dtype=send_metadata.dtype),
+            send_metadata.np.zeros(shape, dtype=send_metadata.dtype),
             dims=send_metadata.dims,
             units=send_metadata.units,
             gt4py_backend=send_metadata.gt4py_backend,
@@ -206,7 +206,7 @@ def gather(
         result: Optional[Quantity]
         if self.rank == constants.ROOT_RANK:
             with array_buffer(
-                send_quantity.np.empty,
+                send_quantity.np.zeros,
                 (self.partitioner.total_ranks,) + tuple(send_quantity.extent),
                 dtype=send_quantity.data.dtype,
             ) as recvbuf:
@@ -745,7 +745,7 @@ def _get_gather_recv_quantity(
         # needs to change the quantity dimensions since we add a "tile" dimension,
         # unlike for tile scatter/gather which retains the same dimensions
         recv_quantity = Quantity(
-            metadata.np.empty(global_extent, dtype=metadata.dtype),
+            metadata.np.zeros(global_extent, dtype=metadata.dtype),
             dims=(constants.TILE_DIM,) + metadata.dims,
             units=metadata.units,
             origin=(0,) + tuple([0 for dim in metadata.dims]),
@@ -767,7 +767,7 @@ def _get_scatter_recv_quantity(
         # needs to change the quantity dimensions since we remove a "tile" dimension,
         # unlike for tile scatter/gather which retains the same dimensions
         recv_quantity = Quantity(
-            metadata.np.empty(shape, dtype=metadata.dtype),
+            metadata.np.zeros(shape, dtype=metadata.dtype),
             dims=metadata.dims[1:],
             units=metadata.units,
             gt4py_backend=metadata.gt4py_backend,
diff --git a/util/pace/util/grid/eta.py b/util/pace/util/grid/eta.py
index 50afaadb..dc37aaa2 100644
--- a/util/pace/util/grid/eta.py
+++ b/util/pace/util/grid/eta.py
@@ -206,7 +206,6 @@ def set_hybrid_pressure_coefficients(km: int) -> HybridPressureCoefficients:
         )
 
     elif km == 91:
-
         ak = np.array(
             [
                 1.00000000,
@@ -402,7 +401,6 @@ def set_hybrid_pressure_coefficients(km: int) -> HybridPressureCoefficients:
         )
 
     elif km == 72:
-
         ak = np.array(
             [
                 1.00000000,
@@ -559,10 +557,299 @@ def set_hybrid_pressure_coefficients(km: int) -> HybridPressureCoefficients:
             ]
         )
 
+    elif km == 137:
+        ak = np.array(
+            [
+                1.00000000,
+                1.82500005,
+                3.00000000,
+                4.63000011,
+                6.82797718,
+                9.74696636,
+                13.6054239,
+                18.6089306,
+                24.9857178,
+                32.9857101,
+                42.8792419,
+                54.9554634,
+                69.5205765,
+                86.8958817,
+                107.415741,
+                131.425507,
+                159.279404,
+                191.338562,
+                227.968948,
+                269.539581,
+                316.420746,
+                368.982361,
+                427.592499,
+                492.616028,
+                564.413452,
+                643.339905,
+                729.744141,
+                823.967834,
+                926.344910,
+                1037.20117,
+                1156.85364,
+                1285.61035,
+                1423.77014,
+                1571.62292,
+                1729.44897,
+                1897.51929,
+                2076.09595,
+                2265.43164,
+                2465.77051,
+                2677.34814,
+                2900.39136,
+                3135.11938,
+                3381.74365,
+                3640.46826,
+                3911.49048,
+                4194.93066,
+                4490.81738,
+                4799.14941,
+                5119.89502,
+                5452.99072,
+                5798.34473,
+                6156.07422,
+                6526.94678,
+                6911.87061,
+                7311.86914,
+                7727.41211,
+                8159.35400,
+                8608.52539,
+                9076.40039,
+                9562.68262,
+                10065.9785,
+                10584.6318,
+                11116.6621,
+                11660.0674,
+                12211.5479,
+                12766.8730,
+                13324.6689,
+                13881.3311,
+                14432.1396,
+                14975.6152,
+                15508.2568,
+                16026.1152,
+                16527.3223,
+                17008.7891,
+                17467.6133,
+                17901.6211,
+                18308.4336,
+                18685.7188,
+                19031.2891,
+                19343.5117,
+                19620.0430,
+                19859.3906,
+                20059.9316,
+                20219.6641,
+                20337.8633,
+                20412.3086,
+                20442.0781,
+                20425.7188,
+                20361.8164,
+                20249.5117,
+                20087.0859,
+                19874.0254,
+                19608.5723,
+                19290.2266,
+                18917.4609,
+                18489.7070,
+                18006.9258,
+                17471.8398,
+                16888.6875,
+                16262.0469,
+                15596.6953,
+                14898.4531,
+                14173.3242,
+                13427.7695,
+                12668.2578,
+                11901.3398,
+                11133.3047,
+                10370.1758,
+                9617.51562,
+                8880.45312,
+                8163.37500,
+                7470.34375,
+                6804.42188,
+                6168.53125,
+                5564.38281,
+                4993.79688,
+                4457.37500,
+                3955.96094,
+                3489.23438,
+                3057.26562,
+                2659.14062,
+                2294.24219,
+                1961.50000,
+                1659.47656,
+                1387.54688,
+                1143.25000,
+                926.507812,
+                734.992188,
+                568.062500,
+                424.414062,
+                302.476562,
+                202.484375,
+                122.101562,
+                62.7812500,
+                22.8359375,
+                3.75781298,
+                0.00000000,
+                0.00000000,
+            ]
+        )
+
+        bk = np.array(
+            [
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                0.00000000,
+                7.00000010e-06,
+                2.40000008e-05,
+                5.90000018e-05,
+                1.12000002e-04,
+                1.99000002e-04,
+                3.39999999e-04,
+                5.61999972e-04,
+                8.90000025e-04,
+                1.35300006e-03,
+                1.99200003e-03,
+                2.85700010e-03,
+                3.97100020e-03,
+                5.37799997e-03,
+                7.13300006e-03,
+                9.26099997e-03,
+                1.18060000e-02,
+                1.48160001e-02,
+                1.83179993e-02,
+                2.23549996e-02,
+                2.69639995e-02,
+                3.21759991e-02,
+                3.80260013e-02,
+                4.45480011e-02,
+                5.17730005e-02,
+                5.97280003e-02,
+                6.84479997e-02,
+                7.79580027e-02,
+                8.82859975e-02,
+                9.94620025e-02,
+                0.111505002,
+                0.124448001,
+                0.138312995,
+                0.153125003,
+                0.168909997,
+                0.185689002,
+                0.203491002,
+                0.222332999,
+                0.242244005,
+                0.263242006,
+                0.285353988,
+                0.308598012,
+                0.332938999,
+                0.358253986,
+                0.384362996,
+                0.411125004,
+                0.438391000,
+                0.466003001,
+                0.493800014,
+                0.521619022,
+                0.549301028,
+                0.576691985,
+                0.603648007,
+                0.630035996,
+                0.655736029,
+                0.680643022,
+                0.704668999,
+                0.727738976,
+                0.749796987,
+                0.770798028,
+                0.790717006,
+                0.809535980,
+                0.827256024,
+                0.843881011,
+                0.859431982,
+                0.873929024,
+                0.887408018,
+                0.899900019,
+                0.911448002,
+                0.922096014,
+                0.931881011,
+                0.940859973,
+                0.949064016,
+                0.956550002,
+                0.963352025,
+                0.969512999,
+                0.975077987,
+                0.980072021,
+                0.984542012,
+                0.988499999,
+                0.991984010,
+                0.995002985,
+                0.997630000,
+                1.00000000,
+            ]
+        )
+
     else:
         raise NotImplementedError(
-            "Only grids with 72, 79, or 91 vertical levels have been implemented so far"
+            "Only grids with 72, 79, 91 or 137 vertical levels"
+            "have been implemented so far"
         )
+
     if 0.0 in bk:
         ks = 0 if km == 91 else np.where(bk == 0)[0][-1]
         ptop = ak[0]
diff --git a/util/pace/util/grid/generation.py b/util/pace/util/grid/generation.py
index b78a7059..679b9449 100644
--- a/util/pace/util/grid/generation.py
+++ b/util/pace/util/grid/generation.py
@@ -75,7 +75,7 @@ def quantity_cast_to_model_float(
     quantity_factory: util.QuantityFactory, qty_64: util.Quantity
 ) -> util.Quantity:
     """Copy & cast from 64-bit float to model precision if need be"""
-    qty = quantity_factory.empty(qty_64.dims, qty_64.units, dtype=Float)
+    qty = quantity_factory.zeros(qty_64.dims, qty_64.units, dtype=Float)
     qty.data[:] = qty_64.data[:]
     return qty
 
@@ -1530,7 +1530,6 @@ def rdyc(self) -> util.Quantity:
         )
 
     def _init_dgrid(self):
-
         grid_mirror_ew = self.quantity_factory.zeros(
             self._grid_dims,
             "radians",
@@ -1751,7 +1750,6 @@ def _compute_dxdy(self):
         return dx, dy
 
     def _compute_dxdy_agrid(self):
-
         dx_agrid_64 = self.quantity_factory.zeros(
             [util.X_DIM, util.Y_DIM],
             "m",
@@ -2149,7 +2147,6 @@ def _calculate_more_trig_terms(self, cos_sg, sin_sg):
         )
 
     def _init_cell_trigonometry(self):
-
         cosa_u_64 = self.quantity_factory.zeros(
             [util.X_INTERFACE_DIM, util.Y_DIM],
             "",
diff --git a/util/pace/util/grid/gnomonic.py b/util/pace/util/grid/gnomonic.py
index 705014e4..f26af0f2 100644
--- a/util/pace/util/grid/gnomonic.py
+++ b/util/pace/util/grid/gnomonic.py
@@ -303,9 +303,9 @@ def _mirror_latlon(lon1, lat1, lon2, lat2, lon0, lat0, np):
     pdot = p0[0] * nb[0] + p0[1] * nb[1] + p0[2] * nb[2]
     pp = p0 - np.multiply(2.0, pdot) * nb
 
-    lon3 = np.empty((1, 1))
-    lat3 = np.empty((1, 1))
-    pp3 = np.empty((3, 1, 1))
+    lon3 = np.zeros((1, 1))
+    lat3 = np.zeros((1, 1))
+    pp3 = np.zeros((3, 1, 1))
     pp3[:, 0, 0] = pp
     _cart_to_latlon(1, pp3, lon3, lat3, np)
 
diff --git a/util/pace/util/grid/helper.py b/util/pace/util/grid/helper.py
index 673e484d..1b977ad8 100644
--- a/util/pace/util/grid/helper.py
+++ b/util/pace/util/grid/helper.py
@@ -166,8 +166,8 @@ def from_restart(
                 but no fv_core.res.nc in restart data file."""
             )
 
-        ak = quantity_factory.empty([Z_INTERFACE_DIM], units="Pa")
-        bk = quantity_factory.empty([Z_INTERFACE_DIM], units="")
+        ak = quantity_factory.zeros([Z_INTERFACE_DIM], units="Pa")
+        bk = quantity_factory.zeros([Z_INTERFACE_DIM], units="")
         with fs.open(ak_bk_data_file, "rb") as f:
             ds = xr.open_dataset(f).isel(Time=0).drop_vars("Time")
             ak.view[:] = ds["ak"].values
@@ -322,7 +322,6 @@ def __init__(
 
     @classmethod
     def new_from_metric_terms(cls, metric_terms: MetricTerms):
-
         horizontal_data = HorizontalGridData.new_from_metric_terms(metric_terms)
         vertical_data = VerticalGridData.new_from_metric_terms(metric_terms)
         contravariant_data = ContravariantGridData.new_from_metric_terms(metric_terms)
@@ -701,7 +700,6 @@ def new_from_grid_variables(
         es1: pace.util.Quantity,
         ew2: pace.util.Quantity,
     ) -> "DriverGridData":
-
         try:
             vlon1, vlon2, vlon3 = split_quantity_along_last_dim(vlon)
             vlat1, vlat2, vlat3 = split_quantity_along_last_dim(vlat)
diff --git a/util/pace/util/halo_data_transformer.py b/util/pace/util/halo_data_transformer.py
index 00a547d6..e97bb97a 100644
--- a/util/pace/util/halo_data_transformer.py
+++ b/util/pace/util/halo_data_transformer.py
@@ -70,7 +70,7 @@ def _build_flatten_indices(
     """
 
     # Have to go down to numpy to leverage indices calculation
-    arr_indices = np.empty(shape, dtype=np.int32, order="C")[slices]
+    arr_indices = np.zeros(shape, dtype=np.int32, order="C")[slices]
 
     # Get offset from first index
     offset_dims = []
@@ -875,7 +875,6 @@ def _opt_unpack_scalar(self, quantities: List[Quantity]):
 
             # Use private stream
             with self._get_stream(cu_kernel_args.stream):
-
                 # Launch kernel
                 blocks = 128
                 grid_x = (info_x._unpack_buffer_size // blocks) + 1
@@ -942,7 +941,6 @@ def _opt_unpack_vector(
 
             # Use private stream
             with self._get_stream(cu_kernel_args.stream):
-
                 # Buffer sizes
                 edge_size = info_x._unpack_buffer_size + info_y._unpack_buffer_size
 
diff --git a/util/pace/util/initialization/allocator.py b/util/pace/util/initialization/allocator.py
index c865cbbf..1a68495e 100644
--- a/util/pace/util/initialization/allocator.py
+++ b/util/pace/util/initialization/allocator.py
@@ -102,7 +102,7 @@ def from_array(
         That numpy array must correspond to the correct shape and extent
         for the given dims.
         """
-        base = self.empty(
+        base = self.zeros(
             dims=dims,
             units=units,
             dtype=data.dtype,