diff --git a/.github/workflows/main_unit_tests.yml b/.github/workflows/main_unit_tests.yml index 5dbf4a1f..5800baa6 100644 --- a/.github/workflows/main_unit_tests.yml +++ b/.github/workflows/main_unit_tests.yml @@ -15,9 +15,9 @@ jobs: uses: actions/setup-python@v4.6.0 with: python-version: '3.8.12' - - name: Install OpenMPI for gt4py + - name: Install OpenMPI & Boost for gt4py run: | - sudo apt-get install libopenmpi-dev + sudo apt-get install libopenmpi-dev libboost1.74-dev - name: Install Python packages run: | python -m pip install --upgrade pip diff --git a/driver/pace/driver/driver.py b/driver/pace/driver/driver.py index 0ef3263b..284acaca 100644 --- a/driver/pace/driver/driver.py +++ b/driver/pace/driver/driver.py @@ -275,7 +275,7 @@ def from_dict(cls, kwargs: Dict[str, Any]) -> "DriverConfig": ) grid_type = kwargs["grid_config"].config.grid_type # Copy grid_type to the DycoreConfig if it's not the default value - if grid_type != 0: + if grid_type != 0: kwargs["dycore_config"].grid_type = grid_type if ( diff --git a/driver/pace/driver/grid.py b/driver/pace/driver/grid.py index c184d566..9fa97a06 100644 --- a/driver/pace/driver/grid.py +++ b/driver/pace/driver/grid.py @@ -105,7 +105,6 @@ def get_grid( quantity_factory: QuantityFactory, communicator: CubedSphereCommunicator, ) -> Tuple[DampingCoefficients, DriverGridData, GridData]: - metric_terms = MetricTerms( quantity_factory=quantity_factory, communicator=communicator, @@ -184,8 +183,7 @@ def get_grid( quantity_factory: QuantityFactory, communicator: CubedSphereCommunicator, ) -> Tuple[DampingCoefficients, DriverGridData, GridData]: - - backend = quantity_factory.empty( + backend = quantity_factory.zeros( dims=[pace.util.X_DIM, pace.util.Y_DIM], units="unknown" ).gt4py_backend diff --git a/driver/pace/driver/initialization.py b/driver/pace/driver/initialization.py index 2b6471a8..bd6d96ea 100644 --- a/driver/pace/driver/initialization.py +++ b/driver/pace/driver/initialization.py @@ -154,7 +154,6 @@ def get_driver_state( driver_grid_data: pace.util.grid.DriverGridData, grid_data: pace.util.grid.GridData, ) -> DriverState: - dycore_state = tc_init.init_tc_state( grid_data=grid_data, quantity_factory=quantity_factory, @@ -323,7 +322,7 @@ def get_driver_state( driver_grid_data: pace.util.grid.DriverGridData, grid_data: pace.util.grid.GridData, ) -> DriverState: - backend = quantity_factory.empty( + backend = quantity_factory.zeros( dims=[pace.util.X_DIM, pace.util.Y_DIM], units="unknown" ).gt4py_backend @@ -348,7 +347,6 @@ def _initialize_dycore_state( communicator: pace.util.CubedSphereCommunicator, backend: str, ) -> fv3core.DycoreState: - grid = self._get_serialized_grid(communicator=communicator, backend=backend) ser = self._serializer(communicator) @@ -401,7 +399,6 @@ def get_driver_state( driver_grid_data: pace.util.grid.DriverGridData, grid_data: pace.util.grid.GridData, ) -> DriverState: - return DriverState( dycore_state=self.dycore_state, physics_state=self.physics_state, diff --git a/driver/pace/driver/run.py b/driver/pace/driver/run.py index e6c39982..df70eb14 100644 --- a/driver/pace/driver/run.py +++ b/driver/pace/driver/run.py @@ -5,7 +5,7 @@ import click import yaml -from pace.util import pace_log, AVAILABLE_LOG_LEVELS +from pace.util import AVAILABLE_LOG_LEVELS, pace_log from .driver import Driver, DriverConfig diff --git a/dsl/pace/dsl/caches/cache_location.py b/dsl/pace/dsl/caches/cache_location.py new file mode 100644 index 00000000..ab57a60b --- /dev/null +++ b/dsl/pace/dsl/caches/cache_location.py @@ -0,0 +1,46 @@ +from pace.dsl.caches.codepath import FV3CodePath +from pace.util import CubedSpherePartitioner + + +def identify_code_path( + rank: int, + partitioner: CubedSpherePartitioner, +) -> FV3CodePath: + if partitioner.layout == (1, 1) or partitioner.layout == [1, 1]: + return FV3CodePath.All + elif partitioner.layout[0] == 1 or partitioner.layout[1] == 1: + raise NotImplementedError( + f"Build for layout {partitioner.layout} is not handled" + ) + else: + if partitioner.tile.on_tile_bottom(rank): + if partitioner.tile.on_tile_left(rank): + return FV3CodePath.BottomLeft + if partitioner.tile.on_tile_right(rank): + return FV3CodePath.BottomRight + else: + return FV3CodePath.Bottom + if partitioner.tile.on_tile_top(rank): + if partitioner.tile.on_tile_left(rank): + return FV3CodePath.TopLeft + if partitioner.tile.on_tile_right(rank): + return FV3CodePath.TopRight + else: + return FV3CodePath.Top + else: + if partitioner.tile.on_tile_left(rank): + return FV3CodePath.Left + if partitioner.tile.on_tile_right(rank): + return FV3CodePath.Right + else: + return FV3CodePath.Center + + +def get_cache_fullpath(code_path: FV3CodePath) -> str: + from gt4py.cartesian import config as gt_config + + return f"{gt_config.cache_settings['root_path']}/.gt_cache_{code_path}" + + +def get_cache_directory(code_path: FV3CodePath) -> str: + return f".gt_cache_{code_path}" diff --git a/dsl/pace/dsl/caches/codepath.py b/dsl/pace/dsl/caches/codepath.py new file mode 100644 index 00000000..8ebf9492 --- /dev/null +++ b/dsl/pace/dsl/caches/codepath.py @@ -0,0 +1,33 @@ +import enum + + +class FV3CodePath(enum.Enum): + """Enum listing all possible code paths on a cube sphere. + For any layout the cube sphere has up to 9 different code paths depending on + the positioning of the rank on the tile and which of the edge/corner cases + it has to handle, as well as the possibility for all boundary computations in + the 1x1 layout case. + Since the framework inlines code to optimize, we _cannot_ pre-suppose which code + being kept and/or ejected. This enum serves as the ground truth to map rank to + the proper generated code. + """ + + All = "FV3_A" + BottomLeft = "FV3_BL" + Left = "FV3_L" + TopLeft = "FV3_TL" + Top = "FV3_T" + TopRight = "FV3_TR" + Right = "FV3_R" + BottomRight = "FV3_BR" + Bottom = "FV3_B" + Center = "FV3_C" + + def __str__(self): + return self.value + + def __repr__(self): + return self.value + + def __format__(self, format_spec: str) -> str: + return self.value diff --git a/dsl/pace/dsl/dace/build.py b/dsl/pace/dsl/dace/build.py index 7d8f3db2..b134f569 100644 --- a/dsl/pace/dsl/dace/build.py +++ b/dsl/pace/dsl/dace/build.py @@ -1,9 +1,9 @@ from typing import List, Optional, Tuple -from warnings import warn from dace.sdfg import SDFG import pace.util +from pace.dsl.caches.cache_location import get_cache_directory, get_cache_fullpath from pace.dsl.dace.dace_config import DaceConfig, DaCeOrchestration @@ -11,19 +11,6 @@ # Distributed compilation -def determine_compiling_ranks(config: DaceConfig) -> bool: - is_compiling = False - rank = config.my_rank - size = config.rank_size - - if int(size / 6) == 0: - is_compiling = True - elif rank % int(size / 6) == rank: - is_compiling = True - - return is_compiling - - def unblock_waiting_tiles(comm, sdfg_path: str) -> None: if comm and comm.Get_size() > 1: for tile in range(1, 6): @@ -31,48 +18,6 @@ def unblock_waiting_tiles(comm, sdfg_path: str) -> None: comm.send(sdfg_path, dest=tile * tilesize + comm.Get_rank()) -def get_target_rank(rank: int, partitioner: pace.util.CubedSpherePartitioner): - """From my rank & the current partitioner we determine which - rank we should read from. - For all layout >= 3,3 this presumes build has been done on a - 3,3 layout.""" - if partitioner.layout == (1, 1): - return 0 - if partitioner.layout == (2, 2): - if partitioner.tile.on_tile_bottom(rank): - if partitioner.tile.on_tile_left(rank): - return 0 # "00" - if partitioner.tile.on_tile_right(rank): - return 1 # "10" - if partitioner.tile.on_tile_top(rank): - if partitioner.tile.on_tile_left(rank): - return 2 # "01" - if partitioner.tile.on_tile_right(rank): - return 3 # "11" - else: - if partitioner.tile.on_tile_bottom(rank): - if partitioner.tile.on_tile_left(rank): - return 0 # "00" - if partitioner.tile.on_tile_right(rank): - return 2 # "20" - else: - return 1 # "10" - if partitioner.tile.on_tile_top(rank): - if partitioner.tile.on_tile_left(rank): - return 6 # "02" - if partitioner.tile.on_tile_right(rank): - return 8 # "22" - else: - return 7 # "12" - else: - if partitioner.tile.on_tile_left(rank): - return 3 # "01" - if partitioner.tile.on_tile_right(rank): - return 5 # "21" - else: - return 4 # "11" - - def build_info_filepath() -> str: return "build_info.txt" @@ -101,7 +46,10 @@ def write_build_info( def get_sdfg_path( - daceprog_name: str, config: DaceConfig, sdfg_file_path: Optional[str] = None + daceprog_name: str, + config: DaceConfig, + sdfg_file_path: Optional[str] = None, + override_run_only=False, ) -> Optional[str]: """Build an SDFG path from the qualified program name or it's direct path to .sdfg @@ -113,7 +61,7 @@ def get_sdfg_path( # TODO: check DaceConfig for cache.strategy == name # Guarding against bad usage of this function - if config.get_orchestrate() != DaCeOrchestration.Run: + if not override_run_only and config.get_orchestrate() != DaCeOrchestration.Run: return None # Case of a .sdfg file given by the user to be compiled @@ -125,19 +73,8 @@ def get_sdfg_path( return sdfg_file_path # Case of loading a precompiled .so - lookup using GT_CACHE - from gt4py.cartesian import config as gt_config - - if config.rank_size > 1: - rank = config.my_rank - rank_str = f"_{config.target_rank:06d}" - else: - rank = 0 - rank_str = f"_{rank:06d}" - - sdfg_dir_path = ( - f"{gt_config.cache_settings['root_path']}" - f"/.gt_cache{rank_str}/dacecache/{daceprog_name}" - ) + cache_fullpath = get_cache_fullpath(config.code_path) + sdfg_dir_path = f"{cache_fullpath}/dacecache/{daceprog_name}" if not os.path.isdir(sdfg_dir_path): raise RuntimeError(f"Precompiled SDFG is missing at {sdfg_dir_path}") @@ -153,23 +90,8 @@ def get_sdfg_path( raise RuntimeError( f"SDFG build for {build_backend}, {config._backend} has been asked" ) - # Check layout - build_layout = ast.literal_eval(build_info_file.readline()) - can_read = True - if config.layout == (1, 1) and config.layout != build_layout: - can_read = False - elif config.layout == (2, 2) and config.layout != build_layout: - can_read = False - elif ( - build_layout != (1, 1) and build_layout != (2, 2) and build_layout != (3, 3) - ): - can_read = False - if not can_read: - warn( - f"SDFG build for layout {build_layout}, " - f"cannot be run with current layout {config.layout}, bad layout?" - ) # Check resolution per tile + build_layout = ast.literal_eval(build_info_file.readline()) build_resolution = ast.literal_eval(build_info_file.readline()) if (config.tile_resolution[0] / config.layout[0]) != ( build_resolution[0] / build_layout[0] @@ -179,7 +101,7 @@ def get_sdfg_path( f"cannot be run with current resolution {config.tile_resolution}" ) - print(f"[DaCe Config] Rank {rank} loading SDFG {sdfg_dir_path}") + print(f"[DaCe Config] Rank {config.my_rank} loading SDFG {sdfg_dir_path}") return sdfg_dir_path @@ -189,33 +111,31 @@ def set_distributed_caches(config: "DaceConfig"): # Execute specific initialization per orchestration state orchestration_mode = config.get_orchestrate() + if orchestration_mode == DaCeOrchestration.Python: + return # Check that we have all the file we need to early out in case # of issues. if orchestration_mode == DaCeOrchestration.Run: import os - from gt4py.cartesian import config as gt_config - - # Check our cache exist - if config.rank_size > 1: - rank = config.my_rank - target_rank_str = f"_{config.target_rank:06d}" - else: - rank = 0 - target_rank_str = f"_{rank:06d}" - cache_filepath = ( - f"{gt_config.cache_settings['root_path']}/.gt_cache{target_rank_str}" - ) - if not os.path.exists(cache_filepath): + cache_directory = get_cache_fullpath(config.code_path) + if not os.path.exists(cache_directory): raise RuntimeError( f"{orchestration_mode} error: Could not find caches for rank " - f"{rank} at {cache_filepath}" + f"{config.my_rank} at {cache_directory}" ) - # All, good set this rank cache to the source cache - gt_config.cache_settings["dir_name"] = f".gt_cache{target_rank_str}" - print( - f"[{orchestration_mode}] Rank {rank} " - f"reading cache {gt_config.cache_settings['dir_name']}" - ) + # Set read/write caches to the target rank + from gt4py.cartesian import config as gt_config + + if config.do_compile: + verb = "reading/writing" + else: + verb = "reading" + + gt_config.cache_settings["dir_name"] = get_cache_directory(config.code_path) + pace.util.pace_log.critical( + f"[{orchestration_mode}] Rank {config.my_rank} " + f"{verb} cache {gt_config.cache_settings['dir_name']}" + ) diff --git a/dsl/pace/dsl/dace/dace_config.py b/dsl/pace/dsl/dace/dace_config.py index e6f3e7df..1bb0939e 100644 --- a/dsl/pace/dsl/dace/dace_config.py +++ b/dsl/pace/dsl/dace/dace_config.py @@ -1,13 +1,16 @@ import enum -from typing import Any, Dict, Optional +import os +from typing import Any, Dict, Optional, Tuple import dace.config from dace.codegen.compiled_sdfg import CompiledSDFG from dace.frontend.python.parser import DaceProgram +from pace.dsl.caches.cache_location import identify_code_path +from pace.dsl.caches.codepath import FV3CodePath from pace.dsl.gt4py_utils import is_gpu_backend from pace.util._optional_imports import cupy as cp -from pace.util.communicator import CubedSphereCommunicator +from pace.util.communicator import CubedSphereCommunicator, CubedSpherePartitioner # This can be turned on to revert compilation for orchestration @@ -16,6 +19,95 @@ DEACTIVATE_DISTRIBUTED_DACE_COMPILE = False +def _is_corner(rank: int, partitioner: CubedSpherePartitioner) -> bool: + if partitioner.tile.on_tile_bottom(rank): + if partitioner.tile.on_tile_left(rank): + return True + if partitioner.tile.on_tile_right(rank): + return True + if partitioner.tile.on_tile_top(rank): + if partitioner.tile.on_tile_left(rank): + return True + if partitioner.tile.on_tile_right(rank): + return True + return False + + +def _smallest_rank_bottom(x: int, y: int, layout: Tuple[int, int]): + return y == 0 and x == 1 + + +def _smallest_rank_top(x: int, y: int, layout: Tuple[int, int]): + return y == layout[1] - 1 and x == 1 + + +def _smallest_rank_left(x: int, y: int, layout: Tuple[int, int]): + return x == 0 and y == 1 + + +def _smallest_rank_right(x: int, y: int, layout: Tuple[int, int]): + return x == layout[0] - 1 and y == 1 + + +def _smallest_rank_middle(x: int, y: int, layout: Tuple[int, int]): + return layout[0] > 1 and layout[1] > 1 and x == 1 and y == 1 + + +def _determine_compiling_ranks( + config: "DaceConfig", + partitioner: CubedSpherePartitioner, +) -> bool: + """ + We try to map every layout to a 3x3 layout which MPI ranks + looks like + 6 7 8 + 3 4 5 + 0 1 2 + Using the partitionner we find mapping of the given layout + to all of those. For example on 4x4 layout + 12 13 14 15 + 8 9 10 11 + 4 5 6 7 + 0 1 2 3 + therefore we map + 0 -> 0 + 1 -> 1 + 2 -> NOT COMPILING + 3 -> 2 + 4 -> 3 + 5 -> 4 + 6 -> NOT COMPILING + 7 -> 5 + 8 -> NOT COMPILING + 9 -> NOT COMPILING + 10 -> NOT COMPILING + 11 -> NOT COMPILING + 12 -> 6 + 13 -> 7 + 14 -> NOT COMPILING + 15 -> 8 + """ + + # Tile 0 compiles + if partitioner.tile_index(config.my_rank) != 0: + return False + + # Corners compile + if _is_corner(config.my_rank, partitioner): + return True + + y, x = partitioner.tile.subtile_index(config.my_rank) + + # If edge or center tile, we give way to the smallest rank + return ( + _smallest_rank_left(x, y, config.layout) + or _smallest_rank_bottom(x, y, config.layout) + or _smallest_rank_middle(x, y, config.layout) + or _smallest_rank_right(x, y, config.layout) + or _smallest_rank_top(x, y, config.layout) + ) + + class DaCeOrchestration(enum.Enum): """ Orchestration mode for DaCe @@ -71,8 +163,6 @@ def __init__( # Temporary. This is a bit too out of the ordinary for the common user. # We should refactor the architecture to allow for a `gtc:orchestrated:dace:X` # backend that would signify both the `CPU|GPU` split and the orchestration mode - import os - if orchestration is None: fv3_dacemode_env_var = os.getenv("FV3_DACEMODE", "Python") # The below condition guard against defining empty FV3_DACEMODE and @@ -175,32 +265,30 @@ def __init__( # attempt to kill the dace.conf to avoid confusion if dace.config.Config._cfg_filename: try: - import os - os.remove(dace.config.Config._cfg_filename) except OSError: pass self._backend = backend self.tile_resolution = [tile_nx, tile_nx, tile_nz] - from pace.dsl.dace.build import get_target_rank, set_distributed_caches + from pace.dsl.dace.build import set_distributed_caches # Distributed build required info if communicator: self.my_rank = communicator.rank self.rank_size = communicator.comm.Get_size() - if DEACTIVATE_DISTRIBUTED_DACE_COMPILE: - self.target_rank = communicator.rank - else: - self.target_rank = get_target_rank( - self.my_rank, communicator.partitioner - ) + self.code_path = identify_code_path(self.my_rank, communicator.partitioner) self.layout = communicator.partitioner.layout + self.do_compile = ( + DEACTIVATE_DISTRIBUTED_DACE_COMPILE + or _determine_compiling_ranks(self, communicator.partitioner) + ) else: self.my_rank = 0 self.rank_size = 1 - self.target_rank = 0 + self.code_path = FV3CodePath.All self.layout = (1, 1) + self.do_compile = True set_distributed_caches(self) @@ -226,7 +314,7 @@ def get_orchestrate(self) -> DaCeOrchestration: return self._orchestrate def get_sync_debug(self) -> bool: - return dace.config.Config.get("compiler", "cuda", "syncdebug") + return dace.config.Config.get_bool("compiler", "cuda", "syncdebug") def as_dict(self) -> Dict[str, Any]: return { diff --git a/dsl/pace/dsl/dace/orchestration.py b/dsl/pace/dsl/dace/orchestration.py index 2bd9df5b..7858381a 100644 --- a/dsl/pace/dsl/dace/orchestration.py +++ b/dsl/pace/dsl/dace/orchestration.py @@ -11,12 +11,7 @@ from dace.transformation.auto.auto_optimize import make_transients_persistent from dace.transformation.helpers import get_parent_map -from pace.dsl.dace.build import ( - determine_compiling_ranks, - get_sdfg_path, - unblock_waiting_tiles, - write_build_info, -) +from pace.dsl.dace.build import get_sdfg_path, write_build_info from pace.dsl.dace.dace_config import ( DEACTIVATE_DISTRIBUTED_DACE_COMPILE, DaceConfig, @@ -34,6 +29,7 @@ memory_static_analysis, report_memory_static_analysis, ) +from pace.util import pace_log from pace.util.mpi import MPI @@ -122,7 +118,7 @@ def _build_sdfg( if DEACTIVATE_DISTRIBUTED_DACE_COMPILE: is_compiling = True else: - is_compiling = determine_compiling_ranks(config) + is_compiling = config.do_compile if is_compiling: # Make the transients array persistents if config.is_gpu_backend(): @@ -198,12 +194,13 @@ def _build_sdfg( ), ) - # Compilation done, either exit or scatter/gather and run + # Compilation done. + # On Build: all ranks sync, then exit. + # On BuildAndRun: all ranks sync, then load the SDFG from + # the expected path (made available by build). + # We use a "FrozenCompiledSDFG" to minimize re-entry cost at call time # DEV NOTE: we explicitly use MPI.COMM_WORLD here because it is # a true multi-machine sync, outside of our own communicator class. - # Also this code is protected in the case of running on one machine by the fact - # that 0 is _always_ a compiling rank & unblock_waiting_tiles is protected - # against scattering when no other ranks are present. if config.get_orchestrate() == DaCeOrchestration.Build: MPI.COMM_WORLD.Barrier() # Protect against early exist which kill SLURM jobs DaCeProgress.log( @@ -212,31 +209,16 @@ def _build_sdfg( ) exit(0) elif config.get_orchestrate() == DaCeOrchestration.BuildAndRun: - MPI.COMM_WORLD.Barrier() - if is_compiling: - if not DEACTIVATE_DISTRIBUTED_DACE_COMPILE: - unblock_waiting_tiles(MPI.COMM_WORLD, sdfg.build_folder) - DaCeProgress.log( - DaCeProgress.default_prefix(config), "Build folder exchanged." - ) - csdfg, _ = daceprog.load_precompiled_sdfg( - sdfg.build_folder, *args, **kwargs - ) - config.loaded_precompiled_SDFG[daceprog] = FrozenCompiledSDFG( - daceprog, csdfg, args, kwargs - ) - - else: - source_rank = config.target_rank - # wait for compilation to be done + if not is_compiling: DaCeProgress.log( DaCeProgress.default_prefix(config), - "Rank is not compiling. Waiting for build dir...", - ) - sdfg_path = MPI.COMM_WORLD.recv(source=source_rank) - DaCeProgress.log( - DaCeProgress.default_prefix(config), "Build dir received, loading .so." + "Rank is not compiling. " + "Waiting for compilation to end on all other ranks...", ) + MPI.COMM_WORLD.Barrier() + + with DaCeProgress(config, "Loading"): + sdfg_path = get_sdfg_path(daceprog.name, config, override_run_only=True) csdfg, _ = daceprog.load_precompiled_sdfg(sdfg_path, *args, **kwargs) config.loaded_precompiled_SDFG[daceprog] = FrozenCompiledSDFG( daceprog, csdfg, args, kwargs @@ -267,6 +249,7 @@ def _call_sdfg( config.get_orchestrate() == DaCeOrchestration.Build or config.get_orchestrate() == DaCeOrchestration.BuildAndRun ): + pace_log.info("Building DaCe orchestration") res = _build_sdfg(daceprog, sdfg, config, args, kwargs) elif config.get_orchestrate() == DaCeOrchestration.Run: # We should never hit this, it should be caught by the @@ -302,7 +285,7 @@ def _parse_sdfg( if DEACTIVATE_DISTRIBUTED_DACE_COMPILE: is_compiling = True else: - is_compiling = determine_compiling_ranks(config) + is_compiling = config.do_compile if not is_compiling: # We can not parse the SDFG since we will load the proper # compiled SDFG from the compiling rank @@ -448,7 +431,6 @@ def __get__(self, obj, objtype=None) -> SDFGEnabledCallable: """Return SDFGEnabledCallable wrapping original obj.method from cache. Update cache first if need be""" if (id(obj), id(self.func)) not in _LazyComputepathMethod.bound_callables: - _LazyComputepathMethod.bound_callables[ (id(obj), id(self.func)) ] = _LazyComputepathMethod.SDFGEnabledCallable(self, obj) diff --git a/dsl/pace/dsl/typing.py b/dsl/pace/dsl/typing.py index 05b255ce..d67dd7b6 100644 --- a/dsl/pace/dsl/typing.py +++ b/dsl/pace/dsl/typing.py @@ -22,11 +22,15 @@ DTypes = Union[bool, np.bool_, int, np.int32, np.int64, float, np.float32, np.float64] +def floating_point_precision() -> int: + return int(os.getenv("PACE_FLOAT_PRECISION", "64")) + + def global_set_floating_point_precision(): """Set the global floating point precision for all reference to Float in the codebase. Defaults to 64 bit.""" global Float - precision_in_bit = int(os.getenv("PACE_FLOAT_PRECISION", "64")) + precision_in_bit = floating_point_precision() if precision_in_bit == 64: return np.float64 elif precision_in_bit == 32: diff --git a/fv3core/pace/fv3core/initialization/geos_wrapper.py b/fv3core/pace/fv3core/initialization/geos_wrapper.py index 2835e77e..a7a526ee 100644 --- a/fv3core/pace/fv3core/initialization/geos_wrapper.py +++ b/fv3core/pace/fv3core/initialization/geos_wrapper.py @@ -11,6 +11,8 @@ from pace.driver.performance.collector import PerformanceCollector from pace.dsl.dace import DaceConfig, orchestrate from pace.dsl.gt4py_utils import is_gpu_backend +from pace.dsl.typing import floating_point_precision +from pace.util._optional_imports import cupy as cp from pace.util.logging import pace_log @@ -131,13 +133,29 @@ def __init__( self.output_dict: Dict[str, np.ndarray] = {} self._allocate_output_dir() + # Feedback information + device_ordinal_info = ( + f" Device PCI bus id: {cp.cuda.Device(0).pci_bus_id}\n" + if is_gpu_backend(backend) + else "N/A" + ) + MPS_pipe_directory = os.getenv("CUDA_MPS_PIPE_DIRECTORY", None) + MPS_is_on = ( + MPS_pipe_directory + and is_gpu_backend(backend) + and os.path.exists(f"{MPS_pipe_directory}/log") + ) pace_log.info( "Pace GEOS wrapper initialized: \n" - f" dt : {self.dycore_state.bdt}\n" - f" bridge : {self._fortran_mem_space} > {self._pace_mem_space}\n" - f" backend: {backend}\n" - f" orchestration: {self._is_orchestrated}\n" - f" sizer : {sizer.nx}x{sizer.ny}x{sizer.nz} (halo: {sizer.n_halo})" + f" dt : {self.dycore_state.bdt}\n" + f" bridge : {self._fortran_mem_space} > {self._pace_mem_space}\n" + f" backend : {backend}\n" + f" float : {floating_point_precision()}bit" + f" orchestration : {self._is_orchestrated}\n" + f" sizer : {sizer.nx}x{sizer.ny}x{sizer.nz}" + f"(halo: {sizer.n_halo})\n" + f" Device ord: {device_ordinal_info}\n" + f" Nvidia MPS : {MPS_is_on}" ) def _critical_path(self): diff --git a/fv3core/pace/fv3core/testing/translate_dyncore.py b/fv3core/pace/fv3core/testing/translate_dyncore.py index 510e299f..6c7da4b7 100644 --- a/fv3core/pace/fv3core/testing/translate_dyncore.py +++ b/fv3core/pace/fv3core/testing/translate_dyncore.py @@ -140,7 +140,7 @@ def compute_parallel(self, inputs, communicator): grid_data.ptop = inputs["ptop"] self._base.make_storage_data_input_vars(inputs) state = DycoreState.init_zeros(quantity_factory=self.grid.quantity_factory) - wsd: pace.util.Quantity = self.grid.quantity_factory.empty( + wsd: pace.util.Quantity = self.grid.quantity_factory.zeros( dims=[pace.util.X_DIM, pace.util.Y_DIM], units="unknown", ) @@ -152,7 +152,7 @@ def compute_parallel(self, inputs, communicator): state[name].data[selection] = value else: setattr(state, name, value) - phis: pace.util.Quantity = self.grid.quantity_factory.empty( + phis: pace.util.Quantity = self.grid.quantity_factory.zeros( dims=[pace.util.X_DIM, pace.util.Y_DIM], units="m", ) diff --git a/fv3core/tests/savepoint/translate/translate_remapping.py b/fv3core/tests/savepoint/translate/translate_remapping.py index fc9196b0..9a2e1f84 100644 --- a/fv3core/tests/savepoint/translate/translate_remapping.py +++ b/fv3core/tests/savepoint/translate/translate_remapping.py @@ -107,7 +107,7 @@ def compute_from_storage(self, inputs): inputs["wsd"] = wsd_2d inputs["q_cld"] = inputs["tracers"]["qcld"] inputs["last_step"] = bool(inputs["last_step"]) - pfull = self.grid.quantity_factory.empty([Z_DIM], units="Pa") + pfull = self.grid.quantity_factory.zeros([Z_DIM], units="Pa") pfull.data[:] = pfull.np.asarray(inputs.pop("pfull")) l_to_e_obj = LagrangianToEulerian( self.stencil_factory, diff --git a/requirements_dev.txt b/requirements_dev.txt index 052bf5c3..484c4948 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -10,8 +10,8 @@ zarr dask>=2021.10.0 netCDF4 cftime -fv3config>=0.9.0 dace==0.14.0 +fv3config>=0.9.0 f90nml>=1.1.0 numpy>=1.15 -e external/gt4py diff --git a/stencils/pace/stencils/testing/grid.py b/stencils/pace/stencils/testing/grid.py index 65ad8870..4cf623b1 100644 --- a/stencils/pace/stencils/testing/grid.py +++ b/stencils/pace/stencils/testing/grid.py @@ -504,7 +504,7 @@ def grid_data(self) -> "GridData": data = getattr(self, name) assert data is not None - quantity = self.quantity_factory.empty(dims=dims, units=units) + quantity = self.quantity_factory.zeros(dims=dims, units=units) if len(quantity.shape) == 3: quantity.data[:] = data[:, :, : quantity.shape[2]] elif len(quantity.shape) == 2: diff --git a/stencils/pace/stencils/testing/parallel_translate.py b/stencils/pace/stencils/testing/parallel_translate.py index dcc1e64d..f10b9b27 100644 --- a/stencils/pace/stencils/testing/parallel_translate.py +++ b/stencils/pace/stencils/testing/parallel_translate.py @@ -12,7 +12,6 @@ class ParallelTranslate: - max_error = TranslateFortranData2Py.max_error near_zero = TranslateFortranData2Py.near_zero compute_grid_option = False @@ -192,7 +191,7 @@ def state_from_inputs(self, inputs: dict, grid=None) -> dict: for name, properties in self.inputs.items(): standard_name = properties.get("name", name) if len(properties["dims"]) > 0: - state[standard_name] = grid.quantity_factory.empty( + state[standard_name] = grid.quantity_factory.zeros( properties["dims"], properties["units"], dtype=inputs[name].dtype ) input_slice = _serialize_slice( diff --git a/stencils/pace/stencils/testing/temporaries.py b/stencils/pace/stencils/testing/temporaries.py index 581387f6..2dd46663 100644 --- a/stencils/pace/stencils/testing/temporaries.py +++ b/stencils/pace/stencils/testing/temporaries.py @@ -40,10 +40,9 @@ def _assert_same_temporaries(dict1: dict, dict2: dict) -> List[str]: attr2 = dict2[attr] if isinstance(attr1, np.ndarray): try: - np.testing.assert_almost_equal( - attr1, attr2, err_msg=f"{attr} not equal" - ) - except AssertionError: + assert np.allclose(attr1, attr2, equal_nan=True) + except AssertionError as e: + print(e) differences.append(attr) else: sub_differences = _assert_same_temporaries(attr1, attr2) diff --git a/tests/main/dsl/test_caches.py b/tests/main/dsl/test_caches.py new file mode 100644 index 00000000..c1f01303 --- /dev/null +++ b/tests/main/dsl/test_caches.py @@ -0,0 +1,176 @@ +import pytest +from gt4py.cartesian.gtscript import PARALLEL, Field, computation, interval +from gt4py.storage import empty, ones + +import pace.dsl +from pace.dsl.dace import orchestrate +from pace.dsl.dace.dace_config import DaceConfig, DaCeOrchestration +from pace.dsl.stencil import CompilationConfig, GridIndexing + + +def _make_storage( + func, + grid_indexing, + stencil_config: pace.dsl.StencilConfig, + *, + dtype=float, + aligned_index=(0, 0, 0), +): + return func( + backend=stencil_config.compilation_config.backend, + shape=grid_indexing.domain, + dtype=dtype, + aligned_index=aligned_index, + ) + + +def _stencil(inp: Field[float], out: Field[float], scalar: float): + with computation(PARALLEL), interval(...): + out = inp + + +def _build_stencil(backend, orchestrated: DaCeOrchestration): + # Make stencil and verify it ran + grid_indexing = GridIndexing( + domain=(5, 5, 5), + n_halo=2, + south_edge=True, + north_edge=True, + west_edge=True, + east_edge=True, + ) + + stencil_config = pace.dsl.StencilConfig( + compilation_config=CompilationConfig(backend=backend, rebuild=True), + dace_config=DaceConfig(None, backend, 5, 5, orchestrated), + ) + + stencil_factory = pace.dsl.StencilFactory(stencil_config, grid_indexing) + + built_stencil = stencil_factory.from_origin_domain( + _stencil, (0, 0, 0), domain=grid_indexing.domain + ) + + return built_stencil, grid_indexing, stencil_config + + +class OrchestratedProgam: + def __init__(self, backend, orchestration): + self.stencil, grid_indexing, stencil_config = _build_stencil( + backend, orchestration + ) + orchestrate(obj=self, config=stencil_config.dace_config) + self.inp = _make_storage(ones, grid_indexing, stencil_config, dtype=float) + self.out = _make_storage(empty, grid_indexing, stencil_config, dtype=float) + + def __call__(self): + self.stencil(self.inp, self.out, self.inp[0, 0, 0]) + + +@pytest.mark.parametrize( + "backend", + [ + pytest.param("dace:cpu"), + ], +) +def test_relocatability_orchestration(backend): + import os + import shutil + + from gt4py.cartesian import config as gt_config + + original_root_directory = gt_config.cache_settings["root_path"] + working_dir = str(os.getcwd()) + + # Compile on default + p0 = OrchestratedProgam(backend, DaCeOrchestration.BuildAndRun) + p0() + assert os.path.exists( + f"{working_dir}/.gt_cache_FV3_A/dacecache/" + "test_caches_OrchestratedProgam___call__", + ) or os.path.exists( + f"{working_dir}/.gt_cache_FV3_A/dacecache/OrchestratedProgam___call__", + ) + + # Compile in another directory + + custom_path = f"{working_dir}/.my_cache_path" + gt_config.cache_settings["root_path"] = custom_path + p1 = OrchestratedProgam(backend, DaCeOrchestration.BuildAndRun) + p1() + assert os.path.exists( + f"{custom_path}/.gt_cache_FV3_A/dacecache/" + "test_caches_OrchestratedProgam___call__", + ) or os.path.exists( + f"{working_dir}/.gt_cache_FV3_A/dacecache/OrchestratedProgam___call__", + ) + + # Check relocability by copying the second cache directory, + # changing the path of gt_config.cache_settings and trying to Run on it + relocated_path = f"{working_dir}/.my_relocated_cache_path" + shutil.copytree(custom_path, relocated_path, dirs_exist_ok=True) + gt_config.cache_settings["root_path"] = relocated_path + p2 = OrchestratedProgam(backend, DaCeOrchestration.Run) + p2() + + # Generate a file exists error to check for bad path + bogus_path = "./nope/notatall/nothappening" + gt_config.cache_settings["root_path"] = bogus_path + with pytest.raises(RuntimeError): + OrchestratedProgam(backend, DaCeOrchestration.Run) + + # Restore cache settings + gt_config.cache_settings["root_path"] = original_root_directory + + +@pytest.mark.parametrize( + "backend", + [ + pytest.param("dace:cpu"), + ], +) +def test_relocatability(backend: str): + import os + import shutil + + import gt4py + from gt4py.cartesian import config as gt_config + + from pace.util.mpi import MPI + + # Restore original dir name + gt4py.cartesian.config.cache_settings["dir_name"] = os.environ.get( + "GT_CACHE_DIR_NAME", f".gt_cache_{MPI.COMM_WORLD.Get_rank():06}" + ) + + backend_sanitized = backend.replace(":", "") + + # Compile on default + p0 = OrchestratedProgam(backend, DaCeOrchestration.Python) + p0() + assert os.path.exists( + f"./.gt_cache_000000/py38_1013/{backend_sanitized}/test_caches/_stencil/" + ) + + # Compile in another directory + + custom_path = "./.my_cache_path" + gt_config.cache_settings["root_path"] = custom_path + p1 = OrchestratedProgam(backend, DaCeOrchestration.Python) + p1() + assert os.path.exists( + f"{custom_path}/.gt_cache_000000/py38_1013/{backend_sanitized}" + "/test_caches/_stencil/" + ) + + # Check relocability by copying the second cache directory, + # changing the path of gt_config.cache_settings and trying to Run on it + relocated_path = "./.my_relocated_cache_path" + shutil.copytree("./.gt_cache_000000", relocated_path, dirs_exist_ok=True) + gt_config.cache_settings["root_path"] = relocated_path + p2 = OrchestratedProgam(backend, DaCeOrchestration.Python) + p2() + assert os.path.exists( + f"{relocated_path}/.gt_cache_000000/py38_1013/{backend_sanitized}" + "/test_caches/_stencil/" + ) diff --git a/tests/main/dsl/test_dace_config.py b/tests/main/dsl/test_dace_config.py index 78553278..cb3566dd 100644 --- a/tests/main/dsl/test_dace_config.py +++ b/tests/main/dsl/test_dace_config.py @@ -1,11 +1,12 @@ import unittest.mock -from pace.dsl.dace.dace_config import DaceConfig +from pace.dsl.dace.dace_config import DaceConfig, _determine_compiling_ranks from pace.dsl.dace.orchestration import ( DaCeOrchestration, orchestrate, orchestrate_function, ) +from pace.util.communicator import CubedSpherePartitioner, TilePartitioner """ @@ -91,3 +92,68 @@ def foo(self): a = A() a.foo() assert not mock_call_sdfg.called + + +def test_orchestrate_distributed_build(): + dummy_dace_config = DaceConfig( + communicator=None, + backend="gtc:dace", + orchestration=DaCeOrchestration.BuildAndRun, + ) + + def _does_compile(rank, partitioner) -> bool: + dummy_dace_config.layout = partitioner.layout + dummy_dace_config.rank_size = partitioner.layout[0] * partitioner.layout[1] * 6 + dummy_dace_config.my_rank = rank + return _determine_compiling_ranks(dummy_dace_config, partitioner) + + # (1, 1) layout, one rank which compiles + cube_partitioner_11 = CubedSpherePartitioner(TilePartitioner((1, 1))) + assert _does_compile(0, cube_partitioner_11) + assert not _does_compile(1, cube_partitioner_11) # not compiling face + + # (2, 2) layout, 4 ranks, all compiling + cube_partitioner_22 = CubedSpherePartitioner(TilePartitioner((2, 2))) + assert _does_compile(0, cube_partitioner_22) + assert _does_compile(1, cube_partitioner_22) + assert _does_compile(2, cube_partitioner_22) + assert _does_compile(3, cube_partitioner_22) + assert not _does_compile(4, cube_partitioner_22) # not compiling face + + # (3, 3) layout, 9 ranks, all compiling + cube_partitioner_33 = CubedSpherePartitioner(TilePartitioner((3, 3))) + assert _does_compile(0, cube_partitioner_33) + assert _does_compile(1, cube_partitioner_33) + assert _does_compile(2, cube_partitioner_33) + assert _does_compile(3, cube_partitioner_33) + assert _does_compile(4, cube_partitioner_33) + assert _does_compile(5, cube_partitioner_33) + assert _does_compile(6, cube_partitioner_33) + assert _does_compile(7, cube_partitioner_33) + assert _does_compile(8, cube_partitioner_33) + assert not _does_compile(9, cube_partitioner_33) # not compiling face + + # (4, 4) layout, 16 ranks, + # expecting compiling:0, 1, 2, 3, 4, 5, 7, 12, 13, 15 + cube_partitioner_44 = CubedSpherePartitioner(TilePartitioner((4, 4))) + assert _does_compile(0, cube_partitioner_44) + assert _does_compile(1, cube_partitioner_44) + assert _does_compile(4, cube_partitioner_44) + assert _does_compile(5, cube_partitioner_44) + assert _does_compile(7, cube_partitioner_44) + assert _does_compile(12, cube_partitioner_44) + assert _does_compile(13, cube_partitioner_44) + assert _does_compile(15, cube_partitioner_44) + assert not _does_compile(2, cube_partitioner_44) # same code path as 3 + assert not _does_compile(6, cube_partitioner_44) # same code path as 5 + assert not _does_compile(8, cube_partitioner_44) # same code path as 4 + assert not _does_compile(11, cube_partitioner_44) # same code path as 7 + assert not _does_compile(16, cube_partitioner_44) # not compiling face + + # For a few other layouts, we check that we always have 9 compiling ranks + for layout in [(5, 5), (10, 10), (20, 20)]: + partition = CubedSpherePartitioner(TilePartitioner(layout)) + compiling = 0 + for i in range(layout[0] * layout[1] * 6): + compiling += 1 if _does_compile(i, partition) else 0 + assert compiling == 9 diff --git a/tests/main/test_grid_init.py b/tests/main/test_grid_init.py index 942dcfd3..4c0e5e2b 100644 --- a/tests/main/test_grid_init.py +++ b/tests/main/test_grid_init.py @@ -51,8 +51,6 @@ def test_grid_init_not_decomposition_dependent(rank: int): assert allclose(metric_terms_1by1.area, metric_terms_3by3.area, partitioner, rank) assert allclose(metric_terms_1by1.dx, metric_terms_3by3.dx, partitioner, rank) assert allclose(metric_terms_1by1.dy, metric_terms_3by3.dy, partitioner, rank) - assert allclose(metric_terms_1by1.dxa, metric_terms_3by3.dxa, partitioner, rank) - assert allclose(metric_terms_1by1.dya, metric_terms_3by3.dya, partitioner, rank) assert allclose( metric_terms_1by1.cos_sg1, metric_terms_3by3.cos_sg1, partitioner, rank ) diff --git a/util/pace/util/__init__.py b/util/pace/util/__init__.py index 54f684d6..58a7c2a5 100644 --- a/util/pace/util/__init__.py +++ b/util/pace/util/__init__.py @@ -54,7 +54,7 @@ from .initialization import GridSizer, QuantityFactory, SubtileGridSizer from .io import read_state, write_state from .local_comm import LocalComm -from .logging import pace_log, AVAILABLE_LOG_LEVELS +from .logging import AVAILABLE_LOG_LEVELS, pace_log from .monitor import Monitor, NetCDFMonitor, ZarrMonitor from .mpi import MPIComm from .namelist import Namelist, NamelistDefaults diff --git a/util/pace/util/communicator.py b/util/pace/util/communicator.py index 938469bd..d2577d8c 100644 --- a/util/pace/util/communicator.py +++ b/util/pace/util/communicator.py @@ -167,7 +167,7 @@ def _get_gather_recv_quantity( ) -> Quantity: """Initialize a Quantity for use when receiving global data during gather""" recv_quantity = Quantity( - send_metadata.np.empty(global_extent, dtype=send_metadata.dtype), + send_metadata.np.zeros(global_extent, dtype=send_metadata.dtype), dims=send_metadata.dims, units=send_metadata.units, origin=tuple([0 for dim in send_metadata.dims]), @@ -182,7 +182,7 @@ def _get_scatter_recv_quantity( ) -> Quantity: """Initialize a Quantity for use when receiving subtile data during scatter""" recv_quantity = Quantity( - send_metadata.np.empty(shape, dtype=send_metadata.dtype), + send_metadata.np.zeros(shape, dtype=send_metadata.dtype), dims=send_metadata.dims, units=send_metadata.units, gt4py_backend=send_metadata.gt4py_backend, @@ -206,7 +206,7 @@ def gather( result: Optional[Quantity] if self.rank == constants.ROOT_RANK: with array_buffer( - send_quantity.np.empty, + send_quantity.np.zeros, (self.partitioner.total_ranks,) + tuple(send_quantity.extent), dtype=send_quantity.data.dtype, ) as recvbuf: @@ -745,7 +745,7 @@ def _get_gather_recv_quantity( # needs to change the quantity dimensions since we add a "tile" dimension, # unlike for tile scatter/gather which retains the same dimensions recv_quantity = Quantity( - metadata.np.empty(global_extent, dtype=metadata.dtype), + metadata.np.zeros(global_extent, dtype=metadata.dtype), dims=(constants.TILE_DIM,) + metadata.dims, units=metadata.units, origin=(0,) + tuple([0 for dim in metadata.dims]), @@ -767,7 +767,7 @@ def _get_scatter_recv_quantity( # needs to change the quantity dimensions since we remove a "tile" dimension, # unlike for tile scatter/gather which retains the same dimensions recv_quantity = Quantity( - metadata.np.empty(shape, dtype=metadata.dtype), + metadata.np.zeros(shape, dtype=metadata.dtype), dims=metadata.dims[1:], units=metadata.units, gt4py_backend=metadata.gt4py_backend, diff --git a/util/pace/util/grid/eta.py b/util/pace/util/grid/eta.py index 50afaadb..dc37aaa2 100644 --- a/util/pace/util/grid/eta.py +++ b/util/pace/util/grid/eta.py @@ -206,7 +206,6 @@ def set_hybrid_pressure_coefficients(km: int) -> HybridPressureCoefficients: ) elif km == 91: - ak = np.array( [ 1.00000000, @@ -402,7 +401,6 @@ def set_hybrid_pressure_coefficients(km: int) -> HybridPressureCoefficients: ) elif km == 72: - ak = np.array( [ 1.00000000, @@ -559,10 +557,299 @@ def set_hybrid_pressure_coefficients(km: int) -> HybridPressureCoefficients: ] ) + elif km == 137: + ak = np.array( + [ + 1.00000000, + 1.82500005, + 3.00000000, + 4.63000011, + 6.82797718, + 9.74696636, + 13.6054239, + 18.6089306, + 24.9857178, + 32.9857101, + 42.8792419, + 54.9554634, + 69.5205765, + 86.8958817, + 107.415741, + 131.425507, + 159.279404, + 191.338562, + 227.968948, + 269.539581, + 316.420746, + 368.982361, + 427.592499, + 492.616028, + 564.413452, + 643.339905, + 729.744141, + 823.967834, + 926.344910, + 1037.20117, + 1156.85364, + 1285.61035, + 1423.77014, + 1571.62292, + 1729.44897, + 1897.51929, + 2076.09595, + 2265.43164, + 2465.77051, + 2677.34814, + 2900.39136, + 3135.11938, + 3381.74365, + 3640.46826, + 3911.49048, + 4194.93066, + 4490.81738, + 4799.14941, + 5119.89502, + 5452.99072, + 5798.34473, + 6156.07422, + 6526.94678, + 6911.87061, + 7311.86914, + 7727.41211, + 8159.35400, + 8608.52539, + 9076.40039, + 9562.68262, + 10065.9785, + 10584.6318, + 11116.6621, + 11660.0674, + 12211.5479, + 12766.8730, + 13324.6689, + 13881.3311, + 14432.1396, + 14975.6152, + 15508.2568, + 16026.1152, + 16527.3223, + 17008.7891, + 17467.6133, + 17901.6211, + 18308.4336, + 18685.7188, + 19031.2891, + 19343.5117, + 19620.0430, + 19859.3906, + 20059.9316, + 20219.6641, + 20337.8633, + 20412.3086, + 20442.0781, + 20425.7188, + 20361.8164, + 20249.5117, + 20087.0859, + 19874.0254, + 19608.5723, + 19290.2266, + 18917.4609, + 18489.7070, + 18006.9258, + 17471.8398, + 16888.6875, + 16262.0469, + 15596.6953, + 14898.4531, + 14173.3242, + 13427.7695, + 12668.2578, + 11901.3398, + 11133.3047, + 10370.1758, + 9617.51562, + 8880.45312, + 8163.37500, + 7470.34375, + 6804.42188, + 6168.53125, + 5564.38281, + 4993.79688, + 4457.37500, + 3955.96094, + 3489.23438, + 3057.26562, + 2659.14062, + 2294.24219, + 1961.50000, + 1659.47656, + 1387.54688, + 1143.25000, + 926.507812, + 734.992188, + 568.062500, + 424.414062, + 302.476562, + 202.484375, + 122.101562, + 62.7812500, + 22.8359375, + 3.75781298, + 0.00000000, + 0.00000000, + ] + ) + + bk = np.array( + [ + 0.00000000, + 0.00000000, + 0.00000000, + 0.00000000, + 0.00000000, + 0.00000000, + 0.00000000, + 0.00000000, + 0.00000000, + 0.00000000, + 0.00000000, + 0.00000000, + 0.00000000, + 0.00000000, + 0.00000000, + 0.00000000, + 0.00000000, + 0.00000000, + 0.00000000, + 0.00000000, + 0.00000000, + 0.00000000, + 0.00000000, + 0.00000000, + 0.00000000, + 0.00000000, + 0.00000000, + 0.00000000, + 0.00000000, + 0.00000000, + 0.00000000, + 0.00000000, + 0.00000000, + 0.00000000, + 0.00000000, + 0.00000000, + 0.00000000, + 0.00000000, + 0.00000000, + 0.00000000, + 0.00000000, + 0.00000000, + 0.00000000, + 0.00000000, + 0.00000000, + 0.00000000, + 0.00000000, + 0.00000000, + 0.00000000, + 0.00000000, + 0.00000000, + 0.00000000, + 0.00000000, + 0.00000000, + 0.00000000, + 7.00000010e-06, + 2.40000008e-05, + 5.90000018e-05, + 1.12000002e-04, + 1.99000002e-04, + 3.39999999e-04, + 5.61999972e-04, + 8.90000025e-04, + 1.35300006e-03, + 1.99200003e-03, + 2.85700010e-03, + 3.97100020e-03, + 5.37799997e-03, + 7.13300006e-03, + 9.26099997e-03, + 1.18060000e-02, + 1.48160001e-02, + 1.83179993e-02, + 2.23549996e-02, + 2.69639995e-02, + 3.21759991e-02, + 3.80260013e-02, + 4.45480011e-02, + 5.17730005e-02, + 5.97280003e-02, + 6.84479997e-02, + 7.79580027e-02, + 8.82859975e-02, + 9.94620025e-02, + 0.111505002, + 0.124448001, + 0.138312995, + 0.153125003, + 0.168909997, + 0.185689002, + 0.203491002, + 0.222332999, + 0.242244005, + 0.263242006, + 0.285353988, + 0.308598012, + 0.332938999, + 0.358253986, + 0.384362996, + 0.411125004, + 0.438391000, + 0.466003001, + 0.493800014, + 0.521619022, + 0.549301028, + 0.576691985, + 0.603648007, + 0.630035996, + 0.655736029, + 0.680643022, + 0.704668999, + 0.727738976, + 0.749796987, + 0.770798028, + 0.790717006, + 0.809535980, + 0.827256024, + 0.843881011, + 0.859431982, + 0.873929024, + 0.887408018, + 0.899900019, + 0.911448002, + 0.922096014, + 0.931881011, + 0.940859973, + 0.949064016, + 0.956550002, + 0.963352025, + 0.969512999, + 0.975077987, + 0.980072021, + 0.984542012, + 0.988499999, + 0.991984010, + 0.995002985, + 0.997630000, + 1.00000000, + ] + ) + else: raise NotImplementedError( - "Only grids with 72, 79, or 91 vertical levels have been implemented so far" + "Only grids with 72, 79, 91 or 137 vertical levels" + "have been implemented so far" ) + if 0.0 in bk: ks = 0 if km == 91 else np.where(bk == 0)[0][-1] ptop = ak[0] diff --git a/util/pace/util/grid/generation.py b/util/pace/util/grid/generation.py index b78a7059..679b9449 100644 --- a/util/pace/util/grid/generation.py +++ b/util/pace/util/grid/generation.py @@ -75,7 +75,7 @@ def quantity_cast_to_model_float( quantity_factory: util.QuantityFactory, qty_64: util.Quantity ) -> util.Quantity: """Copy & cast from 64-bit float to model precision if need be""" - qty = quantity_factory.empty(qty_64.dims, qty_64.units, dtype=Float) + qty = quantity_factory.zeros(qty_64.dims, qty_64.units, dtype=Float) qty.data[:] = qty_64.data[:] return qty @@ -1530,7 +1530,6 @@ def rdyc(self) -> util.Quantity: ) def _init_dgrid(self): - grid_mirror_ew = self.quantity_factory.zeros( self._grid_dims, "radians", @@ -1751,7 +1750,6 @@ def _compute_dxdy(self): return dx, dy def _compute_dxdy_agrid(self): - dx_agrid_64 = self.quantity_factory.zeros( [util.X_DIM, util.Y_DIM], "m", @@ -2149,7 +2147,6 @@ def _calculate_more_trig_terms(self, cos_sg, sin_sg): ) def _init_cell_trigonometry(self): - cosa_u_64 = self.quantity_factory.zeros( [util.X_INTERFACE_DIM, util.Y_DIM], "", diff --git a/util/pace/util/grid/gnomonic.py b/util/pace/util/grid/gnomonic.py index 705014e4..f26af0f2 100644 --- a/util/pace/util/grid/gnomonic.py +++ b/util/pace/util/grid/gnomonic.py @@ -303,9 +303,9 @@ def _mirror_latlon(lon1, lat1, lon2, lat2, lon0, lat0, np): pdot = p0[0] * nb[0] + p0[1] * nb[1] + p0[2] * nb[2] pp = p0 - np.multiply(2.0, pdot) * nb - lon3 = np.empty((1, 1)) - lat3 = np.empty((1, 1)) - pp3 = np.empty((3, 1, 1)) + lon3 = np.zeros((1, 1)) + lat3 = np.zeros((1, 1)) + pp3 = np.zeros((3, 1, 1)) pp3[:, 0, 0] = pp _cart_to_latlon(1, pp3, lon3, lat3, np) diff --git a/util/pace/util/grid/helper.py b/util/pace/util/grid/helper.py index 673e484d..1b977ad8 100644 --- a/util/pace/util/grid/helper.py +++ b/util/pace/util/grid/helper.py @@ -166,8 +166,8 @@ def from_restart( but no fv_core.res.nc in restart data file.""" ) - ak = quantity_factory.empty([Z_INTERFACE_DIM], units="Pa") - bk = quantity_factory.empty([Z_INTERFACE_DIM], units="") + ak = quantity_factory.zeros([Z_INTERFACE_DIM], units="Pa") + bk = quantity_factory.zeros([Z_INTERFACE_DIM], units="") with fs.open(ak_bk_data_file, "rb") as f: ds = xr.open_dataset(f).isel(Time=0).drop_vars("Time") ak.view[:] = ds["ak"].values @@ -322,7 +322,6 @@ def __init__( @classmethod def new_from_metric_terms(cls, metric_terms: MetricTerms): - horizontal_data = HorizontalGridData.new_from_metric_terms(metric_terms) vertical_data = VerticalGridData.new_from_metric_terms(metric_terms) contravariant_data = ContravariantGridData.new_from_metric_terms(metric_terms) @@ -701,7 +700,6 @@ def new_from_grid_variables( es1: pace.util.Quantity, ew2: pace.util.Quantity, ) -> "DriverGridData": - try: vlon1, vlon2, vlon3 = split_quantity_along_last_dim(vlon) vlat1, vlat2, vlat3 = split_quantity_along_last_dim(vlat) diff --git a/util/pace/util/halo_data_transformer.py b/util/pace/util/halo_data_transformer.py index 00a547d6..e97bb97a 100644 --- a/util/pace/util/halo_data_transformer.py +++ b/util/pace/util/halo_data_transformer.py @@ -70,7 +70,7 @@ def _build_flatten_indices( """ # Have to go down to numpy to leverage indices calculation - arr_indices = np.empty(shape, dtype=np.int32, order="C")[slices] + arr_indices = np.zeros(shape, dtype=np.int32, order="C")[slices] # Get offset from first index offset_dims = [] @@ -875,7 +875,6 @@ def _opt_unpack_scalar(self, quantities: List[Quantity]): # Use private stream with self._get_stream(cu_kernel_args.stream): - # Launch kernel blocks = 128 grid_x = (info_x._unpack_buffer_size // blocks) + 1 @@ -942,7 +941,6 @@ def _opt_unpack_vector( # Use private stream with self._get_stream(cu_kernel_args.stream): - # Buffer sizes edge_size = info_x._unpack_buffer_size + info_y._unpack_buffer_size diff --git a/util/pace/util/initialization/allocator.py b/util/pace/util/initialization/allocator.py index c865cbbf..1a68495e 100644 --- a/util/pace/util/initialization/allocator.py +++ b/util/pace/util/initialization/allocator.py @@ -102,7 +102,7 @@ def from_array( That numpy array must correspond to the correct shape and extent for the given dims. """ - base = self.empty( + base = self.zeros( dims=dims, units=units, dtype=data.dtype,