From 96c5ecacb66e6290485c46f93058a1c4f234f631 Mon Sep 17 00:00:00 2001 From: Tom Vo Date: Fri, 3 Mar 2023 15:51:42 -0800 Subject: [PATCH] Replace `drop_incomplete_djf` with `drop_incomplete_seasons` --- tests/test_temporal.py | 57 +++++------ xcdat/temporal.py | 221 +++++++++++++++++++++++++---------------- 2 files changed, 162 insertions(+), 116 deletions(-) diff --git a/tests/test_temporal.py b/tests/test_temporal.py index acc1ec1f..b3e08b02 100644 --- a/tests/test_temporal.py +++ b/tests/test_temporal.py @@ -581,7 +581,7 @@ def test_weighted_seasonal_averages_with_DJF_and_drop_incomplete_seasons(self): result = ds.temporal.group_average( "ts", "season", - season_config={"dec_mode": "DJF", "drop_incomplete_djf": True}, + season_config={"dec_mode": "DJF", "drop_incomplete_seasons": True}, ) expected = ds.copy() # Drop the incomplete DJF seasons @@ -619,7 +619,7 @@ def test_weighted_seasonal_averages_with_DJF_and_drop_incomplete_seasons(self): "freq": "season", "weighted": "True", "dec_mode": "DJF", - "drop_incomplete_djf": "True", + "drop_incomplete_seasons": "True", }, ) @@ -633,7 +633,7 @@ def test_weighted_seasonal_averages_with_DJF_without_dropping_incomplete_seasons result = ds.temporal.group_average( "ts", "season", - season_config={"dec_mode": "DJF", "drop_incomplete_djf": False}, + season_config={"dec_mode": "DJF", "drop_incomplete_seasons": False}, ) expected = ds.copy() expected = expected.drop_dims("time") @@ -670,7 +670,7 @@ def test_weighted_seasonal_averages_with_DJF_without_dropping_incomplete_seasons "freq": "season", "weighted": "True", "dec_mode": "DJF", - "drop_incomplete_djf": "False", + "drop_incomplete_seasons": "False", }, ) @@ -822,7 +822,7 @@ def test_weighted_custom_seasonal_averages(self): assert result.identical(expected) - def test_weighted_seasonal_averages_drops_incomplete_seasons(self): + def test_weighted_custom_seasonal_averages_drops_incomplete_seasons(self): ds = self.ds.copy() ds["time"].values[:] = np.array( [ @@ -835,28 +835,26 @@ def test_weighted_seasonal_averages_drops_incomplete_seasons(self): dtype="datetime64[ns]", ) - custom_seasons = [ - ["Nov", "Dec", "Jan", "Feb", "Mar"], - ] + custom_seasons = [["Nov", "Dec"], ["Feb", "Mar", "Apr"]] result = ds.temporal.group_average( "ts", "season", season_config={ + "drop_incomplete_seasons": True, "custom_seasons": custom_seasons, - # "drop_incomplete_seasons": True, }, ) expected = ds.copy() expected = expected.drop_dims("time") expected["ts"] = xr.DataArray( name="ts", - data=np.array([[[1.3933333]]]), + data=np.array([[[1.5]]]), coords={ "lat": expected.lat, "lon": expected.lon, "time": xr.DataArray( - data=np.array([cftime.datetime(2001, 1, 1)], dtype=object), + data=np.array([cftime.datetime(2000, 12, 1)], dtype=object), dims=["time"], attrs={ "axis": "T", @@ -872,13 +870,12 @@ def test_weighted_seasonal_averages_drops_incomplete_seasons(self): "operation": "temporal_avg", "mode": "group_average", "freq": "season", - "custom_seasons": ["NovDecJanFebMar"], + "custom_seasons": ["NovDec", "FebMarApr"], "weighted": "True", }, ) - xr.testing.assert_allclose(result, expected) - assert result.ts.attrs == expected.ts.attrs + assert result.identical(expected) def test_weighted_custom_seasonal_averages_with_seasons_spanning_calendar_years( self, @@ -1160,7 +1157,7 @@ def test_weighted_seasonal_climatology_with_DJF(self): result = ds.temporal.climatology( "ts", "season", - season_config={"dec_mode": "DJF", "drop_incomplete_djf": True}, + season_config={"dec_mode": "DJF", "drop_incomplete_seasons": True}, ) expected = ds.copy() @@ -1202,7 +1199,7 @@ def test_weighted_seasonal_climatology_with_DJF(self): "freq": "season", "weighted": "True", "dec_mode": "DJF", - "drop_incomplete_djf": "True", + "drop_incomplete_seasons": "True", }, ) @@ -1215,7 +1212,7 @@ def test_chunked_weighted_seasonal_climatology_with_DJF(self): result = ds.temporal.climatology( "ts", "season", - season_config={"dec_mode": "DJF", "drop_incomplete_djf": True}, + season_config={"dec_mode": "DJF", "drop_incomplete_seasons": True}, ) expected = ds.copy() @@ -1257,7 +1254,7 @@ def test_chunked_weighted_seasonal_climatology_with_DJF(self): "freq": "season", "weighted": "True", "dec_mode": "DJF", - "drop_incomplete_djf": "True", + "drop_incomplete_seasons": "True", }, ) @@ -1378,7 +1375,7 @@ def test_weighted_custom_seasonal_climatology(self): assert result.identical(expected) @pytest.mark.xfail - def test_weighted_custom_seasonal_climatology_with_seasons_spanning_calendar_years_and_drop_incomplete_seasons( + def test_weighted_custom_seasonal_climatology_with_seasons_spanning_calendar_years( self, ): ds = self.ds.copy() @@ -1836,7 +1833,7 @@ def test_weighted_seasonal_departures_with_DJF(self): result = ds.temporal.departures( "ts", "season", - season_config={"dec_mode": "DJF", "drop_incomplete_djf": True}, + season_config={"dec_mode": "DJF", "drop_incomplete_seasons": True}, ) expected = ds.copy() expected["ts"] = xr.DataArray( @@ -1853,7 +1850,7 @@ def test_weighted_seasonal_departures_with_DJF(self): "freq": "season", "weighted": "True", "dec_mode": "DJF", - "drop_incomplete_djf": "True", + "drop_incomplete_seasons": "True", }, ) @@ -1871,7 +1868,7 @@ def test_weighted_seasonal_departures_with_DJF_and_keep_weights(self): "season", weighted=True, keep_weights=True, - season_config={"dec_mode": "DJF", "drop_incomplete_djf": True}, + season_config={"dec_mode": "DJF", "drop_incomplete_seasons": True}, ) expected = ds.copy() expected["ts"] = xr.DataArray( @@ -1888,7 +1885,7 @@ def test_weighted_seasonal_departures_with_DJF_and_keep_weights(self): "freq": "season", "weighted": "True", "dec_mode": "DJF", - "drop_incomplete_djf": "True", + "drop_incomplete_seasons": "True", }, ) expected["time_wts"] = xr.DataArray( @@ -1925,7 +1922,7 @@ def test_unweighted_seasonal_departures_with_DJF(self): "ts", "season", weighted=False, - season_config={"dec_mode": "DJF", "drop_incomplete_djf": True}, + season_config={"dec_mode": "DJF", "drop_incomplete_seasons": True}, ) expected = ds.copy() expected["ts"] = xr.DataArray( @@ -1942,7 +1939,7 @@ def test_unweighted_seasonal_departures_with_DJF(self): "freq": "season", "weighted": "False", "dec_mode": "DJF", - "drop_incomplete_djf": "True", + "drop_incomplete_seasons": "True", }, ) @@ -3128,7 +3125,7 @@ def test_raises_error_with_incorrect_mode_arg(self): weighted=True, season_config={ "dec_mode": "DJF", - "drop_incomplete_djf": False, + "drop_incomplete_seasons": False, "custom_seasons": None, }, ) @@ -3144,7 +3141,7 @@ def test_raises_error_if_freq_arg_is_not_supported_by_operation(self): weighted=True, season_config={ "dec_mode": "DJF", - "drop_incomplete_djf": False, + "drop_incomplete_seasons": False, "custom_seasons": None, }, ) @@ -3156,7 +3153,7 @@ def test_raises_error_if_freq_arg_is_not_supported_by_operation(self): weighted=True, season_config={ "dec_mode": "DJF", - "drop_incomplete_djf": False, + "drop_incomplete_seasons": False, "custom_seasons": None, }, ) @@ -3168,7 +3165,7 @@ def test_raises_error_if_freq_arg_is_not_supported_by_operation(self): weighted=True, season_config={ "dec_mode": "DJF", - "drop_incomplete_djf": False, + "drop_incomplete_seasons": False, "custom_seasons": None, }, ) @@ -3194,7 +3191,7 @@ def test_raises_error_if_december_mode_is_not_supported(self): weighted=True, season_config={ "dec_mode": "unsupported", - "drop_incomplete_djf": False, + "drop_incomplete_seasons": False, "custom_seasons": None, }, ) diff --git a/xcdat/temporal.py b/xcdat/temporal.py index 8fabf150..172ed354 100644 --- a/xcdat/temporal.py +++ b/xcdat/temporal.py @@ -64,7 +64,7 @@ "SeasonConfigInput", { "dec_mode": Literal["DJF", "JFD"], - "drop_incomplete_djf": bool, + "drop_incomplete_seasons": bool, "custom_seasons": Optional[List[List[str]]], }, total=False, @@ -74,7 +74,7 @@ "SeasonConfigAttr", { "dec_mode": Literal["DJF", "JFD"], - "drop_incomplete_djf": bool, + "drop_incomplete_seasons": bool, "custom_seasons": Optional[Dict[str, List[str]]], }, total=False, @@ -82,7 +82,7 @@ DEFAULT_SEASON_CONFIG: SeasonConfigInput = { "dec_mode": "DJF", - "drop_incomplete_djf": False, + "drop_incomplete_seasons": False, "custom_seasons": None, } @@ -251,11 +251,18 @@ def group_average( Xarray labels the season with December as "DJF", but it is actually "JFD". - * "drop_incomplete_djf" (bool, by default False) - If the "dec_mode" is "DJF", this flag drops (True) or keeps - (False) time coordinates that fall under incomplete DJF seasons - Incomplete DJF seasons include the start year Jan/Feb and the - end year Dec. + * "drop_incomplete_seasons" (bool, by default False) + Seasons are considered incomplete if they do not have all of + the required months to form the season. For example, if we have + the time coordinates ["2000-11-16", "2000-12-16", "2001-01-16", + "2001-02-16"] and we want to group seasons by "ND" ("Nov", + "Dec") and "JFM" ("Jan", "Feb", "Mar"). + + * ["2000-11-16", "2000-12-16"] is considered a complete "ND" + season since both "Nov" and "Dec" are present. + * ["2001-01-16", "2001-02-16"] is considered an incomplete "JFM" + season because it only has "Jan" and "Feb". Therefore, these + time coordinates are dropped. Configs for custom seasons: @@ -291,7 +298,7 @@ def group_average( >>> "season", >>> season_config={ >>> "dec_mode": "DJF", - >>> "drop_incomplete_season": True + >>> "drop_incomplete_seasons": True >>> } >>> ) >>> ds_season.ts @@ -327,7 +334,7 @@ def group_average( 'freq': 'season', 'weighted': 'True', 'dec_mode': 'DJF', - 'drop_incomplete_djf': 'False' + 'drop_incomplete_seasons': 'False' } """ self._set_data_var_attrs(data_var) @@ -381,6 +388,21 @@ def climatology( predefined seasons are passed, configs for custom seasons are ignored and vice versa. + General configs: + + * "drop_incomplete_seasons" (bool, by default False) + Seasons are considered incomplete if they do not have all of + the required months to form the season. For example, if we have + the time coordinates ["2000-11-16", "2000-12-16", "2001-01-16", + "2001-02-16"] and we want to group seasons by "ND" ("Nov", + "Dec") and "JFM" ("Jan", "Feb", "Mar"). + + * ["2000-11-16", "2000-12-16"] is considered a complete "ND" + season since both "Nov" and "Dec" are present. + * ["2001-01-16", "2001-02-16"] is considered an incomplete "JFM" + season because it only has "Jan" and "Feb". Therefore, these + time coordinates are dropped. + Configs for predefined seasons: * "dec_mode" (Literal["DJF", "JFD"], by default "DJF") @@ -391,12 +413,6 @@ def climatology( Xarray labels the season with December as "DJF", but it is actually "JFD". - * "drop_incomplete_djf" (bool, by default False) - If the "dec_mode" is "DJF", this flag drops (True) or keeps - (False) time coordinates that fall under incomplete DJF seasons - Incomplete DJF seasons include the start year Jan/Feb and the - end year Dec. - Configs for custom seasons: * "custom_seasons" ([List[List[str]]], by default None) @@ -435,7 +451,7 @@ def climatology( >>> "season", >>> season_config={ >>> "dec_mode": "DJF", - >>> "drop_incomplete_season": True + >>> "drop_incomplete_seasons": True >>> } >>> ) >>> ds_season.ts @@ -471,7 +487,7 @@ def climatology( 'freq': 'season', 'weighted': 'True', 'dec_mode': 'DJF', - 'drop_incomplete_djf': 'False' + 'drop_incomplete_seasons': 'False' } """ self._set_data_var_attrs(data_var) @@ -545,6 +561,21 @@ def departures( predefined seasons are passed, configs for custom seasons are ignored and vice versa. + General configs: + + * "drop_incomplete_seasons" (bool, by default False) + Seasons are considered incomplete if they do not have all of + the required months to form the season. For example, if we have + the time coordinates ["2000-11-16", "2000-12-16", "2001-01-16", + "2001-02-16"] and we want to group seasons by "ND" ("Nov", + "Dec") and "JFM" ("Jan", "Feb", "Mar"). + + * ["2000-11-16", "2000-12-16"] is considered a complete "ND" + season since both "Nov" and "Dec" are present. + * ["2001-01-16", "2001-02-16"] is considered an incomplete "JFM" + season because it only has "Jan" and "Feb". Therefore, these + time coordinates are dropped. + Configs for predefined seasons: * "dec_mode" (Literal["DJF", "JFD"], by default "DJF") @@ -555,12 +586,6 @@ def departures( Xarray labels the season with December as "DJF", but it is actually "JFD". - * "drop_incomplete_djf" (bool, by default False) - If the "dec_mode" is "DJF", this flag drops (True) or keeps - (False) time coordinates that fall under incomplete DJF seasons - Incomplete DJF seasons include the start year Jan/Feb and the - end year Dec. - Configs for custom seasons: * "custom_seasons" ([List[List[str]]], by default None) @@ -609,7 +634,7 @@ def departures( 'frequency': 'season', 'weighted': 'True', 'dec_mode': 'DJF', - 'drop_incomplete_djf': 'False' + 'drop_incomplete_seasons': 'False' } """ ds = self._dataset.copy() @@ -829,9 +854,12 @@ def _set_arg_attrs( ) custom_seasons = season_config.get("custom_seasons", None) dec_mode = season_config.get("dec_mode", "DJF") - drop_incomplete_djf = season_config.get("drop_incomplete_djf", False) self._season_config: SeasonConfigAttr = {} + self._season_config["drop_incomplete_seasons"] = season_config.get( + "drop_incomplete_seasons", False + ) + if custom_seasons is None: if dec_mode not in ("DJF", "JFD"): raise ValueError( @@ -840,8 +868,6 @@ def _set_arg_attrs( ) self._season_config["dec_mode"] = dec_mode - if dec_mode == "DJF": - self._season_config["drop_incomplete_djf"] = drop_incomplete_djf else: self._season_config["custom_seasons"] = self._form_seasons(custom_seasons) @@ -909,10 +935,9 @@ def _preprocess_dataset(self, ds: xr.Dataset) -> xr.Dataset: """ if ( self._freq == "season" - and self._season_config.get("dec_mode") == "DJF" - and self._season_config.get("drop_incomplete_djf") is True + and self._season_config.get("drop_incomplete_seasons") is True ): - ds = self._drop_incomplete_djf(ds) + ds = self._drop_incomplete_seasons(ds) if ( self._freq == "day" @@ -923,49 +948,63 @@ def _preprocess_dataset(self, ds: xr.Dataset) -> xr.Dataset: return ds - def _drop_incomplete_djf(self, dataset: xr.Dataset) -> xr.Dataset: - """Drops incomplete DJF seasons within a continuous time series. + def _drop_incomplete_seasons(self, ds: xr.Dataset) -> xr.Dataset: + """Drops incomplete seasons within a continuous time series. - This method assumes that the time series is continuous and removes the - leading and trailing incomplete seasons (e.g., the first January and - February of a time series that are not complete, because the December of - the previous year is missing). This method does not account for or - remove missing time steps anywhere else. + Seasons are considered incomplete if they do not have all of the + required months to form the season. For example, if we have the time + coordinates ["2000-11-16", "2000-12-16", "2001-01-16", "2001-02-16"] + and we want to group seasons by "ND" ("Nov", "Dec") and "JFM" ("Jan", + "Feb", "Mar"). + - ["2000-11-16", "2000-12-16"] is considered a complete "ND" season + since both "Nov" and "Dec" are present. + - ["2001-01-16", "2001-02-16"] is considered an incomplete "JFM" + season because it only has "Jan" and "Feb". Therefore, these + time coordinates are dropped. Parameters ---------- - dataset : xr.Dataset - The dataset with some possibly incomplete DJF seasons. + df : pd.DataFrame + A DataFrame of seasonal datetime components with potentially + incomplete seasons. Returns ------- - xr.Dataset - The dataset with only complete DJF seasons. + pd.DataFrame + A DataFrame of seasonal datetime components with only complete + seasons. """ - # Separate the dataset into two datasets, one with and one without - # the time dimension. This is necessary because the xarray .where() - # method concatenates the time dimension to non-time dimension data - # vars, which is not a desired behavior. - ds = dataset.copy() - ds_time = ds.get([v for v in ds.data_vars if self.dim in ds[v].dims]) # type: ignore - ds_no_time = ds.get([v for v in ds.data_vars if self.dim not in ds[v].dims]) # type: ignore - - start_year, end_year = ( - ds[self.dim].dt.year.values[0], - ds[self.dim].dt.year.values[-1], - ) - incomplete_seasons = (f"{start_year}-01", f"{start_year}-02", f"{end_year}-12") - - for year_month in incomplete_seasons: - try: - coord_pt = ds.loc[dict(time=year_month)][self.dim][0] - ds_time = ds_time.where(ds_time[self.dim] != coord_pt, drop=True) - except (KeyError, IndexError): - continue - - ds_final = xr.merge((ds_time, ds_no_time)) - - return ds_final + # Algorithm + # Prereq - This needs to be done AFTER time coordinates are labeled + # and BEFORE obsoelete columns are dropped because custom seasons can be + # assigned to the time coordiantes first. + # 1. Get the count of months per season (pre-defined seasons by xarray + # all have 3), otherwise use custom seasons count + # 2. Label all time coordinates by groups + # 3. Group the time coordinates by group and the get count + # 4. Drop time coordinates where count != expected count for season + ds_new = ds.copy() + time_coords = ds[self.dim].copy() + + # Transform the time coords into a DataFrame of seasonal datetime + # components based on the grouping mode. + df = self._get_df_dt_components(time_coords, drop_obsolete_cols=False) + + # Add a column for the expected count of months for that season + # For example, "NovDec" is split into ["Nov", "Dec"] which equals an + # expected count of 2 months. + df["expected_months"] = df["season"].str.split(r"(?<=.)(?=[A-Z])").str.len() + # Add a column for the actual count of months for that season. + df["actual_months"] = df.groupby(["season"])["year"].transform("count") + + # Get the incomplete seasons and drop the time coordinates that are in + # those incomplete seasons. + indexes_to_drop = df[df["expected_months"] != df["actual_months"]].index + if len(indexes_to_drop) > 0: + coords_to_drop = time_coords.values[indexes_to_drop] + ds_new = ds_new.where(~time_coords.isin(coords_to_drop), drop=True) + + return ds_new def _drop_leap_days(self, ds: xr.Dataset): """Drop leap days from time coordinates. @@ -1175,9 +1214,9 @@ def _label_time_coords(self, time_coords: xr.DataArray) -> xr.DataArray: This methods labels time coordinates for grouping by first extracting specific xarray datetime components from time coordinates and storing them in a pandas DataFrame. After processing (if necessary) is performed - on the DataFrame, it is converted to a numpy array of datetime - objects. This numpy serves as the data source for the final - DataArray of labeled time coordinates. + on the DataFrame, it is converted to a numpy array of datetime objects. + This numpy array serves as the data source for the final DataArray of + labeled time coordinates. Parameters ---------- @@ -1213,7 +1252,9 @@ def _label_time_coords(self, time_coords: xr.DataArray) -> xr.DataArray: >>> Coordinates: >>> * time (time) datetime64[ns] 2000-01-01T00:00:00 ... 2000-04-01T00:00:00 """ - df_dt_components: pd.DataFrame = self._get_df_dt_components(time_coords) + df_dt_components: pd.DataFrame = self._get_df_dt_components( + time_coords, drop_obsolete_cols=True + ) dt_objects = self._convert_df_to_dt(df_dt_components) time_grouped = xr.DataArray( @@ -1227,7 +1268,9 @@ def _label_time_coords(self, time_coords: xr.DataArray) -> xr.DataArray: return time_grouped - def _get_df_dt_components(self, time_coords: xr.DataArray) -> pd.DataFrame: + def _get_df_dt_components( + self, time_coords: xr.DataArray, drop_obsolete_cols: bool + ) -> pd.DataFrame: """Returns a DataFrame of xarray datetime components. This method extracts the applicable xarray datetime components from each @@ -1248,6 +1291,12 @@ def _get_df_dt_components(self, time_coords: xr.DataArray) -> pd.DataFrame: ---------- time_coords : xr.DataArray The time coordinates. + drop_obsolete_cols : bool + Drop obsolete columns after processing seasonal DataFrame when + ``self._freq="season"``. Set to False to keep datetime columns + needed for preprocessing the dataset (e.g,. removing incomplete + seasons), and set to True to remove obsolete columns when needing + to group time coordinates. Returns ------- @@ -1278,12 +1327,15 @@ def _get_df_dt_components(self, time_coords: xr.DataArray) -> pd.DataFrame: if self._mode in ["climatology", "departures"]: df["year"] = time_coords[f"{self.dim}.year"].values df["month"] = time_coords[f"{self.dim}.month"].values - - if self._mode == "group_average": + elif self._mode == "group_average": df["month"] = time_coords[f"{self.dim}.month"].values df = self._process_season_df(df) + if drop_obsolete_cols: + df = self._drop_obsolete_columns(df) + df = self._map_seasons_to_mid_months(df) + return df def _process_season_df(self, df: pd.DataFrame) -> pd.DataFrame: @@ -1292,13 +1344,13 @@ def _process_season_df(self, df: pd.DataFrame) -> pd.DataFrame: Parameters ---------- - df : pd.DataFrame - A DataFrame of xarray datetime components. + df : xr.DataArray + A DataFrame of seasonal datetime components. Returns ------- pd.DataFrame - A DataFrame of processed xarray datetime components. + A DataFrame of seasonal datetime components. """ df_new = df.copy() custom_seasons = self._season_config.get("custom_seasons") @@ -1311,8 +1363,6 @@ def _process_season_df(self, df: pd.DataFrame) -> pd.DataFrame: if dec_mode == "DJF": df_new = self._shift_decembers(df_new) - df_new = self._drop_obsolete_columns(df_new) - df_new = self._map_seasons_to_mid_months(df_new) return df_new def _map_months_to_custom_seasons(self, df: pd.DataFrame) -> pd.DataFrame: @@ -1631,16 +1681,15 @@ def _add_operation_attrs(self, data_var: xr.DataArray) -> xr.DataArray: ) if self._freq == "season": - custom_seasons = self._season_config.get("custom_seasons") + data_var.attrs["drop_incomplete_seasons"] = self._season_config.get( + "drop_incomplete_seasons" + ) - if custom_seasons is None: + custom_seasons = self._season_config.get("custom_seasons") + if custom_seasons is not None: + data_var.attrs["custom_seasons"] = list(custom_seasons.keys()) + else: dec_mode = self._season_config.get("dec_mode") - drop_incomplete_djf = self._season_config.get("drop_incomplete_djf") - data_var.attrs["dec_mode"] = dec_mode - if dec_mode == "DJF": - data_var.attrs["drop_incomplete_djf"] = str(drop_incomplete_djf) - else: - data_var.attrs["custom_seasons"] = list(custom_seasons.keys()) return data_var