Skip to content

Commit

Permalink
breaking: drop rows with null targets when dropna=False (#447)
Browse files Browse the repository at this point in the history
  • Loading branch information
jmoralez authored Nov 12, 2024
1 parent e9b8c64 commit ca67b98
Show file tree
Hide file tree
Showing 4 changed files with 162 additions and 134 deletions.
23 changes: 14 additions & 9 deletions mlforecast/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -402,16 +402,23 @@ def _transform(
target = target[self._restore_idxs]

# determine rows to keep
target_nulls = np.isnan(target)
if target_nulls.ndim == 2:
# target nulls for each horizon are dropped in MLForecast.fit_models
# we just drop rows here for which all the target values are null
target_nulls = target_nulls.all(axis=1)
if dropna:
feature_nulls = np.full(df.shape[0], False)
for feature_vals in features.values():
feature_nulls |= np.isnan(feature_vals)
target_nulls = np.isnan(target)
if target_nulls.ndim == 2:
# target nulls for each horizon are dropped in MLForecast.fit_models
# we just drop rows here for which all the target values are null
target_nulls = target_nulls.all(axis=1)
keep_rows = ~(feature_nulls | target_nulls)
else:
# we always want to drop rows with nulls in the target
keep_rows = ~target_nulls

self._dropped_series: Optional[np.ndarray] = None
if not keep_rows.all():
# remove rows with nulls
for k, v in features.items():
features[k] = v[keep_rows]
target = target[keep_rows]
Expand All @@ -422,19 +429,17 @@ def _transform(
last_idxs = self._sort_idxs[last_idxs]
last_vals_nan = ~keep_rows[last_idxs]
if last_vals_nan.any():
self._dropped_series: Optional[np.ndarray] = np.where(last_vals_nan)[0]
self._dropped_series = np.where(last_vals_nan)[0]
dropped_ids = reprlib.repr(list(self.uids[self._dropped_series]))
warnings.warn(
"The following series were dropped completely "
f"due to the transformations and features: {dropped_ids}.\n"
"These series won't show up if you use `MLForecast.forecast_fitted_values()`.\n"
"You can set `dropna=False` or use transformations that require less samples to mitigate this"
)
else:
self._dropped_series = None
elif isinstance(df, pd.DataFrame):
# we'll be assigning columns below, so we need to copy
df = df.copy(deep=False)
self._dropped_series = None

# once we've computed the features and target we can slice the series
update_samples = [
Expand Down
63 changes: 38 additions & 25 deletions nbs/core.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,7 @@
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The autoreload extension is already loaded. To reload it, use:\n",
" %reload_ext autoreload\n"
]
}
],
"outputs": [],
"source": [
"#|hide\n",
"%load_ext autoreload\n",
Expand Down Expand Up @@ -895,16 +886,23 @@
" target = target[self._restore_idxs] \n",
"\n",
" # determine rows to keep\n",
" target_nulls = np.isnan(target)\n",
" if target_nulls.ndim == 2:\n",
" # target nulls for each horizon are dropped in MLForecast.fit_models\n",
" # we just drop rows here for which all the target values are null\n",
" target_nulls = target_nulls.all(axis=1)\n",
" if dropna:\n",
" feature_nulls = np.full(df.shape[0], False)\n",
" for feature_vals in features.values():\n",
" feature_nulls |= np.isnan(feature_vals)\n",
" target_nulls = np.isnan(target)\n",
" if target_nulls.ndim == 2:\n",
" # target nulls for each horizon are dropped in MLForecast.fit_models\n",
" # we just drop rows here for which all the target values are null\n",
" target_nulls = target_nulls.all(axis=1)\n",
" keep_rows = ~(feature_nulls | target_nulls)\n",
" else:\n",
" # we always want to drop rows with nulls in the target\n",
" keep_rows = ~target_nulls\n",
"\n",
" self._dropped_series: Optional[np.ndarray] = None\n",
" if not keep_rows.all():\n",
" # remove rows with nulls\n",
" for k, v in features.items():\n",
" features[k] = v[keep_rows]\n",
" target = target[keep_rows]\n",
Expand All @@ -915,19 +913,17 @@
" last_idxs = self._sort_idxs[last_idxs]\n",
" last_vals_nan = ~keep_rows[last_idxs]\n",
" if last_vals_nan.any():\n",
" self._dropped_series: Optional[np.ndarray] = np.where(last_vals_nan)[0] \n",
" self._dropped_series = np.where(last_vals_nan)[0] \n",
" dropped_ids = reprlib.repr(list(self.uids[self._dropped_series]))\n",
" warnings.warn(\n",
" \"The following series were dropped completely \"\n",
" f\"due to the transformations and features: {dropped_ids}.\\n\"\n",
" \"These series won't show up if you use `MLForecast.forecast_fitted_values()`.\\n\"\n",
" \"You can set `dropna=False` or use transformations that require less samples to mitigate this\"\n",
" )\n",
" else:\n",
" self._dropped_series = None\n",
" elif isinstance(df, pd.DataFrame):\n",
" # we'll be assigning columns below, so we need to copy\n",
" df = df.copy(deep=False)\n",
" self._dropped_series = None\n",
"\n",
" # once we've computed the features and target we can slice the series\n",
" update_samples = [\n",
Expand Down Expand Up @@ -1701,7 +1697,7 @@
"text/markdown": [
"---\n",
"\n",
"[source](https://github.com/Nixtla/mlforecast/blob/main/mlforecast/core.py#L496){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n",
"[source](https://github.com/Nixtla/mlforecast/blob/main/mlforecast/core.py#L511){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n",
"\n",
"## TimeSeries.fit_transform\n",
"\n",
Expand All @@ -1723,7 +1719,7 @@
"text/plain": [
"---\n",
"\n",
"[source](https://github.com/Nixtla/mlforecast/blob/main/mlforecast/core.py#L496){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n",
"[source](https://github.com/Nixtla/mlforecast/blob/main/mlforecast/core.py#L511){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n",
"\n",
"## TimeSeries.fit_transform\n",
"\n",
Expand Down Expand Up @@ -2016,7 +2012,7 @@
"text/markdown": [
"---\n",
"\n",
"[source](https://github.com/Nixtla/mlforecast/blob/main/mlforecast/core.py#L743){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n",
"[source](https://github.com/Nixtla/mlforecast/blob/main/mlforecast/core.py#L758){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n",
"\n",
"## TimeSeries.predict\n",
"\n",
Expand All @@ -2030,7 +2026,7 @@
"text/plain": [
"---\n",
"\n",
"[source](https://github.com/Nixtla/mlforecast/blob/main/mlforecast/core.py#L743){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n",
"[source](https://github.com/Nixtla/mlforecast/blob/main/mlforecast/core.py#L758){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n",
"\n",
"## TimeSeries.predict\n",
"\n",
Expand Down Expand Up @@ -2168,7 +2164,7 @@
"text/markdown": [
"---\n",
"\n",
"[source](https://github.com/Nixtla/mlforecast/blob/main/mlforecast/core.py#L848){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n",
"[source](https://github.com/Nixtla/mlforecast/blob/main/mlforecast/core.py#L863){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n",
"\n",
"## TimeSeries.update\n",
"\n",
Expand All @@ -2181,7 +2177,7 @@
"text/plain": [
"---\n",
"\n",
"[source](https://github.com/Nixtla/mlforecast/blob/main/mlforecast/core.py#L848){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n",
"[source](https://github.com/Nixtla/mlforecast/blob/main/mlforecast/core.py#L863){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n",
"\n",
"## TimeSeries.update\n",
"\n",
Expand Down Expand Up @@ -2575,6 +2571,23 @@
"ts.fit_transform(series, 'unique_id', 'ds', 'y')\n",
"assert ts.keep_last_n is None"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#| hide\n",
"# no target nulls when dropna=False\n",
"ts = TimeSeries(\n",
" freq='D',\n",
" lags=[1, 2],\n",
" target_transforms=[Differences([5])],\n",
")\n",
"prep = ts.fit_transform(series, 'unique_id', 'ds', 'y', dropna=False)\n",
"assert not prep['y'].isnull().any()"
]
}
],
"metadata": {
Expand Down
Loading

0 comments on commit ca67b98

Please sign in to comment.