breaking: drop rows with null targets when dropna=False (#447)

Nixtla · Nov 12, 2024 · ca67b98 · ca67b98
1 parent e9b8c64
commit ca67b98
Show file tree

Hide file tree

Showing 4 changed files with 162 additions and 134 deletions.
diff --git a/mlforecast/core.py b/mlforecast/core.py
@@ -402,16 +402,23 @@ def _transform(
             target = target[self._restore_idxs]
 
         # determine rows to keep
+        target_nulls = np.isnan(target)
+        if target_nulls.ndim == 2:
+            # target nulls for each horizon are dropped in MLForecast.fit_models
+            # we just drop rows here for which all the target values are null
+            target_nulls = target_nulls.all(axis=1)
         if dropna:
             feature_nulls = np.full(df.shape[0], False)
             for feature_vals in features.values():
                 feature_nulls |= np.isnan(feature_vals)
-            target_nulls = np.isnan(target)
-            if target_nulls.ndim == 2:
-                # target nulls for each horizon are dropped in MLForecast.fit_models
-                # we just drop rows here for which all the target values are null
-                target_nulls = target_nulls.all(axis=1)
             keep_rows = ~(feature_nulls | target_nulls)
+        else:
+            # we always want to drop rows with nulls in the target
+            keep_rows = ~target_nulls
+
+        self._dropped_series: Optional[np.ndarray] = None
+        if not keep_rows.all():
+            # remove rows with nulls
             for k, v in features.items():
                 features[k] = v[keep_rows]
             target = target[keep_rows]
@@ -422,19 +429,17 @@ def _transform(
                 last_idxs = self._sort_idxs[last_idxs]
             last_vals_nan = ~keep_rows[last_idxs]
             if last_vals_nan.any():
-                self._dropped_series: Optional[np.ndarray] = np.where(last_vals_nan)[0]
+                self._dropped_series = np.where(last_vals_nan)[0]
                 dropped_ids = reprlib.repr(list(self.uids[self._dropped_series]))
                 warnings.warn(
                     "The following series were dropped completely "
                     f"due to the transformations and features: {dropped_ids}.\n"
                     "These series won't show up if you use `MLForecast.forecast_fitted_values()`.\n"
                     "You can set `dropna=False` or use transformations that require less samples to mitigate this"
                 )
-            else:
-                self._dropped_series = None
         elif isinstance(df, pd.DataFrame):
+            # we'll be assigning columns below, so we need to copy
             df = df.copy(deep=False)
-            self._dropped_series = None
 
         # once we've computed the features and target we can slice the series
         update_samples = [

diff --git a/nbs/core.ipynb b/nbs/core.ipynb
@@ -20,16 +20,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "The autoreload extension is already loaded. To reload it, use:\n",
-      "  %reload_ext autoreload\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "#|hide\n",
     "%load_ext autoreload\n",
@@ -895,16 +886,23 @@
     "            target = target[self._restore_idxs]       \n",
     "\n",
     "        # determine rows to keep\n",
+    "        target_nulls = np.isnan(target)\n",
+    "        if target_nulls.ndim == 2:\n",
+    "            # target nulls for each horizon are dropped in MLForecast.fit_models\n",
+    "            # we just drop rows here for which all the target values are null\n",
+    "            target_nulls = target_nulls.all(axis=1)\n",
     "        if dropna:\n",
     "            feature_nulls = np.full(df.shape[0], False)\n",
     "            for feature_vals in features.values():\n",
     "                feature_nulls |= np.isnan(feature_vals)\n",
-    "            target_nulls = np.isnan(target)\n",
-    "            if target_nulls.ndim == 2:\n",
-    "                # target nulls for each horizon are dropped in MLForecast.fit_models\n",
-    "                # we just drop rows here for which all the target values are null\n",
-    "                target_nulls = target_nulls.all(axis=1)\n",
     "            keep_rows = ~(feature_nulls | target_nulls)\n",
+    "        else:\n",
+    "            # we always want to drop rows with nulls in the target\n",
+    "            keep_rows = ~target_nulls\n",
+    "\n",
+    "        self._dropped_series: Optional[np.ndarray] = None\n",
+    "        if not keep_rows.all():\n",
+    "            # remove rows with nulls\n",
     "            for k, v in features.items():\n",
     "                features[k] = v[keep_rows]\n",
     "            target = target[keep_rows]\n",
@@ -915,19 +913,17 @@
     "                last_idxs = self._sort_idxs[last_idxs]\n",
     "            last_vals_nan = ~keep_rows[last_idxs]\n",
     "            if last_vals_nan.any():\n",
-    "                self._dropped_series: Optional[np.ndarray] = np.where(last_vals_nan)[0]                \n",
+    "                self._dropped_series = np.where(last_vals_nan)[0]                \n",
     "                dropped_ids = reprlib.repr(list(self.uids[self._dropped_series]))\n",
     "                warnings.warn(\n",
     "                    \"The following series were dropped completely \"\n",
     "                    f\"due to the transformations and features: {dropped_ids}.\\n\"\n",
     "                    \"These series won't show up if you use `MLForecast.forecast_fitted_values()`.\\n\"\n",
     "                    \"You can set `dropna=False` or use transformations that require less samples to mitigate this\"\n",
     "                )\n",
-    "            else:\n",
-    "                self._dropped_series = None\n",
     "        elif isinstance(df, pd.DataFrame):\n",
+    "            # we'll be assigning columns below, so we need to copy\n",
     "            df = df.copy(deep=False)\n",
-    "            self._dropped_series = None\n",
     "\n",
     "        # once we've computed the features and target we can slice the series\n",
     "        update_samples = [\n",
@@ -1701,7 +1697,7 @@
       "text/markdown": [
        "---\n",
        "\n",
-       "[source](https://github.com/Nixtla/mlforecast/blob/main/mlforecast/core.py#L496){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n",
+       "[source](https://github.com/Nixtla/mlforecast/blob/main/mlforecast/core.py#L511){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n",
        "\n",
        "## TimeSeries.fit_transform\n",
        "\n",
@@ -1723,7 +1719,7 @@
       "text/plain": [
        "---\n",
        "\n",
-       "[source](https://github.com/Nixtla/mlforecast/blob/main/mlforecast/core.py#L496){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n",
+       "[source](https://github.com/Nixtla/mlforecast/blob/main/mlforecast/core.py#L511){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n",
        "\n",
        "## TimeSeries.fit_transform\n",
        "\n",
@@ -2016,7 +2012,7 @@
       "text/markdown": [
        "---\n",
        "\n",
-       "[source](https://github.com/Nixtla/mlforecast/blob/main/mlforecast/core.py#L743){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n",
+       "[source](https://github.com/Nixtla/mlforecast/blob/main/mlforecast/core.py#L758){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n",
        "\n",
        "## TimeSeries.predict\n",
        "\n",
@@ -2030,7 +2026,7 @@
       "text/plain": [
        "---\n",
        "\n",
-       "[source](https://github.com/Nixtla/mlforecast/blob/main/mlforecast/core.py#L743){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n",
+       "[source](https://github.com/Nixtla/mlforecast/blob/main/mlforecast/core.py#L758){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n",
        "\n",
        "## TimeSeries.predict\n",
        "\n",
@@ -2168,7 +2164,7 @@
       "text/markdown": [
        "---\n",
        "\n",
-       "[source](https://github.com/Nixtla/mlforecast/blob/main/mlforecast/core.py#L848){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n",
+       "[source](https://github.com/Nixtla/mlforecast/blob/main/mlforecast/core.py#L863){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n",
        "\n",
        "## TimeSeries.update\n",
        "\n",
@@ -2181,7 +2177,7 @@
       "text/plain": [
        "---\n",
        "\n",
-       "[source](https://github.com/Nixtla/mlforecast/blob/main/mlforecast/core.py#L848){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n",
+       "[source](https://github.com/Nixtla/mlforecast/blob/main/mlforecast/core.py#L863){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n",
        "\n",
        "## TimeSeries.update\n",
        "\n",
@@ -2575,6 +2571,23 @@
     "ts.fit_transform(series, 'unique_id', 'ds', 'y')\n",
     "assert ts.keep_last_n is None"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| hide\n",
+    "# no target nulls when dropna=False\n",
+    "ts = TimeSeries(\n",
+    "    freq='D',\n",
+    "    lags=[1, 2],\n",
+    "    target_transforms=[Differences([5])],\n",
+    ")\n",
+    "prep = ts.fit_transform(series, 'unique_id', 'ds', 'y', dropna=False)\n",
+    "assert not prep['y'].isnull().any()"
+   ]
   }
  ],
  "metadata": {