From 91c2a59887505563175566b94546bfff3f5af971 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Morales?= Date: Tue, 16 Jul 2024 20:54:51 -0600 Subject: [PATCH] support polars in GlobalSklearnTransformer (#377) --- .github/workflows/build-docs.yaml | 10 +- .github/workflows/ci.yaml | 92 ++--- environment.yml | 37 -- local_environment.yml | 28 -- mlforecast/target_transforms.py | 37 +- .../quick_start_distributed.ipynb | 330 +++++++++--------- nbs/feature_engineering.ipynb | 2 - nbs/target_transforms.ipynb | 56 ++- settings.ini | 10 +- setup.py | 25 +- 10 files changed, 260 insertions(+), 367 deletions(-) delete mode 100644 environment.yml delete mode 100644 local_environment.yml diff --git a/.github/workflows/build-docs.yaml b/.github/workflows/build-docs.yaml index 0d3ba293..9e290368 100644 --- a/.github/workflows/build-docs.yaml +++ b/.github/workflows/build-docs.yaml @@ -6,10 +6,6 @@ on: branches: ["main"] workflow_dispatch: -defaults: - run: - shell: bash - jobs: build-docs: runs-on: ubuntu-latest @@ -24,12 +20,10 @@ jobs: path: docs-scripts - uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # 5.1.1 with: - cache: "pip" - python-version: '3.10' - cache-dependency-path: settings.ini + python-version: "3.10" - name: Build docs run: | - pip install -e ".[dev]" + pip install uv && uv pip install --system ".[all]" mkdir nbs/_extensions cp -r docs-scripts/mintlify/ nbs/_extensions/ python docs-scripts/update-quarto.py diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index a5c7ab27..2233c862 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -7,44 +7,18 @@ on: branches: [main] workflow_dispatch: -defaults: - run: - shell: bash -l {0} - concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true jobs: - nb-sync: - runs-on: ubuntu-latest - steps: - - name: Clone repo - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 - - - name: Set up python - uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # 5.1.1 - - - name: Install nbdev - run: pip install nbdev - - - name: Check if all notebooks are cleaned - run: | - echo "Check we are starting with clean git checkout" - if [ -n "$(git status -uno -s)" ]; then echo "git status is not clean"; false; fi - echo "Trying to strip out notebooks" - ./action_files/clean_nbs - echo "Check that strip out was unnecessary" - git status -s # display the status to see which nbs need cleaning up - if [ -n "$(git status -uno -s)" ]; then echo -e "!!! Detected unstripped out notebooks\n!!!Remember to run nbdev_install_hooks"; false; fi - run-all-tests: runs-on: ubuntu-latest timeout-minutes: 30 strategy: fail-fast: false matrix: - python-version: ['3.8', '3.9', '3.10'] + python-version: ["3.8", "3.9", "3.10", "3.11"] env: AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_NIXTLA_TMP }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_NIXTLA_TMP }} @@ -53,49 +27,24 @@ jobs: uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 - name: Set up environment - uses: mamba-org/setup-micromamba@f8b8a1e23a26f60a44c853292711bacfd3eac822 # v1.9.0 + uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # 5.1.1 with: - environment-file: environment.yml - create-args: python=${{ matrix.python-version }} - cache-environment: true + python-version: ${{ matrix.python-version }} - name: Install the library - run: pip install ./ + run: pip install uv && uv pip install --system ".[all]" - name: Run all tests - run: nbdev_test --n_workers 0 --do_print --timing --flags 'polars core' + run: nbdev_test --n_workers 0 --do_print --timing --skip_file_re 'electricity' --flags 'polars' - run-macos-tests: - runs-on: macos-13 + run-local-tests: + runs-on: ${{ matrix.os }} timeout-minutes: 30 strategy: fail-fast: false matrix: - python-version: ['3.8', '3.9', '3.10', '3.11'] - steps: - - name: Clone repo - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 - - - name: Set up environment - uses: mamba-org/setup-micromamba@f8b8a1e23a26f60a44c853292711bacfd3eac822 # v1.9.0 - with: - environment-file: local_environment.yml - create-args: python=${{ matrix.python-version }} - cache-environment: true - - - name: Install the library - run: pip install ./ - - - name: Run local tests - run: nbdev_test --n_workers 0 --do_print --timing --skip_file_glob "*distributed*" --flags 'polars core' - - run-windows-tests: - runs-on: windows-latest - timeout-minutes: 30 - strategy: - fail-fast: false - matrix: - python-version: ['3.8', '3.9', '3.10', '3.11'] + os: [macos-13, macos-14, windows-latest] + python-version: ["3.8", "3.9", "3.10", "3.11"] steps: - name: Clone repo uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 @@ -106,10 +55,14 @@ jobs: python-version: ${{ matrix.python-version }} - name: Install the library - run: pip install uv && uv pip install ".[dev]" --system + run: pip install uv && uv pip install --system ".[dev]" + + - name: Install OpenMP + if: startsWith(matrix.os, 'macos') + run: brew install libomp - name: Run local tests - run: nbdev_test --n_workers 0 --do_print --timing --skip_file_glob "*distributed*" --flags 'polars core' + run: nbdev_test --n_workers 0 --do_print --timing --skip_file_re "(distributed|electricity)" --flags 'polars' check-deps: runs-on: ubuntu-latest @@ -120,11 +73,10 @@ jobs: - name: Set up python uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # 5.1.1 with: - python-version: '3.10' - cache: 'pip' + python-version: "3.10" - name: Install forecast notebook dependencies - run: pip install . datasetsforecast lightgbm matplotlib nbdev xgboost + run: pip install uv && uv pip install --system . datasetsforecast lightgbm matplotlib nbdev xgboost - name: Run forecast notebook run: nbdev_test --path nbs/forecast.ipynb @@ -136,11 +88,10 @@ jobs: - uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # 5.1.1 with: - python-version: '3.10' - cache: 'pip' + python-version: "3.10" - name: Install dependencies - run: pip install . pytest pytest-benchmark + run: pip install uv && uv pip install --system . pytest pytest-benchmark - name: Run efficiency tests run: pytest tests/test_pipeline.py --benchmark-group-by=func --benchmark-sort=fullname @@ -152,11 +103,10 @@ jobs: - uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # 5.1.1 with: - python-version: '3.10' - cache: 'pip' + python-version: "3.10" - name: Install dependencies - run: pip install . datasetsforecast lightgbm pytest + run: pip install uv && uv pip install --system . datasetsforecast lightgbm pytest - name: Run m4 performance tests run: pytest tests/test_m4.py diff --git a/environment.yml b/environment.yml deleted file mode 100644 index 856e4c04..00000000 --- a/environment.yml +++ /dev/null @@ -1,37 +0,0 @@ -name: mlforecast -channels: - - conda-forge -dependencies: - - coreforecast>=0.0.7 - - dask<2023.1.1 - - fsspec - - gitpython - - holidays<0.21 - - lightgbm - - matplotlib - - nbformat - - numba - - optuna - - pandas - - pip - - prophet - - pyarrow - - pyspark>=3.3 - - s3fs - - scikit-learn - - setuptools<70 - - shap - - statsmodels - - window-ops - - py-xgboost-cpu - - pip: - - datasetsforecast - - duckdb<0.8 - - fugue[ray] - - lightgbm_ray - - nbdev - - polars[numpy]>=0.0.0rc0 - - ray<2.8 - - triad==0.9.1 - - utilsforecast>=0.0.27 - - xgboost_ray diff --git a/local_environment.yml b/local_environment.yml deleted file mode 100644 index cd3dc771..00000000 --- a/local_environment.yml +++ /dev/null @@ -1,28 +0,0 @@ -name: mlforecast -channels: - - conda-forge -dependencies: - - coreforecast>=0.0.7 - - fsspec - - holidays<0.21 - - jinja2 - - lightgbm - - matplotlib - - nbformat - - nomkl - - numba - - optuna - - pandas - - pip - - prophet - - pyarrow - - scikit-learn - - shap - - statsmodels - - window-ops - - py-xgboost-cpu - - pip: - - datasetsforecast - - nbdev - - polars[numpy]>=0.0.0rc0 - - utilsforecast>=0.0.27 diff --git a/mlforecast/target_transforms.py b/mlforecast/target_transforms.py index fc28ea86..5017e3c3 100644 --- a/mlforecast/target_transforms.py +++ b/mlforecast/target_transforms.py @@ -11,7 +11,7 @@ import coreforecast.scalers as core_scalers import numpy as np -import pandas as pd +import utilsforecast.processing as ufp from coreforecast.grouped_array import GroupedArray as CoreGroupedArray from sklearn.base import TransformerMixin, clone from utilsforecast.compat import DataFrame @@ -295,25 +295,26 @@ class GlobalSklearnTransformer(BaseTargetTransform): def __init__(self, transformer: TransformerMixin): self.transformer = transformer - def fit_transform(self, df: pd.DataFrame) -> pd.DataFrame: - df = df.copy(deep=False) + def fit_transform(self, df: DataFrame) -> DataFrame: + df = ufp.copy_if_pandas(df, deep=False) self.transformer_ = clone(self.transformer) - df[self.target_col] = self.transformer_.fit_transform( - df[[self.target_col]].values + transformed = self.transformer_.fit_transform(df[[self.target_col]].to_numpy()) + return ufp.assign_columns(df, self.target_col, transformed[:, 0]) + + def inverse_transform(self, df: DataFrame) -> DataFrame: + df = ufp.copy_if_pandas(df, deep=False) + cols_to_transform = [ + c for c in df.columns if c not in (self.id_col, self.time_col) + ] + transformed = self.transformer_.inverse_transform( + df[cols_to_transform].to_numpy() ) - return df - - def inverse_transform(self, df: pd.DataFrame) -> pd.DataFrame: - df = df.copy(deep=False) - cols_to_transform = df.columns.drop([self.id_col, self.time_col]) - for col in cols_to_transform: - df[col] = self.transformer_.inverse_transform(df[[col]].values) - return df - - def update(self, df: pd.DataFrame) -> pd.DataFrame: - df = df.copy(deep=False) - df[self.target_col] = self.transformer_.transform(df[[self.target_col]].values) - return df + return ufp.assign_columns(df, cols_to_transform, transformed) + + def update(self, df: DataFrame) -> DataFrame: + df = ufp.copy_if_pandas(df, deep=False) + transformed = self.transformer_.transform(df[[self.target_col]].to_numpy()) + return ufp.assign_columns(df, self.target_col, transformed[:, 0]) @staticmethod def stack(transforms: Sequence["GlobalSklearnTransformer"]) -> "GlobalSklearnTransformer": # type: ignore[override] diff --git a/nbs/docs/getting-started/quick_start_distributed.ipynb b/nbs/docs/getting-started/quick_start_distributed.ipynb index 05abc366..747f377f 100644 --- a/nbs/docs/getting-started/quick_start_distributed.ipynb +++ b/nbs/docs/getting-started/quick_start_distributed.ipynb @@ -125,20 +125,6 @@ "data": { "text/html": [ "
Dask DataFrame Structure:
\n", - "
\n", - "\n", "\n", " \n", " \n", @@ -201,8 +187,7 @@ " \n", " \n", "
\n", - "
\n", - "
Dask Name: assign, 5 graph layers
" + "
Dask Name: assign, 5 expressions
" ], "text/plain": [ "Dask DataFrame Structure:\n", @@ -213,7 +198,8 @@ "... ... ... ... ... ...\n", "id_90 ... ... ... ... ...\n", "id_99 ... ... ... ... ...\n", - "Dask Name: assign, 5 graph layers" + "Dask Name: assign, 5 expressions\n", + "Expr=Assign(frame=MapPartitions(lambda))" ] }, "execution_count": null, @@ -345,31 +331,32 @@ "source": [ "#| hide\n", "# test num_partitions works properly\n", - "num_partitions_test = 4\n", - "test_dd = dd.from_pandas(series, npartitions=num_partitions_test) # In this case we dont have to specify the column\n", - "test_dd['unique_id'] = test_dd['unique_id'].astype(str)\n", - "fcst_np = DistributedMLForecast(\n", - " models=models,\n", - " freq='D',\n", - " target_transforms=[Differences([7])], \n", - " lags=[7],\n", - " lag_transforms={\n", - " 1: [ExpandingMean()],\n", - " 7: [RollingMean(window_size=14)]\n", - " },\n", - " date_features=['dayofweek', 'month'],\n", - " num_threads=1,\n", - " engine=client,\n", - " num_partitions=num_partitions_test\n", - ")\n", - "fcst_np.fit(test_dd)\n", - "test_partition_results_size(fcst_np, num_partitions_test)\n", - "preds_np = fcst_np.predict(7).compute().sort_values(['unique_id', 'ds']).reset_index(drop=True)\n", - "preds = fcst.predict(7).compute().sort_values(['unique_id', 'ds']).reset_index(drop=True)\n", - "pd.testing.assert_frame_equal(\n", - " preds[['unique_id', 'ds']], \n", - " preds_np[['unique_id', 'ds']], \n", - ")" + "if sys.version_info >= (3, 9):\n", + " num_partitions_test = 4\n", + " test_dd = dd.from_pandas(series, npartitions=num_partitions_test) # In this case we dont have to specify the column\n", + " test_dd['unique_id'] = test_dd['unique_id'].astype(str)\n", + " fcst_np = DistributedMLForecast(\n", + " models=models,\n", + " freq='D',\n", + " target_transforms=[Differences([7])], \n", + " lags=[7],\n", + " lag_transforms={\n", + " 1: [ExpandingMean()],\n", + " 7: [RollingMean(window_size=14)]\n", + " },\n", + " date_features=['dayofweek', 'month'],\n", + " num_threads=1,\n", + " engine=client,\n", + " num_partitions=num_partitions_test\n", + " )\n", + " fcst_np.fit(test_dd)\n", + " test_partition_results_size(fcst_np, num_partitions_test)\n", + " preds_np = fcst_np.predict(7).compute().sort_values(['unique_id', 'ds']).reset_index(drop=True)\n", + " preds = fcst.predict(7).compute().sort_values(['unique_id', 'ds']).reset_index(drop=True)\n", + " pd.testing.assert_frame_equal(\n", + " preds[['unique_id', 'ds']], \n", + " preds_np[['unique_id', 'ds']], \n", + " )" ] }, { @@ -425,49 +412,49 @@ " \n", " 0\n", " id_00\n", - " 2002-09-27\n", - " 20.999371\n", - " 21.892795\n", + " 2002-09-27 00:00:00\n", + " 20.80344\n", + " 21.72224\n", " \n", " \n", " 1\n", " id_00\n", - " 2002-09-28\n", - " 84.771692\n", - " 83.002009\n", + " 2002-09-28 00:00:00\n", + " 85.726774\n", + " 84.59995\n", " \n", " \n", " 2\n", " id_00\n", - " 2002-09-29\n", - " 162.389419\n", - " 163.528475\n", + " 2002-09-29 00:00:00\n", + " 163.385159\n", + " 163.511357\n", " \n", " \n", " 3\n", " id_00\n", - " 2002-09-30\n", - " 245.002456\n", - " 245.472042\n", + " 2002-09-30 00:00:00\n", + " 244.434753\n", + " 246.15272\n", " \n", " \n", " 4\n", " id_00\n", - " 2002-10-01\n", - " 317.240952\n", - " 313.948840\n", + " 2002-10-01 00:00:00\n", + " 316.953824\n", + " 314.691167\n", " \n", " \n", "\n", "" ], "text/plain": [ - " unique_id ds DaskXGBForecast DaskLGBMForecast\n", - "0 id_00 2002-09-27 20.999371 21.892795\n", - "1 id_00 2002-09-28 84.771692 83.002009\n", - "2 id_00 2002-09-29 162.389419 163.528475\n", - "3 id_00 2002-09-30 245.002456 245.472042\n", - "4 id_00 2002-10-01 317.240952 313.948840" + " unique_id ds DaskXGBForecast DaskLGBMForecast\n", + "0 id_00 2002-09-27 00:00:00 20.80344 21.72224\n", + "1 id_00 2002-09-28 00:00:00 85.726774 84.59995\n", + "2 id_00 2002-09-29 00:00:00 163.385159 163.511357\n", + "3 id_00 2002-09-30 00:00:00 244.434753 246.15272\n", + "4 id_00 2002-10-01 00:00:00 316.953824 314.691167" ] }, "execution_count": null, @@ -641,7 +628,7 @@ ")\n", "upd_fcst.fit(partitioned_series)\n", "\n", - "new_df = (series.groupby('unique_id')['ds'].max() + pd.offsets.Day()).reset_index()\n", + "new_df = (series.groupby('unique_id', observed=True)['ds'].max() + pd.offsets.Day()).reset_index()\n", "new_df['y'] = -1.0\n", "upd_fcst.update(new_df)\n", "expected = new_df.rename(columns={'y': 'Lag1Model'})\n", @@ -651,6 +638,7 @@ "pd.testing.assert_frame_equal(\n", " upd_preds.reset_index(drop=True),\n", " expected.reset_index(drop=True),\n", + " check_dtype=False,\n", ")" ] }, @@ -697,8 +685,7 @@ " partitioned_series,\n", " n_windows=3,\n", " h=14,\n", - ")\n", - "cv_res" + ")" ] }, { @@ -738,68 +725,68 @@ " \n", " \n", " \n", - " 0\n", - " id_00\n", - " 2002-08-16\n", - " 22.706938\n", - " 21.967568\n", - " 2002-08-15\n", - " 11.878591\n", + " 45\n", + " id_13\n", + " 2002-08-19 00:00:00\n", + " 254.761097\n", + " 253.220683\n", + " 2002-08-15 00:00:00\n", + " 243.926251\n", " \n", " \n", - " 1\n", - " id_00\n", - " 2002-08-17\n", - " 95.885948\n", - " 98.285482\n", - " 2002-08-15\n", - " 75.108162\n", + " 132\n", + " id_19\n", + " 2002-08-22 00:00:00\n", + " 16.19625\n", + " 16.190252\n", + " 2002-08-15 00:00:00\n", + " 15.072471\n", " \n", " \n", - " 2\n", - " id_00\n", - " 2002-08-18\n", - " 172.546631\n", - " 171.527272\n", - " 2002-08-15\n", - " 175.278407\n", + " 11\n", + " id_20\n", + " 2002-08-27 00:00:00\n", + " 2.241844\n", + " 2.135991\n", + " 2002-08-15 00:00:00\n", + " 2.298182\n", " \n", " \n", - " 3\n", - " id_00\n", - " 2002-08-19\n", - " 238.256594\n", - " 238.375726\n", - " 2002-08-15\n", - " 226.062025\n", + " 80\n", + " id_25\n", + " 2002-08-26 00:00:00\n", + " 253.331793\n", + " 253.219323\n", + " 2002-08-15 00:00:00\n", + " 244.810677\n", " \n", " \n", - " 4\n", - " id_00\n", - " 2002-08-20\n", - " 306.005923\n", - " 305.146636\n", - " 2002-08-15\n", - " 318.433401\n", + " 42\n", + " id_43\n", + " 2002-08-16 00:00:00\n", + " 302.717019\n", + " 301.165652\n", + " 2002-08-15 00:00:00\n", + " 318.044911\n", " \n", " \n", "\n", "" ], "text/plain": [ - " unique_id ds DaskXGBForecast DaskLGBMForecast cutoff \\\n", - "0 id_00 2002-08-16 22.706938 21.967568 2002-08-15 \n", - "1 id_00 2002-08-17 95.885948 98.285482 2002-08-15 \n", - "2 id_00 2002-08-18 172.546631 171.527272 2002-08-15 \n", - "3 id_00 2002-08-19 238.256594 238.375726 2002-08-15 \n", - "4 id_00 2002-08-20 306.005923 305.146636 2002-08-15 \n", + " unique_id ds DaskXGBForecast DaskLGBMForecast \\\n", + "45 id_13 2002-08-19 00:00:00 254.761097 253.220683 \n", + "132 id_19 2002-08-22 00:00:00 16.19625 16.190252 \n", + "11 id_20 2002-08-27 00:00:00 2.241844 2.135991 \n", + "80 id_25 2002-08-26 00:00:00 253.331793 253.219323 \n", + "42 id_43 2002-08-16 00:00:00 302.717019 301.165652 \n", "\n", - " y \n", - "0 11.878591 \n", - "1 75.108162 \n", - "2 175.278407 \n", - "3 226.062025 \n", - "4 318.433401 " + " cutoff y \n", + "45 2002-08-15 00:00:00 243.926251 \n", + "132 2002-08-15 00:00:00 15.072471 \n", + "11 2002-08-15 00:00:00 2.298182 \n", + "80 2002-08-15 00:00:00 244.810677 \n", + "42 2002-08-15 00:00:00 318.044911 " ] }, "execution_count": null, @@ -863,8 +850,8 @@ " h=14,\n", " refit=False\n", ")\n", - "cv_results_df = cv_res.compute()\n", - "cv_results_no_refit_df = cv_res_no_refit.compute()\n", + "cv_results_df = cv_res.compute().sort_values(['unique_id', 'ds'])\n", + "cv_results_no_refit_df = cv_res_no_refit.compute().sort_values(['unique_id', 'ds'])\n", "# test we recover the same \"metadata\"\n", "models = ['DaskXGBForecast', 'DaskLGBMForecast']\n", "test_eq(\n", @@ -882,7 +869,6 @@ "source": [ "#|hide\n", "non_std_series = partitioned_series.copy()\n", - "non_std_series['ds'] = non_std_series.map_partitions(lambda part: part.groupby('unique_id').cumcount())\n", "non_std_series = non_std_series.rename(columns={'ds': 'time', 'y': 'value', 'unique_id': 'some_id'})\n", "flow_params = dict(\n", " models=[DaskXGBForecast(random_state=0)],\n", @@ -897,7 +883,7 @@ "fcst = DistributedMLForecast(freq='D', **flow_params)\n", "fcst.fit(partitioned_series)\n", "preds = fcst.predict(7).compute()\n", - "fcst2 = DistributedMLForecast(freq=1, **flow_params)\n", + "fcst2 = DistributedMLForecast(freq='D', **flow_params)\n", "fcst2.preprocess(non_std_series, id_col='some_id', time_col='time', target_col='value')\n", "fcst2.models_ = fcst.models_ # distributed training can end up with different fits\n", "non_std_preds = fcst2.predict(7).compute()\n", @@ -1273,7 +1259,15 @@ "execution_count": null, "id": "04005646-d244-48c8-b11c-124a41d9740c", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + } + ], "source": [ "preds = fa.as_pandas(fcst.predict(10)).sort_values(['unique_id', 'ds']).reset_index(drop=True)\n", "preds2 = fa.as_pandas(fcst2.predict(10)).sort_values(['unique_id', 'ds']).reset_index(drop=True)\n", @@ -1693,38 +1687,38 @@ " \n", " \n", " 0\n", - " id_00\n", + " id_04\n", " 2001-05-15\n", - " 431.677682\n", - " 427.262462\n", + " 215.602509\n", + " 212.205627\n", " \n", " \n", " 1\n", - " id_00\n", + " id_04\n", " 2001-05-16\n", - " 503.673189\n", - " 502.605670\n", + " 286.985718\n", + " 276.514374\n", " \n", " \n", " 2\n", - " id_00\n", + " id_04\n", " 2001-05-17\n", - " 8.150285\n", - " 7.604773\n", + " 370.456116\n", + " 375.606598\n", " \n", " \n", " 3\n", - " id_00\n", + " id_04\n", " 2001-05-18\n", - " 97.620923\n", - " 97.582869\n", + " 484.781769\n", + " 485.743347\n", " \n", " \n", " 4\n", - " id_00\n", + " id_04\n", " 2001-05-19\n", - " 194.568960\n", - " 192.818578\n", + " 563.665894\n", + " 561.609985\n", " \n", " \n", "\n", @@ -1732,11 +1726,11 @@ ], "text/plain": [ " unique_id ds RayLGBMForecast RayXGBForecast\n", - "0 id_00 2001-05-15 431.677682 427.262462\n", - "1 id_00 2001-05-16 503.673189 502.605670\n", - "2 id_00 2001-05-17 8.150285 7.604773\n", - "3 id_00 2001-05-18 97.620923 97.582869\n", - "4 id_00 2001-05-19 194.568960 192.818578" + "0 id_04 2001-05-15 215.602509 212.205627\n", + "1 id_04 2001-05-16 286.985718 276.514374\n", + "2 id_04 2001-05-17 370.456116 375.606598\n", + "3 id_04 2001-05-18 484.781769 485.743347\n", + "4 id_04 2001-05-19 563.665894 561.609985" ] }, "execution_count": null, @@ -1900,48 +1894,48 @@ " \n", " \n", " 0\n", - " id_10\n", - " 2001-05-01\n", - " 24.962461\n", - " 22.998615\n", + " id_01\n", + " 2001-05-03\n", + " 184.426498\n", + " 184.533676\n", " 2001-04-30\n", - " 31.878545\n", + " 175.337772\n", " \n", " \n", " 1\n", - " id_10\n", - " 2001-05-02\n", - " 53.219645\n", - " 54.298105\n", + " id_02\n", + " 2001-05-11\n", + " 277.452911\n", + " 276.489410\n", " 2001-04-30\n", - " 48.349363\n", + " 282.129155\n", " \n", " \n", " 2\n", - " id_10\n", - " 2001-05-03\n", - " 78.068732\n", - " 76.111907\n", + " id_02\n", + " 2001-05-12\n", + " 11.065877\n", + " 13.194022\n", " 2001-04-30\n", - " 71.607111\n", + " 16.293327\n", " \n", " \n", " 3\n", - " id_10\n", - " 2001-05-04\n", - " 103.153889\n", - " 104.344135\n", + " id_02\n", + " 2001-05-14\n", + " 103.247375\n", + " 101.106804\n", " 2001-04-30\n", - " 103.482107\n", + " 95.109389\n", " \n", " \n", " 4\n", - " id_10\n", - " 2001-05-05\n", - " 116.708231\n", - " 115.950523\n", + " id_03\n", + " 2001-05-04\n", + " 172.470184\n", + " 170.334702\n", " 2001-04-30\n", - " 124.719690\n", + " 169.325874\n", " \n", " \n", "\n", @@ -1949,11 +1943,11 @@ ], "text/plain": [ " unique_id ds RayLGBMForecast RayXGBForecast cutoff y\n", - "0 id_10 2001-05-01 24.962461 22.998615 2001-04-30 31.878545\n", - "1 id_10 2001-05-02 53.219645 54.298105 2001-04-30 48.349363\n", - "2 id_10 2001-05-03 78.068732 76.111907 2001-04-30 71.607111\n", - "3 id_10 2001-05-04 103.153889 104.344135 2001-04-30 103.482107\n", - "4 id_10 2001-05-05 116.708231 115.950523 2001-04-30 124.719690" + "0 id_01 2001-05-03 184.426498 184.533676 2001-04-30 175.337772\n", + "1 id_02 2001-05-11 277.452911 276.489410 2001-04-30 282.129155\n", + "2 id_02 2001-05-12 11.065877 13.194022 2001-04-30 16.293327\n", + "3 id_02 2001-05-14 103.247375 101.106804 2001-04-30 95.109389\n", + "4 id_03 2001-05-04 172.470184 170.334702 2001-04-30 169.325874" ] }, "execution_count": null, diff --git a/nbs/feature_engineering.ipynb b/nbs/feature_engineering.ipynb index 90aa606e..3d84b2cf 100644 --- a/nbs/feature_engineering.ipynb +++ b/nbs/feature_engineering.ipynb @@ -551,7 +551,6 @@ "outputs": [], "source": [ "#| hide\n", - "#| core\n", "from mlforecast.lag_transforms import ExpandingMean" ] }, @@ -563,7 +562,6 @@ "outputs": [], "source": [ "#| hide\n", - "#| core\n", "transformed_core = transform_exog(\n", " prices,\n", " lags=[1, 2],\n", diff --git a/nbs/target_transforms.ipynb b/nbs/target_transforms.ipynb index 41741e65..589b9c57 100644 --- a/nbs/target_transforms.ipynb +++ b/nbs/target_transforms.ipynb @@ -45,7 +45,7 @@ "\n", "import coreforecast.scalers as core_scalers\n", "import numpy as np\n", - "import pandas as pd\n", + "import utilsforecast.processing as ufp\n", "from coreforecast.grouped_array import GroupedArray as CoreGroupedArray\n", "from sklearn.base import TransformerMixin, clone\n", "from utilsforecast.compat import DataFrame\n", @@ -345,7 +345,6 @@ "outputs": [], "source": [ "#| hide\n", - "#| core\n", "sc = AutoDifferences(1)\n", "ga = GroupedArray(np.arange(10), np.array([0, 10]))\n", "transformed = sc.fit_transform(ga)\n", @@ -406,7 +405,6 @@ "outputs": [], "source": [ "#| hide\n", - "#| core\n", "sc = AutoSeasonalDifferences(season_length=5, max_diffs=1)\n", "ga = GroupedArray(np.arange(5)[np.arange(10) % 5], np.array([0, 10]))\n", "transformed = sc.fit_transform(ga)\n", @@ -462,7 +460,6 @@ "outputs": [], "source": [ "#| hide\n", - "#| core\n", "sc = AutoSeasonalityAndDifferences(max_season_length=5, max_diffs=1)\n", "ga = GroupedArray(np.arange(5)[np.arange(10) % 5], np.array([0, 10]))\n", "transformed = sc.fit_transform(ga)\n", @@ -675,23 +672,26 @@ " def __init__(self, transformer: TransformerMixin):\n", " self.transformer = transformer\n", "\n", - " def fit_transform(self, df: pd.DataFrame) -> pd.DataFrame:\n", - " df = df.copy(deep=False)\n", + " def fit_transform(self, df: DataFrame) -> DataFrame:\n", + " df = ufp.copy_if_pandas(df, deep=False)\n", " self.transformer_ = clone(self.transformer)\n", - " df[self.target_col] = self.transformer_.fit_transform(df[[self.target_col]].values)\n", - " return df\n", + " transformed = self.transformer_.fit_transform(df[[self.target_col]].to_numpy())\n", + " return ufp.assign_columns(df, self.target_col, transformed[:, 0])\n", "\n", - " def inverse_transform(self, df: pd.DataFrame) -> pd.DataFrame:\n", - " df = df.copy(deep=False)\n", - " cols_to_transform = df.columns.drop([self.id_col, self.time_col])\n", - " for col in cols_to_transform:\n", - " df[col] = self.transformer_.inverse_transform(df[[col]].values)\n", - " return df \n", + " def inverse_transform(self, df: DataFrame) -> DataFrame:\n", + " df = ufp.copy_if_pandas(df, deep=False)\n", + " cols_to_transform = [\n", + " c for c in df.columns if c not in (self.id_col, self.time_col)\n", + " ]\n", + " transformed = self.transformer_.inverse_transform(\n", + " df[cols_to_transform].to_numpy()\n", + " )\n", + " return ufp.assign_columns(df, cols_to_transform, transformed)\n", "\n", - " def update(self, df: pd.DataFrame) -> pd.DataFrame:\n", - " df = df.copy(deep=False)\n", - " df[self.target_col] = self.transformer_.transform(df[[self.target_col]].values)\n", - " return df\n", + " def update(self, df: DataFrame) -> DataFrame:\n", + " df = ufp.copy_if_pandas(df, deep=False)\n", + " transformed = self.transformer_.transform(df[[self.target_col]].to_numpy())\n", + " return ufp.assign_columns(df, self.target_col, transformed[:, 0])\n", "\n", " @staticmethod\n", " def stack(transforms: Sequence[\"GlobalSklearnTransformer\"]) -> \"GlobalSklearnTransformer\": # type: ignore[override]\n", @@ -736,6 +736,26 @@ ")\n", "np.testing.assert_allclose(prep['y'].values, expected)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f8a67713-ac9e-4525-b635-8ac3298262d6", + "metadata": {}, + "outputs": [], + "source": [ + "#| hide\n", + "#| polars\n", + "series_pl = generate_daily_series(10, engine='polars')\n", + "fcst_pl = MLForecast(\n", + " models=[LinearRegression()],\n", + " freq='1d',\n", + " lags=[1, 2],\n", + " target_transforms=[boxcox_global, single_difference]\n", + ")\n", + "prep_pl = fcst_pl.preprocess(series_pl, dropna=False)\n", + "pd.testing.assert_frame_equal(prep, prep_pl.to_pandas())" + ] } ], "metadata": { diff --git a/settings.ini b/settings.ini index 0cda5e6a..f7f0e9b8 100644 --- a/settings.ini +++ b/settings.ini @@ -15,15 +15,15 @@ language = English custom_sidebar = True license = apache2 status = 3 -requirements = cloudpickle coreforecast>=0.0.10 fsspec numba optuna packaging pandas scikit-learn utilsforecast>=0.1.9 window-ops +requirements = cloudpickle coreforecast>=0.0.11 fsspec numba optuna packaging pandas scikit-learn utilsforecast>=0.1.9 window-ops dask_requirements = fugue dask[complete] lightgbm xgboost -ray_requirements = fugue[ray] lightgbm_ray xgboost_ray +ray_requirements = fugue[ray] lightgbm_ray numpy<2 pandas<2.2 ray<2.8 setuptools<70 xgboost<2 xgboost_ray spark_requirements = fugue pyspark>=3.3 lightgbm xgboost aws_requirements = fsspec[s3] gcp_requirements = fsspec[gcs] azure_requirements = fsspec[adl] polars_requirements = polars[numpy] -dev_requirements = black datasetsforecast matplotlib mypy nbdev pre-commit prophet pyarrow ruff shap statsmodels +dev_requirements = black>=24 datasetsforecast gitpython holidays<0.21 lightgbm matplotlib mypy nbdev pre-commit polars[numpy] pyarrow ruff shap statsmodels xgboost nbs_path = nbs doc_path = _docs recursive = True @@ -35,8 +35,8 @@ title = mlforecast tst_flags = polars black_formatting = True readme_nb = index.ipynb -allowed_metadata_keys = -allowed_cell_metadata_keys = +allowed_metadata_keys = +allowed_cell_metadata_keys = jupyter_hooks = True clean_ids = True clear_all = False diff --git a/setup.py b/setup.py index 0d38f493..9692f055 100644 --- a/setup.py +++ b/setup.py @@ -32,21 +32,22 @@ azure_requirements = cfg['azure_requirements'].split() gcp_requirements = cfg['gcp_requirements'].split() polars_requirements = cfg['polars_requirements'].split() -all_extras = [ - dask_requirements, - ray_requirements, - spark_requirements, - aws_requirements, - azure_requirements, - gcp_requirements, - polars_requirements, -] -dev_requirements = sorted(set(sum([cfg['dev_requirements'].split()] + all_extras, []))) +dev_requirements = cfg['dev_requirements'].split() +all_requirements = { + *dask_requirements, + *ray_requirements, + *spark_requirements, + *aws_requirements, + *azure_requirements, + *gcp_requirements, + *polars_requirements, + *dev_requirements, +} min_python = cfg['min_python'] lic = licenses.get(cfg['license'].lower(), (cfg['license'], None)) setuptools.setup( - name = 'mlforecast', + name = 'mlforecast', license = lic[0], classifiers = [ 'Development Status :: ' + statuses[int(cfg['status'])], @@ -67,6 +68,7 @@ 'gcp': gcp_requirements, 'polars': polars_requirements, 'dev': dev_requirements, + 'all': all_requirements, }, dependency_links = cfg.get('dep_links','').split(), python_requires = '>=' + cfg['min_python'], @@ -78,4 +80,3 @@ 'nbdev': [f'{cfg.get("lib_path")}={cfg.get("lib_path")}._modidx:d'] }, **setup_cfg) -