Skip to content

Commit

Permalink
convert_excel_date, convert_matlab_date for Polars (#1365)
Browse files Browse the repository at this point in the history
Implemented `convert_excel_date` and `convert_matlab_date` for Polars.

------

Co-authored-by: samuel.oranyeli <[email protected]>
Co-authored-by: Eric Ma <[email protected]>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
4 people authored Jul 3, 2024
1 parent 45ad9c0 commit e74f323
Show file tree
Hide file tree
Showing 7 changed files with 194 additions and 51 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

## [Unreleased]

- [ENH] Added `convert_excel_date` and `convert_matlab_date` methods for polars - Issue #1352
- [ENH] Added a `complete` method for polars. - Issue #1352 @samukweku
- [ENH] `read_commandline` function now supports polars - Issue #1352
- [ENH] Improved performance for non-equi joins when using numba - @samukweku PR #1341
Expand Down
86 changes: 47 additions & 39 deletions janitor/functions/convert_date.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,22 @@
import datetime as dt
from typing import Hashable
from typing import Hashable, Union

import pandas as pd
import pandas_flavor as pf
from pandas.api.types import is_numeric_dtype
from pandas.errors import OutOfBoundsDatetime

from janitor.utils import deprecated_alias
from janitor.utils import deprecated_alias, refactored_function


@pf.register_dataframe_method
@deprecated_alias(column="column_name")
@deprecated_alias(column="column_names")
def convert_excel_date(
df: pd.DataFrame, column_name: Hashable
df: pd.DataFrame, column_names: Union[Hashable, list]
) -> pd.DataFrame:
"""Convert Excel's serial date format into Python datetime format.
This method mutates the original DataFrame.
This method does not mutate the original DataFrame.
Implementation is also from
Implementation is based on
[Stack Overflow](https://stackoverflow.com/questions/38454403/convert-excel-style-date-with-pandas).
Examples:
Expand All @@ -38,40 +36,36 @@ def convert_excel_date(
Args:
df: A pandas DataFrame.
column_name: A column name.
Raises:
ValueError: If there are non numeric values in the column.
column_names: A column name, or a list of column names.
Returns:
A pandas DataFrame with corrected dates.
""" # noqa: E501

if not is_numeric_dtype(df[column_name]):
raise ValueError(
"There are non-numeric values in the column. "
"All values must be numeric."
if not isinstance(column_names, list):
column_names = [column_names]
# https://stackoverflow.com/a/65460255/7175713
dictionary = {
column_name: pd.to_datetime(
df[column_name], unit="D", origin="1899-12-30"
)
for column_name in column_names
}

df[column_name] = pd.TimedeltaIndex(
df[column_name], unit="d"
) + dt.datetime(
1899, 12, 30
) # noqa: W503
return df
return df.assign(**dictionary)


@pf.register_dataframe_method
@deprecated_alias(column="column_name")
@deprecated_alias(column="column_names")
def convert_matlab_date(
df: pd.DataFrame, column_name: Hashable
df: pd.DataFrame, column_names: Union[Hashable, list]
) -> pd.DataFrame:
"""Convert Matlab's serial date number into Python datetime format.
Implementation is also from
Implementation is based on
[Stack Overflow](https://stackoverflow.com/questions/13965740/converting-matlabs-datenum-format-to-python).
This method mutates the original DataFrame.
This method does not mutate the original DataFrame.
Examples:
>>> import pandas as pd
Expand All @@ -84,29 +78,38 @@ def convert_matlab_date(
2 737124.498500
3 737124.000000
>>> df.convert_matlab_date('date')
date
0 2018-03-06 00:00:00.000000
1 2018-03-05 19:34:50.563200
2 2018-03-05 11:57:50.399999
3 2018-03-05 00:00:00.000000
date
0 2018-03-06 00:00:00.000000000
1 2018-03-05 19:34:50.563199671
2 2018-03-05 11:57:50.399998876
3 2018-03-05 00:00:00.000000000
Args:
df: A pandas DataFrame.
column_name: A column name.
column_names: A column name, or a list of column names.
Returns:
A pandas DataFrame with corrected dates.
""" # noqa: E501
days = pd.Series([dt.timedelta(v % 1) for v in df[column_name]])
df[column_name] = (
df[column_name].astype(int).apply(dt.datetime.fromordinal)
+ days
- dt.timedelta(days=366)
)
return df
# https://stackoverflow.com/a/49135037/7175713
if not isinstance(column_names, list):
column_names = [column_names]
dictionary = {
column_name: pd.to_datetime(df[column_name] - 719529, unit="D")
for column_name in column_names
}

return df.assign(**dictionary)


@pf.register_dataframe_method
@pf.register_dataframe_method
@refactored_function(
message=(
"This function will be deprecated in a 1.x release. "
"Please use `pd.to_datetime` instead."
)
)
@deprecated_alias(column="column_name")
def convert_unix_date(df: pd.DataFrame, column_name: Hashable) -> pd.DataFrame:
"""Convert unix epoch time into Python datetime format.
Expand All @@ -116,6 +119,11 @@ def convert_unix_date(df: pd.DataFrame, column_name: Hashable) -> pd.DataFrame:
This method mutates the original DataFrame.
!!!note
This function will be deprecated in a 1.x release.
Please use `pd.to_datetime` instead.
Examples:
>>> import pandas as pd
>>> import janitor
Expand Down
3 changes: 3 additions & 0 deletions janitor/polars/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from .clean_names import clean_names, make_clean_names
from .complete import complete
from .dates_to_polars import convert_excel_date, convert_matlab_date
from .pivot_longer import pivot_longer, pivot_longer_spec
from .row_to_names import row_to_names

Expand All @@ -10,4 +11,6 @@
"make_clean_names",
"row_to_names",
"complete",
"convert_excel_date",
"convert_matlab_date",
]
112 changes: 112 additions & 0 deletions janitor/polars/dates_to_polars.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
from __future__ import annotations

from janitor.utils import import_message

from .polars_flavor import register_expr_method

try:
import polars as pl
except ImportError:
import_message(
submodule="polars",
package="polars",
conda_channel="conda-forge",
pip_install=True,
)


@register_expr_method
def convert_excel_date(expr: pl.Expr) -> pl.Expr:
"""
Convert Excel's serial date format into Python datetime format.
Inspiration is from
[Stack Overflow](https://stackoverflow.com/questions/38454403/convert-excel-style-date-with-pandas).
Examples:
>>> import polars as pl
>>> import janitor.polars
>>> df = pl.DataFrame({"date": [39690, 39690, 37118]})
>>> df
shape: (3, 1)
┌───────┐
│ date │
│ --- │
│ i64 │
╞═══════╡
│ 39690 │
│ 39690 │
│ 37118 │
└───────┘
>>> expression = pl.col('date').convert_excel_date().alias('date_')
>>> df.with_columns(expression)
shape: (3, 2)
┌───────┬────────────┐
│ date ┆ date_ │
│ --- ┆ --- │
│ i64 ┆ date │
╞═══════╪════════════╡
│ 39690 ┆ 2008-08-30 │
│ 39690 ┆ 2008-08-30 │
│ 37118 ┆ 2001-08-15 │
└───────┴────────────┘
!!! info "New in version 0.28.0"
Returns:
A polars Expression.
""" # noqa: E501
expression = pl.duration(days=expr)
expression += pl.date(year=1899, month=12, day=30)
return expression


@register_expr_method
def convert_matlab_date(expr: pl.Expr) -> pl.Expr:
"""
Convert Matlab's serial date number into Python datetime format.
Implementation is from
[Stack Overflow](https://stackoverflow.com/questions/13965740/converting-matlabs-datenum-format-to-python).
Examples:
>>> import polars as pl
>>> import janitor.polars
>>> df = pl.DataFrame({"date": [737125.0, 737124.815863, 737124.4985, 737124]})
>>> df
shape: (4, 1)
┌───────────────┐
│ date │
│ --- │
│ f64 │
╞═══════════════╡
│ 737125.0 │
│ 737124.815863 │
│ 737124.4985 │
│ 737124.0 │
└───────────────┘
>>> expression = pl.col('date').convert_matlab_date().alias('date_')
>>> df.with_columns(expression)
shape: (4, 2)
┌───────────────┬─────────────────────────┐
│ date ┆ date_ │
│ --- ┆ --- │
│ f64 ┆ datetime[μs] │
╞═══════════════╪═════════════════════════╡
│ 737125.0 ┆ 2018-03-06 00:00:00 │
│ 737124.815863 ┆ 2018-03-05 19:34:50.563 │
│ 737124.4985 ┆ 2018-03-05 11:57:50.399 │
│ 737124.0 ┆ 2018-03-05 00:00:00 │
└───────────────┴─────────────────────────┘
!!! info "New in version 0.28.0"
Returns:
A polars Expression.
""" # noqa: E501
# https://stackoverflow.com/questions/13965740/converting-matlabs-datenum-format-to-python
expression = expr.sub(719529).mul(86_400_000)
expression = pl.duration(milliseconds=expression)
expression += pl.datetime(year=1970, month=1, day=1)
return expression
12 changes: 0 additions & 12 deletions tests/functions/test_convert_excel_date.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,3 @@ def test_convert_excel_date():
)

assert df["hire_date"].dtype == "M8[ns]"


@pytest.mark.functions
def test_convert_excel_date_with_string_data():
"""Raises ValueError if values of column are not numeric"""
df = pd.read_excel(
Path(pytest.EXAMPLES_DIR) / "notebooks" / "dirty_data.xlsx",
engine="openpyxl",
).clean_names()

with pytest.raises(ValueError):
df.convert_excel_date("certification")
11 changes: 11 additions & 0 deletions tests/polars/functions/test_convert_excel_date_polars.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import polars as pl

import janitor.polars # noqa: F401


def test_convert_excel_date():
df = pl.DataFrame({"dates": [42580.3333333333]})

expression = pl.col("dates").convert_excel_date().alias("dd")
expression = df.with_columns(expression).get_column("dd")
assert expression.dtype.is_temporal() is True
20 changes: 20 additions & 0 deletions tests/polars/functions/test_convert_matlab_date_polars.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import polars as pl

import janitor.polars # noqa: F401


def test_convert_matlab_date():
df = pl.DataFrame(
{
"dates": [
733_301.0,
729_159.0,
734_471.0,
737_299.563_296_356_5,
737_300.000_000_000_0,
]
}
)
expression = pl.col("dates").convert_matlab_date().alias("dd")
expression = df.with_columns(expression).get_column("dd")
assert expression.dtype.is_temporal() is True

0 comments on commit e74f323

Please sign in to comment.