Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] pivot_wider_spec pandas #1427

Merged
merged 7 commits into from
Dec 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion janitor/functions/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,12 @@
from .limit_column_characters import limit_column_characters
from .min_max_scale import min_max_scale
from .move import move
from .pivot import pivot_longer, pivot_longer_spec, pivot_wider
from .pivot import (
pivot_longer,
pivot_longer_spec,
pivot_wider,
pivot_wider_spec,
)
from .process_text import process_text
from .remove_columns import remove_columns
from .remove_empty import remove_empty
Expand Down Expand Up @@ -138,6 +143,7 @@
"pivot_longer",
"pivot_longer_spec",
"pivot_wider",
"pivot_wider_spec",
"process_text",
"remove_columns",
"remove_empty",
Expand Down
157 changes: 156 additions & 1 deletion janitor/functions/pivot.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,10 +327,14 @@ def pivot_longer(
Should be either a single column name, or a list/tuple of
column names.
`index` should be a list of tuples if the columns are a MultiIndex.
Column selection is possible using the
[`select`][janitor.functions.select.select] syntax.
column_names: Name(s) of columns to unpivot. Should be either
a single column name or a list/tuple of column names.
`column_names` should be a list of tuples
if the columns are a MultiIndex.
Column selection is possible using the
[`select`][janitor.functions.select.select] syntax.
names_to: Name of new column as a string that will contain
what were previously the column names in `column_names`.
The default is `variable` if no value is provided. It can
Expand Down Expand Up @@ -420,10 +424,13 @@ def pivot_longer_spec(
) -> pd.DataFrame:
"""A declarative interface to pivot a DataFrame from wide to long form,
where you describe how the data will be unpivoted,
using a DataFrame. This gives you, the user,
using a DataFrame.

This gives you, the user,
more control over unpivoting, where you create a “spec”
data frame that describes exactly how data stored
in the column names becomes variables.

It can come in handy for situations where
[`pivot_longer`][janitor.functions.pivot.pivot_longer]
seems inadequate for the transformation.
Expand Down Expand Up @@ -2380,3 +2387,151 @@ def _check_tuples_multiindex(indexer, args, param):
)

return args


def pivot_wider_spec(
df: pd.DataFrame,
spec: pd.DataFrame,
index: list | tuple | str | Pattern = None,
reset_index: bool = True,
) -> pd.DataFrame:
"""A declarative interface to pivot a DataFrame from long to wide form,
where you describe how the data will be pivoted,
using a DataFrame.

This gives you, the user,
more control over pivoting, where you create a “spec”
data frame that describes exactly how data stored
in the column names becomes variables.

It can come in handy for situations where
`pd.DataFrame.pivot`
seems inadequate for the transformation.

!!! info "New in version 0.31.0"

Examples:
>>> import pandas as pd
>>> from janitor import pivot_wider_spec
>>> df = pd.DataFrame(
... [
... {"famid": 1, "birth": 1, "age": 1, "ht": 2.8},
... {"famid": 1, "birth": 1, "age": 2, "ht": 3.4},
... {"famid": 1, "birth": 2, "age": 1, "ht": 2.9},
... {"famid": 1, "birth": 2, "age": 2, "ht": 3.8},
... {"famid": 1, "birth": 3, "age": 1, "ht": 2.2},
... {"famid": 1, "birth": 3, "age": 2, "ht": 2.9},
... {"famid": 2, "birth": 1, "age": 1, "ht": 2.0},
... {"famid": 2, "birth": 1, "age": 2, "ht": 3.2},
... {"famid": 2, "birth": 2, "age": 1, "ht": 1.8},
... {"famid": 2, "birth": 2, "age": 2, "ht": 2.8},
... {"famid": 2, "birth": 3, "age": 1, "ht": 1.9},
... {"famid": 2, "birth": 3, "age": 2, "ht": 2.4},
... {"famid": 3, "birth": 1, "age": 1, "ht": 2.2},
... {"famid": 3, "birth": 1, "age": 2, "ht": 3.3},
... {"famid": 3, "birth": 2, "age": 1, "ht": 2.3},
... {"famid": 3, "birth": 2, "age": 2, "ht": 3.4},
... {"famid": 3, "birth": 3, "age": 1, "ht": 2.1},
... {"famid": 3, "birth": 3, "age": 2, "ht": 2.9},
... ]
... )
>>> df
famid birth age ht
0 1 1 1 2.8
1 1 1 2 3.4
2 1 2 1 2.9
3 1 2 2 3.8
4 1 3 1 2.2
5 1 3 2 2.9
6 2 1 1 2.0
7 2 1 2 3.2
8 2 2 1 1.8
9 2 2 2 2.8
10 2 3 1 1.9
11 2 3 2 2.4
12 3 1 1 2.2
13 3 1 2 3.3
14 3 2 1 2.3
15 3 2 2 3.4
16 3 3 1 2.1
17 3 3 2 2.9
>>> spec = {".name": ["ht1", "ht2"],
... ".value": ["ht", "ht"],
... "age": [1, 2]}
>>> spec = pd.DataFrame(spec)
>>> spec
.name .value age
0 ht1 ht 1
1 ht2 ht 2
>>> pivot_wider_spec(df=df,spec=spec, index=['famid','birth'])
famid birth ht1 ht2
0 1 1 2.8 3.4
1 1 2 2.9 3.8
2 1 3 2.2 2.9
3 2 1 2.0 3.2
4 2 2 1.8 2.8
5 2 3 1.9 2.4
6 3 1 2.2 3.3
7 3 2 2.3 3.4
8 3 3 2.1 2.9

Args:
df: A pandas DataFrame.
spec: A specification DataFrame.
At a minimum, the spec DataFrame
must have a '.name' and a '.value' columns.
The '.name' column should contain the
the names of the columns in the output DataFrame.
The '.value' column should contain the name of the column(s)
in the source DataFrame that will be serve as the values.
Additional columns in spec will serves as the columns
to be flipped to wide form.
Note that these additional columns should already exist
in the source DataFrame.
index: Name(s) of columns to use as identifier variables.
It should be either a single column name, or a list of column names.
If `index` is not provided, the DataFrame's index is used.
Column selection is possible using the
[`select`][janitor.functions.select.select] syntax.
reset_index: Determines whether to reset the `index`.
Applicable only if `index` is provided.

Returns:
A pandas DataFrame that has been unpivoted from long to wide form.
""" # noqa: E501
check("spec", spec, [pd.DataFrame])
check("reset_index", reset_index, [bool])
if not spec.columns.is_unique:
raise ValueError("Kindly ensure the spec's columns is unique.")
if ".name" not in spec.columns:
raise KeyError(
"Kindly ensure the spec DataFrame has a `.name` column."
)
if ".value" not in spec.columns:
raise KeyError(
"Kindly ensure the spec DataFrame has a `.value` column."
)
if spec.columns.tolist()[:2] != [".name", ".value"]:
raise ValueError(
"The first two columns of the spec DataFrame "
"should be '.name' and '.value', "
"with '.name' coming before '.value'."
)
if spec.columns.size == 2:
raise ValueError(
"Kindly provide the column(s) "
"to use to make new frame’s columns"
)
columns = spec.columns[2:]
values = spec[".value"].unique()
if index is not None:
index = _select_index([index], df, axis="columns")
index = df.columns[index].tolist()
df = df.pivot(index=index, columns=columns, values=values)
_index = spec.columns[1:].tolist()
spec = spec.set_index(_index).squeeze()
df = df.reindex(columns=spec.index)
df.columns = df.columns.map(spec)
if reset_index and index:
return df.reset_index()
return df
136 changes: 136 additions & 0 deletions tests/functions/test_pivot_wider_spec.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
import re

import pandas as pd
import pytest
from pandas.testing import assert_frame_equal

from janitor import pivot_wider_spec


@pytest.fixture
def df_checks():
"""pytest fixture"""
return pd.DataFrame(
[
{"famid": 1, "birth": 1, "age": 1, "ht": 2.8},
{"famid": 1, "birth": 1, "age": 2, "ht": 3.4},
{"famid": 1, "birth": 2, "age": 1, "ht": 2.9},
{"famid": 1, "birth": 2, "age": 2, "ht": 3.8},
{"famid": 1, "birth": 3, "age": 1, "ht": 2.2},
{"famid": 1, "birth": 3, "age": 2, "ht": 2.9},
{"famid": 2, "birth": 1, "age": 1, "ht": 2.0},
{"famid": 2, "birth": 1, "age": 2, "ht": 3.2},
{"famid": 2, "birth": 2, "age": 1, "ht": 1.8},
{"famid": 2, "birth": 2, "age": 2, "ht": 2.8},
{"famid": 2, "birth": 3, "age": 1, "ht": 1.9},
{"famid": 2, "birth": 3, "age": 2, "ht": 2.4},
{"famid": 3, "birth": 1, "age": 1, "ht": 2.2},
{"famid": 3, "birth": 1, "age": 2, "ht": 3.3},
{"famid": 3, "birth": 2, "age": 1, "ht": 2.3},
{"famid": 3, "birth": 2, "age": 2, "ht": 3.4},
{"famid": 3, "birth": 3, "age": 1, "ht": 2.1},
{"famid": 3, "birth": 3, "age": 2, "ht": 2.9},
]
)


spec = {".name": ["ht1", "ht2"], ".value": ["ht", "ht"], "age": [1, 2]}
spec = pd.DataFrame(spec)


def test_spec_is_a_dataframe(df_checks):
"""Raise Error if spec is not a DataFrame."""
with pytest.raises(
TypeError,
match="spec should be one of.+",
):
df_checks.pipe(pivot_wider_spec, spec={".name": "name"})


def test_spec_columns_has_dot_name(df_checks):
"""Raise KeyError if '.name' not in spec's columns."""
with pytest.raises(
KeyError,
match="Kindly ensure the spec DataFrame has a `.name` column.",
):
df_checks.pipe(
pivot_wider_spec,
spec=spec.set_axis(labels=[".value", ".blabla", "age"], axis=1),
)


def test_spec_columns_has_dot_value(df_checks):
"""Raise KeyError if '.value' not in spec's columns."""
with pytest.raises(
KeyError,
match="Kindly ensure the spec DataFrame has a `.value` column.",
):
df_checks.pipe(
pivot_wider_spec,
spec=spec.set_axis(labels=[".name", ".blabla", "age"], axis=1),
)


def test_spec_columns_name_value_order(df_checks):
"""
Raise ValueError if '.name' and '.value'
are not the first two labels
in spec's columns.
"""
msg = "The first two columns of the spec DataFrame "
msg += "should be '.name' and '.value',.+"
with pytest.raises(
ValueError,
match=msg,
):
df_checks.pipe(
pivot_wider_spec,
spec=spec.loc[:, [".value", ".name", "age"]],
)


def test_spec_columns_len_2(df_checks):
"""
Raise ValueError if '.name' and '.value'
are the only columns in spec.
"""
msg = "Kindly provide the column(s) "
msg += "to use to make new frame’s columns"
with pytest.raises(
ValueError,
match=re.escape(msg),
):
df_checks.pipe(
pivot_wider_spec,
spec=spec.loc[:, [".name", ".value"]],
)


def test_spec_columns_not_unique(df_checks):
"""Raise ValueError if the spec's columns is not unique."""
with pytest.raises(
ValueError, match="Kindly ensure the spec's columns is unique."
):
df_checks.pipe(
pivot_wider_spec,
spec=spec.set_axis(labels=[".name", ".name", "age"], axis=1),
)


def test_pivot_wider_spec(df_checks):
"""
Test output
"""
expected = (
df_checks.pivot(index=["famid", "birth"], columns="age", values="ht")
.add_prefix("ht")
.rename_axis(columns=None)
.reset_index()
)
actual = df_checks.pipe(
pivot_wider_spec, spec=spec, index=["famid", "birth"]
)
assert_frame_equal(
actual.sort_values(expected.columns.tolist(), ignore_index=True),
expected.sort_values(expected.columns.tolist(), ignore_index=True),
)
Loading