Skip to content

Commit

Permalink
BUG: to_csv casting datetimes in categorical to int (pandas-dev#44930)
Browse files Browse the repository at this point in the history
  • Loading branch information
phofl authored Dec 22, 2021
1 parent d64df84 commit 079289c
Show file tree
Hide file tree
Showing 6 changed files with 50 additions and 9 deletions.
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v1.4.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -613,6 +613,7 @@ Categorical
^^^^^^^^^^^
- Bug in setting dtype-incompatible values into a :class:`Categorical` (or ``Series`` or ``DataFrame`` backed by ``Categorical``) raising ``ValueError`` instead of ``TypeError`` (:issue:`41919`)
- Bug in :meth:`Categorical.searchsorted` when passing a dtype-incompatible value raising ``KeyError`` instead of ``TypeError`` (:issue:`41919`)
- Bug in :meth:`Categorical.astype` casting datetimes and :class:`Timestamp` to int for dtype ``object`` (:issue:`44930`)
- Bug in :meth:`Series.where` with ``CategoricalDtype`` when passing a dtype-incompatible value raising ``ValueError`` instead of ``TypeError`` (:issue:`41919`)
- Bug in :meth:`Categorical.fillna` when passing a dtype-incompatible value raising ``ValueError`` instead of ``TypeError`` (:issue:`41919`)
- Bug in :meth:`Categorical.fillna` with a tuple-like category raising ``ValueError`` instead of ``TypeError`` when filling with a non-category tuple (:issue:`41919`)
Expand Down Expand Up @@ -760,6 +761,7 @@ I/O
- Bug in :class:`ExcelWriter`, where ``engine_kwargs`` were not passed through to all engines (:issue:`43442`)
- Bug in :func:`read_csv` raising ``ValueError`` when ``parse_dates`` was used with ``MultiIndex`` columns (:issue:`8991`)
- Bug in :func:`read_csv` not raising an ``ValueError`` when ``\n`` was specified as ``delimiter`` or ``sep`` which conflicts with ``lineterminator`` (:issue:`43528`)
- Bug in :func:`to_csv` converting datetimes in categorical :class:`Series` to integers (:issue:`40754`)
- Bug in :func:`read_csv` converting columns to numeric after date parsing failed (:issue:`11019`)
- Bug in :func:`read_csv` not replacing ``NaN`` values with ``np.nan`` before attempting date conversion (:issue:`26203`)
- Bug in :func:`read_csv` raising ``AttributeError`` when attempting to read a .csv file and infer index column dtype from an nullable integer type (:issue:`44079`)
Expand Down
7 changes: 6 additions & 1 deletion pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@
)
import pandas.core.common as com
from pandas.core.construction import (
ensure_wrapped_if_datetimelike,
extract_array,
sanitize_array,
)
Expand Down Expand Up @@ -538,8 +539,12 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike:

else:
# GH8628 (PERF): astype category codes instead of astyping array
try:
if is_datetime64_dtype(self.categories):
new_cats = ensure_wrapped_if_datetimelike(self.categories._values)
else:
new_cats = np.asarray(self.categories)

try:
new_cats = new_cats.astype(dtype=dtype, copy=copy)
fill_value = lib.item_from_zerodim(np.array(np.nan).astype(dtype))
except (
Expand Down
10 changes: 10 additions & 0 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
soft_convert_objects,
)
from pandas.core.dtypes.common import (
ensure_platform_int,
is_1d_only_ea_dtype,
is_1d_only_ea_obj,
is_dtype_equal,
Expand Down Expand Up @@ -88,6 +89,7 @@
replace_regex,
should_use_regex,
)
from pandas.core.array_algos.take import take_nd
from pandas.core.array_algos.transforms import shift
from pandas.core.arrays import (
Categorical,
Expand Down Expand Up @@ -2087,6 +2089,14 @@ def to_native_types(
**kwargs,
) -> np.ndarray:
"""convert to our native types format"""
if isinstance(values, Categorical):
# GH#40754 Convert categorical datetimes to datetime array
values = take_nd(
values.categories._values,
ensure_platform_int(values._codes),
fill_value=na_rep,
)

values = ensure_wrapped_if_datetimelike(values)

if isinstance(values, (DatetimeArray, TimedeltaArray)):
Expand Down
16 changes: 16 additions & 0 deletions pandas/tests/arrays/categorical/test_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,10 @@
Categorical,
CategoricalIndex,
Index,
NaT,
Series,
Timestamp,
to_datetime,
)
import pandas._testing as tm

Expand Down Expand Up @@ -176,6 +178,20 @@ def test_astype_category(self, dtype_ordered, cat_ordered):
expected = cat
tm.assert_categorical_equal(result, expected)

def test_astype_object_datetime_categories(self):
# GH#40754
cat = Categorical(to_datetime(["2021-03-27", NaT]))
result = cat.astype(object)
expected = np.array([Timestamp("2021-03-27 00:00:00"), np.nan], dtype="object")
tm.assert_numpy_array_equal(result, expected)

def test_astype_object_timestamp_categories(self):
# GH#18024
cat = Categorical([Timestamp("2014-01-01")])
result = cat.astype(object)
expected = np.array([Timestamp("2014-01-01 00:00:00")], dtype="object")
tm.assert_numpy_array_equal(result, expected)

def test_iter_python_types(self):
# GH-19909
cat = Categorical([1, 2])
Expand Down
15 changes: 15 additions & 0 deletions pandas/tests/io/formats/test_to_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,21 @@ def test_to_csv_date_format(self):
df_sec_grouped = df_sec.groupby([pd.Grouper(key="A", freq="1h"), "B"])
assert df_sec_grouped.mean().to_csv(date_format="%Y-%m-%d") == expected_ymd_sec

def test_to_csv_date_format_in_categorical(self):
# GH#40754
ser = pd.Series(pd.to_datetime(["2021-03-27", pd.NaT], format="%Y-%m-%d"))
ser = ser.astype("category")
expected = tm.convert_rows_list_to_csv_str(["0", "2021-03-27", '""'])
assert ser.to_csv(index=False) == expected

ser = pd.Series(
pd.date_range(
start="2021-03-27", freq="D", periods=1, tz="Europe/Berlin"
).append(pd.DatetimeIndex([pd.NaT]))
)
ser = ser.astype("category")
assert ser.to_csv(index=False, date_format="%Y-%m-%d") == expected

def test_to_csv_multi_index(self):
# see gh-6618
df = DataFrame([1], columns=pd.MultiIndex.from_arrays([[1], [2]]))
Expand Down
9 changes: 1 addition & 8 deletions pandas/tests/series/indexing/test_where.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import numpy as np
import pytest

import pandas.util._test_decorators as td

from pandas.core.dtypes.common import is_integer

import pandas as pd
Expand Down Expand Up @@ -440,15 +438,10 @@ def test_where_categorical(frame_or_series):
tm.assert_equal(exp, res)


def test_where_datetimelike_categorical(request, tz_naive_fixture, using_array_manager):
def test_where_datetimelike_categorical(request, tz_naive_fixture):
# GH#37682
tz = tz_naive_fixture

if using_array_manager and tz is None:
# TODO(ArrayManager) DataFrame.values not yet correctly returning datetime array
# for categorical with datetime categories
td.mark_array_manager_not_yet_implemented(request)

dr = date_range("2001-01-01", periods=3, tz=tz)._with_freq(None)
lvals = pd.DatetimeIndex([dr[0], dr[1], pd.NaT])
rvals = pd.Categorical([dr[0], pd.NaT, dr[2]])
Expand Down

0 comments on commit 079289c

Please sign in to comment.