Skip to content

Commit

Permalink
BUG: read_csv not respecting converter in all cases for index col (pa…
Browse files Browse the repository at this point in the history
  • Loading branch information
phofl authored Feb 26, 2022
1 parent c44871c commit 7ee8ab0
Show file tree
Hide file tree
Showing 4 changed files with 29 additions and 8 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.5.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -360,6 +360,7 @@ I/O
- Bug in :meth:`DataFrame.info` where a new line at the end of the output is omitted when called on an empty :class:`DataFrame` (:issue:`45494`)
- Bug in :func:`read_csv` not recognizing line break for ``on_bad_lines="warn"`` for ``engine="c"`` (:issue:`41710`)
- Bug in :meth:`DataFrame.to_csv` not respecting ``float_format`` for ``Float64`` dtype (:issue:`45991`)
- Bug in :func:`read_csv` not respecting a specified converter to index columns in all cases (:issue:`40589`)
- Bug in :func:`read_parquet` when ``engine="pyarrow"`` which caused partial write to disk when column of unsupported datatype was passed (:issue:`44914`)
- Bug in :func:`DataFrame.to_excel` and :class:`ExcelWriter` would raise when writing an empty DataFrame to a ``.ods`` file (:issue:`45793`)

Expand Down
15 changes: 12 additions & 3 deletions pandas/io/parsers/base_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ def __init__(self, kwds):
self.keep_default_na = kwds.get("keep_default_na", True)

self.dtype = copy(kwds.get("dtype", None))
self.converters = kwds.get("converters")

self.true_values = kwds.get("true_values")
self.false_values = kwds.get("false_values")
Expand Down Expand Up @@ -476,6 +477,7 @@ def _clean_mapping(self, mapping):
@final
def _agg_index(self, index, try_parse_dates: bool = True) -> Index:
arrays = []
converters = self._clean_mapping(self.converters)

for i, arr in enumerate(index):

Expand All @@ -500,10 +502,17 @@ def _agg_index(self, index, try_parse_dates: bool = True) -> Index:
clean_dtypes = self._clean_mapping(self.dtype)

cast_type = None
if isinstance(clean_dtypes, dict) and self.index_names is not None:
cast_type = clean_dtypes.get(self.index_names[i], None)
index_converter = False
if self.index_names is not None:
if isinstance(clean_dtypes, dict):
cast_type = clean_dtypes.get(self.index_names[i], None)

if isinstance(converters, dict):
index_converter = converters.get(self.index_names[i]) is not None

try_num_bool = not (cast_type and is_string_dtype(cast_type))
try_num_bool = not (
cast_type and is_string_dtype(cast_type) or index_converter
)

arr, _ = self._infer_types(
arr, col_na_values | col_na_fvalues, try_num_bool
Expand Down
1 change: 0 additions & 1 deletion pandas/io/parsers/python_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,6 @@ def __init__(self, f: ReadCsvBuffer[str] | list, **kwds):
self.has_index_names = kwds["has_index_names"]

self.verbose = kwds["verbose"]
self.converters = kwds["converters"]

self.thousands = kwds["thousands"]
self.decimal = kwds["decimal"]
Expand Down
20 changes: 16 additions & 4 deletions pandas/tests/io/parser/test_converters.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,16 +152,28 @@ def convert_score(x):
tm.assert_frame_equal(results[0], results[1])


def test_converter_index_col_bug(all_parsers):
# see gh-1835
@pytest.mark.parametrize("conv_f", [lambda x: x, str])
def test_converter_index_col_bug(all_parsers, conv_f):
# see gh-1835 , GH#40589
parser = all_parsers
data = "A;B\n1;2\n3;4"

rs = parser.read_csv(
StringIO(data), sep=";", index_col="A", converters={"A": lambda x: x}
StringIO(data), sep=";", index_col="A", converters={"A": conv_f}
)

xp = DataFrame({"B": [2, 4]}, index=Index([1, 3], name="A"))
xp = DataFrame({"B": [2, 4]}, index=Index(["1", "3"], name="A", dtype="object"))
tm.assert_frame_equal(rs, xp)


def test_converter_identity_object(all_parsers):
# GH#40589
parser = all_parsers
data = "A,B\n1,2\n3,4"

rs = parser.read_csv(StringIO(data), converters={"A": lambda x: x})

xp = DataFrame({"A": ["1", "3"], "B": [2, 4]})
tm.assert_frame_equal(rs, xp)


Expand Down

0 comments on commit 7ee8ab0

Please sign in to comment.