BUG: read_csv not respecting converter in all cases for index col (pa…

…ndas-dev#46053)
me-kbs · Feb 26, 2022 · 7ee8ab0 · 7ee8ab0
1 parent c44871c
commit 7ee8ab0
Show file tree

Hide file tree

Showing 4 changed files with 29 additions and 8 deletions.
diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
@@ -360,6 +360,7 @@ I/O
 - Bug in :meth:`DataFrame.info` where a new line at the end of the output is omitted when called on an empty :class:`DataFrame` (:issue:`45494`)
 - Bug in :func:`read_csv` not recognizing line break for ``on_bad_lines="warn"`` for ``engine="c"`` (:issue:`41710`)
 - Bug in :meth:`DataFrame.to_csv` not respecting ``float_format`` for ``Float64`` dtype (:issue:`45991`)
+- Bug in :func:`read_csv` not respecting a specified converter to index columns in all cases (:issue:`40589`)
 - Bug in :func:`read_parquet` when ``engine="pyarrow"`` which caused partial write to disk when column of unsupported datatype was passed (:issue:`44914`)
 - Bug in :func:`DataFrame.to_excel` and :class:`ExcelWriter` would raise when writing an empty DataFrame to a ``.ods`` file (:issue:`45793`)
 

diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py
@@ -103,6 +103,7 @@ def __init__(self, kwds):
         self.keep_default_na = kwds.get("keep_default_na", True)
 
         self.dtype = copy(kwds.get("dtype", None))
+        self.converters = kwds.get("converters")
 
         self.true_values = kwds.get("true_values")
         self.false_values = kwds.get("false_values")
@@ -476,6 +477,7 @@ def _clean_mapping(self, mapping):
     @final
     def _agg_index(self, index, try_parse_dates: bool = True) -> Index:
         arrays = []
+        converters = self._clean_mapping(self.converters)
 
         for i, arr in enumerate(index):
 
@@ -500,10 +502,17 @@ def _agg_index(self, index, try_parse_dates: bool = True) -> Index:
             clean_dtypes = self._clean_mapping(self.dtype)
 
             cast_type = None
-            if isinstance(clean_dtypes, dict) and self.index_names is not None:
-                cast_type = clean_dtypes.get(self.index_names[i], None)
+            index_converter = False
+            if self.index_names is not None:
+                if isinstance(clean_dtypes, dict):
+                    cast_type = clean_dtypes.get(self.index_names[i], None)
+
+                if isinstance(converters, dict):
+                    index_converter = converters.get(self.index_names[i]) is not None
 
-            try_num_bool = not (cast_type and is_string_dtype(cast_type))
+            try_num_bool = not (
+                cast_type and is_string_dtype(cast_type) or index_converter
+            )
 
             arr, _ = self._infer_types(
                 arr, col_na_values | col_na_fvalues, try_num_bool

diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py
@@ -95,7 +95,6 @@ def __init__(self, f: ReadCsvBuffer[str] | list, **kwds):
             self.has_index_names = kwds["has_index_names"]
 
         self.verbose = kwds["verbose"]
-        self.converters = kwds["converters"]
 
         self.thousands = kwds["thousands"]
         self.decimal = kwds["decimal"]

diff --git a/pandas/tests/io/parser/test_converters.py b/pandas/tests/io/parser/test_converters.py
@@ -152,16 +152,28 @@ def convert_score(x):
     tm.assert_frame_equal(results[0], results[1])
 
 
-def test_converter_index_col_bug(all_parsers):
-    # see gh-1835
+@pytest.mark.parametrize("conv_f", [lambda x: x, str])
+def test_converter_index_col_bug(all_parsers, conv_f):
+    # see gh-1835 , GH#40589
     parser = all_parsers
     data = "A;B\n1;2\n3;4"
 
     rs = parser.read_csv(
-        StringIO(data), sep=";", index_col="A", converters={"A": lambda x: x}
+        StringIO(data), sep=";", index_col="A", converters={"A": conv_f}
     )
 
-    xp = DataFrame({"B": [2, 4]}, index=Index([1, 3], name="A"))
+    xp = DataFrame({"B": [2, 4]}, index=Index(["1", "3"], name="A", dtype="object"))
+    tm.assert_frame_equal(rs, xp)
+
+
+def test_converter_identity_object(all_parsers):
+    # GH#40589
+    parser = all_parsers
+    data = "A,B\n1,2\n3,4"
+
+    rs = parser.read_csv(StringIO(data), converters={"A": lambda x: x})
+
+    xp = DataFrame({"A": ["1", "3"], "B": [2, 4]})
     tm.assert_frame_equal(rs, xp)