diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index b769383281880..76e3bd9825956 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -12,8 +12,10 @@ DefaultDict, Hashable, Iterable, + List, Mapping, Sequence, + Tuple, cast, final, overload, @@ -441,10 +443,15 @@ def _maybe_dedup_names(self, names: Sequence[Hashable]) -> Sequence[Hashable]: return names @final - def _maybe_make_multi_index_columns(self, columns, col_names=None): + def _maybe_make_multi_index_columns( + self, + columns: Sequence[Hashable], + col_names: Sequence[Hashable] | None = None, + ) -> Sequence[Hashable] | MultiIndex: # possibly create a column mi here if _is_potential_multi_index(columns): - columns = MultiIndex.from_tuples(columns, names=col_names) + list_columns = cast(List[Tuple], columns) + return MultiIndex.from_tuples(list_columns, names=col_names) return columns @final @@ -923,7 +930,25 @@ def _check_data_length( stacklevel=find_stack_level(), ) - def _evaluate_usecols(self, usecols, names): + @overload + def _evaluate_usecols( + self, + usecols: set[int] | Callable[[Hashable], object], + names: Sequence[Hashable], + ) -> set[int]: + ... + + @overload + def _evaluate_usecols( + self, usecols: set[str], names: Sequence[Hashable] + ) -> set[str]: + ... + + def _evaluate_usecols( + self, + usecols: Callable[[Hashable], object] | set[str] | set[int], + names: Sequence[Hashable], + ) -> set[str] | set[int]: """ Check whether or not the 'usecols' parameter is a callable. If so, enumerates the 'names' @@ -1289,7 +1314,8 @@ def _get_na_values(col, na_values, na_fvalues, keep_default_na): def _is_potential_multi_index( - columns, index_col: bool | Sequence[int] | None = None + columns: Sequence[Hashable] | MultiIndex, + index_col: bool | Sequence[int] | None = None, ) -> bool: """ Check whether or not the `columns` parameter diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index 05c963f2d2552..988dcd3d8a124 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -1,5 +1,10 @@ from __future__ import annotations +from typing import ( + Hashable, + Mapping, + Sequence, +) import warnings import numpy as np @@ -7,6 +12,8 @@ import pandas._libs.parsers as parsers from pandas._typing import ( ArrayLike, + DtypeArg, + DtypeObj, FilePath, ReadCsvBuffer, ) @@ -20,6 +27,10 @@ from pandas.core.dtypes.concat import union_categoricals from pandas.core.dtypes.dtypes import ExtensionDtype +from pandas import ( + Index, + MultiIndex, +) from pandas.core.indexes.api import ensure_index_from_sequences from pandas.io.parsers.base_parser import ( @@ -193,7 +204,7 @@ def close(self) -> None: except ValueError: pass - def _set_noconvert_columns(self): + def _set_noconvert_columns(self) -> None: """ Set the columns that should not undergo dtype conversions. @@ -214,7 +225,14 @@ def _set_noconvert_columns(self): for col in noconvert_columns: self._reader.set_noconvert(col) - def read(self, nrows=None): + def read( + self, + nrows: int | None = None, + ) -> tuple[ + Index | MultiIndex | None, + Sequence[Hashable] | MultiIndex, + Mapping[Hashable, ArrayLike], + ]: try: if self.low_memory: chunks = self._reader.read_low_memory(nrows) @@ -306,11 +324,11 @@ def read(self, nrows=None): index, names = self._make_index(date_data, alldata, names) # maybe create a mi on the columns - names = self._maybe_make_multi_index_columns(names, self.col_names) + conv_names = self._maybe_make_multi_index_columns(names, self.col_names) - return index, names, date_data + return index, conv_names, date_data - def _filter_usecols(self, names): + def _filter_usecols(self, names: Sequence[Hashable]) -> Sequence[Hashable]: # hackish usecols = self._evaluate_usecols(self.usecols, names) if usecols is not None and len(names) != len(usecols): @@ -395,13 +413,15 @@ def _concatenate_chunks(chunks: list[dict[int, ArrayLike]]) -> dict: return result -def ensure_dtype_objs(dtype): +def ensure_dtype_objs( + dtype: DtypeArg | dict[Hashable, DtypeArg] | None +) -> DtypeObj | dict[Hashable, DtypeObj] | None: """ Ensure we have either None, a dtype object, or a dictionary mapping to dtype objects. """ if isinstance(dtype, dict): - dtype = {k: pandas_dtype(dtype[k]) for k in dtype} + return {k: pandas_dtype(dtype[k]) for k in dtype} elif dtype is not None: - dtype = pandas_dtype(dtype) + return pandas_dtype(dtype) return dtype diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index b493c4f12fb31..12d5e4599cee0 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -270,8 +270,8 @@ def read(self, rows: int | None = None): self.index_names, self.dtype, ) - columns = self._maybe_make_multi_index_columns(columns, self.col_names) - return index, columns, col_dict + conv_columns = self._maybe_make_multi_index_columns(columns, self.col_names) + return index, conv_columns, col_dict # handle new style for names in index count_empty_content_vals = count_empty_vals(content[0]) @@ -560,6 +560,7 @@ def _handle_usecols( usecols_key is used if there are string usecols. """ + col_indices: set[int] | list[int] if self.usecols is not None: if callable(self.usecols): col_indices = self._evaluate_usecols(self.usecols, usecols_key)