From 28f1bd0d9a940bdc182e2c5c1efb856d197fa558 Mon Sep 17 00:00:00 2001 From: Alexander Beedie Date: Mon, 23 Dec 2024 22:33:49 +0000 Subject: [PATCH] feat(python): Add "drop_empty_cols" parameter for `read_excel` and `read_ods` --- py-polars/polars/_utils/various.py | 2 +- py-polars/polars/io/spreadsheet/functions.py | 10 ++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/py-polars/polars/_utils/various.py b/py-polars/polars/_utils/various.py index f40249ef9bf..126929d6d62 100644 --- a/py-polars/polars/_utils/various.py +++ b/py-polars/polars/_utils/various.py @@ -253,7 +253,7 @@ def deduplicate_names(names: Iterable[str]) -> list[str]: seen: MutableMapping[str, int] = Counter() deduped = [] for nm in names: - deduped.append(f"{nm}{seen[nm]}" if nm in seen else nm) + deduped.append(f"{nm}{seen[nm] - 1}" if nm in seen else nm) seen[nm] += 1 return deduped diff --git a/py-polars/polars/io/spreadsheet/functions.py b/py-polars/polars/io/spreadsheet/functions.py index 766b15c5d21..1d32589302b 100644 --- a/py-polars/polars/io/spreadsheet/functions.py +++ b/py-polars/polars/io/spreadsheet/functions.py @@ -66,7 +66,7 @@ def _sources( return sources, read_multiple_workbooks -def _standardize_duplicate_names(s: str) -> str: +def _standardize_duplicates(s: str) -> str: """Standardize columns with '_duplicated_n' names.""" return re.sub(r"_duplicated_(\d+)", repl=r"\1", string=s) @@ -277,7 +277,8 @@ def read_excel( Indicate whether to omit empty rows when reading data into the DataFrame. drop_empty_cols Indicate whether to omit empty columns (with no headers) when reading data into - the DataFrame. + the DataFrame (note that empty column identification may vary depending on the + underlying engine being used). raise_if_empty When there is no data in the sheet,`NoDataError` is raised. If this parameter is set to False, an empty DataFrame (with no columns) is returned instead. @@ -505,7 +506,8 @@ def read_ods( Indicate whether to omit empty rows when reading data into the DataFrame. drop_empty_cols Indicate whether to omit empty columns (with no headers) when reading data into - the DataFrame. + the DataFrame (note that empty column identification may vary depending on the + underlying engine being used). raise_if_empty When there is no data in the sheet,`NoDataError` is raised. If this parameter is set to False, an empty DataFrame (with no columns) is returned instead. @@ -1129,5 +1131,5 @@ def _read_spreadsheet_xlsx2csv( if cast_to_boolean: df = df.with_columns(*cast_to_boolean) - df = df.rename(_standardize_duplicate_names) + df = df.rename(_standardize_duplicates) return _reorder_columns(df, columns)