From a916062f103dbb669893d5577ab8e131a35962c0 Mon Sep 17 00:00:00 2001 From: Javier Martinez Date: Tue, 9 Jul 2024 12:16:49 +0200 Subject: [PATCH 1/3] fix: csv encoding --- unstructured/partition/auto.py | 1 + unstructured/partition/csv.py | 19 ++++++++++++------- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/unstructured/partition/auto.py b/unstructured/partition/auto.py index f392819fa1..fe24aaee01 100644 --- a/unstructured/partition/auto.py +++ b/unstructured/partition/auto.py @@ -532,6 +532,7 @@ def partition( infer_table_structure=infer_table_structure, languages=languages, detect_language_per_element=detect_language_per_element, + encoding=encoding, **kwargs, ) elif filetype == FileType.TSV: diff --git a/unstructured/partition/csv.py b/unstructured/partition/csv.py index 78ed29ea5b..611a043bfc 100644 --- a/unstructured/partition/csv.py +++ b/unstructured/partition/csv.py @@ -40,6 +40,7 @@ def partition_csv( # NOTE (jennings) partition_csv generates a single TableElement # so detect_language_per_element is not included as a param date_from_file_object: bool = False, + encoding: Optional[str] = "utf-8", **kwargs: Any, ) -> list[Element]: """Partitions Microsoft Excel Documents in .csv format into its document elements. @@ -71,14 +72,16 @@ def partition_csv( date_from_file_object Applies only when providing file via `file` parameter. If this option is True, attempt infer last_modified metadata from bytes, otherwise set it to None. + encoding + The encoding to use when reading the CSV file. """ exactly_one(filename=filename, file=file) header = 0 if include_header else None if filename: - delimiter = get_delimiter(file_path=filename) - table = pd.read_csv(filename, header=header, sep=delimiter) + delimiter = get_delimiter(file_path=filename, encoding=encoding) + table = pd.read_csv(filename, header=header, sep=delimiter, encoding=encoding) last_modification_date = get_last_modified_date(filename) elif file: @@ -86,8 +89,8 @@ def partition_csv( get_last_modified_date_from_file(file) if date_from_file_object else None ) f = spooled_to_bytes_io_if_needed(file) - delimiter = get_delimiter(file=f) - table = pd.read_csv(f, header=header, sep=delimiter) + delimiter = get_delimiter(file=f, encoding=encoding) + table = pd.read_csv(f, header=header, sep=delimiter, encoding=encoding) html_text = table.to_html(index=False, header=include_header, na_rep="") text = cast(str, soupparser_fromstring(html_text).text_content()) @@ -111,7 +114,9 @@ def partition_csv( return list(elements) -def get_delimiter(file_path: str | None = None, file: IO[bytes] | None = None): +def get_delimiter( + file_path: str | None = None, file: IO[bytes] | None = None, encoding: str = "utf-8" +) -> str: """Use the standard csv sniffer to determine the delimiter. Reads just a small portion in case the file is large. @@ -123,9 +128,9 @@ def get_delimiter(file_path: str | None = None, file: IO[bytes] | None = None): if file: lines = file.readlines(num_bytes) file.seek(0) - data = "\n".join(ln.decode("utf-8") for ln in lines) + data = "\n".join(ln.decode(encoding) for ln in lines) elif file_path is not None: - with open(file_path) as f: + with open(file_path, encoding=encoding) as f: data = "\n".join(f.readlines(num_bytes)) else: raise ValueError("either `file_path` or `file` argument must be provided") From d05d11a0d7a30512ea69d1b66f8a1c6c0dd9db50 Mon Sep 17 00:00:00 2001 From: Javier Martinez Date: Tue, 9 Jul 2024 12:19:13 +0200 Subject: [PATCH 2/3] fix: tsv encoding --- unstructured/partition/auto.py | 1 + unstructured/partition/tsv.py | 7 +++++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/unstructured/partition/auto.py b/unstructured/partition/auto.py index fe24aaee01..7a31c88fd0 100644 --- a/unstructured/partition/auto.py +++ b/unstructured/partition/auto.py @@ -542,6 +542,7 @@ def partition( file=file, languages=languages, detect_language_per_element=detect_language_per_element, + encoding=encoding, **kwargs, ) elif filetype == FileType.EMPTY: diff --git a/unstructured/partition/tsv.py b/unstructured/partition/tsv.py index 62add9cec5..bc67c6c161 100644 --- a/unstructured/partition/tsv.py +++ b/unstructured/partition/tsv.py @@ -38,6 +38,7 @@ def partition_tsv( # NOTE (jennings) partition_tsv generates a single TableElement # so detect_language_per_element is not included as a param date_from_file_object: bool = False, + encoding: Optional[str] = "utf-8", **kwargs: Any, ) -> list[Element]: """Partitions TSV files into document elements. @@ -61,6 +62,8 @@ def partition_tsv( date_from_file_object Applies only when providing file via `file` parameter. If this option is True, attempt infer last_modified metadata from bytes, otherwise set it to None. + encoding + The encoding to use when reading the CSV file. """ exactly_one(filename=filename, file=file) @@ -68,13 +71,13 @@ def partition_tsv( header = 0 if include_header else None if filename: - table = pd.read_csv(filename, sep="\t", header=header) + table = pd.read_csv(filename, sep="\t", header=header, encoding=encoding) last_modification_date = get_last_modified_date(filename) elif file: # -- Note(scanny): `SpooledTemporaryFile` on Python<3.11 does not implement `.readable()` # -- which triggers an exception on `pd.DataFrame.read_csv()` call. f = spooled_to_bytes_io_if_needed(file) - table = pd.read_csv(f, sep="\t", header=header) + table = pd.read_csv(f, sep="\t", header=header, encoding=encoding) last_modification_date = ( get_last_modified_date_from_file(file) if date_from_file_object else None ) From e5d2889091cf3d52f88465d7ca39c34f6ac3c7b1 Mon Sep 17 00:00:00 2001 From: Javier Martinez Date: Tue, 9 Jul 2024 13:43:16 +0200 Subject: [PATCH 3/3] fix: ensure that some encoding is used --- unstructured/partition/csv.py | 3 ++- unstructured/partition/tsv.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/unstructured/partition/csv.py b/unstructured/partition/csv.py index 611a043bfc..49b0e65fbd 100644 --- a/unstructured/partition/csv.py +++ b/unstructured/partition/csv.py @@ -40,7 +40,7 @@ def partition_csv( # NOTE (jennings) partition_csv generates a single TableElement # so detect_language_per_element is not included as a param date_from_file_object: bool = False, - encoding: Optional[str] = "utf-8", + encoding: Optional[str] = None, **kwargs: Any, ) -> list[Element]: """Partitions Microsoft Excel Documents in .csv format into its document elements. @@ -78,6 +78,7 @@ def partition_csv( exactly_one(filename=filename, file=file) header = 0 if include_header else None + encoding = encoding or "utf-8" if filename: delimiter = get_delimiter(file_path=filename, encoding=encoding) diff --git a/unstructured/partition/tsv.py b/unstructured/partition/tsv.py index bc67c6c161..12d87921e5 100644 --- a/unstructured/partition/tsv.py +++ b/unstructured/partition/tsv.py @@ -38,7 +38,7 @@ def partition_tsv( # NOTE (jennings) partition_tsv generates a single TableElement # so detect_language_per_element is not included as a param date_from_file_object: bool = False, - encoding: Optional[str] = "utf-8", + encoding: Optional[str] = None, **kwargs: Any, ) -> list[Element]: """Partitions TSV files into document elements. @@ -69,6 +69,7 @@ def partition_tsv( last_modification_date = None header = 0 if include_header else None + encoding = encoding or "utf-8" if filename: table = pd.read_csv(filename, sep="\t", header=header, encoding=encoding)