diff --git a/unstructured/partition/auto.py b/unstructured/partition/auto.py index f392819fa1..7a31c88fd0 100644 --- a/unstructured/partition/auto.py +++ b/unstructured/partition/auto.py @@ -532,6 +532,7 @@ def partition( infer_table_structure=infer_table_structure, languages=languages, detect_language_per_element=detect_language_per_element, + encoding=encoding, **kwargs, ) elif filetype == FileType.TSV: @@ -541,6 +542,7 @@ def partition( file=file, languages=languages, detect_language_per_element=detect_language_per_element, + encoding=encoding, **kwargs, ) elif filetype == FileType.EMPTY: diff --git a/unstructured/partition/csv.py b/unstructured/partition/csv.py index 78ed29ea5b..49b0e65fbd 100644 --- a/unstructured/partition/csv.py +++ b/unstructured/partition/csv.py @@ -40,6 +40,7 @@ def partition_csv( # NOTE (jennings) partition_csv generates a single TableElement # so detect_language_per_element is not included as a param date_from_file_object: bool = False, + encoding: Optional[str] = None, **kwargs: Any, ) -> list[Element]: """Partitions Microsoft Excel Documents in .csv format into its document elements. @@ -71,14 +72,17 @@ def partition_csv( date_from_file_object Applies only when providing file via `file` parameter. If this option is True, attempt infer last_modified metadata from bytes, otherwise set it to None. + encoding + The encoding to use when reading the CSV file. """ exactly_one(filename=filename, file=file) header = 0 if include_header else None + encoding = encoding or "utf-8" if filename: - delimiter = get_delimiter(file_path=filename) - table = pd.read_csv(filename, header=header, sep=delimiter) + delimiter = get_delimiter(file_path=filename, encoding=encoding) + table = pd.read_csv(filename, header=header, sep=delimiter, encoding=encoding) last_modification_date = get_last_modified_date(filename) elif file: @@ -86,8 +90,8 @@ def partition_csv( get_last_modified_date_from_file(file) if date_from_file_object else None ) f = spooled_to_bytes_io_if_needed(file) - delimiter = get_delimiter(file=f) - table = pd.read_csv(f, header=header, sep=delimiter) + delimiter = get_delimiter(file=f, encoding=encoding) + table = pd.read_csv(f, header=header, sep=delimiter, encoding=encoding) html_text = table.to_html(index=False, header=include_header, na_rep="") text = cast(str, soupparser_fromstring(html_text).text_content()) @@ -111,7 +115,9 @@ def partition_csv( return list(elements) -def get_delimiter(file_path: str | None = None, file: IO[bytes] | None = None): +def get_delimiter( + file_path: str | None = None, file: IO[bytes] | None = None, encoding: str = "utf-8" +) -> str: """Use the standard csv sniffer to determine the delimiter. Reads just a small portion in case the file is large. @@ -123,9 +129,9 @@ def get_delimiter(file_path: str | None = None, file: IO[bytes] | None = None): if file: lines = file.readlines(num_bytes) file.seek(0) - data = "\n".join(ln.decode("utf-8") for ln in lines) + data = "\n".join(ln.decode(encoding) for ln in lines) elif file_path is not None: - with open(file_path) as f: + with open(file_path, encoding=encoding) as f: data = "\n".join(f.readlines(num_bytes)) else: raise ValueError("either `file_path` or `file` argument must be provided") diff --git a/unstructured/partition/tsv.py b/unstructured/partition/tsv.py index 62add9cec5..12d87921e5 100644 --- a/unstructured/partition/tsv.py +++ b/unstructured/partition/tsv.py @@ -38,6 +38,7 @@ def partition_tsv( # NOTE (jennings) partition_tsv generates a single TableElement # so detect_language_per_element is not included as a param date_from_file_object: bool = False, + encoding: Optional[str] = None, **kwargs: Any, ) -> list[Element]: """Partitions TSV files into document elements. @@ -61,20 +62,23 @@ def partition_tsv( date_from_file_object Applies only when providing file via `file` parameter. If this option is True, attempt infer last_modified metadata from bytes, otherwise set it to None. + encoding + The encoding to use when reading the CSV file. """ exactly_one(filename=filename, file=file) last_modification_date = None header = 0 if include_header else None + encoding = encoding or "utf-8" if filename: - table = pd.read_csv(filename, sep="\t", header=header) + table = pd.read_csv(filename, sep="\t", header=header, encoding=encoding) last_modification_date = get_last_modified_date(filename) elif file: # -- Note(scanny): `SpooledTemporaryFile` on Python<3.11 does not implement `.readable()` # -- which triggers an exception on `pd.DataFrame.read_csv()` call. f = spooled_to_bytes_io_if_needed(file) - table = pd.read_csv(f, sep="\t", header=header) + table = pd.read_csv(f, sep="\t", header=header, encoding=encoding) last_modification_date = ( get_last_modified_date_from_file(file) if date_from_file_object else None )