Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: csv/tsv encoding #3369

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions unstructured/partition/auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -532,6 +532,7 @@ def partition(
infer_table_structure=infer_table_structure,
languages=languages,
detect_language_per_element=detect_language_per_element,
encoding=encoding,
**kwargs,
)
elif filetype == FileType.TSV:
Expand All @@ -541,6 +542,7 @@ def partition(
file=file,
languages=languages,
detect_language_per_element=detect_language_per_element,
encoding=encoding,
**kwargs,
)
elif filetype == FileType.EMPTY:
Expand Down
20 changes: 13 additions & 7 deletions unstructured/partition/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ def partition_csv(
# NOTE (jennings) partition_csv generates a single TableElement
# so detect_language_per_element is not included as a param
date_from_file_object: bool = False,
encoding: Optional[str] = None,
**kwargs: Any,
) -> list[Element]:
"""Partitions Microsoft Excel Documents in .csv format into its document elements.
Expand Down Expand Up @@ -71,23 +72,26 @@ def partition_csv(
date_from_file_object
Applies only when providing file via `file` parameter. If this option is True, attempt
infer last_modified metadata from bytes, otherwise set it to None.
encoding
The encoding to use when reading the CSV file.
"""
exactly_one(filename=filename, file=file)

header = 0 if include_header else None
encoding = encoding or "utf-8"

if filename:
delimiter = get_delimiter(file_path=filename)
table = pd.read_csv(filename, header=header, sep=delimiter)
delimiter = get_delimiter(file_path=filename, encoding=encoding)
table = pd.read_csv(filename, header=header, sep=delimiter, encoding=encoding)
last_modification_date = get_last_modified_date(filename)

elif file:
last_modification_date = (
get_last_modified_date_from_file(file) if date_from_file_object else None
)
f = spooled_to_bytes_io_if_needed(file)
delimiter = get_delimiter(file=f)
table = pd.read_csv(f, header=header, sep=delimiter)
delimiter = get_delimiter(file=f, encoding=encoding)
table = pd.read_csv(f, header=header, sep=delimiter, encoding=encoding)

html_text = table.to_html(index=False, header=include_header, na_rep="")
text = cast(str, soupparser_fromstring(html_text).text_content())
Expand All @@ -111,7 +115,9 @@ def partition_csv(
return list(elements)


def get_delimiter(file_path: str | None = None, file: IO[bytes] | None = None):
def get_delimiter(
file_path: str | None = None, file: IO[bytes] | None = None, encoding: str = "utf-8"
) -> str:
"""Use the standard csv sniffer to determine the delimiter.

Reads just a small portion in case the file is large.
Expand All @@ -123,9 +129,9 @@ def get_delimiter(file_path: str | None = None, file: IO[bytes] | None = None):
if file:
lines = file.readlines(num_bytes)
file.seek(0)
data = "\n".join(ln.decode("utf-8") for ln in lines)
data = "\n".join(ln.decode(encoding) for ln in lines)
elif file_path is not None:
with open(file_path) as f:
with open(file_path, encoding=encoding) as f:
data = "\n".join(f.readlines(num_bytes))
else:
raise ValueError("either `file_path` or `file` argument must be provided")
Expand Down
8 changes: 6 additions & 2 deletions unstructured/partition/tsv.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ def partition_tsv(
# NOTE (jennings) partition_tsv generates a single TableElement
# so detect_language_per_element is not included as a param
date_from_file_object: bool = False,
encoding: Optional[str] = None,
**kwargs: Any,
) -> list[Element]:
"""Partitions TSV files into document elements.
Expand All @@ -61,20 +62,23 @@ def partition_tsv(
date_from_file_object
Applies only when providing file via `file` parameter. If this option is True, attempt
infer last_modified metadata from bytes, otherwise set it to None.
encoding
The encoding to use when reading the CSV file.
"""
exactly_one(filename=filename, file=file)

last_modification_date = None
header = 0 if include_header else None
encoding = encoding or "utf-8"

if filename:
table = pd.read_csv(filename, sep="\t", header=header)
table = pd.read_csv(filename, sep="\t", header=header, encoding=encoding)
last_modification_date = get_last_modified_date(filename)
elif file:
# -- Note(scanny): `SpooledTemporaryFile` on Python<3.11 does not implement `.readable()`
# -- which triggers an exception on `pd.DataFrame.read_csv()` call.
f = spooled_to_bytes_io_if_needed(file)
table = pd.read_csv(f, sep="\t", header=header)
table = pd.read_csv(f, sep="\t", header=header, encoding=encoding)
last_modification_date = (
get_last_modified_date_from_file(file) if date_from_file_object else None
)
Expand Down