From a916062f103dbb669893d5577ab8e131a35962c0 Mon Sep 17 00:00:00 2001
From: Javier Martinez <javiermartinezalvarez98@gmail.com>
Date: Tue, 9 Jul 2024 12:16:49 +0200
Subject: [PATCH 1/3] fix: csv encoding

---
 unstructured/partition/auto.py |  1 +
 unstructured/partition/csv.py  | 19 ++++++++++++-------
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/unstructured/partition/auto.py b/unstructured/partition/auto.py
index f392819fa1..fe24aaee01 100644
--- a/unstructured/partition/auto.py
+++ b/unstructured/partition/auto.py
@@ -532,6 +532,7 @@ def partition(
             infer_table_structure=infer_table_structure,
             languages=languages,
             detect_language_per_element=detect_language_per_element,
+            encoding=encoding,
             **kwargs,
         )
     elif filetype == FileType.TSV:
diff --git a/unstructured/partition/csv.py b/unstructured/partition/csv.py
index 78ed29ea5b..611a043bfc 100644
--- a/unstructured/partition/csv.py
+++ b/unstructured/partition/csv.py
@@ -40,6 +40,7 @@ def partition_csv(
     # NOTE (jennings) partition_csv generates a single TableElement
     # so detect_language_per_element is not included as a param
     date_from_file_object: bool = False,
+    encoding: Optional[str] = "utf-8",
     **kwargs: Any,
 ) -> list[Element]:
     """Partitions Microsoft Excel Documents in .csv format into its document elements.
@@ -71,14 +72,16 @@ def partition_csv(
     date_from_file_object
         Applies only when providing file via `file` parameter. If this option is True, attempt
         infer last_modified metadata from bytes, otherwise set it to None.
+    encoding
+        The encoding to use when reading the CSV file.
     """
     exactly_one(filename=filename, file=file)
 
     header = 0 if include_header else None
 
     if filename:
-        delimiter = get_delimiter(file_path=filename)
-        table = pd.read_csv(filename, header=header, sep=delimiter)
+        delimiter = get_delimiter(file_path=filename, encoding=encoding)
+        table = pd.read_csv(filename, header=header, sep=delimiter, encoding=encoding)
         last_modification_date = get_last_modified_date(filename)
 
     elif file:
@@ -86,8 +89,8 @@ def partition_csv(
             get_last_modified_date_from_file(file) if date_from_file_object else None
         )
         f = spooled_to_bytes_io_if_needed(file)
-        delimiter = get_delimiter(file=f)
-        table = pd.read_csv(f, header=header, sep=delimiter)
+        delimiter = get_delimiter(file=f, encoding=encoding)
+        table = pd.read_csv(f, header=header, sep=delimiter, encoding=encoding)
 
     html_text = table.to_html(index=False, header=include_header, na_rep="")
     text = cast(str, soupparser_fromstring(html_text).text_content())
@@ -111,7 +114,9 @@ def partition_csv(
     return list(elements)
 
 
-def get_delimiter(file_path: str | None = None, file: IO[bytes] | None = None):
+def get_delimiter(
+    file_path: str | None = None, file: IO[bytes] | None = None, encoding: str = "utf-8"
+) -> str:
     """Use the standard csv sniffer to determine the delimiter.
 
     Reads just a small portion in case the file is large.
@@ -123,9 +128,9 @@ def get_delimiter(file_path: str | None = None, file: IO[bytes] | None = None):
     if file:
         lines = file.readlines(num_bytes)
         file.seek(0)
-        data = "\n".join(ln.decode("utf-8") for ln in lines)
+        data = "\n".join(ln.decode(encoding) for ln in lines)
     elif file_path is not None:
-        with open(file_path) as f:
+        with open(file_path, encoding=encoding) as f:
             data = "\n".join(f.readlines(num_bytes))
     else:
         raise ValueError("either `file_path` or `file` argument must be provided")

From d05d11a0d7a30512ea69d1b66f8a1c6c0dd9db50 Mon Sep 17 00:00:00 2001
From: Javier Martinez <javiermartinezalvarez98@gmail.com>
Date: Tue, 9 Jul 2024 12:19:13 +0200
Subject: [PATCH 2/3] fix: tsv encoding

---
 unstructured/partition/auto.py | 1 +
 unstructured/partition/tsv.py  | 7 +++++--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/unstructured/partition/auto.py b/unstructured/partition/auto.py
index fe24aaee01..7a31c88fd0 100644
--- a/unstructured/partition/auto.py
+++ b/unstructured/partition/auto.py
@@ -542,6 +542,7 @@ def partition(
             file=file,
             languages=languages,
             detect_language_per_element=detect_language_per_element,
+            encoding=encoding,
             **kwargs,
         )
     elif filetype == FileType.EMPTY:
diff --git a/unstructured/partition/tsv.py b/unstructured/partition/tsv.py
index 62add9cec5..bc67c6c161 100644
--- a/unstructured/partition/tsv.py
+++ b/unstructured/partition/tsv.py
@@ -38,6 +38,7 @@ def partition_tsv(
     # NOTE (jennings) partition_tsv generates a single TableElement
     # so detect_language_per_element is not included as a param
     date_from_file_object: bool = False,
+    encoding: Optional[str] = "utf-8",
     **kwargs: Any,
 ) -> list[Element]:
     """Partitions TSV files into document elements.
@@ -61,6 +62,8 @@ def partition_tsv(
     date_from_file_object
         Applies only when providing file via `file` parameter. If this option is True, attempt
         infer last_modified metadata from bytes, otherwise set it to None.
+    encoding
+        The encoding to use when reading the CSV file.
     """
     exactly_one(filename=filename, file=file)
 
@@ -68,13 +71,13 @@ def partition_tsv(
     header = 0 if include_header else None
 
     if filename:
-        table = pd.read_csv(filename, sep="\t", header=header)
+        table = pd.read_csv(filename, sep="\t", header=header, encoding=encoding)
         last_modification_date = get_last_modified_date(filename)
     elif file:
         # -- Note(scanny): `SpooledTemporaryFile` on Python<3.11 does not implement `.readable()`
         # -- which triggers an exception on `pd.DataFrame.read_csv()` call.
         f = spooled_to_bytes_io_if_needed(file)
-        table = pd.read_csv(f, sep="\t", header=header)
+        table = pd.read_csv(f, sep="\t", header=header, encoding=encoding)
         last_modification_date = (
             get_last_modified_date_from_file(file) if date_from_file_object else None
         )

From e5d2889091cf3d52f88465d7ca39c34f6ac3c7b1 Mon Sep 17 00:00:00 2001
From: Javier Martinez <javiermartinezalvarez98@gmail.com>
Date: Tue, 9 Jul 2024 13:43:16 +0200
Subject: [PATCH 3/3] fix: ensure that some encoding is used

---
 unstructured/partition/csv.py | 3 ++-
 unstructured/partition/tsv.py | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/unstructured/partition/csv.py b/unstructured/partition/csv.py
index 611a043bfc..49b0e65fbd 100644
--- a/unstructured/partition/csv.py
+++ b/unstructured/partition/csv.py
@@ -40,7 +40,7 @@ def partition_csv(
     # NOTE (jennings) partition_csv generates a single TableElement
     # so detect_language_per_element is not included as a param
     date_from_file_object: bool = False,
-    encoding: Optional[str] = "utf-8",
+    encoding: Optional[str] = None,
     **kwargs: Any,
 ) -> list[Element]:
     """Partitions Microsoft Excel Documents in .csv format into its document elements.
@@ -78,6 +78,7 @@ def partition_csv(
     exactly_one(filename=filename, file=file)
 
     header = 0 if include_header else None
+    encoding = encoding or "utf-8"
 
     if filename:
         delimiter = get_delimiter(file_path=filename, encoding=encoding)
diff --git a/unstructured/partition/tsv.py b/unstructured/partition/tsv.py
index bc67c6c161..12d87921e5 100644
--- a/unstructured/partition/tsv.py
+++ b/unstructured/partition/tsv.py
@@ -38,7 +38,7 @@ def partition_tsv(
     # NOTE (jennings) partition_tsv generates a single TableElement
     # so detect_language_per_element is not included as a param
     date_from_file_object: bool = False,
-    encoding: Optional[str] = "utf-8",
+    encoding: Optional[str] = None,
     **kwargs: Any,
 ) -> list[Element]:
     """Partitions TSV files into document elements.
@@ -69,6 +69,7 @@ def partition_tsv(
 
     last_modification_date = None
     header = 0 if include_header else None
+    encoding = encoding or "utf-8"
 
     if filename:
         table = pd.read_csv(filename, sep="\t", header=header, encoding=encoding)