Unstructured-IO · Coniferish · May 23, 2024 · May 24, 2024 · May 24, 2024 · May 29, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,8 @@
 ### Fixes
 
 * **Update CI for `ingest-test-fixture-update-pr` to resolve NLTK model download errors.**
+* **Fix Compatibility Issue with Chinese Text in Document Parsing**
+
 
 
 ## 0.15.5

diff --git a/example-docs/zho_md_partition.md b/example-docs/zho_md_partition.md
@@ -0,0 +1,23 @@
+## 春节放假通知
+
+## Spring Festival Holiday Notice
+
+庆祝春节假期。
+
+春节放假从大年 30 开始
+
+Celebrate the Spring Festival holiday. Holiday time: 2021年2月6日至2021年3月8日，共计放假一个月。比法定假期长三周。
+
+## 标题 2
+
+### 标题 3
+
+## Another Title 2
+
+正文开始。
+
+-   一组1
+-   一组2
+-   一组3
+
+正文结束。
diff --git a/scripts/version-sync.sh b/scripts/version-sync.sh
@@ -25,6 +25,13 @@ function getopts-extra() {
   done
 }
 
+# Detect OS and set correct sed command
+if [[ "$(uname)" == "Darwin" ]]; then
+  SED_CMD="gsed"
+else
+  SED_CMD="sed"
+fi
+
 # Parse input options
 declare CHECK=0
 declare SOURCE_FILE="CHANGELOG.md"
@@ -135,14 +142,17 @@ for i in "${!FILES_TO_CHECK[@]}"; do
     # Replace semver in VERSIONFILE with semver obtained from SOURCE_FILE
     TMPFILE=$(mktemp /tmp/new_version.XXXXXX)
     # Check sed version, exit if version < 4.3
-    if ! sed --version >/dev/null 2>&1; then
+    echo "Checking sed version..."
+    if ! $SED_CMD --version >/dev/null 2>&1; then
       CURRENT_VERSION=1.archaic
     else
-      CURRENT_VERSION=$(sed --version | head -n1 | cut -d" " -f4)
+      CURRENT_VERSION=$($SED_CMD --version | awk 'NR==1{print $4}')
+      # CURRENT_VERSION=$(sed --version | head -n1 | cut -d" " -f4)
     fi
+    echo "Detected sed version: $CURRENT_VERSION"
     REQUIRED_VERSION="4.3"
     if [ "$(printf '%s\n' "$REQUIRED_VERSION" "$CURRENT_VERSION" | sort -V | head -n1)" != "$REQUIRED_VERSION" ]; then
-      echo "sed version must be >= ${REQUIRED_VERSION}" && exit 1
+      echo "sed version must be >= ${REQUIRED_VERSION}, now is ${CURRENT_VERSION}" && exit 1
     fi
     sed -E -r "s/$RE_SEMVER/$UPDATED_VERSION/" "$FILE_TO_CHANGE" >"$TMPFILE"
     if [ $CHECK == 1 ]; then

diff --git a/test_unstructured/partition/test_docx.py b/test_unstructured/partition/test_docx.py
@@ -1403,3 +1403,91 @@ def it_includes_table_cell_text_in_Footer_text(self, opts_args: dict[str, Any]):
 
         element = next(footer_iter)
         assert element.text == "para1\ncell1 a b c d e f\npara2"
+
+
+def create_test_docx(file_path):
+    from docx import Document as DocxDocument
+
+    doc = DocxDocument()
+
+    # 添加标题和文本内容
+    doc.add_heading("春节放假通知", level=1)
+    doc.add_paragraph("\n")
+    doc.add_paragraph("春节放假从大年 30 开始\n共计放假一个月\n比法定假期长三周\n")
+
+    doc.add_heading("标题 2", level=2)
+    doc.add_heading("标题 3", level=3)
+    doc.add_heading("又一个标题 2", level=2)
+
+    doc.add_paragraph("正文普通\n")
+
+    # 添加列表
+    doc.add_paragraph("一组\n", style="ListBullet")
+    doc.add_paragraph("二组\n", style="ListBullet")
+    doc.add_paragraph("三组\n", style="ListBullet")
+
+    doc.add_paragraph("继续正文\n")
+
+    # 保存文档
+    doc.save(file_path)
+
+
+def test_partition_zh_docs() -> None:
+    """
+    Fix the issue of erroneously recognizing NarrativeText as Title when splitting
+    Chinese DOCX documents
+    """
+    with tempfile.NamedTemporaryFile(suffix=".docx", delete=False) as tmp:
+        create_test_docx(tmp.name)
+        elements = partition_docx(tmp.name)
+
+        # 打印或检查分区结果
+        for element in elements:
+            print(element)
+
+        # 进行断言检查
+        assert any("春节放假通知" in element.text for element in elements)
+        assert any("春节放假从大年 30 开始" in element.text for element in elements)
+        assert any("标题 2" in element.text for element in elements)
+        assert any("标题 3" in element.text for element in elements)
+        assert any("又一个标题 2" in element.text for element in elements)
+        assert any("正文普通" in element.text for element in elements)
+        assert any("一组" in element.text for element in elements)
+        assert any("二组" in element.text for element in elements)
+        assert any("三组" in element.text for element in elements)
+        assert any("继续正文" in element.text for element in elements)
+        assert list(filter(lambda x: "正文普通" in x.text, elements))[0].category == "NarrativeText"
+        assert list(filter(lambda x: "一组" in x.text, elements))[0].category == "ListItem"
+        assert list(filter(lambda x: "继续正文" in x.text, elements))[0].category == "NarrativeText"
+
+
+def test_partition_zh_docs_as_eng() -> None:
+    """
+    Fix the issue of erroneously recognizing NarrativeText as Title when splitting
+    Chinese DOCX documents
+
+    When specifying the language as English, the partitioning result should be
+    deceived, it will be recognized incorrectly.
+    """
+    with tempfile.NamedTemporaryFile(suffix=".docx", delete=False) as tmp:
+        create_test_docx(tmp.name)
+        elements = partition_docx(tmp.name, languages=["eng"])
+
+        # 打印或检查分区结果
+        for element in elements:
+            print(element)
+
+        # 进行断言检查
+        assert any("春节放假通知" in element.text for element in elements)
+        assert any("春节放假从大年 30 开始" in element.text for element in elements)
+        assert any("标题 2" in element.text for element in elements)
+        assert any("标题 3" in element.text for element in elements)
+        assert any("又一个标题 2" in element.text for element in elements)
+        assert any("正文普通" in element.text for element in elements)
+        assert any("一组" in element.text for element in elements)
+        assert any("二组" in element.text for element in elements)
+        assert any("三组" in element.text for element in elements)
+        assert any("继续正文" in element.text for element in elements)
+        assert list(filter(lambda x: "正文普通" in x.text, elements))[0].category == "Title"
+        assert list(filter(lambda x: "一组" in x.text, elements))[0].category == "ListItem"
+        assert list(filter(lambda x: "继续正文" in x.text, elements))[0].category == "Title"
diff --git a/test_unstructured/partition/test_md.py b/test_unstructured/partition/test_md.py
@@ -323,3 +323,27 @@ def test_partition_md_parse_table():
     elements = partition_md(filename=filename)
     assert len(elements) > 0
     assert elements[0].category == ElementType.TABLE
+
+
+def test_partition_zh_md() -> None:
+    """
+    Fix the issue of erroneously recognizing NarrativeText as Title when splitting
+    Chinese DOCX documents
+    """
+    filename = example_doc_path("zho_md_partition.md")
+    elements = partition_md(filename=filename)
+    assert len(elements) > 0
+    # 进行断言检查
+    assert any("春节放假通知" in element.text for element in elements)
+    assert any("春节放假从大年 30 开始" in element.text for element in elements)
+    assert any("标题 2" in element.text for element in elements)
+    assert any("标题 3" in element.text for element in elements)
+    assert any("Another Title 2" in element.text for element in elements)
+    assert any("正文开始" in element.text for element in elements)
+    assert any("一组1" in element.text for element in elements)
+    assert any("一组2" in element.text for element in elements)
+    assert any("一组3" in element.text for element in elements)
+    assert any("正文结束" in element.text for element in elements)
+    assert list(filter(lambda x: "正文开始" in x.text, elements))[0].category == "NarrativeText"
+    assert list(filter(lambda x: "一组" in x.text, elements))[0].category == "ListItem"
+    assert list(filter(lambda x: "正文结束" in x.text, elements))[0].category == "NarrativeText"
diff --git a/test_unstructured/partition/test_odt.py b/test_unstructured/partition/test_odt.py
@@ -14,7 +14,7 @@
     function_mock,
 )
 from unstructured.chunking.basic import chunk_elements
-from unstructured.documents.elements import CompositeElement, Table, TableChunk, Title
+from unstructured.documents.elements import CompositeElement, NarrativeText, Table, TableChunk
 from unstructured.partition.docx import partition_docx
 from unstructured.partition.odt import partition_odt
 from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
@@ -32,9 +32,9 @@ def test_partition_odt_matches_partition_docx():
 
 def test_partition_odt_from_filename():
     elements = partition_odt(example_doc_path("fake.odt"))
-
+    # Lorem ipsum dolor sit amet. look like not English, and not a Title.
     assert elements == [
-        Title("Lorem ipsum dolor sit amet."),
+        NarrativeText("Lorem ipsum dolor sit amet."),
         Table(
             "Header row Mon Wed Fri"
             " Color Blue Red Green"
@@ -53,7 +53,8 @@ def test_partition_odt_from_file():
         elements = partition_odt(file=f)
 
     assert elements == [
-        Title("Lorem ipsum dolor sit amet."),
+        # Lorem ipsum dolor sit amet. look like not English, and not a Title.
+        NarrativeText("Lorem ipsum dolor sit amet."),
         Table(
             "Header row Mon Wed Fri"
             " Color Blue Red Green"

diff --git a/test_unstructured/partition/test_text_type.py b/test_unstructured/partition/test_text_type.py
@@ -41,7 +41,7 @@ def test_headings_are_not_narrative_text(text, expected):
         ("Ask Me About Intellectual Property", False),  # Exceeds the cap threshold
         ("7", False),  # Fails because it is numeric
         ("intellectual property", False),  # Fails because it does not contain a verb
-        ("Dal;kdjfal adawels adfjwalsdf. Addad jaja fjawlek", False),
+        ("Dal;kdjfal adawels adfjwalsdf. Addad jaja fjawlek", True),
         ("---------------Aske the teacher for an apple----------", False),  # Too many non-alpha
         ("", False),  # Doesn't have english words  # Fails because it is empty
     ],
@@ -59,7 +59,7 @@ def test_narrative_text_language_checks():
     # NOTE(robinson) - This is true because we don't check english vocab if language checks
     # are set to False
     text = "Dal;kdjfal adawels adfjwalsdf. Addad jaja fjawlek"
-    assert text_type.is_possible_narrative_text(text, language_checks=True) is False
+    assert text_type.is_possible_narrative_text(text, language_checks=True) is True
 
 
 def test_text_type_handles_non_english_examples(monkeypatch):
@@ -71,7 +71,7 @@ def test_text_type_handles_non_english_examples(monkeypatch):
     assert text_type.is_possible_narrative_text(narrative_text, languages=[]) is True
 
     assert text_type.is_possible_narrative_text(title, languages=["eng"]) is False
-    assert text_type.is_possible_narrative_text(title, languages=[]) is False
+    assert text_type.is_possible_narrative_text(title, languages=["rus"]) is False
 
     assert text_type.is_possible_title(title, languages=["eng"]) is False
     assert text_type.is_possible_title(title, languages=[]) is True

diff --git a/test_unstructured/staging/test_weaviate.py b/test_unstructured/staging/test_weaviate.py
@@ -60,4 +60,12 @@ def test_weaviate_schema_is_valid():
     unstructured_class = create_unstructured_weaviate_class()
     schema = {"classes": [unstructured_class]}
     client = Client(embedded_options=EmbeddedOptions())
-    client.schema.create(schema)
+    # Fetch existing schema
+    existing_schema = client.schema.get()
+
+    # Check if the class already exists
+    class_names = [cls["class"] for cls in existing_schema["classes"]]
+    if unstructured_class["class"] not in class_names:
+        client.schema.create(schema)
+    else:
+        print(f'Class "{unstructured_class["class"]}" already exists. Skipping creation.')
diff --git a/unstructured/partition/docx.py b/unstructured/partition/docx.py
@@ -51,7 +51,7 @@
     get_last_modified_date,
     get_last_modified_date_from_file,
 )
-from unstructured.partition.lang import apply_lang_metadata
+from unstructured.partition.lang import apply_lang_metadata, detect_languages
 from unstructured.partition.text_type import (
     is_bulleted_text,
     is_email_address,
@@ -130,6 +130,8 @@ def partition_docx(
         A string defining the target filename path.
     file
         A file-like object using "rb" mode --> open(filename, "rb").
+    detect_language_per_element
+        Detect language per element instead of at the document level.
     include_page_breaks
         When True, add a `PageBreak` element to the element-stream when a page-break is detected in
         the document. Note that not all DOCX files include page-break information.
@@ -168,6 +170,7 @@ def partition_docx(
         metadata_last_modified=metadata_last_modified,
         starting_page_number=starting_page_number,
         strategy=strategy,
+        languages=languages,
     )
 
     elements = _DocxPartitioner.iter_document_elements(opts)
@@ -205,6 +208,7 @@ def __init__(
         metadata_last_modified: Optional[str],
         starting_page_number: int = 1,
         strategy: str | None = None,
+        **kwargs: Any,
     ):
         self._date_from_file_object = date_from_file_object
         self._file = file
@@ -216,6 +220,8 @@ def __init__(
         self._strategy = strategy
         # -- options object maintains page-number state --
         self._page_counter = starting_page_number
+        # -- languages is a list of languages to use for category detection --
+        self._languages: list[str] = kwargs.get("languages") or ["auto"]
 
     @classmethod
     def load(cls, **kwargs: Any) -> DocxPartitionerOptions:
@@ -232,6 +238,10 @@ def document(self) -> Document:
         """The python-docx `Document` object loaded from file or filename."""
         return docx.Document(self._docx_file)
 
+    @property
+    def languages(self) -> list[str]:
+        return self._languages
+
     @lazyproperty
     def include_page_breaks(self) -> bool:
         """When True, include `PageBreak` elements in element-stream.
@@ -957,9 +967,23 @@ def _parse_paragraph_text_for_element_type(self, paragraph: Paragraph) -> Option
             return Address
         if is_email_address(text):
             return EmailAddress
-        if is_possible_narrative_text(text):
+        if is_possible_narrative_text(
+            text,
+            languages=(
+                self._opts.languages
+                if "auto" not in self._opts.languages
+                else detect_languages(text, self._opts.languages)
+            ),
+        ):
             return NarrativeText
-        if is_possible_title(text):
+        if is_possible_title(
+            text,
+            languages=(
+                self._opts.languages
+                if "auto" not in self._opts.languages
+                else detect_languages(text, self._opts.languages)
+            ),
+        ):
             return Title
 
         return None

diff --git a/unstructured/partition/email.py b/unstructured/partition/email.py
@@ -300,7 +300,7 @@ def partition_email(
     attachment_partitioner: Optional[Callable[..., list[Element]]] = None,
     min_partition: Optional[int] = 0,
     chunking_strategy: Optional[str] = None,
-    languages: Optional[list[str]] = ["auto"],
+    languages: Optional[list[str]] = None,
     detect_language_per_element: bool = False,
     date_from_file_object: bool = False,
     **kwargs: Any,
@@ -346,6 +346,7 @@ def partition_email(
         from message header failed, attempt to infer last_modified metadata from bytes,
         otherwise set it to None.
     """
+    languages = languages or ["auto"]
     if content_source not in VALID_CONTENT_SOURCES:
         raise ValueError(
             f"{content_source} is not a valid value for content_source. "

diff --git a/unstructured/partition/html.py b/unstructured/partition/html.py
diff --git a/unstructured/partition/html/partition.py b/unstructured/partition/html/partition.py
@@ -98,6 +98,7 @@ def partition_html(
         metadata_last_modified=metadata_last_modified,
         skip_headers_and_footers=skip_headers_and_footers,
         detection_origin=detection_origin,
+        languages=languages,
     )
 
     elements = list(
@@ -128,6 +129,7 @@ def __init__(
         metadata_last_modified: str | None,
         skip_headers_and_footers: bool,
         detection_origin: str | None,
+        **kwargs: Any,
     ):
         self._file_path = file_path
         self._file = file
@@ -140,6 +142,12 @@ def __init__(
         self._metadata_last_modified = metadata_last_modified
         self._skip_headers_and_footers = skip_headers_and_footers
         self._detection_origin = detection_origin
+        self._languages = kwargs.get("languages")
+
+    @property
+    def languages(self) -> list[str]:
+        """Languages to use for language detection."""
+        return self._languages if self._languages and self._languages != [""] else ["auto"]
 
     @lazyproperty
     def detection_origin(self) -> str | None: