Unstructured-IO · sksharma0 · Jul 14, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,10 @@
 
 ### Enhancements
 
+* **Merge words splitted across two lines** Merges words which are splitted across two lines in text partition
+
+### Enhancements
+
 * **Bump unstructured.paddleocr to 2.8.0.1.**
 * **Refine HTML parser to accommodate block element nested in phrasing.** HTML parser no longer raises on a block element (e.g. `<p>`, `<div>`) nested inside a phrasing element (e.g. `<strong>` or `<cite>`). Instead it breaks the phrasing run (and therefore element) at the block-item start and begins a new phrasing run after the block-item. This is consistent with how the browser determines element boundaries in this situation.
 * **Install rewritten HTML parser to fix 12 existing bugs and provide headroom for refinement and growth.** A rewritten HTML parser resolves a collection of outstanding bugs with HTML partitioning and provides a firm foundation for further elaborating that important partitioner.

diff --git a/example-docs/text_having_splitted_words.txt b/example-docs/text_having_splitted_words.txt
@@ -0,0 +1,6 @@
+This is a test document to test text having splitt-
+ed words to next line.
+
+These words are descri-
+bed as a character followed by a dash and 1+ white-
+spaces
diff --git a/test_unstructured/partition/test_text.py b/test_unstructured/partition/test_text.py
@@ -580,3 +580,12 @@ def test_partition_text_detects_more_than_3_languages():
         {element.metadata.languages[0] for element in elements if element.metadata.languages},
     )
     assert len(langs) > 10
+
+
+def test_partition_text_for_text_having_splitted_words():
+    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "text_having_splitted_words.txt")
+    elements = partition_text(filename=filename)
+    assert [element.text for element in elements] == [
+        "This is a test document to test text having splitted words to next line.",
+        "These words are described as a character followed by a dash and 1+ whitespaces",
+    ]
diff --git a/unstructured/cleaners/core.py b/unstructured/cleaners/core.py
@@ -104,6 +104,20 @@ def clean_ligatures(text) -> str:
     return cleaned_text
 
 
+def clean_splitted_words(text: str, pattern: str = r"(\w+)-\s+(\w+)") -> str:
+    """
+    The `clean_splitted_words` function removes the hyphen and whitespace
+    between two words in a given text.
+
+    :param text: A string that contains the text to be cleaned
+    :type text: str
+    :return: modified version of the input text where any occurrence of a word followed by a hyphen
+    and whitespace, followed by another word, is replaced with just the two words
+    concatenated together.
+    """
+    return re.sub(pattern, r"\1\2", text)
+
+
 def group_bullet_paragraph(paragraph: str) -> list:
     """Groups paragraphs with bullets that have line breaks for visual/formatting purposes.
     For example:

diff --git a/unstructured/nlp/patterns.py b/unstructured/nlp/patterns.py
@@ -145,3 +145,5 @@
 # e.g. 1. 2. 3. or 1) 2) 3), not 1.1 1.2 1.3
 NUMBERED_LIST_PATTERN = r"^\d+(\.|\))\s(.+)"
 NUMBERED_LIST_RE = re.compile(NUMBERED_LIST_PATTERN)
+SPLITTED_WORDS_PATTERN = r"(\w+)-\s+(\w+)"
+SPLITTED_WORDS_RE = re.compile(SPLITTED_WORDS_PATTERN)
diff --git a/unstructured/partition/text.py b/unstructured/partition/text.py
@@ -6,10 +6,7 @@
 from typing import IO, Any, Callable, Literal, Optional
 
 from unstructured.chunking import add_chunking_strategy
-from unstructured.cleaners.core import (
-    auto_paragraph_grouper,
-    clean_bullets,
-)
+from unstructured.cleaners.core import auto_paragraph_grouper, clean_bullets, clean_splitted_words
 from unstructured.documents.coordinates import CoordinateSystem
 from unstructured.documents.elements import (
     Address,
@@ -37,6 +34,7 @@
 from unstructured.partition.text_type import (
     is_bulleted_text,
     is_email_address,
+    is_having_splitted_words,
     is_possible_narrative_text,
     is_possible_numbered_list,
     is_possible_title,
@@ -167,6 +165,11 @@ def _partition_text(
     elif text is not None:
         file_text = str(text)
 
+    # NOTE(sksharma0): if there are words that continue on the following line,
+    # merge them into single word
+    if is_having_splitted_words(file_text):
+        file_text = clean_splitted_words(file_text)
+
     if paragraph_grouper is False:
         pass
     elif paragraph_grouper is not None:

diff --git a/unstructured/partition/text_type.py b/unstructured/partition/text_type.py
@@ -19,6 +19,7 @@
     EMAIL_ADDRESS_PATTERN_RE,
     ENDS_IN_PUNCT_RE,
     NUMBERED_LIST_RE,
+    SPLITTED_WORDS_RE,
     UNICODE_BULLETS_RE,
     US_CITY_STATE_ZIP_RE,
     US_PHONE_NUMBERS_RE,
@@ -317,3 +318,8 @@ def is_email_address(text: str) -> bool:
 def is_possible_numbered_list(text: str) -> bool:
     """Checks to see if the text is a potential numbered list."""
     return NUMBERED_LIST_RE.match(text.strip()) is not None
+
+
+def is_having_splitted_words(text: str) -> bool:
+    """Checks if the given text has words that continue on the following line"""
+    return SPLITTED_WORDS_RE.search(text.strip()) is not None