diff --git a/CHANGELOG.md b/CHANGELOG.md
index dd177a94a6..5c1d38240d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,10 @@
### Enhancements
+* **Merge words splitted across two lines** Merges words which are splitted across two lines in text partition
+
+### Enhancements
+
* **Bump unstructured.paddleocr to 2.8.0.1.**
* **Refine HTML parser to accommodate block element nested in phrasing.** HTML parser no longer raises on a block element (e.g. `
`, `
`) nested inside a phrasing element (e.g. `` or ``). Instead it breaks the phrasing run (and therefore element) at the block-item start and begins a new phrasing run after the block-item. This is consistent with how the browser determines element boundaries in this situation.
* **Install rewritten HTML parser to fix 12 existing bugs and provide headroom for refinement and growth.** A rewritten HTML parser resolves a collection of outstanding bugs with HTML partitioning and provides a firm foundation for further elaborating that important partitioner.
diff --git a/example-docs/text_having_splitted_words.txt b/example-docs/text_having_splitted_words.txt
new file mode 100644
index 0000000000..ac4643678f
--- /dev/null
+++ b/example-docs/text_having_splitted_words.txt
@@ -0,0 +1,6 @@
+This is a test document to test text having splitt-
+ed words to next line.
+
+These words are descri-
+bed as a character followed by a dash and 1+ white-
+spaces
\ No newline at end of file
diff --git a/test_unstructured/partition/test_text.py b/test_unstructured/partition/test_text.py
index 50d8210340..1239d812d1 100644
--- a/test_unstructured/partition/test_text.py
+++ b/test_unstructured/partition/test_text.py
@@ -580,3 +580,12 @@ def test_partition_text_detects_more_than_3_languages():
{element.metadata.languages[0] for element in elements if element.metadata.languages},
)
assert len(langs) > 10
+
+
+def test_partition_text_for_text_having_splitted_words():
+ filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "text_having_splitted_words.txt")
+ elements = partition_text(filename=filename)
+ assert [element.text for element in elements] == [
+ "This is a test document to test text having splitted words to next line.",
+ "These words are described as a character followed by a dash and 1+ whitespaces",
+ ]
diff --git a/unstructured/cleaners/core.py b/unstructured/cleaners/core.py
index 90a58184d1..f9e064318c 100644
--- a/unstructured/cleaners/core.py
+++ b/unstructured/cleaners/core.py
@@ -104,6 +104,20 @@ def clean_ligatures(text) -> str:
return cleaned_text
+def clean_splitted_words(text: str, pattern: str = r"(\w+)-\s+(\w+)") -> str:
+ """
+ The `clean_splitted_words` function removes the hyphen and whitespace
+ between two words in a given text.
+
+ :param text: A string that contains the text to be cleaned
+ :type text: str
+ :return: modified version of the input text where any occurrence of a word followed by a hyphen
+ and whitespace, followed by another word, is replaced with just the two words
+ concatenated together.
+ """
+ return re.sub(pattern, r"\1\2", text)
+
+
def group_bullet_paragraph(paragraph: str) -> list:
"""Groups paragraphs with bullets that have line breaks for visual/formatting purposes.
For example:
diff --git a/unstructured/nlp/patterns.py b/unstructured/nlp/patterns.py
index e18f067c4a..616cbb44e2 100644
--- a/unstructured/nlp/patterns.py
+++ b/unstructured/nlp/patterns.py
@@ -145,3 +145,5 @@
# e.g. 1. 2. 3. or 1) 2) 3), not 1.1 1.2 1.3
NUMBERED_LIST_PATTERN = r"^\d+(\.|\))\s(.+)"
NUMBERED_LIST_RE = re.compile(NUMBERED_LIST_PATTERN)
+SPLITTED_WORDS_PATTERN = r"(\w+)-\s+(\w+)"
+SPLITTED_WORDS_RE = re.compile(SPLITTED_WORDS_PATTERN)
diff --git a/unstructured/partition/text.py b/unstructured/partition/text.py
index 96cd105250..20085357b3 100644
--- a/unstructured/partition/text.py
+++ b/unstructured/partition/text.py
@@ -6,10 +6,7 @@
from typing import IO, Any, Callable, Literal, Optional
from unstructured.chunking import add_chunking_strategy
-from unstructured.cleaners.core import (
- auto_paragraph_grouper,
- clean_bullets,
-)
+from unstructured.cleaners.core import auto_paragraph_grouper, clean_bullets, clean_splitted_words
from unstructured.documents.coordinates import CoordinateSystem
from unstructured.documents.elements import (
Address,
@@ -37,6 +34,7 @@
from unstructured.partition.text_type import (
is_bulleted_text,
is_email_address,
+ is_having_splitted_words,
is_possible_narrative_text,
is_possible_numbered_list,
is_possible_title,
@@ -167,6 +165,11 @@ def _partition_text(
elif text is not None:
file_text = str(text)
+ # NOTE(sksharma0): if there are words that continue on the following line,
+ # merge them into single word
+ if is_having_splitted_words(file_text):
+ file_text = clean_splitted_words(file_text)
+
if paragraph_grouper is False:
pass
elif paragraph_grouper is not None:
diff --git a/unstructured/partition/text_type.py b/unstructured/partition/text_type.py
index 2989c24728..779070a051 100644
--- a/unstructured/partition/text_type.py
+++ b/unstructured/partition/text_type.py
@@ -19,6 +19,7 @@
EMAIL_ADDRESS_PATTERN_RE,
ENDS_IN_PUNCT_RE,
NUMBERED_LIST_RE,
+ SPLITTED_WORDS_RE,
UNICODE_BULLETS_RE,
US_CITY_STATE_ZIP_RE,
US_PHONE_NUMBERS_RE,
@@ -317,3 +318,8 @@ def is_email_address(text: str) -> bool:
def is_possible_numbered_list(text: str) -> bool:
"""Checks to see if the text is a potential numbered list."""
return NUMBERED_LIST_RE.match(text.strip()) is not None
+
+
+def is_having_splitted_words(text: str) -> bool:
+ """Checks if the given text has words that continue on the following line"""
+ return SPLITTED_WORDS_RE.search(text.strip()) is not None