diff --git a/CHANGELOG.md b/CHANGELOG.md index dd177a94a6..5c1d38240d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,10 @@ ### Enhancements +* **Merge words splitted across two lines** Merges words which are splitted across two lines in text partition + +### Enhancements + * **Bump unstructured.paddleocr to 2.8.0.1.** * **Refine HTML parser to accommodate block element nested in phrasing.** HTML parser no longer raises on a block element (e.g. `

`, `

`) nested inside a phrasing element (e.g. `` or ``). Instead it breaks the phrasing run (and therefore element) at the block-item start and begins a new phrasing run after the block-item. This is consistent with how the browser determines element boundaries in this situation. * **Install rewritten HTML parser to fix 12 existing bugs and provide headroom for refinement and growth.** A rewritten HTML parser resolves a collection of outstanding bugs with HTML partitioning and provides a firm foundation for further elaborating that important partitioner. diff --git a/example-docs/text_having_splitted_words.txt b/example-docs/text_having_splitted_words.txt new file mode 100644 index 0000000000..ac4643678f --- /dev/null +++ b/example-docs/text_having_splitted_words.txt @@ -0,0 +1,6 @@ +This is a test document to test text having splitt- +ed words to next line. + +These words are descri- +bed as a character followed by a dash and 1+ white- +spaces \ No newline at end of file diff --git a/test_unstructured/partition/test_text.py b/test_unstructured/partition/test_text.py index 50d8210340..1239d812d1 100644 --- a/test_unstructured/partition/test_text.py +++ b/test_unstructured/partition/test_text.py @@ -580,3 +580,12 @@ def test_partition_text_detects_more_than_3_languages(): {element.metadata.languages[0] for element in elements if element.metadata.languages}, ) assert len(langs) > 10 + + +def test_partition_text_for_text_having_splitted_words(): + filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "text_having_splitted_words.txt") + elements = partition_text(filename=filename) + assert [element.text for element in elements] == [ + "This is a test document to test text having splitted words to next line.", + "These words are described as a character followed by a dash and 1+ whitespaces", + ] diff --git a/unstructured/cleaners/core.py b/unstructured/cleaners/core.py index 90a58184d1..f9e064318c 100644 --- a/unstructured/cleaners/core.py +++ b/unstructured/cleaners/core.py @@ -104,6 +104,20 @@ def clean_ligatures(text) -> str: return cleaned_text +def clean_splitted_words(text: str, pattern: str = r"(\w+)-\s+(\w+)") -> str: + """ + The `clean_splitted_words` function removes the hyphen and whitespace + between two words in a given text. + + :param text: A string that contains the text to be cleaned + :type text: str + :return: modified version of the input text where any occurrence of a word followed by a hyphen + and whitespace, followed by another word, is replaced with just the two words + concatenated together. + """ + return re.sub(pattern, r"\1\2", text) + + def group_bullet_paragraph(paragraph: str) -> list: """Groups paragraphs with bullets that have line breaks for visual/formatting purposes. For example: diff --git a/unstructured/nlp/patterns.py b/unstructured/nlp/patterns.py index e18f067c4a..616cbb44e2 100644 --- a/unstructured/nlp/patterns.py +++ b/unstructured/nlp/patterns.py @@ -145,3 +145,5 @@ # e.g. 1. 2. 3. or 1) 2) 3), not 1.1 1.2 1.3 NUMBERED_LIST_PATTERN = r"^\d+(\.|\))\s(.+)" NUMBERED_LIST_RE = re.compile(NUMBERED_LIST_PATTERN) +SPLITTED_WORDS_PATTERN = r"(\w+)-\s+(\w+)" +SPLITTED_WORDS_RE = re.compile(SPLITTED_WORDS_PATTERN) diff --git a/unstructured/partition/text.py b/unstructured/partition/text.py index 96cd105250..20085357b3 100644 --- a/unstructured/partition/text.py +++ b/unstructured/partition/text.py @@ -6,10 +6,7 @@ from typing import IO, Any, Callable, Literal, Optional from unstructured.chunking import add_chunking_strategy -from unstructured.cleaners.core import ( - auto_paragraph_grouper, - clean_bullets, -) +from unstructured.cleaners.core import auto_paragraph_grouper, clean_bullets, clean_splitted_words from unstructured.documents.coordinates import CoordinateSystem from unstructured.documents.elements import ( Address, @@ -37,6 +34,7 @@ from unstructured.partition.text_type import ( is_bulleted_text, is_email_address, + is_having_splitted_words, is_possible_narrative_text, is_possible_numbered_list, is_possible_title, @@ -167,6 +165,11 @@ def _partition_text( elif text is not None: file_text = str(text) + # NOTE(sksharma0): if there are words that continue on the following line, + # merge them into single word + if is_having_splitted_words(file_text): + file_text = clean_splitted_words(file_text) + if paragraph_grouper is False: pass elif paragraph_grouper is not None: diff --git a/unstructured/partition/text_type.py b/unstructured/partition/text_type.py index 2989c24728..779070a051 100644 --- a/unstructured/partition/text_type.py +++ b/unstructured/partition/text_type.py @@ -19,6 +19,7 @@ EMAIL_ADDRESS_PATTERN_RE, ENDS_IN_PUNCT_RE, NUMBERED_LIST_RE, + SPLITTED_WORDS_RE, UNICODE_BULLETS_RE, US_CITY_STATE_ZIP_RE, US_PHONE_NUMBERS_RE, @@ -317,3 +318,8 @@ def is_email_address(text: str) -> bool: def is_possible_numbered_list(text: str) -> bool: """Checks to see if the text is a potential numbered list.""" return NUMBERED_LIST_RE.match(text.strip()) is not None + + +def is_having_splitted_words(text: str) -> bool: + """Checks if the given text has words that continue on the following line""" + return SPLITTED_WORDS_RE.search(text.strip()) is not None