Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Merges words which are split across two lines in text partition #3394

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@

### Enhancements

* **Merge words splitted across two lines** Merges words which are splitted across two lines in text partition

### Enhancements

* **Bump unstructured.paddleocr to 2.8.0.1.**
* **Refine HTML parser to accommodate block element nested in phrasing.** HTML parser no longer raises on a block element (e.g. `<p>`, `<div>`) nested inside a phrasing element (e.g. `<strong>` or `<cite>`). Instead it breaks the phrasing run (and therefore element) at the block-item start and begins a new phrasing run after the block-item. This is consistent with how the browser determines element boundaries in this situation.
* **Install rewritten HTML parser to fix 12 existing bugs and provide headroom for refinement and growth.** A rewritten HTML parser resolves a collection of outstanding bugs with HTML partitioning and provides a firm foundation for further elaborating that important partitioner.
Expand Down
6 changes: 6 additions & 0 deletions example-docs/text_having_splitted_words.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
This is a test document to test text having splitt-
ed words to next line.

These words are descri-
bed as a character followed by a dash and 1+ white-
spaces
9 changes: 9 additions & 0 deletions test_unstructured/partition/test_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -580,3 +580,12 @@ def test_partition_text_detects_more_than_3_languages():
{element.metadata.languages[0] for element in elements if element.metadata.languages},
)
assert len(langs) > 10


def test_partition_text_for_text_having_splitted_words():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "text_having_splitted_words.txt")
elements = partition_text(filename=filename)
assert [element.text for element in elements] == [
"This is a test document to test text having splitted words to next line.",
"These words are described as a character followed by a dash and 1+ whitespaces",
]
14 changes: 14 additions & 0 deletions unstructured/cleaners/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,20 @@ def clean_ligatures(text) -> str:
return cleaned_text


def clean_splitted_words(text: str, pattern: str = r"(\w+)-\s+(\w+)") -> str:
"""
The `clean_splitted_words` function removes the hyphen and whitespace
between two words in a given text.

:param text: A string that contains the text to be cleaned
:type text: str
:return: modified version of the input text where any occurrence of a word followed by a hyphen
and whitespace, followed by another word, is replaced with just the two words
concatenated together.
"""
return re.sub(pattern, r"\1\2", text)


def group_bullet_paragraph(paragraph: str) -> list:
"""Groups paragraphs with bullets that have line breaks for visual/formatting purposes.
For example:
Expand Down
2 changes: 2 additions & 0 deletions unstructured/nlp/patterns.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,3 +145,5 @@
# e.g. 1. 2. 3. or 1) 2) 3), not 1.1 1.2 1.3
NUMBERED_LIST_PATTERN = r"^\d+(\.|\))\s(.+)"
NUMBERED_LIST_RE = re.compile(NUMBERED_LIST_PATTERN)
SPLITTED_WORDS_PATTERN = r"(\w+)-\s+(\w+)"
SPLITTED_WORDS_RE = re.compile(SPLITTED_WORDS_PATTERN)
11 changes: 7 additions & 4 deletions unstructured/partition/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,7 @@
from typing import IO, Any, Callable, Literal, Optional

from unstructured.chunking import add_chunking_strategy
from unstructured.cleaners.core import (
auto_paragraph_grouper,
clean_bullets,
)
from unstructured.cleaners.core import auto_paragraph_grouper, clean_bullets, clean_splitted_words
from unstructured.documents.coordinates import CoordinateSystem
from unstructured.documents.elements import (
Address,
Expand Down Expand Up @@ -37,6 +34,7 @@
from unstructured.partition.text_type import (
is_bulleted_text,
is_email_address,
is_having_splitted_words,
is_possible_narrative_text,
is_possible_numbered_list,
is_possible_title,
Expand Down Expand Up @@ -167,6 +165,11 @@ def _partition_text(
elif text is not None:
file_text = str(text)

# NOTE(sksharma0): if there are words that continue on the following line,
# merge them into single word
if is_having_splitted_words(file_text):
file_text = clean_splitted_words(file_text)

if paragraph_grouper is False:
pass
elif paragraph_grouper is not None:
Expand Down
6 changes: 6 additions & 0 deletions unstructured/partition/text_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
EMAIL_ADDRESS_PATTERN_RE,
ENDS_IN_PUNCT_RE,
NUMBERED_LIST_RE,
SPLITTED_WORDS_RE,
UNICODE_BULLETS_RE,
US_CITY_STATE_ZIP_RE,
US_PHONE_NUMBERS_RE,
Expand Down Expand Up @@ -317,3 +318,8 @@ def is_email_address(text: str) -> bool:
def is_possible_numbered_list(text: str) -> bool:
"""Checks to see if the text is a potential numbered list."""
return NUMBERED_LIST_RE.match(text.strip()) is not None


def is_having_splitted_words(text: str) -> bool:
"""Checks if the given text has words that continue on the following line"""
return SPLITTED_WORDS_RE.search(text.strip()) is not None