From d5a981c59a7d2fd1132ffc84b64519eb614e439a Mon Sep 17 00:00:00 2001 From: heya5 Date: Fri, 14 Jun 2024 17:35:23 +0800 Subject: [PATCH] fix missing
text element with one
--- unstructured/documents/html.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/unstructured/documents/html.py b/unstructured/documents/html.py index 2fb6797d7b..12d70f3b4b 100644 --- a/unstructured/documents/html.py +++ b/unstructured/documents/html.py @@ -385,7 +385,9 @@ def _is_text_tag( # NOTE(robinson) - This indicates that a div tag has no children. If that's the # case and the tag has text, its potential a text tag children = list(tag_elem) - if tag_elem.tag in SECTION_TAGS + ["body"] and len(children) == 0: + # Exclude
tags when counting children + children_count = len([ child for child in children if child.tag not in TEXTBREAK_TAGS]) + if tag_elem.tag in SECTION_TAGS + ["body"] and children_count == 0: return True if _has_adjacent_bulleted_spans(tag_elem, children):