From 19509ed688355728d0a17e4615aa9d90fd759508 Mon Sep 17 00:00:00 2001 From: PhorstenkampFuzzy <106159868+PhorstenkampFuzzy@users.noreply.github.com> Date: Tue, 17 Dec 2024 10:28:27 +0100 Subject: [PATCH] Fix for issue #3815 Removed redundent patch as it is already merged in https://github.com/pdfminer/pdfminer.six/pull/885 --- unstructured/partition/pdf.py | 6 ------ unstructured/patches/__init__.py | 0 unstructured/patches/pdfminer.py | 24 ------------------------ 3 files changed, 30 deletions(-) delete mode 100644 unstructured/patches/__init__.py delete mode 100644 unstructured/patches/pdfminer.py diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index f87812d40b..a648af40aa 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -11,7 +11,6 @@ import numpy as np import wrapt -from pdfminer import psparser from pdfminer.layout import LTContainer, LTImage, LTItem, LTTextBox from pdfminer.utils import open_filename from pi_heif import register_heif_opener @@ -96,16 +95,11 @@ PartitionStrategy, ) from unstructured.partition.utils.sorting import coord_has_valid_points, sort_page_elements -from unstructured.patches.pdfminer import parse_keyword from unstructured.utils import first, requires_dependencies if TYPE_CHECKING: pass -# NOTE(alan): Patching this to fix a bug in pdfminer.six. Submitted this PR into pdfminer.six to fix -# the bug: https://github.com/pdfminer/pdfminer.six/pull/885 -psparser.PSBaseParser._parse_keyword = parse_keyword # type: ignore - RE_MULTISPACE_INCLUDING_NEWLINES = re.compile(pattern=r"\s+", flags=re.DOTALL) diff --git a/unstructured/patches/__init__.py b/unstructured/patches/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/unstructured/patches/pdfminer.py b/unstructured/patches/pdfminer.py deleted file mode 100644 index 20b938d1ce..0000000000 --- a/unstructured/patches/pdfminer.py +++ /dev/null @@ -1,24 +0,0 @@ -from typing import Union - -from pdfminer.psparser import END_KEYWORD, KWD, PSBaseParser, PSKeyword - - -def parse_keyword(self: PSBaseParser, s: bytes, i: int) -> int: - """Patch for pdfminer method _parse_keyword of PSBaseParser. Changes are identical to the PR - https://github.com/pdfminer/pdfminer.six/pull/885.""" - m = END_KEYWORD.search(s, i) - if not m: - j = len(s) - self._curtoken += s[i:] - else: - j = m.start(0) - self._curtoken += s[i:j] - if self._curtoken == b"true": - token: Union[bool, PSKeyword] = True - elif self._curtoken == b"false": - token = False - else: - token = KWD(self._curtoken) - self._add_token(token) - self._parse1 = self._parse_main - return j