fix: correctly patch EOF handling in pdfminer (fixes: Unstructured-IO…

…#3815)
dhdaines · Dec 17, 2024 · 14a2d91 · 14a2d91
1 parent 9a9bf4c
commit 14a2d91
Show file tree

Hide file tree

Showing 3 changed files with 75 additions and 15 deletions.
diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py
@@ -1194,15 +1194,28 @@ def test_partition_pdf_with_fast_finds_headers_footers(
 @pytest.mark.parametrize(
     ("filename", "expected_log"),
     [
+        # This one is *actually* an invalid PDF document
         ("invalid-pdf-structure-pdfminer-entire-doc.pdf", "Repairing the PDF document ..."),
-        ("invalid-pdf-structure-pdfminer-one-page.pdf", "Repairing the PDF page 2 ..."),
     ],
 )
 def test_extractable_elements_repair_invalid_pdf_structure(filename, expected_log, caplog):
     caplog.set_level(logging.INFO)
     assert pdf.extractable_elements(filename=example_doc_path(f"pdf/{filename}"))
     assert expected_log in caplog.text
 
+@pytest.mark.parametrize(
+    ("filename", "expected_log"),
+    [
+        # This one is *not* an invalid PDF document, make sure we
+        # don't try to "repair" it unnecessarily
+        ("invalid-pdf-structure-pdfminer-one-page.pdf", "Repairing the PDF page 2 ..."),
+    ],
+)
+def test_properly_patch_pdfminer(filename, expected_log, caplog):
+    caplog.set_level(logging.INFO)
+    assert pdf.extractable_elements(filename=example_doc_path(f"pdf/{filename}"))
+    assert expected_log not in caplog.text
+
 
 def assert_element_extraction(
     elements: list[Element],

diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py
@@ -11,7 +11,6 @@
 
 import numpy as np
 import wrapt
-from pdfminer import psparser
 from pdfminer.layout import LTContainer, LTImage, LTItem, LTTextBox
 from pdfminer.utils import open_filename
 from pi_heif import register_heif_opener
@@ -96,16 +95,14 @@
     PartitionStrategy,
 )
 from unstructured.partition.utils.sorting import coord_has_valid_points, sort_page_elements
-from unstructured.patches.pdfminer import parse_keyword
 from unstructured.utils import first, requires_dependencies
+from unstructured.patches.pdfminer import patch_psparser
 
 if TYPE_CHECKING:
     pass
 
-# NOTE(alan): Patching this to fix a bug in pdfminer.six. Submitted this PR into pdfminer.six to fix
-# the bug: https://github.com/pdfminer/pdfminer.six/pull/885
-psparser.PSBaseParser._parse_keyword = parse_keyword  # type: ignore
 
+patch_psparser()
 RE_MULTISPACE_INCLUDING_NEWLINES = re.compile(pattern=r"\s+", flags=re.DOTALL)
 
 

diff --git a/unstructured/patches/pdfminer.py b/unstructured/patches/pdfminer.py
@@ -1,18 +1,35 @@
-from typing import Union
+from typing import Union, Tuple
 
-from pdfminer.psparser import END_KEYWORD, KWD, PSBaseParser, PSKeyword
+import functools
+import pdfminer
+from pdfminer.psparser import (
+    PSBaseParser,
+    KWD,
+    PSBaseParserToken,
+    PSEOF,
+    END_KEYWORD,
+    PSKeyword,
+    log,
+)
 
+factory_seek = PSBaseParser.seek
 
-def parse_keyword(self: PSBaseParser, s: bytes, i: int) -> int:
-    """Patch for pdfminer method _parse_keyword of PSBaseParser. Changes are identical to the PR
-    https://github.com/pdfminer/pdfminer.six/pull/885."""
+
+@functools.wraps(PSBaseParser.seek)
+def seek(self: PSBaseParser, pos: int) -> None:
+    factory_seek(self, pos)
+    self.eof = False
+
+
+@functools.wraps(PSBaseParser._parse_keyword)
+def _parse_keyword(self, s: bytes, i: int) -> int:
     m = END_KEYWORD.search(s, i)
-    if not m:
-        j = len(s)
-        self._curtoken += s[i:]
-    else:
+    if m:
         j = m.start(0)
         self._curtoken += s[i:j]
+    else:
+        self._curtoken += s[i:]
+        return len(s)
     if self._curtoken == b"true":
         token: Union[bool, PSKeyword] = True
     elif self._curtoken == b"false":
@@ -22,3 +39,36 @@ def parse_keyword(self: PSBaseParser, s: bytes, i: int) -> int:
     self._add_token(token)
     self._parse1 = self._parse_main
     return j
+
+
+@functools.wraps(PSBaseParser.nexttoken)
+def nexttoken(self) -> Tuple[int, PSBaseParserToken]:
+    if self.eof:
+        # It's not really unexpected, come on now...
+        raise PSEOF("Unexpected EOF")
+    while not self._tokens:
+        try:
+            self.fillbuf()
+            self.charpos = self._parse1(self.buf, self.charpos)
+        except PSEOF:
+            # If we hit EOF in the middle of a token, try to parse
+            # it by tacking on whitespace, and delay raising PSEOF
+            # until next time around
+            self.charpos = self._parse1(b"\n", 0)
+            self.eof = True
+            # Oh, so there wasn't actually a token there? OK.
+            if not self._tokens:
+                raise
+    token = self._tokens.pop(0)
+    log.debug("nexttoken: %r", token)
+    return token
+
+
+def patch_psparser():
+    """Monkey-patch certain versions of pdfminer.six to avoid breaking
+    tokens across buffers."""
+    # Presuming the bug will be fixed in the next release
+    if pdfminer.__version__ <= "20240706":
+        PSBaseParser.seek = seek
+        PSBaseParser._parse_keyword = _parse_keyword
+        PSBaseParser.nexttoken = nexttoken