Merge pull request #313 from VikParuchuri/dev

Fix bugs
VikParuchuri · Oct 23, 2024 · 2f3f0d7 · 2f3f0d7
2 parents 189d660 + 4ae3d74
commit 2f3f0d7
Show file tree

Hide file tree

Showing 3 changed files with 17 additions and 6 deletions.
diff --git a/marker/convert.py b/marker/convert.py
@@ -93,8 +93,6 @@ def convert_single_pdf(
 
     # Identify text lines, layout, reading order
     surya_detection(lowres_images, pages, detection_model, batch_multiplier=batch_multiplier)
-    surya_layout(lowres_images, pages, layout_model, batch_multiplier=batch_multiplier)
-    surya_order(lowres_images, pages, order_model, batch_multiplier=batch_multiplier)
 
     # OCR pages as needed
     pages, ocr_stats = run_ocr(doc, pages, langs, ocr_model, batch_multiplier=batch_multiplier, ocr_all_pages=ocr_all_pages)
@@ -105,12 +103,17 @@ def convert_single_pdf(
         print(f"Could not extract any text blocks for {fname}")
         return "", {}, out_meta
 
+    surya_layout(lowres_images, pages, layout_model, batch_multiplier=batch_multiplier)
+
     # Find headers and footers
     bad_span_ids = filter_header_footer(pages)
     out_meta["block_stats"] = {"header_footer": len(bad_span_ids)}
 
-    # Add block types from layout and sort from reading order
+    # Add block types from layout
     annotate_block_types(pages)
+
+    # Sort from reading order
+    surya_order(lowres_images, pages, order_model, batch_multiplier=batch_multiplier)
     sort_blocks_in_reading_order(pages)
 
     # Dump debug data if flags are set

diff --git a/marker/ocr/recognition.py b/marker/ocr/recognition.py
@@ -79,10 +79,18 @@ def surya_recognition(doc, page_idxs, langs: List[str], rec_model, pages: List[P
     polygons = deepcopy([[b.polygon for b in bboxes] for bboxes in detection_results])
 
     # Scale polygons to get correct image slices
-    for poly in polygons:
-        for p in poly:
+    for j, poly in enumerate(polygons):
+        skip_idxs = []
+        for z, p in enumerate(poly):
             for i in range(len(p)):
                 p[i] = [int(p[i][0] * box_scale), int(p[i][1] * box_scale)]
+            x_coords = [p[i][0] for i in range(len(p))]
+            y_coords = [p[i][1] for i in range(len(p))]
+            bbox = [min(x_coords), min(y_coords), max(x_coords), max(y_coords)]
+            if (bbox[2] - bbox[0]) * (bbox[3] - bbox[1]) == 0:
+                skip_idxs.append(z)
+        if len(skip_idxs) > 0:
+            polygons[j] = [p for i, p in enumerate(poly) if i not in skip_idxs]
 
     results = run_recognition(images, surya_langs, rec_model, processor, polygons=polygons, batch_size=int(get_batch_size() * batch_multiplier))
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "marker-pdf"
-version = "0.3.4"
+version = "0.3.5"
 description = "Convert PDF to markdown with high speed and accuracy."
 authors = ["Vik Paruchuri <[email protected]>"]
 readme = "README.md"