Skip to content

Commit

Permalink
Merge pull request #313 from VikParuchuri/dev
Browse files Browse the repository at this point in the history
Fix bugs
  • Loading branch information
VikParuchuri authored Oct 23, 2024
2 parents 189d660 + 4ae3d74 commit 2f3f0d7
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 6 deletions.
9 changes: 6 additions & 3 deletions marker/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,6 @@ def convert_single_pdf(

# Identify text lines, layout, reading order
surya_detection(lowres_images, pages, detection_model, batch_multiplier=batch_multiplier)
surya_layout(lowres_images, pages, layout_model, batch_multiplier=batch_multiplier)
surya_order(lowres_images, pages, order_model, batch_multiplier=batch_multiplier)

# OCR pages as needed
pages, ocr_stats = run_ocr(doc, pages, langs, ocr_model, batch_multiplier=batch_multiplier, ocr_all_pages=ocr_all_pages)
Expand All @@ -105,12 +103,17 @@ def convert_single_pdf(
print(f"Could not extract any text blocks for {fname}")
return "", {}, out_meta

surya_layout(lowres_images, pages, layout_model, batch_multiplier=batch_multiplier)

# Find headers and footers
bad_span_ids = filter_header_footer(pages)
out_meta["block_stats"] = {"header_footer": len(bad_span_ids)}

# Add block types from layout and sort from reading order
# Add block types from layout
annotate_block_types(pages)

# Sort from reading order
surya_order(lowres_images, pages, order_model, batch_multiplier=batch_multiplier)
sort_blocks_in_reading_order(pages)

# Dump debug data if flags are set
Expand Down
12 changes: 10 additions & 2 deletions marker/ocr/recognition.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,10 +79,18 @@ def surya_recognition(doc, page_idxs, langs: List[str], rec_model, pages: List[P
polygons = deepcopy([[b.polygon for b in bboxes] for bboxes in detection_results])

# Scale polygons to get correct image slices
for poly in polygons:
for p in poly:
for j, poly in enumerate(polygons):
skip_idxs = []
for z, p in enumerate(poly):
for i in range(len(p)):
p[i] = [int(p[i][0] * box_scale), int(p[i][1] * box_scale)]
x_coords = [p[i][0] for i in range(len(p))]
y_coords = [p[i][1] for i in range(len(p))]
bbox = [min(x_coords), min(y_coords), max(x_coords), max(y_coords)]
if (bbox[2] - bbox[0]) * (bbox[3] - bbox[1]) == 0:
skip_idxs.append(z)
if len(skip_idxs) > 0:
polygons[j] = [p for i, p in enumerate(poly) if i not in skip_idxs]

results = run_recognition(images, surya_langs, rec_model, processor, polygons=polygons, batch_size=int(get_batch_size() * batch_multiplier))

Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "marker-pdf"
version = "0.3.4"
version = "0.3.5"
description = "Convert PDF to markdown with high speed and accuracy."
authors = ["Vik Paruchuri <[email protected]>"]
readme = "README.md"
Expand Down

0 comments on commit 2f3f0d7

Please sign in to comment.