Skip to content

Commit

Permalink
Fix table recognition
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed Oct 24, 2024
1 parent b4bf323 commit 534b6b2
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 6 deletions.
16 changes: 11 additions & 5 deletions marker/tables/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,12 @@ def get_table_boxes(pages: List[Page], doc: PdfDocument, fname):
table_counts = []
table_bboxes = []
img_sizes = []
pnums = []

for page in pages:
pnum = page.pnum
for page_idx, page in enumerate(pages):
# The bbox for the entire table
bbox = [b.bbox for b in page.layout.bboxes if b.label == "Table"]
highres_img = render_image(doc[pnum], dpi=settings.SURYA_TABLE_DPI)
highres_img = render_image(doc[page_idx], dpi=settings.SURYA_TABLE_DPI)

page_table_imgs = []
page_bboxes = []
Expand All @@ -48,11 +48,13 @@ def get_table_boxes(pages: List[Page], doc: PdfDocument, fname):
if len(bbox) == 0:
table_counts.append(0)
img_sizes.append(None)
pnums.append(page.pnum)
continue

# Number of tables per page
table_counts.append(len(bbox))
img_sizes.append(highres_img.size)
pnums.append(page.pnum)

for bb in bbox:
highres_bb = rescale_bbox(page.layout.image_bbox, [0, 0, highres_img.size[0], highres_img.size[1]], bb)
Expand All @@ -62,10 +64,14 @@ def get_table_boxes(pages: List[Page], doc: PdfDocument, fname):
table_imgs.extend(page_table_imgs)
table_bboxes.extend(page_bboxes)

table_idxs = [i for i, c in enumerate(table_counts) if c > 0]
# The page number in doc and in the original document are not the same
# Doc has had pages removed from the start to align to start_page
# This corrects for that
doc_idxs = [pnum for pnum, tc in zip(pnums, table_counts) if tc > 0]
table_idxs = [i for i, tc in enumerate(table_counts) if tc > 0]
sel_text_lines = get_page_text_lines(
fname,
table_idxs,
doc_idxs,
[hr for i, hr in enumerate(img_sizes) if i in table_idxs],
)
text_lines = []
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "marker-pdf"
version = "0.3.7"
version = "0.3.8"
description = "Convert PDF to markdown with high speed and accuracy."
authors = ["Vik Paruchuri <[email protected]>"]
readme = "README.md"
Expand Down

0 comments on commit 534b6b2

Please sign in to comment.