Skip to content

Commit

Permalink
Merge pull request #319 from VikParuchuri/dev
Browse files Browse the repository at this point in the history
Flatten pdfs, fix page separators
  • Loading branch information
VikParuchuri authored Oct 25, 2024
2 parents 1b4b413 + b9c6f73 commit b2cae2e
Show file tree
Hide file tree
Showing 8 changed files with 53 additions and 30 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,7 @@ Set `DEBUG=true` to save data to the `debug` subfolder in the marker root direct
These settings can improve/change output quality:

- `OCR_ALL_PAGES` will force OCR across the document. Many PDFs have bad text embedded due to older OCR engines being used.
- `PAGINATE_OUTPUT` will put a horizontal rule between pages. Default: False.
- `PAGINATE_OUTPUT` will put a horizontal rule between pages. Default: False. The horizontal rule will be `\n\n`, then `{PAGE_NUMBER}`, then 48 single dashes `-`, then `\n\n`. The separator can be configured via the `PAGE_SEPARATOR` setting.
- `EXTRACT_IMAGES` will extract images and save separately. Default: True.
- `BAD_SPAN_TYPES` specifies layout blocks to remove from the markdown output.

Expand Down
2 changes: 1 addition & 1 deletion marker/pdf/extract_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def get_text_blocks(doc, fname, max_pages: Optional[int] = None, start_page: Opt

page_range = range(start_page, start_page + max_pages)

char_blocks = dictionary_output(fname, page_range=page_range, keep_chars=False, workers=settings.PDFTEXT_CPU_WORKERS)
char_blocks = dictionary_output(fname, page_range=page_range, keep_chars=False, workers=settings.PDFTEXT_CPU_WORKERS, flatten_pdf=settings.FLATTEN_PDF)
marker_blocks = [pdftext_format_to_blocks(page, pnum) for pnum, page in enumerate(char_blocks)]

return marker_blocks, toc
Expand Down
60 changes: 40 additions & 20 deletions marker/postprocessors/markdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,11 +64,19 @@ def merge_spans(pages: List[Page]) -> List[List[MergedBlock]]:
if len(block_lines) > 0:
page_blocks.append(MergedBlock(
lines=block_lines,
pnum=block.pnum,
pnum=page.pnum,
bbox=block.bbox,
block_type=block.block_type,
heading_level=block.heading_level
))
if len(page_blocks) == 0:
page_blocks.append(MergedBlock(
lines=[],
pnum=page.pnum,
bbox=page.bbox,
block_type="Text",
heading_level=None
))
merged_blocks.append(page_blocks)

return merged_blocks
Expand Down Expand Up @@ -139,9 +147,6 @@ def block_separator(prev_block: FullyMergedBlock, block: FullyMergedBlock):
if prev_block.block_type == "Text":
sep = "\n\n"

if prev_block.page_end:
sep = settings.PAGE_SEPARATOR

return sep + block.text


Expand All @@ -152,22 +157,46 @@ def merge_lines(blocks: List[List[MergedBlock]], max_block_gap=15):
block_text = ""
block_type = ""
prev_heading_level = None
pnum = None

for idx, page in enumerate(blocks):
# Insert pagination at every page boundary
if settings.PAGINATE_OUTPUT:
if block_text:
text_blocks.append(
FullyMergedBlock(
text=block_surround(block_text, prev_type, prev_heading_level),
block_type=prev_type if prev_type else settings.DEFAULT_BLOCK_TYPE,
page_start=False,
pnum=pnum
)
)
block_text = ""
text_blocks.append(
FullyMergedBlock(
text="",
block_type="Text",
page_start=True,
pnum=page[0].pnum
)
)

for block in page:
block_type = block.block_type
if (block_type != prev_type and prev_type) or (block.heading_level != prev_heading_level and prev_heading_level):
text_blocks.append(
FullyMergedBlock(
text=block_surround(block_text, prev_type, prev_heading_level),
block_type=prev_type if prev_type else settings.DEFAULT_BLOCK_TYPE,
page_end=False
page_start=False,
pnum=block.pnum
)
)
block_text = ""

prev_type = block_type
prev_heading_level = block.heading_level
pnum = block.pnum
# Join lines in the block together properly
for i, line in enumerate(block.lines):
line_height = line.bbox[3] - line.bbox[1]
Expand All @@ -181,36 +210,27 @@ def merge_lines(blocks: List[List[MergedBlock]], max_block_gap=15):
else:
block_text = line.text

# Force blocks to end at page boundaries
if settings.PAGINATE_OUTPUT:
text_blocks.append(
FullyMergedBlock(
text=block_surround(block_text, prev_type, prev_heading_level),
block_type=prev_type if prev_type else settings.DEFAULT_BLOCK_TYPE,
page_end=True
)
)
block_text = ""


# Append the final block
text_blocks.append(
FullyMergedBlock(
text=block_surround(block_text, prev_type, prev_heading_level),
block_type=block_type if block_type else settings.DEFAULT_BLOCK_TYPE,
page_end=False
page_start=False,
pnum=pnum
)
)

text_blocks = [block for block in text_blocks if block.text.strip()]
text_blocks = [block for block in text_blocks if (block.text.strip() or block.page_start)]
return text_blocks


def get_full_text(text_blocks):
full_text = ""
prev_block = None
for block in text_blocks:
if prev_block:
if block.page_start:
full_text += "\n\n{" + str(block.pnum) + "}" + settings.PAGE_SEPARATOR
elif prev_block:
full_text += block_separator(prev_block, block)
else:
full_text += block.text
Expand Down
3 changes: 2 additions & 1 deletion marker/schema/merged.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,5 @@ class MergedBlock(BboxElement):
class FullyMergedBlock(BaseModel):
text: str
block_type: str
page_end: bool
page_start: bool
pnum: int | None
3 changes: 2 additions & 1 deletion marker/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ class Settings(BaseSettings):
EXTRACT_IMAGES: bool = True # Extract images from pdfs and save them
PAGINATE_OUTPUT: bool = False # Paginate output markdown
BASE_DIR: str = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
FLATTEN_PDF: bool = True # Pull form field values into the PDF before converting to markdown

@computed_field
@property
Expand Down Expand Up @@ -88,7 +89,7 @@ def TORCH_DEVICE_MODEL(self) -> str:
HEADING_DEFAULT_LEVEL: int = 2

# Output
PAGE_SEPARATOR: str = "\n\n" + "-" * 48 + "\n\n"
PAGE_SEPARATOR: str = "-" * 48 + "\n\n"

# Debug
DEBUG_DATA_FOLDER: str = os.path.join(BASE_DIR, "debug_data")
Expand Down
1 change: 1 addition & 0 deletions marker/tables/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ def get_table_boxes(pages: List[Page], doc: PdfDocument, fname):
fname,
doc_idxs,
[hr for i, hr in enumerate(img_sizes) if i in table_idxs],
# Add flatten pdf here
)
text_lines = []
out_img_sizes = []
Expand Down
8 changes: 4 additions & 4 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "marker-pdf"
version = "0.3.8"
version = "0.3.9"
description = "Convert PDF to markdown with high speed and accuracy."
authors = ["Vik Paruchuri <[email protected]>"]
readme = "README.md"
Expand Down Expand Up @@ -32,7 +32,7 @@ tabulate = "^0.9.0"
ftfy = "^6.1.1"
texify = "^0.2.0"
rapidfuzz = "^3.8.1"
surya-ocr = "^0.6.10"
surya-ocr = "^0.6.11"
filetype = "^1.2.0"
regex = "^2024.4.28"
pdftext = "^0.3.17"
Expand Down

0 comments on commit b2cae2e

Please sign in to comment.