Merge pull request #197 from VikParuchuri/dev

Bugfixes and new features
VikParuchuri · Jun 17, 2024 · fe9343c · fe9343c
2 parents 0d6f8da + 26f3890
commit fe9343c
Show file tree

Hide file tree

Showing 6 changed files with 42 additions and 14 deletions.
diff --git a/README.md b/README.md
@@ -38,16 +38,16 @@ The above results are with marker and nougat setup so they each take ~4GB of VRA
 
 See [below](#benchmarks) for detailed speed and accuracy benchmarks, and instructions on how to run your own benchmarks.
 
+# Hosted API
+
+There is a hosted API for marker available [here](https://www.datalab.to/).  It has been tuned for performance, and generally takes 10s + 1s/page for conversion.
+
 # Commercial usage
 
 I want marker to be as widely accessible as possible, while still funding my development/training costs.  Research and personal usage is always okay, but there are some restrictions on commercial usage.
 
 The weights for the models are licensed `cc-by-nc-sa-4.0`, but I will waive that for any organization under $5M USD in gross revenue in the most recent 12-month period AND under $5M in lifetime VC/angel funding raised. If you want to remove the GPL license requirements (dual-license) and/or use the weights commercially over the revenue limit, check out the options [here](https://www.datalab.to).
 
-# Hosted API
-
-There is a hosted API for marker available [here](https://www.datalab.to/).  It's currently in beta, and I'm working on optimizing speed.
-
 # Community
 
 [Discord](https://discord.gg//KuZwXNGnfH) is where we discuss future development.
@@ -147,6 +147,15 @@ There are some settings that you may find useful if things aren't working the wa
 
 In general, if output is not what you expect, trying to OCR the PDF is a good first step.  Not all PDFs have good text/bboxes embedded in them.
 
+## Useful settings
+
+These settings can improve/change output quality:
+
+- `OCR_ALL_PAGES` will force OCR across the document.  Many PDFs have bad text embedded due to older OCR engines being used.
+- `PAGINATE_OUTPUT` will put a horizontal rule between pages.  Default: False.
+- `EXTRACT_IMAGES` will extract images and save separately.  Default: True.
+- `BAD_SPAN_TYPES` specifies layout blocks to remove from the markdown output.
+
 # Benchmarks
 
 Benchmarking PDF extraction quality is hard.  I've created a test set by finding books and scientific papers that have a pdf version and a latex source.  I convert the latex to text, and compare the reference to the output of text extraction methods.  It's noisy, but at least directionally correct.

diff --git a/convert.py b/convert.py
@@ -23,6 +23,9 @@
 
 
 def worker_init(shared_model):
+    if shared_model is None:
+        shared_model = load_all_models()
+
     global model_refs
     model_refs = shared_model
 
@@ -107,17 +110,22 @@ def main():
     else:
         total_processes = int(total_processes)
 
-    mp.set_start_method('spawn') # Required for CUDA, forkserver doesn't work
-    model_lst = load_all_models()
+    try:
+        mp.set_start_method('spawn') # Required for CUDA, forkserver doesn't work
+    except RuntimeError:
+        raise RuntimeError("Set start method to spawn twice. This may be a temporary issue with the script. Please try running it again.")
 
-    for model in model_lst:
-        if model is None:
-            continue
+    if settings.TORCH_DEVICE == "mps" or settings.TORCH_DEVICE_MODEL == "mps":
+        print("Cannot use MPS with torch multiprocessing share_memory. This will make things less memory efficient. If you want to share memory, you have to use CUDA or CPU.  Set the TORCH_DEVICE environment variable to change the device.")
 
-        if model.device.type == "mps":
-            raise ValueError("Cannot use MPS with torch multiprocessing share_memory.  You have to use CUDA or CPU.  Set the TORCH_DEVICE environment variable to change the device.")
+        model_lst = None
+    else:
+        model_lst = load_all_models()
 
-        model.share_memory()
+        for model in model_lst:
+            if model is None:
+                continue
+            model.share_memory()
 
     print(f"Converting {len(files_to_convert)} pdfs in chunk {args.chunk_idx + 1}/{args.num_chunks} with {total_processes} processes, and storing in {out_folder}")
     task_args = [(f, out_folder, metadata.get(os.path.basename(f)), args.min_length) for f in files_to_convert]

diff --git a/marker/images/extract.py b/marker/images/extract.py
@@ -39,6 +39,11 @@ def extract_page_images(page_obj, page):
     image_blocks = find_image_blocks(page)
 
     for image_idx, (block_idx, line_idx, bbox) in enumerate(image_blocks):
+        if block_idx >= len(page.blocks):
+            block_idx = len(page.blocks) - 1
+        if block_idx < 0:
+            continue
+
         block = page.blocks[block_idx]
         image = render_bbox_image(page_obj, page, bbox)
         image_filename = get_image_filename(page, image_idx)

diff --git a/marker/postprocessors/markdown.py b/marker/postprocessors/markdown.py
@@ -4,6 +4,8 @@
 import regex
 from typing import List
 
+from marker.settings import settings
+
 
 def escape_markdown(text):
     # List of characters that need to be escaped in markdown
@@ -143,7 +145,7 @@ def merge_lines(blocks: List[List[MergedBlock]]):
     block_text = ""
     block_type = ""
 
-    for page in blocks:
+    for idx, page in enumerate(blocks):
         for block in page:
             block_type = block.block_type
             if block_type != prev_type and prev_type:
@@ -168,6 +170,9 @@ def merge_lines(blocks: List[List[MergedBlock]]):
                 else:
                     block_text = line.text
 
+        if settings.PAGINATE_OUTPUT and idx < len(blocks) - 1:
+            block_text += "\n\n" + "-" * 16 + "\n\n" # Page separator horizontal rule
+
     # Append the final block
     text_blocks.append(
         FullyMergedBlock(

diff --git a/marker/settings.py b/marker/settings.py
@@ -11,6 +11,7 @@ class Settings(BaseSettings):
     TORCH_DEVICE: Optional[str] = None # Note: MPS device does not work for text detection, and will default to CPU
     IMAGE_DPI: int = 96 # DPI to render images pulled from pdf at
     EXTRACT_IMAGES: bool = True # Extract images from pdfs and save them
+    PAGINATE_OUTPUT: bool = False # Paginate output markdown
 
     @computed_field
     @property

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "marker-pdf"
-version = "0.2.13"
+version = "0.2.14"
 description = "Convert PDF to markdown with high speed and accuracy."
 authors = ["Vik Paruchuri <[email protected]>"]
 readme = "README.md"