Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Performance improvements #328

Merged
merged 7 commits into from
Oct 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 34 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ There's a hosted API for marker available [here](https://www.datalab.to/):

- Supports PDFs, word documents, and powerpoints
- 1/4th the price of leading cloud-based competitors
- Leverages [Modal](https://modal.com/) for high reliability without latency spikes
- High uptime (99.99%), quality, and speed (.25s/page for 50 page doc)

# Community

Expand Down Expand Up @@ -191,6 +191,39 @@ The output will be a markdown file, but there will also be a metadata json file
}
```

## API server

There is a very simple API server you can run like this:

```shell
pip install -U uvicorn fastapi python-multipart
marker_server --port 8001
```

This will start a fastapi server that you can access at `localhost:8001`. You can go to `localhost:8001/docs` to see the endpoint options.

Note that this is not a very robust API, and is only intended for small-scale use. If you want to use this server, but want a more robust conversion option, you can run against the hosted [Datalab API](https://www.datalab.to/plans). You'll need to register and get an API key, then run:

```shell
marker_server --port 8001 --api_key API_KEY
```

Note: This is not the recommended way to use the Datalab API - it's only provided as a convenience for people wrapping the marker repo. The recommended way is to make a post request to the endpoint directly from your code vs proxying through this server.

You can send requests like this:

```
import requests
import json

post_data = {
'filepath': 'FILEPATH',
# Add other params here
}

requests.post("http://localhost:8001/marker", data=json.dumps(post_data)).json()
```

# Troubleshooting

There are some settings that you may find useful if things aren't working the way you expect:
Expand Down
1 change: 0 additions & 1 deletion marker/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,6 @@ def convert_single_pdf(

# OCR pages as needed
pages, ocr_stats = run_ocr(doc, pages, langs, ocr_model, batch_multiplier=batch_multiplier, ocr_all_pages=ocr_all_pages)
flush_cuda_memory()

out_meta["ocr_stats"] = ocr_stats
if len([b for p in pages for b in p.blocks]) == 0:
Expand Down
1 change: 1 addition & 0 deletions marker_app.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
os.environ["IN_STREAMLIT"] = "true"
os.environ["PDFTEXT_CPU_WORKERS"] = "1"

import base64
import io
Expand Down
175 changes: 175 additions & 0 deletions marker_server.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
import argparse
import asyncio
import os

import requests
import uvicorn
from pydantic import BaseModel, Field
from starlette.responses import HTMLResponse

os.environ["PDFTEXT_CPU_WORKERS"] = "1"

import base64
from contextlib import asynccontextmanager
from typing import Optional, Annotated
import io

from fastapi import FastAPI, Body
from marker.convert import convert_single_pdf
from marker.models import load_all_models

app_data = {}

@asynccontextmanager
async def lifespan(app: FastAPI):
if app.state.LOCAL:
app_data["models"] = load_all_models()

yield

if "models" in app_data:
del app_data["models"]


app = FastAPI(lifespan=lifespan)

@app.get("/")
async def root():
return HTMLResponse(
"""
<h1>Marker API</h1>
<ul>
<li><a href="/docs">API Documentation</a></li>
<li><a href="/marker">Run marker (post request only)</a></li>
</ul>
"""
)


class CommonParams(BaseModel):
filepath: Annotated[
str,
Field(description="The path to the PDF file to convert.")
]
max_pages: Annotated[
Optional[int],
Field(description="The maximum number of pages in the document to convert.", example=None)
] = None
langs: Annotated[
Optional[str],
Field(description="The optional languages to use if OCR is needed, comma separated. Must be either the names or codes from from https://github.com/VikParuchuri/surya/blob/master/surya/languages.py.", example=None)
] = None
force_ocr: Annotated[
bool,
Field(description="Force OCR on all pages of the PDF. Defaults to False. This can lead to worse results if you have good text in your PDFs (which is true in most cases).")
] = False
paginate: Annotated[
bool,
Field(description="Whether to paginate the output. Defaults to False. If set to True, each page of the output will be separated by a horizontal rule that contains the page number (2 newlines, {PAGE_NUMBER}, 48 - characters, 2 newlines).")
] = False
extract_images: Annotated[
bool,
Field(description="Whether to extract images from the PDF. Defaults to True. If set to False, no images will be extracted from the PDF.")
] = True


@app.post("/marker")
async def convert_pdf(
params: CommonParams
):
if app.state.LOCAL:
print(f"Converting {params.filepath} locally.")
assert all([
params.extract_images is True,
params.paginate is False,
]), "Local conversion API does not support image extraction or pagination."
return await convert_pdf_local(params)
else:
print(f"Converting {params.filepath} using the Datalab API.")
return await convert_pdf_remote(params)


async def convert_pdf_remote(params: CommonParams):
with open(params.filepath, "rb") as f:
filedata = f.read()

filename = os.path.basename(params.filepath)
form_data = {
'file': (filename, filedata, 'application/pdf'),
'max_pages': (None, params.max_pages),
'langs': (None, params.langs),
'force_ocr': (None, params.force_ocr),
'paginate': (None, params.paginate),
'extract_images': (None, params.extract_images),
}

headers = {"X-API-Key": app.state.API_KEY}

response = requests.post(app.state.DATALAB_URL, files=form_data, headers=headers)
data = response.json()

check_url = data["request_check_url"]

for i in range(300):
await asyncio.sleep(2)
response = requests.get(check_url, headers=headers)
data = response.json()

if data["status"] == "complete":
break

return data


async def convert_pdf_local(params: CommonParams):
try:
full_text, images, metadata = convert_single_pdf(
params.filepath,
app_data["models"],
max_pages=params.max_pages,
langs=params.langs,
ocr_all_pages=params.force_ocr
)
except Exception as e:
return {
"success": False,
"error": str(e),
}

encoded = {}
for k, v in images.items():
byte_stream = io.BytesIO()
v.save(byte_stream, format="PNG")
encoded[k] = base64.b64encode(byte_stream.getvalue()).decode("utf-8")

return {
"markdown": full_text,
"images": encoded,
"metadata": metadata,
"success": True
}


def main():
parser = argparse.ArgumentParser(description='Convert PDFs to markdown.')
parser.add_argument('--port', type=int, default=8000, help='Port to run the server on')
parser.add_argument('--host', type=str, default="127.0.0.1", help='Host to run the server on')
parser.add_argument('--api_key', type=str, default=None, help='API key for the Datalab API. If not specified, API will run locally.')
parser.add_argument("--datalab_url", type=str, default="https://api.datalab.to/api/v1/marker", help="The URL for the Datalab API")

args = parser.parse_args()

app.state.API_KEY = args.api_key
app.state.LOCAL = args.api_key is None
app.state.DATALAB_URL = args.datalab_url

# Run the server
uvicorn.run(
app,
host=args.host,
port=args.port,
)


if __name__ == "__main__":
main()
Loading
Loading