Skip to content

Commit

Permalink
Merge pull request #24 from velaia/main
Browse files Browse the repository at this point in the history
Improved performance and lower memory usage during PDF indexing
  • Loading branch information
bclavie authored Sep 23, 2024
2 parents fd60959 + 0f6fb60 commit 34fb0cf
Showing 1 changed file with 23 additions and 10 deletions.
33 changes: 23 additions & 10 deletions byaldi/colpali.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
import shutil
import tempfile
from importlib.metadata import version
from pathlib import Path
from typing import Dict, List, Optional, Union, cast
Expand Down Expand Up @@ -464,15 +465,22 @@ def _process_and_add_to_index(
"""TODO: THERE ARE TOO MANY FUNCTIONS DOING THINGS HERE. I blame Claude, but this is temporary anyway."""
if isinstance(item, Path):
if item.suffix.lower() == ".pdf":
images = convert_from_path(item)
for i, image in enumerate(images):
self._add_to_index(
image,
store_collection_with_index,
doc_id,
page_id=i + 1,
metadata=metadata,
with tempfile.TemporaryDirectory() as path:
images = convert_from_path(
item,
thread_count=os.cpu_count()-1,
output_folder=path,
paths_only=True
)
for i, image_path in enumerate(images):
image = Image.open(image_path)
self._add_to_index(
image,
store_collection_with_index,
doc_id,
page_id=i + 1,
metadata=metadata,
)
elif item.suffix.lower() in [".jpg", ".jpeg", ".png", ".bmp"]:
image = Image.open(item)
self._add_to_index(
Expand Down Expand Up @@ -645,8 +653,13 @@ def encode_image(
images.append(Image.open(os.path.join(item, file)))
elif item.lower().endswith(".pdf"):
# Process PDF
pdf_images = convert_from_path(item)
images.extend(pdf_images)
with tempfile.TemporaryDirectory() as path:
pdf_images = convert_from_path(
item,
thread_count=os.cpu_count()-1,
output_folder=path
)
images.extend(pdf_images)
elif item.lower().endswith(
(".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".gif")
):
Expand Down

0 comments on commit 34fb0cf

Please sign in to comment.