From fb8dc159f9ed3eb9998a9c17f32e05d8f94ca53d Mon Sep 17 00:00:00 2001 From: "Daniel N. Lang" Date: Thu, 19 Sep 2024 23:29:58 +0200 Subject: [PATCH 1/2] =?UTF-8?q?Adjust=20thread=5Fcount=20-=20about=2025s?= =?UTF-8?q?=20faster=20on=208-core=20Intel(R)=20Xeon(R)=20CPU=20@=202.20GH?= =?UTF-8?q?z=20this=20way=20than=20fixed=20thread=5Fcount=3D4=20Also=20add?= =?UTF-8?q?ed=20paths=5Fonly=20option=20to=20convert=5Ffrom=5Fpath=20which?= =?UTF-8?q?=20can=20significantly=20reduce=20memory=20consumption=20for=20?= =?UTF-8?q?large=20PDFs=CB=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- byaldi/colpali.py | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/byaldi/colpali.py b/byaldi/colpali.py index cc08fef..b30b8ad 100644 --- a/byaldi/colpali.py +++ b/byaldi/colpali.py @@ -1,5 +1,6 @@ import os import shutil +import tempfile # Import version directly from the package metadata from importlib.metadata import version @@ -477,15 +478,22 @@ def _process_and_add_to_index( """TODO: THERE ARE TOO MANY FUNCTIONS DOING THINGS HERE. I blame Claude, but this is temporary anyway.""" if isinstance(item, Path): if item.suffix.lower() == ".pdf": - images = convert_from_path(item) - for i, image in enumerate(images): - self._add_to_index( - image, - store_collection_with_index, - doc_id, - page_id=i + 1, - metadata=metadata, + with tempfile.TemporaryDirectory() as path: + images = convert_from_path( + item, + thread_count=os.cpu_count()-1, + output_folder=path, + paths_only=True ) + for i, image_path in enumerate(images): + image = Image.open(image_path) + self._add_to_index( + image, + store_collection_with_index, + doc_id, + page_id=i + 1, + metadata=metadata, + ) elif item.suffix.lower() in [".jpg", ".jpeg", ".png", ".bmp"]: image = Image.open(item) self._add_to_index( @@ -664,8 +672,12 @@ def encode_image( images.append(Image.open(os.path.join(item, file))) elif item.lower().endswith(".pdf"): # Process PDF - pdf_images = convert_from_path(item) - images.extend(pdf_images) + with tempfile.TemporaryDirectory() as path: + pdf_images = convert_from_path( + item, + thread_count=os.cpu_count()-1, output_folder=path + ) + images.extend(pdf_images) elif item.lower().endswith( (".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".gif") ): From dfb069ce724d2248e5e42c88928fd9ff90f596ed Mon Sep 17 00:00:00 2001 From: "Daniel N. Lang" Date: Thu, 19 Sep 2024 23:35:59 +0200 Subject: [PATCH 2/2] formatting --- byaldi/colpali.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/byaldi/colpali.py b/byaldi/colpali.py index b30b8ad..e9f5874 100644 --- a/byaldi/colpali.py +++ b/byaldi/colpali.py @@ -675,7 +675,8 @@ def encode_image( with tempfile.TemporaryDirectory() as path: pdf_images = convert_from_path( item, - thread_count=os.cpu_count()-1, output_folder=path + thread_count=os.cpu_count()-1, + output_folder=path ) images.extend(pdf_images) elif item.lower().endswith(