Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add dev deps and apply linter #14

Merged
merged 6 commits into from
Sep 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions .github/workflows/ruff.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
name: Ruff
on: pull_request
jobs:
ruff:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: "Linting & Flaking"
uses: chartboost/ruff-action@v1
5 changes: 2 additions & 3 deletions byaldi/RAGModel.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from pathlib import Path
from typing import Any, List, Optional, Union, Dict
from uuid import uuid4
from typing import Dict, List, Optional, Union

from PIL import Image

Expand Down Expand Up @@ -164,4 +163,4 @@ def search(
return self.model.search(query, k, return_base64_results)

def get_doc_ids_to_file_names(self):
return self.model.get_doc_ids_to_file_names()
return self.model.get_doc_ids_to_file_names()
3 changes: 2 additions & 1 deletion byaldi/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from .RAGModel import RAGMultiModalModel
from importlib.metadata import version

from .RAGModel import RAGMultiModalModel

__version__ = version("Byaldi")
__all__ = ["RAGMultiModalModel"]
241 changes: 175 additions & 66 deletions byaldi/colpali.py

Large diffs are not rendered by default.

9 changes: 8 additions & 1 deletion byaldi/objects.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,14 @@


class Result:
def __init__(self, doc_id: str, page_num: int, score: float, metadata: Optional[dict] = None, base64: Optional[str] = None):
def __init__(
self,
doc_id: str,
page_num: int,
score: float,
metadata: Optional[dict] = None,
base64: Optional[str] = None,
):
self.doc_id = doc_id
self.page_num = page_num
self.score = score
Expand Down
6 changes: 4 additions & 2 deletions byaldi/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from io import StringIO
import sys
from io import StringIO


def capture_print(func):
def wrapper(*args, **kwargs):
Expand All @@ -10,4 +11,5 @@ def wrapper(*args, **kwargs):
finally:
sys.stdout = original_stdout
return result
return wrapper

return wrapper
87 changes: 41 additions & 46 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,51 +1,52 @@
[build-system]
requires = ["setuptools"]
build-backend = "setuptools.build_meta"
requires = ["setuptools"]
build-backend = "setuptools.build_meta"

[tool.setuptools]
packages = [
"byaldi"
]
packages = ["byaldi"]

[project]
name = "Byaldi"
version = "0.0.2post2"
description = "Use late-interaction multi-modal models such as ColPALI in just a few lines of code."
name = "Byaldi"
version = "0.0.4"
description = "Use late-interaction multi-modal models such as ColPali in just a few lines of code."
readme = "README.md"
requires-python = ">=3.8"
license = {file = "LICENSE"}
keywords = ["reranking", "retrieval", "rag", "nlp", "colpali", "colbert", "multi-modal"]
authors = [
{name = "Ben Clavié", email = "[email protected]" }
requires-python = ">=3.9"
license = { file = "LICENSE" }
keywords = [
"reranking",
"retrieval",
"rag",
"nlp",
"colpali",
"colbert",
"multi-modal",
]
authors = [{ name = "Ben Clavié", email = "[email protected]" }]
maintainers = [
{name = "Ben Clavié", email = "[email protected]" }
{ name = "Ben Clavié", email = "[email protected]" },
{ name = "Tony Wu", email = "[email protected]" },
]

dependencies = [
"transformers",
"torch",
"ninja",
"pdf2image",
"srsly",
"colpali-engine==0.2.2",
"mteb==1.6.35",
"colpali-engine==0.2.2",
"ml-dtypes",
"mteb==1.6.35",
"ninja",
"pdf2image",
"srsly",
"torch",
"transformers",
]


[project.optional-dependencies]
server = [
"uvicorn",
"fastapi"
]
dev = ["pytest>=7.4.0", "ruff>=0.1.9"]
server = ["uvicorn", "fastapi"]

[project.urls]
"Homepage" = "https://github.com/answerdotai/byaldi"

[tool.pytest.ini_options]
filterwarnings = [
"ignore::Warning"
]
filterwarnings = ["ignore::Warning"]

[tool.ruff]
# Exclude a variety of commonly ignored directories.
Expand Down Expand Up @@ -83,23 +84,17 @@ target-version = "py39"

[tool.ruff.lint]
select = [
# bugbear rules
"B",
"I",
# remove unused imports
"F401",
# bare except statements
"E722",
# unused arguments
"ARG",
]
ignore = [
"B006",
"B018",
# bugbear rules
"B",
"I",
# remove unused imports
"F401",
# bare except statements
"E722",
# unused arguments
"ARG",
]
ignore = ["B006", "B018"]

unfixable = [
"T201",
"T203",
]
unfixable = ["T201", "T203"]
ignore-init-module-imports = true
105 changes: 61 additions & 44 deletions tests/all.py
Original file line number Diff line number Diff line change
@@ -1,120 +1,137 @@
import os
from pathlib import Path
from byaldi import RAGMultiModalModel


def test_single_pdf():
print("Testing single PDF indexing and retrieval...")

# Initialize the model
model = RAGMultiModalModel.from_pretrained("vidore/colpali")

# Index a single PDF
model.index(
input_path="docs/attention.pdf",
index_name="attention_index",
store_collection_with_index=True,
overwrite=True
overwrite=True,
)

# Test retrieval
queries = [
"How does the positional encoding thing work?",
"what's the BLEU score of this new strange method?"
"what's the BLEU score of this new strange method?",
]

for query in queries:
results = model.search(query, k=3)

print(f"\nQuery: {query}")
for result in results:
print(f"Doc ID: {result.doc_id}, Page: {result.page_num}, Score: {result.score}")

print(
f"Doc ID: {result.doc_id}, Page: {result.page_num}, Score: {result.score}"
)

# Check if the expected page (6 for positional encoding) is in the top results
if "positional encoding" in query.lower():
assert any(r.page_num == 6 for r in results), "Expected page 6 for positional encoding query"

assert any(
r.page_num == 6 for r in results
), "Expected page 6 for positional encoding query"

# Check if the expected pages (8 and 9 for BLEU score) are in the top results
if "bleu score" in query.lower():
assert any(r.page_num in [8, 9] for r in results), "Expected pages 8 or 9 for BLEU score query"

assert any(
r.page_num in [8, 9] for r in results
), "Expected pages 8 or 9 for BLEU score query"

print("Single PDF test completed.")


def test_multi_document():
print("\nTesting multi-document indexing and retrieval...")

# Initialize the model
model = RAGMultiModalModel.from_pretrained("vidore/colpali")

# Index a directory of documents
model.index(
input_path="docs/",
index_name="multi_doc_index",
store_collection_with_index=True,
overwrite=True
overwrite=True,
)

# Test retrieval
queries = [
"How does the positional encoding thing work?",
"what's the BLEU score of this new strange method?"
"what's the BLEU score of this new strange method?",
]

for query in queries:
results = model.search(query, k=5)

print(f"\nQuery: {query}")
for result in results:
print(f"Doc ID: {result.doc_id}, Page: {result.page_num}, Score: {result.score}")

print(
f"Doc ID: {result.doc_id}, Page: {result.page_num}, Score: {result.score}"
)

# Check if the expected page (6 for positional encoding) is in the top results
if "positional encoding" in query.lower():
assert any(r.page_num == 6 for r in results), "Expected page 6 for positional encoding query"

assert any(
r.page_num == 6 for r in results
), "Expected page 6 for positional encoding query"

# Check if the expected pages (8 and 9 for BLEU score) are in the top results
if "bleu score" in query.lower():
assert any(r.page_num in [8, 9] for r in results), "Expected pages 8 or 9 for BLEU score query"

assert any(
r.page_num in [8, 9] for r in results
), "Expected pages 8 or 9 for BLEU score query"

print("Multi-document test completed.")


def test_add_to_index():
print("\nTesting adding to an existing index...")

# Load the existing index
model = RAGMultiModalModel.from_index("multi_doc_index")

# Add a new document to the index
model.add_to_index(
input_item="docs/",
store_collection_with_index=True,
doc_id=[1002, 1003],
metadata=[{"author": "John Doe", "year": 2023}] * 2
metadata=[{"author": "John Doe", "year": 2023}] * 2,
)

# Test retrieval with the updated index
queries = [
"what's the BLEU score of this new strange method?"
]

queries = ["what's the BLEU score of this new strange method?"]

for query in queries:
results = model.search(query, k=3)

print(f"\nQuery: {query}")
for result in results:
print(f"Doc ID: {result.doc_id}, Page: {result.page_num}, Score: {result.score}")
print(
f"Doc ID: {result.doc_id}, Page: {result.page_num}, Score: {result.score}"
)
print(f"Metadata: {result.metadata}")

# Check if the expected page (6 for positional encoding) is in the top results
if "positional encoding" in query.lower():
assert any(r.page_num == 6 for r in results), "Expected page 6 for positional encoding query"

assert any(
r.page_num == 6 for r in results
), "Expected page 6 for positional encoding query"

# Check if the expected pages (8 and 9 for BLEU score) are in the top results
if "bleu score" in query.lower():
assert any(r.page_num in [8, 9] for r in results), "Expected pages 8 or 9 for BLEU score query"

assert any(
r.page_num in [8, 9] for r in results
), "Expected pages 8 or 9 for BLEU score query"

print("Add to index test completed.")


if __name__ == "__main__":
test_single_pdf()
test_multi_document()
test_add_to_index()
test_add_to_index()
Loading