Skip to content

Commit

Permalink
Merge pull request #14 from AnswerDotAI/add-dev-deps-and-apply-linter
Browse files Browse the repository at this point in the history
Add dev deps and apply linter
  • Loading branch information
bclavie authored Sep 16, 2024
2 parents 922e0e9 + 2ca7737 commit dd10e68
Show file tree
Hide file tree
Showing 8 changed files with 302 additions and 163 deletions.
9 changes: 9 additions & 0 deletions .github/workflows/ruff.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
name: Ruff
on: pull_request
jobs:
ruff:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: "Linting & Flaking"
uses: chartboost/ruff-action@v1
5 changes: 2 additions & 3 deletions byaldi/RAGModel.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from pathlib import Path
from typing import Any, List, Optional, Union, Dict
from uuid import uuid4
from typing import Dict, List, Optional, Union

from PIL import Image

Expand Down Expand Up @@ -165,4 +164,4 @@ def search(
return self.model.search(query, k, return_base64_results)

def get_doc_ids_to_file_names(self):
return self.model.get_doc_ids_to_file_names()
return self.model.get_doc_ids_to_file_names()
3 changes: 2 additions & 1 deletion byaldi/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from .RAGModel import RAGMultiModalModel
from importlib.metadata import version

from .RAGModel import RAGMultiModalModel

__version__ = version("Byaldi")
__all__ = ["RAGMultiModalModel"]
241 changes: 175 additions & 66 deletions byaldi/colpali.py

Large diffs are not rendered by default.

9 changes: 8 additions & 1 deletion byaldi/objects.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,14 @@


class Result:
def __init__(self, doc_id: str, page_num: int, score: float, metadata: Optional[dict] = None, base64: Optional[str] = None):
def __init__(
self,
doc_id: str,
page_num: int,
score: float,
metadata: Optional[dict] = None,
base64: Optional[str] = None,
):
self.doc_id = doc_id
self.page_num = page_num
self.score = score
Expand Down
6 changes: 4 additions & 2 deletions byaldi/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from io import StringIO
import sys
from io import StringIO


def capture_print(func):
def wrapper(*args, **kwargs):
Expand All @@ -10,4 +11,5 @@ def wrapper(*args, **kwargs):
finally:
sys.stdout = original_stdout
return result
return wrapper

return wrapper
87 changes: 41 additions & 46 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,51 +1,52 @@
[build-system]
requires = ["setuptools"]
build-backend = "setuptools.build_meta"
requires = ["setuptools"]
build-backend = "setuptools.build_meta"

[tool.setuptools]
packages = [
"byaldi"
]
packages = ["byaldi"]

[project]
name = "Byaldi"
version = "0.0.2post2"
description = "Use late-interaction multi-modal models such as ColPALI in just a few lines of code."
name = "Byaldi"
version = "0.0.4"
description = "Use late-interaction multi-modal models such as ColPali in just a few lines of code."
readme = "README.md"
requires-python = ">=3.8"
license = {file = "LICENSE"}
keywords = ["reranking", "retrieval", "rag", "nlp", "colpali", "colbert", "multi-modal"]
authors = [
{name = "Ben Clavié", email = "[email protected]" }
requires-python = ">=3.9"
license = { file = "LICENSE" }
keywords = [
"reranking",
"retrieval",
"rag",
"nlp",
"colpali",
"colbert",
"multi-modal",
]
authors = [{ name = "Ben Clavié", email = "[email protected]" }]
maintainers = [
{name = "Ben Clavié", email = "[email protected]" }
{ name = "Ben Clavié", email = "[email protected]" },
{ name = "Tony Wu", email = "[email protected]" },
]

dependencies = [
"transformers",
"torch",
"ninja",
"pdf2image",
"srsly",
"colpali-engine==0.2.2",
"mteb==1.6.35",
"colpali-engine==0.2.2",
"ml-dtypes",
"mteb==1.6.35",
"ninja",
"pdf2image",
"srsly",
"torch",
"transformers",
]


[project.optional-dependencies]
server = [
"uvicorn",
"fastapi"
]
dev = ["pytest>=7.4.0", "ruff>=0.1.9"]
server = ["uvicorn", "fastapi"]

[project.urls]
"Homepage" = "https://github.com/answerdotai/byaldi"

[tool.pytest.ini_options]
filterwarnings = [
"ignore::Warning"
]
filterwarnings = ["ignore::Warning"]

[tool.ruff]
# Exclude a variety of commonly ignored directories.
Expand Down Expand Up @@ -83,23 +84,17 @@ target-version = "py39"

[tool.ruff.lint]
select = [
# bugbear rules
"B",
"I",
# remove unused imports
"F401",
# bare except statements
"E722",
# unused arguments
"ARG",
]
ignore = [
"B006",
"B018",
# bugbear rules
"B",
"I",
# remove unused imports
"F401",
# bare except statements
"E722",
# unused arguments
"ARG",
]
ignore = ["B006", "B018"]

unfixable = [
"T201",
"T203",
]
unfixable = ["T201", "T203"]
ignore-init-module-imports = true
105 changes: 61 additions & 44 deletions tests/all.py
Original file line number Diff line number Diff line change
@@ -1,120 +1,137 @@
import os
from pathlib import Path
from byaldi import RAGMultiModalModel


def test_single_pdf():
print("Testing single PDF indexing and retrieval...")

# Initialize the model
model = RAGMultiModalModel.from_pretrained("vidore/colpali")

# Index a single PDF
model.index(
input_path="docs/attention.pdf",
index_name="attention_index",
store_collection_with_index=True,
overwrite=True
overwrite=True,
)

# Test retrieval
queries = [
"How does the positional encoding thing work?",
"what's the BLEU score of this new strange method?"
"what's the BLEU score of this new strange method?",
]

for query in queries:
results = model.search(query, k=3)

print(f"\nQuery: {query}")
for result in results:
print(f"Doc ID: {result.doc_id}, Page: {result.page_num}, Score: {result.score}")

print(
f"Doc ID: {result.doc_id}, Page: {result.page_num}, Score: {result.score}"
)

# Check if the expected page (6 for positional encoding) is in the top results
if "positional encoding" in query.lower():
assert any(r.page_num == 6 for r in results), "Expected page 6 for positional encoding query"

assert any(
r.page_num == 6 for r in results
), "Expected page 6 for positional encoding query"

# Check if the expected pages (8 and 9 for BLEU score) are in the top results
if "bleu score" in query.lower():
assert any(r.page_num in [8, 9] for r in results), "Expected pages 8 or 9 for BLEU score query"

assert any(
r.page_num in [8, 9] for r in results
), "Expected pages 8 or 9 for BLEU score query"

print("Single PDF test completed.")


def test_multi_document():
print("\nTesting multi-document indexing and retrieval...")

# Initialize the model
model = RAGMultiModalModel.from_pretrained("vidore/colpali")

# Index a directory of documents
model.index(
input_path="docs/",
index_name="multi_doc_index",
store_collection_with_index=True,
overwrite=True
overwrite=True,
)

# Test retrieval
queries = [
"How does the positional encoding thing work?",
"what's the BLEU score of this new strange method?"
"what's the BLEU score of this new strange method?",
]

for query in queries:
results = model.search(query, k=5)

print(f"\nQuery: {query}")
for result in results:
print(f"Doc ID: {result.doc_id}, Page: {result.page_num}, Score: {result.score}")

print(
f"Doc ID: {result.doc_id}, Page: {result.page_num}, Score: {result.score}"
)

# Check if the expected page (6 for positional encoding) is in the top results
if "positional encoding" in query.lower():
assert any(r.page_num == 6 for r in results), "Expected page 6 for positional encoding query"

assert any(
r.page_num == 6 for r in results
), "Expected page 6 for positional encoding query"

# Check if the expected pages (8 and 9 for BLEU score) are in the top results
if "bleu score" in query.lower():
assert any(r.page_num in [8, 9] for r in results), "Expected pages 8 or 9 for BLEU score query"

assert any(
r.page_num in [8, 9] for r in results
), "Expected pages 8 or 9 for BLEU score query"

print("Multi-document test completed.")


def test_add_to_index():
print("\nTesting adding to an existing index...")

# Load the existing index
model = RAGMultiModalModel.from_index("multi_doc_index")

# Add a new document to the index
model.add_to_index(
input_item="docs/",
store_collection_with_index=True,
doc_id=[1002, 1003],
metadata=[{"author": "John Doe", "year": 2023}] * 2
metadata=[{"author": "John Doe", "year": 2023}] * 2,
)

# Test retrieval with the updated index
queries = [
"what's the BLEU score of this new strange method?"
]

queries = ["what's the BLEU score of this new strange method?"]

for query in queries:
results = model.search(query, k=3)

print(f"\nQuery: {query}")
for result in results:
print(f"Doc ID: {result.doc_id}, Page: {result.page_num}, Score: {result.score}")
print(
f"Doc ID: {result.doc_id}, Page: {result.page_num}, Score: {result.score}"
)
print(f"Metadata: {result.metadata}")

# Check if the expected page (6 for positional encoding) is in the top results
if "positional encoding" in query.lower():
assert any(r.page_num == 6 for r in results), "Expected page 6 for positional encoding query"

assert any(
r.page_num == 6 for r in results
), "Expected page 6 for positional encoding query"

# Check if the expected pages (8 and 9 for BLEU score) are in the top results
if "bleu score" in query.lower():
assert any(r.page_num in [8, 9] for r in results), "Expected pages 8 or 9 for BLEU score query"

assert any(
r.page_num in [8, 9] for r in results
), "Expected pages 8 or 9 for BLEU score query"

print("Add to index test completed.")


if __name__ == "__main__":
test_single_pdf()
test_multi_document()
test_add_to_index()
test_add_to_index()

0 comments on commit dd10e68

Please sign in to comment.