-
Notifications
You must be signed in to change notification settings - Fork 63
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #14 from AnswerDotAI/add-dev-deps-and-apply-linter
Add dev deps and apply linter
- Loading branch information
Showing
8 changed files
with
302 additions
and
163 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
name: Ruff | ||
on: pull_request | ||
jobs: | ||
ruff: | ||
runs-on: ubuntu-latest | ||
steps: | ||
- uses: actions/checkout@v3 | ||
- name: "Linting & Flaking" | ||
uses: chartboost/ruff-action@v1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,6 @@ | ||
from .RAGModel import RAGMultiModalModel | ||
from importlib.metadata import version | ||
|
||
from .RAGModel import RAGMultiModalModel | ||
|
||
__version__ = version("Byaldi") | ||
__all__ = ["RAGMultiModalModel"] |
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,51 +1,52 @@ | ||
[build-system] | ||
requires = ["setuptools"] | ||
build-backend = "setuptools.build_meta" | ||
requires = ["setuptools"] | ||
build-backend = "setuptools.build_meta" | ||
|
||
[tool.setuptools] | ||
packages = [ | ||
"byaldi" | ||
] | ||
packages = ["byaldi"] | ||
|
||
[project] | ||
name = "Byaldi" | ||
version = "0.0.2post2" | ||
description = "Use late-interaction multi-modal models such as ColPALI in just a few lines of code." | ||
name = "Byaldi" | ||
version = "0.0.4" | ||
description = "Use late-interaction multi-modal models such as ColPali in just a few lines of code." | ||
readme = "README.md" | ||
requires-python = ">=3.8" | ||
license = {file = "LICENSE"} | ||
keywords = ["reranking", "retrieval", "rag", "nlp", "colpali", "colbert", "multi-modal"] | ||
authors = [ | ||
{name = "Ben Clavié", email = "[email protected]" } | ||
requires-python = ">=3.9" | ||
license = { file = "LICENSE" } | ||
keywords = [ | ||
"reranking", | ||
"retrieval", | ||
"rag", | ||
"nlp", | ||
"colpali", | ||
"colbert", | ||
"multi-modal", | ||
] | ||
authors = [{ name = "Ben Clavié", email = "[email protected]" }] | ||
maintainers = [ | ||
{name = "Ben Clavié", email = "[email protected]" } | ||
{ name = "Ben Clavié", email = "[email protected]" }, | ||
{ name = "Tony Wu", email = "[email protected]" }, | ||
] | ||
|
||
dependencies = [ | ||
"transformers", | ||
"torch", | ||
"ninja", | ||
"pdf2image", | ||
"srsly", | ||
"colpali-engine==0.2.2", | ||
"mteb==1.6.35", | ||
"colpali-engine==0.2.2", | ||
"ml-dtypes", | ||
"mteb==1.6.35", | ||
"ninja", | ||
"pdf2image", | ||
"srsly", | ||
"torch", | ||
"transformers", | ||
] | ||
|
||
|
||
[project.optional-dependencies] | ||
server = [ | ||
"uvicorn", | ||
"fastapi" | ||
] | ||
dev = ["pytest>=7.4.0", "ruff>=0.1.9"] | ||
server = ["uvicorn", "fastapi"] | ||
|
||
[project.urls] | ||
"Homepage" = "https://github.com/answerdotai/byaldi" | ||
|
||
[tool.pytest.ini_options] | ||
filterwarnings = [ | ||
"ignore::Warning" | ||
] | ||
filterwarnings = ["ignore::Warning"] | ||
|
||
[tool.ruff] | ||
# Exclude a variety of commonly ignored directories. | ||
|
@@ -83,23 +84,17 @@ target-version = "py39" | |
|
||
[tool.ruff.lint] | ||
select = [ | ||
# bugbear rules | ||
"B", | ||
"I", | ||
# remove unused imports | ||
"F401", | ||
# bare except statements | ||
"E722", | ||
# unused arguments | ||
"ARG", | ||
] | ||
ignore = [ | ||
"B006", | ||
"B018", | ||
# bugbear rules | ||
"B", | ||
"I", | ||
# remove unused imports | ||
"F401", | ||
# bare except statements | ||
"E722", | ||
# unused arguments | ||
"ARG", | ||
] | ||
ignore = ["B006", "B018"] | ||
|
||
unfixable = [ | ||
"T201", | ||
"T203", | ||
] | ||
unfixable = ["T201", "T203"] | ||
ignore-init-module-imports = true |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,120 +1,137 @@ | ||
import os | ||
from pathlib import Path | ||
from byaldi import RAGMultiModalModel | ||
|
||
|
||
def test_single_pdf(): | ||
print("Testing single PDF indexing and retrieval...") | ||
|
||
# Initialize the model | ||
model = RAGMultiModalModel.from_pretrained("vidore/colpali") | ||
|
||
# Index a single PDF | ||
model.index( | ||
input_path="docs/attention.pdf", | ||
index_name="attention_index", | ||
store_collection_with_index=True, | ||
overwrite=True | ||
overwrite=True, | ||
) | ||
|
||
# Test retrieval | ||
queries = [ | ||
"How does the positional encoding thing work?", | ||
"what's the BLEU score of this new strange method?" | ||
"what's the BLEU score of this new strange method?", | ||
] | ||
|
||
for query in queries: | ||
results = model.search(query, k=3) | ||
|
||
print(f"\nQuery: {query}") | ||
for result in results: | ||
print(f"Doc ID: {result.doc_id}, Page: {result.page_num}, Score: {result.score}") | ||
|
||
print( | ||
f"Doc ID: {result.doc_id}, Page: {result.page_num}, Score: {result.score}" | ||
) | ||
|
||
# Check if the expected page (6 for positional encoding) is in the top results | ||
if "positional encoding" in query.lower(): | ||
assert any(r.page_num == 6 for r in results), "Expected page 6 for positional encoding query" | ||
|
||
assert any( | ||
r.page_num == 6 for r in results | ||
), "Expected page 6 for positional encoding query" | ||
|
||
# Check if the expected pages (8 and 9 for BLEU score) are in the top results | ||
if "bleu score" in query.lower(): | ||
assert any(r.page_num in [8, 9] for r in results), "Expected pages 8 or 9 for BLEU score query" | ||
|
||
assert any( | ||
r.page_num in [8, 9] for r in results | ||
), "Expected pages 8 or 9 for BLEU score query" | ||
|
||
print("Single PDF test completed.") | ||
|
||
|
||
def test_multi_document(): | ||
print("\nTesting multi-document indexing and retrieval...") | ||
|
||
# Initialize the model | ||
model = RAGMultiModalModel.from_pretrained("vidore/colpali") | ||
|
||
# Index a directory of documents | ||
model.index( | ||
input_path="docs/", | ||
index_name="multi_doc_index", | ||
store_collection_with_index=True, | ||
overwrite=True | ||
overwrite=True, | ||
) | ||
|
||
# Test retrieval | ||
queries = [ | ||
"How does the positional encoding thing work?", | ||
"what's the BLEU score of this new strange method?" | ||
"what's the BLEU score of this new strange method?", | ||
] | ||
|
||
for query in queries: | ||
results = model.search(query, k=5) | ||
|
||
print(f"\nQuery: {query}") | ||
for result in results: | ||
print(f"Doc ID: {result.doc_id}, Page: {result.page_num}, Score: {result.score}") | ||
|
||
print( | ||
f"Doc ID: {result.doc_id}, Page: {result.page_num}, Score: {result.score}" | ||
) | ||
|
||
# Check if the expected page (6 for positional encoding) is in the top results | ||
if "positional encoding" in query.lower(): | ||
assert any(r.page_num == 6 for r in results), "Expected page 6 for positional encoding query" | ||
|
||
assert any( | ||
r.page_num == 6 for r in results | ||
), "Expected page 6 for positional encoding query" | ||
|
||
# Check if the expected pages (8 and 9 for BLEU score) are in the top results | ||
if "bleu score" in query.lower(): | ||
assert any(r.page_num in [8, 9] for r in results), "Expected pages 8 or 9 for BLEU score query" | ||
|
||
assert any( | ||
r.page_num in [8, 9] for r in results | ||
), "Expected pages 8 or 9 for BLEU score query" | ||
|
||
print("Multi-document test completed.") | ||
|
||
|
||
def test_add_to_index(): | ||
print("\nTesting adding to an existing index...") | ||
|
||
# Load the existing index | ||
model = RAGMultiModalModel.from_index("multi_doc_index") | ||
|
||
# Add a new document to the index | ||
model.add_to_index( | ||
input_item="docs/", | ||
store_collection_with_index=True, | ||
doc_id=[1002, 1003], | ||
metadata=[{"author": "John Doe", "year": 2023}] * 2 | ||
metadata=[{"author": "John Doe", "year": 2023}] * 2, | ||
) | ||
|
||
# Test retrieval with the updated index | ||
queries = [ | ||
"what's the BLEU score of this new strange method?" | ||
] | ||
|
||
queries = ["what's the BLEU score of this new strange method?"] | ||
|
||
for query in queries: | ||
results = model.search(query, k=3) | ||
|
||
print(f"\nQuery: {query}") | ||
for result in results: | ||
print(f"Doc ID: {result.doc_id}, Page: {result.page_num}, Score: {result.score}") | ||
print( | ||
f"Doc ID: {result.doc_id}, Page: {result.page_num}, Score: {result.score}" | ||
) | ||
print(f"Metadata: {result.metadata}") | ||
|
||
# Check if the expected page (6 for positional encoding) is in the top results | ||
if "positional encoding" in query.lower(): | ||
assert any(r.page_num == 6 for r in results), "Expected page 6 for positional encoding query" | ||
|
||
assert any( | ||
r.page_num == 6 for r in results | ||
), "Expected page 6 for positional encoding query" | ||
|
||
# Check if the expected pages (8 and 9 for BLEU score) are in the top results | ||
if "bleu score" in query.lower(): | ||
assert any(r.page_num in [8, 9] for r in results), "Expected pages 8 or 9 for BLEU score query" | ||
|
||
assert any( | ||
r.page_num in [8, 9] for r in results | ||
), "Expected pages 8 or 9 for BLEU score query" | ||
|
||
print("Add to index test completed.") | ||
|
||
|
||
if __name__ == "__main__": | ||
test_single_pdf() | ||
test_multi_document() | ||
test_add_to_index() | ||
test_add_to_index() |