Merge pull request #14 from AnswerDotAI/add-dev-deps-and-apply-linter

Add dev deps and apply linter
AnswerDotAI · Sep 16, 2024 · dd10e68 · dd10e68
2 parents 922e0e9 + 2ca7737
commit dd10e68
Show file tree

Hide file tree

Showing 8 changed files with 302 additions and 163 deletions.
diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml
@@ -0,0 +1,9 @@
+name: Ruff
+on: pull_request
+jobs:
+  ruff:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - name: "Linting & Flaking"
+        uses: chartboost/ruff-action@v1
diff --git a/byaldi/RAGModel.py b/byaldi/RAGModel.py
@@ -1,6 +1,5 @@
 from pathlib import Path
-from typing import Any, List, Optional, Union, Dict
-from uuid import uuid4
+from typing import Dict, List, Optional, Union
 
 from PIL import Image
 
@@ -165,4 +164,4 @@ def search(
         return self.model.search(query, k, return_base64_results)
 
     def get_doc_ids_to_file_names(self):
-        return self.model.get_doc_ids_to_file_names()
+        return self.model.get_doc_ids_to_file_names()
diff --git a/byaldi/__init__.py b/byaldi/__init__.py
@@ -1,5 +1,6 @@
-from .RAGModel import RAGMultiModalModel
 from importlib.metadata import version
 
+from .RAGModel import RAGMultiModalModel
+
 __version__ = version("Byaldi")
 __all__ = ["RAGMultiModalModel"]
diff --git a/byaldi/colpali.py b/byaldi/colpali.py
diff --git a/byaldi/objects.py b/byaldi/objects.py
@@ -2,7 +2,14 @@
 
 
 class Result:
-    def __init__(self, doc_id: str, page_num: int, score: float, metadata: Optional[dict] = None, base64: Optional[str] = None):
+    def __init__(
+        self,
+        doc_id: str,
+        page_num: int,
+        score: float,
+        metadata: Optional[dict] = None,
+        base64: Optional[str] = None,
+    ):
         self.doc_id = doc_id
         self.page_num = page_num
         self.score = score

diff --git a/byaldi/utils.py b/byaldi/utils.py
@@ -1,5 +1,6 @@
-from io import StringIO
 import sys
+from io import StringIO
+
 
 def capture_print(func):
     def wrapper(*args, **kwargs):
@@ -10,4 +11,5 @@ def wrapper(*args, **kwargs):
         finally:
             sys.stdout = original_stdout
         return result
-    return wrapper
+
+    return wrapper
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,51 +1,52 @@
 [build-system]
-requires = ["setuptools"] 
-build-backend = "setuptools.build_meta" 
+requires = ["setuptools"]
+build-backend = "setuptools.build_meta"
 
 [tool.setuptools]
-packages = [
-    "byaldi"
-]
+packages = ["byaldi"]
 
 [project]
-name = "Byaldi" 
-version = "0.0.2post2"
-description = "Use late-interaction multi-modal models such as ColPALI in just a few lines of code."
+name = "Byaldi"
+version = "0.0.4"
+description = "Use late-interaction multi-modal models such as ColPali in just a few lines of code."
 readme = "README.md"
-requires-python = ">=3.8"
-license = {file = "LICENSE"}
-keywords = ["reranking", "retrieval", "rag", "nlp", "colpali", "colbert", "multi-modal"]
-authors = [
-  {name = "Ben Clavié", email = "[email protected]" }
+requires-python = ">=3.9"
+license = { file = "LICENSE" }
+keywords = [
+    "reranking",
+    "retrieval",
+    "rag",
+    "nlp",
+    "colpali",
+    "colbert",
+    "multi-modal",
 ]
+authors = [{ name = "Ben Clavié", email = "[email protected]" }]
 maintainers = [
-  {name = "Ben Clavié", email = "[email protected]" }
+    { name = "Ben Clavié", email = "[email protected]" },
+    { name = "Tony Wu", email = "[email protected]" },
 ]
 
 dependencies = [
-"transformers",
-"torch",
-"ninja",
-"pdf2image",
-"srsly",
-"colpali-engine==0.2.2",
-"mteb==1.6.35",
+    "colpali-engine==0.2.2",
+    "ml-dtypes",
+    "mteb==1.6.35",
+    "ninja",
+    "pdf2image",
+    "srsly",
+    "torch",
+    "transformers",
 ]
 
-
 [project.optional-dependencies]
-server = [
-    "uvicorn",
-    "fastapi"
-]
+dev = ["pytest>=7.4.0", "ruff>=0.1.9"]
+server = ["uvicorn", "fastapi"]
 
 [project.urls]
 "Homepage" = "https://github.com/answerdotai/byaldi"
 
 [tool.pytest.ini_options]
-filterwarnings = [
-    "ignore::Warning"
-]
+filterwarnings = ["ignore::Warning"]
 
 [tool.ruff]
 # Exclude a variety of commonly ignored directories.
@@ -83,23 +84,17 @@ target-version = "py39"
 
 [tool.ruff.lint]
 select = [
-  # bugbear rules
-  "B",
-  "I",
-  # remove unused imports
-  "F401",
-  # bare except statements
-  "E722",
-  # unused arguments
-  "ARG",
-]
-ignore = [
-  "B006",
-  "B018",
+    # bugbear rules
+    "B",
+    "I",
+    # remove unused imports
+    "F401",
+    # bare except statements
+    "E722",
+    # unused arguments
+    "ARG",
 ]
+ignore = ["B006", "B018"]
 
-unfixable = [
-  "T201",
-  "T203",
-]
+unfixable = ["T201", "T203"]
 ignore-init-module-imports = true
diff --git a/tests/all.py b/tests/all.py
@@ -1,120 +1,137 @@
-import os
-from pathlib import Path
 from byaldi import RAGMultiModalModel
 
+
 def test_single_pdf():
     print("Testing single PDF indexing and retrieval...")
-    
+
     # Initialize the model
     model = RAGMultiModalModel.from_pretrained("vidore/colpali")
-    
+
     # Index a single PDF
     model.index(
         input_path="docs/attention.pdf",
         index_name="attention_index",
         store_collection_with_index=True,
-        overwrite=True
+        overwrite=True,
     )
-    
+
     # Test retrieval
     queries = [
         "How does the positional encoding thing work?",
-        "what's the BLEU score of this new strange method?"
+        "what's the BLEU score of this new strange method?",
     ]
-    
+
     for query in queries:
         results = model.search(query, k=3)
-        
+
         print(f"\nQuery: {query}")
         for result in results:
-            print(f"Doc ID: {result.doc_id}, Page: {result.page_num}, Score: {result.score}")
-
+            print(
+                f"Doc ID: {result.doc_id}, Page: {result.page_num}, Score: {result.score}"
+            )
+
         # Check if the expected page (6 for positional encoding) is in the top results
         if "positional encoding" in query.lower():
-            assert any(r.page_num == 6 for r in results), "Expected page 6 for positional encoding query"
-
+            assert any(
+                r.page_num == 6 for r in results
+            ), "Expected page 6 for positional encoding query"
+
         # Check if the expected pages (8 and 9 for BLEU score) are in the top results
         if "bleu score" in query.lower():
-            assert any(r.page_num in [8, 9] for r in results), "Expected pages 8 or 9 for BLEU score query"
-
+            assert any(
+                r.page_num in [8, 9] for r in results
+            ), "Expected pages 8 or 9 for BLEU score query"
+
     print("Single PDF test completed.")
 
+
 def test_multi_document():
     print("\nTesting multi-document indexing and retrieval...")
-    
+
     # Initialize the model
     model = RAGMultiModalModel.from_pretrained("vidore/colpali")
-    
+
     # Index a directory of documents
     model.index(
         input_path="docs/",
         index_name="multi_doc_index",
         store_collection_with_index=True,
-        overwrite=True
+        overwrite=True,
     )
-    
+
     # Test retrieval
     queries = [
         "How does the positional encoding thing work?",
-        "what's the BLEU score of this new strange method?"
+        "what's the BLEU score of this new strange method?",
     ]
-    
+
     for query in queries:
         results = model.search(query, k=5)
-        
+
         print(f"\nQuery: {query}")
         for result in results:
-            print(f"Doc ID: {result.doc_id}, Page: {result.page_num}, Score: {result.score}")
-
+            print(
+                f"Doc ID: {result.doc_id}, Page: {result.page_num}, Score: {result.score}"
+            )
+
         # Check if the expected page (6 for positional encoding) is in the top results
         if "positional encoding" in query.lower():
-            assert any(r.page_num == 6 for r in results), "Expected page 6 for positional encoding query"
-
+            assert any(
+                r.page_num == 6 for r in results
+            ), "Expected page 6 for positional encoding query"
+
         # Check if the expected pages (8 and 9 for BLEU score) are in the top results
         if "bleu score" in query.lower():
-            assert any(r.page_num in [8, 9] for r in results), "Expected pages 8 or 9 for BLEU score query"
-
+            assert any(
+                r.page_num in [8, 9] for r in results
+            ), "Expected pages 8 or 9 for BLEU score query"
+
     print("Multi-document test completed.")
 
+
 def test_add_to_index():
     print("\nTesting adding to an existing index...")
-    
+
     # Load the existing index
     model = RAGMultiModalModel.from_index("multi_doc_index")
-    
+
     # Add a new document to the index
     model.add_to_index(
         input_item="docs/",
         store_collection_with_index=True,
         doc_id=[1002, 1003],
-        metadata=[{"author": "John Doe", "year": 2023}] * 2
+        metadata=[{"author": "John Doe", "year": 2023}] * 2,
     )
-    
+
     # Test retrieval with the updated index
-    queries = [
-        "what's the BLEU score of this new strange method?"
-    ]
-
+    queries = ["what's the BLEU score of this new strange method?"]
+
     for query in queries:
         results = model.search(query, k=3)
-        
+
         print(f"\nQuery: {query}")
         for result in results:
-            print(f"Doc ID: {result.doc_id}, Page: {result.page_num}, Score: {result.score}")
+            print(
+                f"Doc ID: {result.doc_id}, Page: {result.page_num}, Score: {result.score}"
+            )
             print(f"Metadata: {result.metadata}")
-        
+
         # Check if the expected page (6 for positional encoding) is in the top results
         if "positional encoding" in query.lower():
-            assert any(r.page_num == 6 for r in results), "Expected page 6 for positional encoding query"
-
+            assert any(
+                r.page_num == 6 for r in results
+            ), "Expected page 6 for positional encoding query"
+
         # Check if the expected pages (8 and 9 for BLEU score) are in the top results
         if "bleu score" in query.lower():
-            assert any(r.page_num in [8, 9] for r in results), "Expected pages 8 or 9 for BLEU score query"
-
+            assert any(
+                r.page_num in [8, 9] for r in results
+            ), "Expected pages 8 or 9 for BLEU score query"
+
     print("Add to index test completed.")
 
 
 if __name__ == "__main__":
     test_single_pdf()
     test_multi_document()
-    test_add_to_index()
+    test_add_to_index()