Skip to content

Commit

Permalink
Merge pull request #43 from weni-ai/feature/index-content
Browse files Browse the repository at this point in the history
add: index document full cont; entendpoint to return content
  • Loading branch information
zMardone authored Jul 4, 2024
2 parents 3abfc55 + ce7800c commit a0b486f
Show file tree
Hide file tree
Showing 9 changed files with 154 additions and 24 deletions.
10 changes: 9 additions & 1 deletion app/celery.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,21 @@ def index_file_data(content_base: Dict) -> bool:
os.environ.get("AWS_STORAGE_ACCESS_KEY"),
os.environ.get("AWS_STORAGE_SECRET_KEY")
)
content_base_indexer = main_app.content_base_indexer
text_splitter = TextSplitter(character_text_splitter())
manager = IndexerFileManager(
file_downloader,
main_app.content_base_indexer,
content_base_indexer,
text_splitter,
)
index_result: bool = manager.index_file_url(content_base)
embbed_result: bool = content_base_indexer.check_if_doc_was_embedded_document(
file_uuid=content_base.get("file_uuid"),
content_base_uuid=str(content_base.get('content_base')),
)

index_result = index_result and embbed_result

NexusRESTClient().index_succedded(
task_succeded=index_result,
nexus_task_uuid=content_base.get("task_uuid"),
Expand Down
3 changes: 3 additions & 0 deletions app/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,3 +60,6 @@ def __init__(self):
self.sentry_dsn = os.environ.get("SENTRY_DSN", "")
self.environment = os.environ.get("ENVIRONMENT", "local")
self.es_timeout = os.environ.get("ELASTICSEARCH_TIMEOUT", "30")
self.content_base_documents_index_name = os.environ.get(
"INDEX_CONTENTBASEDOCS_NAME", "content_base_documents"
)
24 changes: 24 additions & 0 deletions app/handlers/content_bases.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,15 @@ class ContentBaseDeleteResponse(BaseModel):
deleted: bool


class ContentBaseSearchDocumentRequest(BaseModel):
file_uuid: str
content_base_uuid: str


class ContentBaseSearchDocumentResponse(BaseModel):
content: str


class ContentBaseHandler(IDocumentHandler):
def __init__(self, content_base_indexer: IDocumentIndexer):
self.content_base_indexer = content_base_indexer
Expand All @@ -56,6 +65,9 @@ def __init__(self, content_base_indexer: IDocumentIndexer):
self.router.add_api_route(
"/content_base/search", endpoint=self.search, methods=["POST"]
)
self.router.add_api_route(
"/content_base/search-document", endpoint=self.search_document_content, methods=["POST"]
)
self.router.add_api_route(
"/content_base/delete", endpoint=self.delete, methods=["DELETE"]
)
Expand Down Expand Up @@ -102,3 +114,15 @@ def search(
filter=request.filter
)
return ContentBaseSearchResponse(response=response)

def search_document_content(
self,
request: ContentBaseSearchDocumentRequest,
Authorization: Annotated[str | None, Header()] = None
):
token_verification(Authorization)
response = self.content_base_indexer.search_document_content(
file_uuid=request.file_uuid,
content_base_uuid=request.content_base_uuid
)
return ContentBaseSearchDocumentResponse(content=response)
14 changes: 14 additions & 0 deletions app/indexer/content_bases.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,5 +97,19 @@ def delete(self, content_base_uuid: UUID, filename: str, file_uuid: str):
if ids:
self.storage.delete(ids=ids)

def index_doc_content(self, full_content: str, content_base_uuid: UUID, filename: str, file_uuid: str):
self.storage.save_doc_content(
full_content=full_content,
content_base_uuid=content_base_uuid,
filename=filename,
file_uuid=file_uuid
)

def delete_batch(self):
raise NotImplementedError

def search_document_content(self, file_uuid: str, content_base_uuid: str) -> str:
return self.storage.search_doc_content(file_uuid, content_base_uuid)

def check_if_doc_was_embedded_document(self, file_uuid: str, content_base_uuid: str) -> bool:
return self.storage.check_if_doc_was_embedded_document(file_uuid, content_base_uuid)
12 changes: 11 additions & 1 deletion app/indexer/indexer_file_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,11 @@ def __init__(self,

def index_file_url(self, content_base, **kwargs) -> bool:
load_type = content_base.get("load_type")
docs: List[Document] = load_file_url_and_split_text(

docs: List[Document]
full_content: str

docs, full_content = load_file_url_and_split_text(
content_base.get("file"),
content_base.get('extension_file'),
self.text_splitter,
Expand All @@ -59,6 +63,12 @@ def index_file_url(self, content_base, **kwargs) -> bool:
document_pages: List[Document] = add_file_metadata(docs, content_base)
try:
self.content_base_indexer.index_documents(document_pages)
self.content_base_indexer.index_doc_content(
full_content=full_content,
content_base_uuid=str(content_base.get('content_base')),
filename=content_base.get("filename"),
file_uuid=content_base.get("file_uuid"),
)
return True
except Exception as e: # TODO: handle exceptions
logger.exception(e)
Expand Down
4 changes: 3 additions & 1 deletion app/loaders/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
from langchain.schema.document import Document
from typing import List
from app.text_splitters import ITextSplitter
from typing import Tuple


supported_loaders = {
'txt': txt_loader,
Expand Down Expand Up @@ -72,7 +74,7 @@ def load_file_url_and_split_text(
file_type: str,
text_splitter: ITextSplitter,
**kwargs
) -> List[Document]:
) -> Tuple[List[Document], str]:

load_type = kwargs.get("load_type", None)

Expand Down
41 changes: 28 additions & 13 deletions app/loaders/loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import uuid
import requests
from abc import ABC, abstractmethod

from typing import Tuple
from urllib.request import urlretrieve
from urllib.parse import urlparse

Expand Down Expand Up @@ -44,7 +44,7 @@ def load(self) -> List[Document]:
def load_and_split_text(
self,
text_splitter: ITextSplitter
) -> List[Document]:
) -> Tuple[List[Document], str]:
return self.loader.load_and_split_text(text_splitter)

def raw_text(self) -> str:
Expand Down Expand Up @@ -101,7 +101,10 @@ def load(self) -> List[Document]:
def load_and_split_text(
self,
text_splitter: ITextSplitter
) -> List[Document]:
) -> Tuple[List[Document], str]:
data: List[Document] = self.loader.load()
full_content = data[0].page_content

pages = self.load()
split_pages = []
for page in pages:
Expand All @@ -117,7 +120,7 @@ def load_and_split_text(
metadata=metadatas
)
)
return split_pages
return (split_pages, full_content)


class PDFLoader(DocumentLoader):
Expand All @@ -141,7 +144,10 @@ def load(self) -> List[Document]:
def load_and_split_text(
self,
text_splitter: ITextSplitter
) -> List[Document]:
) -> Tuple[List[Document], str]:
data: List[Document] = self.loader.load()
full_content = data[0].page_content

pages = self.load()
split_pages = []

Expand All @@ -159,7 +165,7 @@ def load_and_split_text(
)
)

return split_pages
return (split_pages, full_content)

def raw_text(self) -> str:
pages = self.load()
Expand Down Expand Up @@ -191,7 +197,11 @@ def load(self) -> List[Document]:
def load_and_split_text(
self,
text_splitter: ITextSplitter
) -> List[Document]:
) -> Tuple[List[Document], str]:

data: List[Document] = self.loader.load()
full_content = data[0].page_content

pages = self.load()
split_pages = []
for page in pages:
Expand All @@ -207,7 +217,7 @@ def load_and_split_text(
metadata=metadatas
)
)
return split_pages
return (split_pages, full_content)


def docx_loader(file: str) -> Callable:
Expand Down Expand Up @@ -248,7 +258,11 @@ def load(self) -> List[Document]:
def load_and_split_text(
self,
text_splitter: ITextSplitter
) -> List[Document]:
) -> Tuple[List[Document], str]:

data: List[Document] = self.load()
full_content: str = data[0].page_content

pages = self.load()
split_pages = []
for page in pages:
Expand All @@ -264,7 +278,7 @@ def load_and_split_text(
metadata=metadatas
)
)
return split_pages
return (split_pages, full_content)


class URLsLoader(DocumentLoader):
Expand All @@ -288,9 +302,10 @@ def load(self) -> List[Document]:
def load_and_split_text(
self,
text_splitter: ITextSplitter
) -> List[Document]:
) -> Tuple[List[Document], str]:
split_pages = []

data: List[Document] = self.load()
full_content: str = data[0].page_content
pages = self.loader.load_and_split()
for page in pages:
page_content = page.page_content
Expand All @@ -305,4 +320,4 @@ def load_and_split_text(
metadata=metadatas
)
)
return split_pages
return (split_pages, full_content)
54 changes: 54 additions & 0 deletions app/store/elasticsearch_vector_store.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import os

import sentry_sdk

from langchain.vectorstores import VectorStore
from langchain.docstore.document import Document

Expand Down Expand Up @@ -141,3 +143,55 @@ def search(self, search: str, filter=None, threshold=0.1) -> list[Document]:

def delete(self, ids: list[str] = []) -> bool:
return self.vectorstore.delete(ids)

def save_doc_content(self, full_content, content_base_uuid, filename, file_uuid) -> None:
elasticsearch_doc = {
"content": full_content,
"content_base_uuid": content_base_uuid,
"filename": filename,
"file_uuid":file_uuid
}
es_client = self.vectorstore.client
res = es_client.index(index="content_base_documents", body=elasticsearch_doc)
return

def search_doc_content(self, file_uuid: str, content_base_uuid: str) -> str:
query = {
"bool": {
"filter": [
{ "term": { "file_uuid.keyword": file_uuid}},
{ "term": { "content_base_uuid.keyword": content_base_uuid}}
]
}
}
es_client = self.vectorstore.client
try:
res = es_client.search(index="content_base_documents", query=query)
hits = res["hits"]["hits"]

if len(hits) > 0:
doc = hits[0]
return doc.get("_source").get("content")
return ""
except Exception as e:
sentry_sdk.capture_message(f"{e}")
return ""

def check_if_doc_was_embedded_document(self, file_uuid: str, content_base_uuid: str) -> bool:
query = {
"bool": {
"filter": [
{ "term": { "metadata.file_uuid.keyword": file_uuid}},
{ "term": { "metadata.content_base_uuid.keyword": content_base_uuid}}
]
}
}
es_client = self.vectorstore.client
try:
res = es_client.search(index=self.vectorstore.index_name, query=query)
hits = res["hits"].get("total").get("value")

return hits > 0
except Exception as e:
sentry_sdk.capture_message(f"{e}")
return False
16 changes: 8 additions & 8 deletions app/tests/test_document_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,13 +130,13 @@ def test_load_xlsx_cls(self, mock_file_url):
mock_file_url.return_value = (file_path, "")
xlsx_loader = XlsxLoader(file_path)
split_pages: List[Document] = xlsx_loader.load_and_split_text(self.text_splitter)
self.assertEqual(list, type(split_pages))
self.assertEqual(tuple, type(split_pages))

def test_pdf_loader_cls(self):
file_path = f'{self.path}/{self.file_name}.pdf'
pdf_loader = PDFLoader(file_path)
split_pages: List[Document] = pdf_loader.load_and_split_text(self.text_splitter)
self.assertEqual(list, type(split_pages))
self.assertEqual(tuple, type(split_pages))

def test_urls_loader_cls(self):
urls_loader = URLsLoader("https://en.wikipedia.org/wiki/Unit_testing")
Expand All @@ -146,32 +146,32 @@ def test_urls_loader_cls(self):
def test_urls_loader_and_split_cls(self):
urls_loader = URLsLoader("https://en.wikipedia.org/wiki/Unit_testing")
split_pages: List[Document] = urls_loader.load_and_split_text(self.text_splitter)
self.assertEqual(list, type(split_pages))
self.assertEqual(tuple, type(split_pages))

def test_urls_list_loader_and_split_cls(self):
urls = ["https://en.wikipedia.org/wiki/Unit_testing"]
urls_loader = URLsLoader(urls)
split_pages: List[Document] = urls_loader.load_and_split_text(self.text_splitter)
self.assertEqual(list, type(split_pages))
self.assertEqual(tuple, type(split_pages))

def test_docx_loader_cls(self):
file_path = f'{self.path}/{self.file_name}.docx'
docx_loader = DocxLoader(file_path)
split_pages: List[Document] = docx_loader.load_and_split_text(self.text_splitter)
self.assertEqual(list, type(split_pages))
self.assertEqual(tuple, type(split_pages))

@mock.patch.dict(os.environ, {"AWS_STORAGE_BUCKET_NAME": "file-path"})
def test_txt_loader_cls(self):
file_path = f'{self.path}/{self.file_name}.txt'
docx_loader = TxtLoader(file_path)
split_pages: List[Document] = docx_loader.load_and_split_text(self.text_splitter)
self.assertEqual(list, type(split_pages))
self.assertEqual(tuple, type(split_pages))

def test_load_file_url_and_split_text(self):
file_path = f'{self.path}/{self.file_name}.pdf'
file_type = "pdf"
docs = load_file_url_and_split_text(file_path, file_type, self.text_splitter)
self.assertEqual(list, type(docs))
self.assertEqual(tuple, type(docs))

def test_load_file_url_and_split_text_pdf_miner(self):
file_path = f'{self.path}/{self.file_name}.pdf'
Expand All @@ -182,7 +182,7 @@ def test_load_file_url_and_split_text_pdf_miner(self):
self.text_splitter,
load_type="pdfminer"
)
self.assertEqual(list, type(docs))
self.assertEqual(tuple, type(docs))

def test_load_file_url_and_get_pages_text(self): # this function is deprecated
file_path = f'{self.path}/{self.file_name}.pdf'
Expand Down

0 comments on commit a0b486f

Please sign in to comment.