Skip to content

Commit

Permalink
add: index document full cont; entendpoint to return content
Browse files Browse the repository at this point in the history
  • Loading branch information
AlisoSouza committed Jul 1, 2024
1 parent 3abfc55 commit 139cbde
Show file tree
Hide file tree
Showing 6 changed files with 112 additions and 15 deletions.
24 changes: 24 additions & 0 deletions app/handlers/content_bases.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,15 @@ class ContentBaseDeleteResponse(BaseModel):
deleted: bool


class ContentBaseSearchDocumentRequest(BaseModel):
file_uuid: str
content_base_uuid: str


class ContentBaseSearchDocumentResponse(BaseModel):
content: str


class ContentBaseHandler(IDocumentHandler):
def __init__(self, content_base_indexer: IDocumentIndexer):
self.content_base_indexer = content_base_indexer
Expand All @@ -56,6 +65,9 @@ def __init__(self, content_base_indexer: IDocumentIndexer):
self.router.add_api_route(
"/content_base/search", endpoint=self.search, methods=["POST"]
)
self.router.add_api_route(
"/content_base/search-document", endpoint=self.search_document_content, methods=["POST"]
)
self.router.add_api_route(
"/content_base/delete", endpoint=self.delete, methods=["DELETE"]
)
Expand Down Expand Up @@ -102,3 +114,15 @@ def search(
filter=request.filter
)
return ContentBaseSearchResponse(response=response)

def search_document_content(
self,
request: ContentBaseSearchDocumentRequest,
Authorization: Annotated[str | None, Header()] = None
):
token_verification(Authorization)
response = self.content_base_indexer.search_document_content(
file_uuid=request.file_uuid,
content_base_uuid=request.content_base_uuid
)
return ContentBaseSearchDocumentResponse(content=response)
11 changes: 11 additions & 0 deletions app/indexer/content_bases.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,5 +97,16 @@ def delete(self, content_base_uuid: UUID, filename: str, file_uuid: str):
if ids:
self.storage.delete(ids=ids)

def index_doc_content(self, full_content: str, content_base_uuid: UUID, filename: str, file_uuid: str):
self.storage.save_doc_content(
full_content=full_content,
content_base_uuid=content_base_uuid,
filename=filename,
file_uuid=file_uuid
)

def delete_batch(self):
raise NotImplementedError

def search_document_content(self, file_uuid: str, content_base_uuid: str) -> str:
return self.storage.search_doc_content(file_uuid, content_base_uuid)
12 changes: 11 additions & 1 deletion app/indexer/indexer_file_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,11 @@ def __init__(self,

def index_file_url(self, content_base, **kwargs) -> bool:
load_type = content_base.get("load_type")
docs: List[Document] = load_file_url_and_split_text(

docs: List[Document]
full_content: str

docs, full_content = load_file_url_and_split_text(
content_base.get("file"),
content_base.get('extension_file'),
self.text_splitter,
Expand All @@ -59,6 +63,12 @@ def index_file_url(self, content_base, **kwargs) -> bool:
document_pages: List[Document] = add_file_metadata(docs, content_base)
try:
self.content_base_indexer.index_documents(document_pages)
self.content_base_indexer.index_doc_content(
full_content=full_content,
content_base_uuid=str(content_base.get('content_base')),
filename=content_base.get("filename"),
file_uuid=content_base.get("file_uuid"),
)
return True
except Exception as e: # TODO: handle exceptions
logger.exception(e)
Expand Down
4 changes: 3 additions & 1 deletion app/loaders/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
from langchain.schema.document import Document
from typing import List
from app.text_splitters import ITextSplitter
from typing import Tuple


supported_loaders = {
'txt': txt_loader,
Expand Down Expand Up @@ -72,7 +74,7 @@ def load_file_url_and_split_text(
file_type: str,
text_splitter: ITextSplitter,
**kwargs
) -> List[Document]:
) -> Tuple[List[Document], str]:

load_type = kwargs.get("load_type", None)

Expand Down
41 changes: 28 additions & 13 deletions app/loaders/loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import uuid
import requests
from abc import ABC, abstractmethod

from typing import Tuple
from urllib.request import urlretrieve
from urllib.parse import urlparse

Expand Down Expand Up @@ -44,7 +44,7 @@ def load(self) -> List[Document]:
def load_and_split_text(
self,
text_splitter: ITextSplitter
) -> List[Document]:
) -> Tuple[List[Document], str]:
return self.loader.load_and_split_text(text_splitter)

def raw_text(self) -> str:
Expand Down Expand Up @@ -101,7 +101,10 @@ def load(self) -> List[Document]:
def load_and_split_text(
self,
text_splitter: ITextSplitter
) -> List[Document]:
) -> Tuple[List[Document], str]:
data: List[Document] = self.loader.load()
full_content = data[0].page_content

pages = self.load()
split_pages = []
for page in pages:
Expand All @@ -117,7 +120,7 @@ def load_and_split_text(
metadata=metadatas
)
)
return split_pages
return (split_pages, full_content)


class PDFLoader(DocumentLoader):
Expand All @@ -141,7 +144,10 @@ def load(self) -> List[Document]:
def load_and_split_text(
self,
text_splitter: ITextSplitter
) -> List[Document]:
) -> Tuple[List[Document], str]:
data: List[Document] = self.loader.load()
full_content = data[0].page_content

pages = self.load()
split_pages = []

Expand All @@ -159,7 +165,7 @@ def load_and_split_text(
)
)

return split_pages
return (split_pages, full_content)

def raw_text(self) -> str:
pages = self.load()
Expand Down Expand Up @@ -191,7 +197,11 @@ def load(self) -> List[Document]:
def load_and_split_text(
self,
text_splitter: ITextSplitter
) -> List[Document]:
) -> Tuple[List[Document], str]:

data: List[Document] = self.loader.load()
full_content = data[0].page_content

pages = self.load()
split_pages = []
for page in pages:
Expand All @@ -207,7 +217,7 @@ def load_and_split_text(
metadata=metadatas
)
)
return split_pages
return (split_pages, full_content)


def docx_loader(file: str) -> Callable:
Expand Down Expand Up @@ -248,7 +258,11 @@ def load(self) -> List[Document]:
def load_and_split_text(
self,
text_splitter: ITextSplitter
) -> List[Document]:
) -> Tuple[List[Document], str]:

data: List[Document] = self.load()
full_content: str = data[0].page_content

pages = self.load()
split_pages = []
for page in pages:
Expand All @@ -264,7 +278,7 @@ def load_and_split_text(
metadata=metadatas
)
)
return split_pages
return (split_pages, full_content)


class URLsLoader(DocumentLoader):
Expand All @@ -288,9 +302,10 @@ def load(self) -> List[Document]:
def load_and_split_text(
self,
text_splitter: ITextSplitter
) -> List[Document]:
) -> Tuple[List[Document], str]:
split_pages = []

data: List[Document] = self.load()
full_content: str = data[0].page_content
pages = self.loader.load_and_split()
for page in pages:
page_content = page.page_content
Expand All @@ -305,4 +320,4 @@ def load_and_split_text(
metadata=metadatas
)
)
return split_pages
return (split_pages, full_content)
35 changes: 35 additions & 0 deletions app/store/elasticsearch_vector_store.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import os

import sentry_sdk

from langchain.vectorstores import VectorStore
from langchain.docstore.document import Document

Expand Down Expand Up @@ -141,3 +143,36 @@ def search(self, search: str, filter=None, threshold=0.1) -> list[Document]:

def delete(self, ids: list[str] = []) -> bool:
return self.vectorstore.delete(ids)

def save_doc_content(self, full_content, content_base_uuid, filename, file_uuid) -> None:
elasticsearch_doc = {
"content": full_content,
"content_base_uuid": content_base_uuid,
"filename": filename,
"file_uuid":file_uuid
}
es_client = self.vectorstore.client
res = es_client.index(index="content_base_documents", body=elasticsearch_doc)
return

def search_doc_content(self, file_uuid: str, content_base_uuid: str) -> str:
query = {
"bool": {
"filter": [
{ "term": { "file_uuid.keyword": file_uuid}},
{ "term": { "content_base_uuid.keyword": content_base_uuid}}
]
}
}
es_client = self.vectorstore.client
try:
res = es_client.search(index="content_base_documents", query=query)
hits = res["hits"]["hits"]

if len(hits) > 0:
doc = hits[0]
return doc.get("_source").get("content")
return ""
except Exception as e:
sentry_sdk.capture_message(f"{e}")
return ""

0 comments on commit 139cbde

Please sign in to comment.