From 139cbde2f8577814624f5a3fc790b7dd3b4df7dc Mon Sep 17 00:00:00 2001 From: Alisson Date: Mon, 1 Jul 2024 10:36:47 -0300 Subject: [PATCH] add: index document full cont; entendpoint to return content --- app/handlers/content_bases.py | 24 +++++++++++++++ app/indexer/content_bases.py | 11 +++++++ app/indexer/indexer_file_manager.py | 12 +++++++- app/loaders/__init__.py | 4 ++- app/loaders/loaders.py | 41 +++++++++++++++++-------- app/store/elasticsearch_vector_store.py | 35 +++++++++++++++++++++ 6 files changed, 112 insertions(+), 15 deletions(-) diff --git a/app/handlers/content_bases.py b/app/handlers/content_bases.py index 5cbaecb..eb538f6 100644 --- a/app/handlers/content_bases.py +++ b/app/handlers/content_bases.py @@ -46,6 +46,15 @@ class ContentBaseDeleteResponse(BaseModel): deleted: bool +class ContentBaseSearchDocumentRequest(BaseModel): + file_uuid: str + content_base_uuid: str + + +class ContentBaseSearchDocumentResponse(BaseModel): + content: str + + class ContentBaseHandler(IDocumentHandler): def __init__(self, content_base_indexer: IDocumentIndexer): self.content_base_indexer = content_base_indexer @@ -56,6 +65,9 @@ def __init__(self, content_base_indexer: IDocumentIndexer): self.router.add_api_route( "/content_base/search", endpoint=self.search, methods=["POST"] ) + self.router.add_api_route( + "/content_base/search-document", endpoint=self.search_document_content, methods=["POST"] + ) self.router.add_api_route( "/content_base/delete", endpoint=self.delete, methods=["DELETE"] ) @@ -102,3 +114,15 @@ def search( filter=request.filter ) return ContentBaseSearchResponse(response=response) + + def search_document_content( + self, + request: ContentBaseSearchDocumentRequest, + Authorization: Annotated[str | None, Header()] = None + ): + token_verification(Authorization) + response = self.content_base_indexer.search_document_content( + file_uuid=request.file_uuid, + content_base_uuid=request.content_base_uuid + ) + return ContentBaseSearchDocumentResponse(content=response) diff --git a/app/indexer/content_bases.py b/app/indexer/content_bases.py index 56dd4a3..ea87309 100644 --- a/app/indexer/content_bases.py +++ b/app/indexer/content_bases.py @@ -97,5 +97,16 @@ def delete(self, content_base_uuid: UUID, filename: str, file_uuid: str): if ids: self.storage.delete(ids=ids) + def index_doc_content(self, full_content: str, content_base_uuid: UUID, filename: str, file_uuid: str): + self.storage.save_doc_content( + full_content=full_content, + content_base_uuid=content_base_uuid, + filename=filename, + file_uuid=file_uuid + ) + def delete_batch(self): raise NotImplementedError + + def search_document_content(self, file_uuid: str, content_base_uuid: str) -> str: + return self.storage.search_doc_content(file_uuid, content_base_uuid) diff --git a/app/indexer/indexer_file_manager.py b/app/indexer/indexer_file_manager.py index 9235865..5a2dc71 100644 --- a/app/indexer/indexer_file_manager.py +++ b/app/indexer/indexer_file_manager.py @@ -50,7 +50,11 @@ def __init__(self, def index_file_url(self, content_base, **kwargs) -> bool: load_type = content_base.get("load_type") - docs: List[Document] = load_file_url_and_split_text( + + docs: List[Document] + full_content: str + + docs, full_content = load_file_url_and_split_text( content_base.get("file"), content_base.get('extension_file'), self.text_splitter, @@ -59,6 +63,12 @@ def index_file_url(self, content_base, **kwargs) -> bool: document_pages: List[Document] = add_file_metadata(docs, content_base) try: self.content_base_indexer.index_documents(document_pages) + self.content_base_indexer.index_doc_content( + full_content=full_content, + content_base_uuid=str(content_base.get('content_base')), + filename=content_base.get("filename"), + file_uuid=content_base.get("file_uuid"), + ) return True except Exception as e: # TODO: handle exceptions logger.exception(e) diff --git a/app/loaders/__init__.py b/app/loaders/__init__.py index 93a66d6..1eace35 100644 --- a/app/loaders/__init__.py +++ b/app/loaders/__init__.py @@ -18,6 +18,8 @@ from langchain.schema.document import Document from typing import List from app.text_splitters import ITextSplitter +from typing import Tuple + supported_loaders = { 'txt': txt_loader, @@ -72,7 +74,7 @@ def load_file_url_and_split_text( file_type: str, text_splitter: ITextSplitter, **kwargs -) -> List[Document]: +) -> Tuple[List[Document], str]: load_type = kwargs.get("load_type", None) diff --git a/app/loaders/loaders.py b/app/loaders/loaders.py index 009596f..d0c98a0 100644 --- a/app/loaders/loaders.py +++ b/app/loaders/loaders.py @@ -2,7 +2,7 @@ import uuid import requests from abc import ABC, abstractmethod - +from typing import Tuple from urllib.request import urlretrieve from urllib.parse import urlparse @@ -44,7 +44,7 @@ def load(self) -> List[Document]: def load_and_split_text( self, text_splitter: ITextSplitter - ) -> List[Document]: + ) -> Tuple[List[Document], str]: return self.loader.load_and_split_text(text_splitter) def raw_text(self) -> str: @@ -101,7 +101,10 @@ def load(self) -> List[Document]: def load_and_split_text( self, text_splitter: ITextSplitter - ) -> List[Document]: + ) -> Tuple[List[Document], str]: + data: List[Document] = self.loader.load() + full_content = data[0].page_content + pages = self.load() split_pages = [] for page in pages: @@ -117,7 +120,7 @@ def load_and_split_text( metadata=metadatas ) ) - return split_pages + return (split_pages, full_content) class PDFLoader(DocumentLoader): @@ -141,7 +144,10 @@ def load(self) -> List[Document]: def load_and_split_text( self, text_splitter: ITextSplitter - ) -> List[Document]: + ) -> Tuple[List[Document], str]: + data: List[Document] = self.loader.load() + full_content = data[0].page_content + pages = self.load() split_pages = [] @@ -159,7 +165,7 @@ def load_and_split_text( ) ) - return split_pages + return (split_pages, full_content) def raw_text(self) -> str: pages = self.load() @@ -191,7 +197,11 @@ def load(self) -> List[Document]: def load_and_split_text( self, text_splitter: ITextSplitter - ) -> List[Document]: + ) -> Tuple[List[Document], str]: + + data: List[Document] = self.loader.load() + full_content = data[0].page_content + pages = self.load() split_pages = [] for page in pages: @@ -207,7 +217,7 @@ def load_and_split_text( metadata=metadatas ) ) - return split_pages + return (split_pages, full_content) def docx_loader(file: str) -> Callable: @@ -248,7 +258,11 @@ def load(self) -> List[Document]: def load_and_split_text( self, text_splitter: ITextSplitter - ) -> List[Document]: + ) -> Tuple[List[Document], str]: + + data: List[Document] = self.load() + full_content: str = data[0].page_content + pages = self.load() split_pages = [] for page in pages: @@ -264,7 +278,7 @@ def load_and_split_text( metadata=metadatas ) ) - return split_pages + return (split_pages, full_content) class URLsLoader(DocumentLoader): @@ -288,9 +302,10 @@ def load(self) -> List[Document]: def load_and_split_text( self, text_splitter: ITextSplitter - ) -> List[Document]: + ) -> Tuple[List[Document], str]: split_pages = [] - + data: List[Document] = self.load() + full_content: str = data[0].page_content pages = self.loader.load_and_split() for page in pages: page_content = page.page_content @@ -305,4 +320,4 @@ def load_and_split_text( metadata=metadatas ) ) - return split_pages + return (split_pages, full_content) diff --git a/app/store/elasticsearch_vector_store.py b/app/store/elasticsearch_vector_store.py index e8865c3..c126ef2 100644 --- a/app/store/elasticsearch_vector_store.py +++ b/app/store/elasticsearch_vector_store.py @@ -1,5 +1,7 @@ import os +import sentry_sdk + from langchain.vectorstores import VectorStore from langchain.docstore.document import Document @@ -141,3 +143,36 @@ def search(self, search: str, filter=None, threshold=0.1) -> list[Document]: def delete(self, ids: list[str] = []) -> bool: return self.vectorstore.delete(ids) + + def save_doc_content(self, full_content, content_base_uuid, filename, file_uuid) -> None: + elasticsearch_doc = { + "content": full_content, + "content_base_uuid": content_base_uuid, + "filename": filename, + "file_uuid":file_uuid + } + es_client = self.vectorstore.client + res = es_client.index(index="content_base_documents", body=elasticsearch_doc) + return + + def search_doc_content(self, file_uuid: str, content_base_uuid: str) -> str: + query = { + "bool": { + "filter": [ + { "term": { "file_uuid.keyword": file_uuid}}, + { "term": { "content_base_uuid.keyword": content_base_uuid}} + ] + } + } + es_client = self.vectorstore.client + try: + res = es_client.search(index="content_base_documents", query=query) + hits = res["hits"]["hits"] + + if len(hits) > 0: + doc = hits[0] + return doc.get("_source").get("content") + return "" + except Exception as e: + sentry_sdk.capture_message(f"{e}") + return ""