Skip to content

Commit

Permalink
Merge pull request #44 from weni-ai/feature/index-add-verification-step
Browse files Browse the repository at this point in the history
add verification step in indexing
  • Loading branch information
zMardone authored Jul 3, 2024
2 parents 139cbde + 8c271cc commit cd341da
Show file tree
Hide file tree
Showing 4 changed files with 34 additions and 1 deletion.
10 changes: 9 additions & 1 deletion app/celery.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,21 @@ def index_file_data(content_base: Dict) -> bool:
os.environ.get("AWS_STORAGE_ACCESS_KEY"),
os.environ.get("AWS_STORAGE_SECRET_KEY")
)
content_base_indexer = main_app.content_base_indexer
text_splitter = TextSplitter(character_text_splitter())
manager = IndexerFileManager(
file_downloader,
main_app.content_base_indexer,
content_base_indexer,
text_splitter,
)
index_result: bool = manager.index_file_url(content_base)
embbed_result: bool = content_base_indexer.check_if_doc_was_embedded_document(
file_uuid=content_base.get("file_uuid"),
content_base_uuid=str(content_base.get('content_base')),
)

index_result = index_result and embbed_result

NexusRESTClient().index_succedded(
task_succeded=index_result,
nexus_task_uuid=content_base.get("task_uuid"),
Expand Down
3 changes: 3 additions & 0 deletions app/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,3 +60,6 @@ def __init__(self):
self.sentry_dsn = os.environ.get("SENTRY_DSN", "")
self.environment = os.environ.get("ENVIRONMENT", "local")
self.es_timeout = os.environ.get("ELASTICSEARCH_TIMEOUT", "30")
self.content_base_documents_index_name = os.environ.get(
"INDEX_CONTENTBASEDOCS_NAME", "content_base_documents"
)
3 changes: 3 additions & 0 deletions app/indexer/content_bases.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,3 +110,6 @@ def delete_batch(self):

def search_document_content(self, file_uuid: str, content_base_uuid: str) -> str:
return self.storage.search_doc_content(file_uuid, content_base_uuid)

def check_if_doc_was_embedded_document(self, file_uuid: str, content_base_uuid: str) -> bool:
return self.storage.check_if_doc_was_embedded_document(file_uuid, content_base_uuid)
19 changes: 19 additions & 0 deletions app/store/elasticsearch_vector_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,3 +176,22 @@ def search_doc_content(self, file_uuid: str, content_base_uuid: str) -> str:
except Exception as e:
sentry_sdk.capture_message(f"{e}")
return ""

def check_if_doc_was_embedded_document(self, file_uuid: str, content_base_uuid: str) -> bool:
query = {
"bool": {
"filter": [
{ "term": { "metadata.file_uuid.keyword": file_uuid}},
{ "term": { "metadata.content_base_uuid.keyword": content_base_uuid}}
]
}
}
es_client = self.vectorstore.client
try:
res = es_client.search(index=self.vectorstore.index_name, query=query)
hits = res["hits"].get("total").get("value")

return hits > 0
except Exception as e:
sentry_sdk.capture_message(f"{e}")
return False

0 comments on commit cd341da

Please sign in to comment.