Skip to content

Commit

Permalink
chore: Weaviate pyv4 example (#3151)
Browse files Browse the repository at this point in the history
Update Unstructured example for Weaviate, now using latest python v4
client.

---------

Co-authored-by: Matt Robinson <[email protected]>
  • Loading branch information
dudanogueira and MthwRobinson authored Jun 10, 2024
1 parent a66661a commit 657a949
Show file tree
Hide file tree
Showing 2 changed files with 144 additions and 54 deletions.
6 changes: 3 additions & 3 deletions examples/weaviate/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
version: '3.4'
services:
weaviate:
image: cr.weaviate.io/semitechnologies/weaviate:1.19.6
image: cr.weaviate.io/semitechnologies/weaviate:1.25.2
restart: on-failure:0
ports:
- "8080:8080"
- "50051:50051"
environment:
QUERY_DEFAULTS_LIMIT: 20
AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true'
PERSISTENCE_DATA_PATH: "./data"
DEFAULT_VECTORIZER_MODULE: text2vec-transformers
ENABLE_MODULES: text2vec-transformers
TRANSFORMERS_INFERENCE_API: http://t2v-transformers:8080
Expand All @@ -17,4 +17,4 @@ services:
image: semitechnologies/transformers-inference:sentence-transformers-multi-qa-MiniLM-L6-cos-v1
environment:
ENABLE_CUDA: 0 # set to 1 to enable
# NVIDIA_VISIBLE_DEVICES: all # enable if running with CUDA
# NVIDIA_VISIBLE_DEVICES: all # enable if running with CUDA
192 changes: 141 additions & 51 deletions examples/weaviate/weaviate.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"id": "f9fc0cf9",
"metadata": {
"ExecuteTime": {
Expand Down Expand Up @@ -80,7 +80,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": null,
"id": "78e804bb",
"metadata": {
"ExecuteTime": {
Expand All @@ -90,13 +90,14 @@
},
"outputs": [],
"source": [
"unstructured_class = create_unstructured_weaviate_class(unstructured_class_name)\n",
"schema = {\"classes\": [unstructured_class]}"
"# not used, we are creating the schema from the provided data\n",
"# unstructured_class = create_unstructured_weaviate_class(unstructured_class_name)\n",
"# schema = {\"classes\": [unstructured_class]}"
]
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 4,
"id": "3e317a2d",
"metadata": {
"ExecuteTime": {
Expand All @@ -106,12 +107,14 @@
},
"outputs": [],
"source": [
"client = weaviate.Client(\"http://localhost:8080\")"
"# Connecting to Weaviate\n",
"# https://weaviate.io/developers/weaviate/starter-guides/connect\n",
"client = weaviate.connect_to_local()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 5,
"id": "0c508784",
"metadata": {
"ExecuteTime": {
Expand All @@ -121,8 +124,12 @@
},
"outputs": [],
"source": [
"client.schema.delete_all()\n",
"client.schema.create(schema)"
"client.collections.delete(unstructured_class_name)\n",
"collection = client.collections.create(\n",
" name=unstructured_class_name\n",
")\n",
"# we can get our collection at any time:\n",
"collection = client.collections.get(unstructured_class_name)"
]
},
{
Expand All @@ -135,7 +142,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 6,
"id": "a7018bb1",
"metadata": {
"ExecuteTime": {
Expand All @@ -148,6 +155,35 @@
"data_objects = stage_for_weaviate(elements)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "1a077829",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'file_directory': '../../example-docs',\n",
" 'filename': 'layout-parser-paper-fast.pdf',\n",
" 'languages': ['eng'],\n",
" 'last_modified': '2024-06-04T17:26:18',\n",
" 'page_number': 1,\n",
" 'filetype': 'application/pdf',\n",
" 'text': '1 2 0 2',\n",
" 'category': 'UncategorizedText'}"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# this one of our objects\n",
"data_objects[0]"
]
},
{
"cell_type": "code",
"execution_count": 8,
Expand All @@ -163,18 +199,25 @@
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 28/28 [00:00<00:00, 69.56it/s]\n"
"100%|██████████| 25/25 [00:00<00:00, 26620.36it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"FAILED: []\n"
]
}
],
"source": [
"with client.batch(batch_size=10) as batch:\n",
"with collection.batch.dynamic() as batch:\n",
" for data_object in tqdm.tqdm(data_objects):\n",
" batch.add_data_object(\n",
" data_object,\n",
" unstructured_class_name,\n",
" uuid=generate_uuid5(data_object),\n",
" )"
" batch.add_object(\n",
" properties=data_object\n",
" )\n",
" failed_objs_a = client.batch.failed_objects # check if we have failed objects\n",
" print(\"FAILED: \", failed_objs_a)"
]
},
{
Expand All @@ -188,50 +231,87 @@
{
"cell_type": "code",
"execution_count": 9,
"id": "14098434",
"metadata": {
"ExecuteTime": {
"end_time": "2023-08-09T22:59:53.384425Z",
"start_time": "2023-08-09T22:59:53.202823Z"
"id": "25d5bebc",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Object(uuid=_WeaviateUUIDInt('117e4b2d-1222-4d2e-9a40-2e761ecdafe8'), metadata=MetadataReturn(creation_time=None, last_update_time=None, distance=None, certainty=None, score=None, explain_score=None, is_consistent=None, rerank_score=None), properties={'text': '2', 'languages': ['eng'], 'page_number': 2.0, 'category': 'UncategorizedText', 'filetype': 'application/pdf', 'last_modified': '2024-06-04T17:26:18', 'filename': 'layout-parser-paper-fast.pdf', 'parent_id': UUID('47f9bb4b-20e0-5b9f-1ac6-bbb60cd9c2f9'), 'file_directory': '../../example-docs'}, references=None, vector={}, collection='UnstructuredDocument')\n"
]
}
},
],
"source": [
"# lets just get a single object\n",
"object = collection.query.fetch_objects(limit=1).objects[0]\n",
"print(object)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6477a112",
"metadata": {},
"outputs": [],
"source": [
"# We leveraged Weaviate AUTO SCHEMA to generate our collection\n",
"# you can get the collection schema dict like this\n",
"# collection.config.get().to_dict()\n",
"# we can use this same dict to create the collection\n",
"# new_collection = client.collections.create_from_dict(collection.config.get().to_dict())"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "82f67c21",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{\n",
" \"data\": {\n",
" \"Get\": {\n",
" \"UnstructuredDocument\": [\n",
" {\n",
" \"_additional\": {\n",
" \"score\": \"0.23643185\"\n",
" },\n",
" \"text\": \"Deep Learning(DL)-based approaches are the state-of-the-art for a wide range of document image analysis (DIA) tasks including document image classi\\ufb01cation [11,\"\n",
" },\n",
" {\n",
" \"_additional\": {\n",
" \"score\": \"0.22914983\"\n",
" },\n",
" \"text\": \"LayoutParser: A Uni\\ufb01ed Toolkit for Deep Learning Based Document Image Analysis\"\n",
" }\n",
" ]\n",
" }\n",
" }\n",
"}\n"
"0.36298108100891113 {'text': 'Deep Learning(DL)-based approaches are the state-of-the-art for a wide range of document image analysis (DIA) tasks including document image classification [11,', 'languages': ['eng'], 'page_number': 1.0, 'category': 'NarrativeText', 'filetype': 'application/pdf', 'last_modified': '2024-06-04T17:26:18', 'parent_id': UUID('47f9bb4b-20e0-5b9f-1ac6-bbb60cd9c2f9'), 'filename': 'layout-parser-paper-fast.pdf', 'file_directory': '../../example-docs'}\n",
"0.3443584442138672 {'text': 'LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis', 'languages': ['eng'], 'page_number': 1.0, 'category': 'Title', 'filetype': 'application/pdf', 'last_modified': '2024-06-04T17:26:18', 'parent_id': None, 'filename': 'layout-parser-paper-fast.pdf', 'file_directory': '../../example-docs'}\n"
]
}
],
"source": [
"response = (\n",
" client.query.get(\"UnstructuredDocument\", [\"text\", \"_additional {score}\"])\n",
" .with_bm25(query=\"document understanding\")\n",
" .with_limit(2)\n",
" .do()\n",
"results = collection.query.bm25(\n",
" query=\"document understanding\",\n",
" limit=2,\n",
" return_metadata=weaviate.classes.query.MetadataQuery(score=True)\n",
")\n",
"\n",
"print(json.dumps(response, indent=4))"
"for object in results.objects:\n",
" print(object.metadata.score, object.properties)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "905e02ca",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'text': 'Deep Learning(DL)-based approaches are the state-of-the-art for a wide range of document image analysis (DIA) tasks including document image classification [11,', 'languages': ['eng'], 'page_number': 1.0, 'category': 'NarrativeText', 'filetype': 'application/pdf', 'last_modified': '2024-06-04T17:26:18', 'parent_id': UUID('47f9bb4b-20e0-5b9f-1ac6-bbb60cd9c2f9'), 'filename': 'layout-parser-paper-fast.pdf', 'file_directory': '../../example-docs'}\n",
"{'text': 'Z. Shen et al.', 'languages': ['eng'], 'page_number': 2.0, 'category': 'NarrativeText', 'filetype': 'application/pdf', 'last_modified': '2024-06-04T17:26:18', 'parent_id': UUID('47f9bb4b-20e0-5b9f-1ac6-bbb60cd9c2f9'), 'filename': 'layout-parser-paper-fast.pdf', 'file_directory': '../../example-docs'}\n",
"{'text': 'The library implements simple and intuitive Python APIs without sacrificing generalizability and versatility, and can be easily installed via pip. Its convenient functions for handling document image data can be seamlessly integrated with existing DIA pipelines. With detailed documentations and carefully curated tutorials, we hope this tool will benefit a variety of end-users, and will lead to advances in applications in both industry and academic research.', 'languages': ['eng'], 'page_number': 2.0, 'category': 'NarrativeText', 'filetype': 'application/pdf', 'last_modified': '2024-06-04T17:26:18', 'filename': 'layout-parser-paper-fast.pdf', 'parent_id': UUID('47f9bb4b-20e0-5b9f-1ac6-bbb60cd9c2f9'), 'file_directory': '../../example-docs'}\n",
"{'text': 'Introduction', 'languages': ['eng'], 'page_number': 1.0, 'category': 'Title', 'filetype': 'application/pdf', 'last_modified': '2024-06-04T17:26:18', 'parent_id': None, 'filename': 'layout-parser-paper-fast.pdf', 'file_directory': '../../example-docs'}\n"
]
}
],
"source": [
"# We can also perform similarity search\n",
"results = collection.query.near_text(\n",
" query=\"document understanding\",\n",
" limit=4\n",
")\n",
"for object in results.objects:\n",
" print(object.properties)"
]
},
{
Expand All @@ -244,6 +324,16 @@
}
},
"source": []
},
{
"cell_type": "code",
"execution_count": 12,
"id": "20bca8f9",
"metadata": {},
"outputs": [],
"source": [
"client.close()"
]
}
],
"metadata": {
Expand All @@ -262,7 +352,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.17"
"version": "3.12.3"
}
},
"nbformat": 4,
Expand Down

0 comments on commit 657a949

Please sign in to comment.