chore: Weaviate pyv4 example (#3151)

Update Unstructured example for Weaviate, now using latest python v4 client. --------- Co-authored-by: Matt Robinson <[email protected]>
Unstructured-IO · Jun 10, 2024 · 657a949 · 657a949
1 parent a66661a
commit 657a949
Show file tree

Hide file tree

Showing 2 changed files with 144 additions and 54 deletions.
diff --git a/examples/weaviate/docker-compose.yml b/examples/weaviate/docker-compose.yml
@@ -1,14 +1,14 @@
 version: '3.4'
 services:
   weaviate:
-    image: cr.weaviate.io/semitechnologies/weaviate:1.19.6
+    image: cr.weaviate.io/semitechnologies/weaviate:1.25.2
     restart: on-failure:0
     ports:
      - "8080:8080"
+     - "50051:50051"
     environment:
       QUERY_DEFAULTS_LIMIT: 20
       AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true'
-      PERSISTENCE_DATA_PATH: "./data"
       DEFAULT_VECTORIZER_MODULE: text2vec-transformers
       ENABLE_MODULES: text2vec-transformers
       TRANSFORMERS_INFERENCE_API: http://t2v-transformers:8080
@@ -17,4 +17,4 @@ services:
     image: semitechnologies/transformers-inference:sentence-transformers-multi-qa-MiniLM-L6-cos-v1
     environment:
       ENABLE_CUDA: 0 # set to 1 to enable
-      # NVIDIA_VISIBLE_DEVICES: all # enable if running with CUDA
+      # NVIDIA_VISIBLE_DEVICES: all # enable if running with CUDA
diff --git a/examples/weaviate/weaviate.ipynb b/examples/weaviate/weaviate.ipynb
@@ -41,7 +41,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "id": "f9fc0cf9",
    "metadata": {
     "ExecuteTime": {
@@ -80,7 +80,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "id": "78e804bb",
    "metadata": {
     "ExecuteTime": {
@@ -90,13 +90,14 @@
    },
    "outputs": [],
    "source": [
-    "unstructured_class = create_unstructured_weaviate_class(unstructured_class_name)\n",
-    "schema = {\"classes\": [unstructured_class]}"
+    "# not used, we are creating the schema from the provided data\n",
+    "# unstructured_class = create_unstructured_weaviate_class(unstructured_class_name)\n",
+    "# schema = {\"classes\": [unstructured_class]}"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 4,
    "id": "3e317a2d",
    "metadata": {
     "ExecuteTime": {
@@ -106,12 +107,14 @@
    },
    "outputs": [],
    "source": [
-    "client = weaviate.Client(\"http://localhost:8080\")"
+    "# Connecting to Weaviate\n",
+    "# https://weaviate.io/developers/weaviate/starter-guides/connect\n",
+    "client = weaviate.connect_to_local()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 5,
    "id": "0c508784",
    "metadata": {
     "ExecuteTime": {
@@ -121,8 +124,12 @@
    },
    "outputs": [],
    "source": [
-    "client.schema.delete_all()\n",
-    "client.schema.create(schema)"
+    "client.collections.delete(unstructured_class_name)\n",
+    "collection = client.collections.create(\n",
+    "    name=unstructured_class_name\n",
+    ")\n",
+    "# we can get our collection at any time:\n",
+    "collection = client.collections.get(unstructured_class_name)"
    ]
   },
   {
@@ -135,7 +142,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 6,
    "id": "a7018bb1",
    "metadata": {
     "ExecuteTime": {
@@ -148,6 +155,35 @@
     "data_objects = stage_for_weaviate(elements)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "1a077829",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'file_directory': '../../example-docs',\n",
+       " 'filename': 'layout-parser-paper-fast.pdf',\n",
+       " 'languages': ['eng'],\n",
+       " 'last_modified': '2024-06-04T17:26:18',\n",
+       " 'page_number': 1,\n",
+       " 'filetype': 'application/pdf',\n",
+       " 'text': '1 2 0 2',\n",
+       " 'category': 'UncategorizedText'}"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# this one of our objects\n",
+    "data_objects[0]"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 8,
@@ -163,18 +199,25 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100%|██████████| 28/28 [00:00<00:00, 69.56it/s]\n"
+      "100%|██████████| 25/25 [00:00<00:00, 26620.36it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "FAILED:  []\n"
      ]
     }
    ],
    "source": [
-    "with client.batch(batch_size=10) as batch:\n",
+    "with collection.batch.dynamic() as batch:\n",
     "    for data_object in tqdm.tqdm(data_objects):\n",
-    "        batch.add_data_object(\n",
-    "            data_object,\n",
-    "            unstructured_class_name,\n",
-    "            uuid=generate_uuid5(data_object),\n",
-    "        )"
+    "        batch.add_object(\n",
+    "            properties=data_object\n",
+    "        )\n",
+    "    failed_objs_a = client.batch.failed_objects  # check if we have failed objects\n",
+    "    print(\"FAILED: \", failed_objs_a)"
    ]
   },
   {
@@ -188,50 +231,87 @@
   {
    "cell_type": "code",
    "execution_count": 9,
-   "id": "14098434",
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2023-08-09T22:59:53.384425Z",
-     "start_time": "2023-08-09T22:59:53.202823Z"
+   "id": "25d5bebc",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Object(uuid=_WeaviateUUIDInt('117e4b2d-1222-4d2e-9a40-2e761ecdafe8'), metadata=MetadataReturn(creation_time=None, last_update_time=None, distance=None, certainty=None, score=None, explain_score=None, is_consistent=None, rerank_score=None), properties={'text': '2', 'languages': ['eng'], 'page_number': 2.0, 'category': 'UncategorizedText', 'filetype': 'application/pdf', 'last_modified': '2024-06-04T17:26:18', 'filename': 'layout-parser-paper-fast.pdf', 'parent_id': UUID('47f9bb4b-20e0-5b9f-1ac6-bbb60cd9c2f9'), 'file_directory': '../../example-docs'}, references=None, vector={}, collection='UnstructuredDocument')\n"
+     ]
     }
-   },
+   ],
+   "source": [
+    "# lets just get a single object\n",
+    "object = collection.query.fetch_objects(limit=1).objects[0]\n",
+    "print(object)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6477a112",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# We leveraged Weaviate AUTO SCHEMA to generate our collection\n",
+    "# you can get the collection schema dict like this\n",
+    "# collection.config.get().to_dict()\n",
+    "# we can use this same dict to create the collection\n",
+    "# new_collection = client.collections.create_from_dict(collection.config.get().to_dict())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "82f67c21",
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "{\n",
-      "    \"data\": {\n",
-      "        \"Get\": {\n",
-      "            \"UnstructuredDocument\": [\n",
-      "                {\n",
-      "                    \"_additional\": {\n",
-      "                        \"score\": \"0.23643185\"\n",
-      "                    },\n",
-      "                    \"text\": \"Deep Learning(DL)-based approaches are the state-of-the-art for a wide range of document image analysis (DIA) tasks including document image classi\\ufb01cation [11,\"\n",
-      "                },\n",
-      "                {\n",
-      "                    \"_additional\": {\n",
-      "                        \"score\": \"0.22914983\"\n",
-      "                    },\n",
-      "                    \"text\": \"LayoutParser: A Uni\\ufb01ed Toolkit for Deep Learning Based Document Image Analysis\"\n",
-      "                }\n",
-      "            ]\n",
-      "        }\n",
-      "    }\n",
-      "}\n"
+      "0.36298108100891113 {'text': 'Deep Learning(DL)-based approaches are the state-of-the-art for a wide range of document image analysis (DIA) tasks including document image classiﬁcation [11,', 'languages': ['eng'], 'page_number': 1.0, 'category': 'NarrativeText', 'filetype': 'application/pdf', 'last_modified': '2024-06-04T17:26:18', 'parent_id': UUID('47f9bb4b-20e0-5b9f-1ac6-bbb60cd9c2f9'), 'filename': 'layout-parser-paper-fast.pdf', 'file_directory': '../../example-docs'}\n",
+      "0.3443584442138672 {'text': 'LayoutParser: A Uniﬁed Toolkit for Deep Learning Based Document Image Analysis', 'languages': ['eng'], 'page_number': 1.0, 'category': 'Title', 'filetype': 'application/pdf', 'last_modified': '2024-06-04T17:26:18', 'parent_id': None, 'filename': 'layout-parser-paper-fast.pdf', 'file_directory': '../../example-docs'}\n"
      ]
     }
    ],
    "source": [
-    "response = (\n",
-    "    client.query.get(\"UnstructuredDocument\", [\"text\", \"_additional {score}\"])\n",
-    "    .with_bm25(query=\"document understanding\")\n",
-    "    .with_limit(2)\n",
-    "    .do()\n",
+    "results = collection.query.bm25(\n",
+    "    query=\"document understanding\",\n",
+    "    limit=2,\n",
+    "    return_metadata=weaviate.classes.query.MetadataQuery(score=True)\n",
     ")\n",
-    "\n",
-    "print(json.dumps(response, indent=4))"
+    "for object in results.objects:\n",
+    "    print(object.metadata.score, object.properties)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "905e02ca",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'text': 'Deep Learning(DL)-based approaches are the state-of-the-art for a wide range of document image analysis (DIA) tasks including document image classiﬁcation [11,', 'languages': ['eng'], 'page_number': 1.0, 'category': 'NarrativeText', 'filetype': 'application/pdf', 'last_modified': '2024-06-04T17:26:18', 'parent_id': UUID('47f9bb4b-20e0-5b9f-1ac6-bbb60cd9c2f9'), 'filename': 'layout-parser-paper-fast.pdf', 'file_directory': '../../example-docs'}\n",
+      "{'text': 'Z. Shen et al.', 'languages': ['eng'], 'page_number': 2.0, 'category': 'NarrativeText', 'filetype': 'application/pdf', 'last_modified': '2024-06-04T17:26:18', 'parent_id': UUID('47f9bb4b-20e0-5b9f-1ac6-bbb60cd9c2f9'), 'filename': 'layout-parser-paper-fast.pdf', 'file_directory': '../../example-docs'}\n",
+      "{'text': 'The library implements simple and intuitive Python APIs without sacriﬁcing generalizability and versatility, and can be easily installed via pip. Its convenient functions for handling document image data can be seamlessly integrated with existing DIA pipelines. With detailed documentations and carefully curated tutorials, we hope this tool will beneﬁt a variety of end-users, and will lead to advances in applications in both industry and academic research.', 'languages': ['eng'], 'page_number': 2.0, 'category': 'NarrativeText', 'filetype': 'application/pdf', 'last_modified': '2024-06-04T17:26:18', 'filename': 'layout-parser-paper-fast.pdf', 'parent_id': UUID('47f9bb4b-20e0-5b9f-1ac6-bbb60cd9c2f9'), 'file_directory': '../../example-docs'}\n",
+      "{'text': 'Introduction', 'languages': ['eng'], 'page_number': 1.0, 'category': 'Title', 'filetype': 'application/pdf', 'last_modified': '2024-06-04T17:26:18', 'parent_id': None, 'filename': 'layout-parser-paper-fast.pdf', 'file_directory': '../../example-docs'}\n"
+     ]
+    }
+   ],
+   "source": [
+    "# We can also perform similarity search\n",
+    "results = collection.query.near_text(\n",
+    "    query=\"document understanding\",\n",
+    "    limit=4\n",
+    ")\n",
+    "for object in results.objects:\n",
+    "    print(object.properties)"
    ]
   },
   {
@@ -244,6 +324,16 @@
     }
    },
    "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "20bca8f9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "client.close()"
+   ]
   }
  ],
  "metadata": {
@@ -262,7 +352,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.17"
+   "version": "3.12.3"
   }
  },
  "nbformat": 4,