Merge pull request #1383 from wxywb/master

Add demo of hybrid retrieval.
milvus-io · Jul 19, 2024 · 80dce46 · 80dce46
2 parents 3721e11 + 6f7480e
commit 80dce46
Show file tree

Hide file tree

Showing 9 changed files with 320 additions and 0 deletions.
diff --git a/bootcamp/tutorials/quickstart/apps/hybrid_demo_with_milvus/.streamlit/config.toml b/bootcamp/tutorials/quickstart/apps/hybrid_demo_with_milvus/.streamlit/config.toml
@@ -0,0 +1,3 @@
+[theme]
+base = "dark"
+primaryColor = "#4fc4f9"
diff --git a/bootcamp/tutorials/quickstart/apps/hybrid_demo_with_milvus/README.md b/bootcamp/tutorials/quickstart/apps/hybrid_demo_with_milvus/README.md
@@ -0,0 +1,57 @@
+# Hybrid Semantic Search with Milvus
+
+<div style="text-align: center;">
+  <figure>
+    <img src="./pics/demo.jpg" alt="Description of Image" width="700"/>
+  </figure>
+</div>
+
+The Milvus Hybrid Search Demo uses the BGE-M3 model to provide advanced search results. Users can enter queries to receive Dense, Sparse, and Hybrid responses. Dense responses focus on the semantic context, while Sparse responses emphasize keyword matching. The Hybrid approach combines both methods, offering comprehensive results that capture both context and specific keywords. This demo highlights the effectiveness of integrating multiple retrieval strategies to enhance search result relevance with the balacne of both semantic and lexical similairty.
+
+## Features
+1. Embed the text as dense and sparse vectors.
+2. Set up a Milvus collection to store the dense and sparse vectors.
+3. Insert the data into Milvus.
+4. Search and inspect the results.
+
+## Quick Deploy
+
+Follow these steps to quickly deploy the application locally:
+
+### Preparation
+
+> Prerequisites: Python 3.8 or higher
+
+**1. Download Codes**
+
+```bash
+$ git clone <https://github.com/milvus-io/bootcamp.git>
+$ cd bootcamp/bootcamp/tutorials/quickstart/app/hybrid_demo_with_milvus
+```
+
+**2. Installation**
+
+Run the following commands to install the required libraries:
+
+```bash
+$ pip install -r requirements.txt
+```
+
+**3.Data Download**
+
+Download the Quora Duplicate Questions dataset and place it in the same directory:
+
+```bash
+wget http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv
+```
+
+Credit for the dataset: [First Quora Dataset Release: Question Pairs](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs)
+
+
+### Start Service
+
+Run the Streamlit application:
+
+```bash
+$ streamlit run ui.py
+```
diff --git a/bootcamp/tutorials/quickstart/apps/hybrid_demo_with_milvus/index.py b/bootcamp/tutorials/quickstart/apps/hybrid_demo_with_milvus/index.py
@@ -0,0 +1,122 @@
+"""
+Hybrid Semantic Search with Milvus
+
+This demo showcases hybrid semantic search using both dense and sparse vectors with Milvus.
+You can optionally use the BGE-M3 model to embed text into dense and sparse vectors, or use randomly generated vectors as an example.
+Additionally, you can rerank the search results using the BGE CrossEncoder model.
+
+Prerequisites:
+- Milvus 2.4.0 or higher (sparse vector search is available only in these versions). 
+  Follow this guide to set up Milvus: https://milvus.io/docs/install_standalone-docker.md
+- pymilvus Python client library to connect to the Milvus server.
+- Optional `model` module in pymilvus for BGE-M3 model.
+
+Installation:
+Run the following commands to install the required libraries:
+  pip install pymilvus
+  pip install pymilvus[model]
+
+Steps:
+1. Embed the text as dense and sparse vectors.
+2. Set up a Milvus collection to store the dense and sparse vectors.
+3. Insert the data into Milvus.
+4. Search and inspect the results.
+"""
+
+use_bge_m3 = True
+use_reranker = True
+
+import random
+import numpy as np
+import pandas as pd
+
+from pymilvus import (
+    FieldSchema,
+    CollectionSchema,
+    DataType,
+    Collection,
+    connections,
+)
+
+# 1. prepare a small corpus to search
+file_path = "quora_duplicate_questions.tsv"
+df = pd.read_csv(file_path, sep="\t")
+questions = set()
+for _, row in df.iterrows():
+    obj = row.to_dict()
+    questions.add(obj["question1"][:512])
+    questions.add(obj["question2"][:512])
+    if len(questions) > 10000:
+        break
+
+docs = list(questions)
+
+# add some randomly generated texts
+
+
+def random_embedding(texts):
+    rng = np.random.default_rng()
+    return {
+        "dense": np.random.rand(len(texts), 768),
+        "sparse": [
+            {
+                d: rng.random()
+                for d in random.sample(range(1000), random.randint(20, 30))
+            }
+            for _ in texts
+        ],
+    }
+
+
+dense_dim = 768
+ef = random_embedding
+
+# BGE-M3 model can embed texts as dense and sparse vectors.
+# It is included in the optional `model` module in pymilvus, to install it,
+# simply run "pip install pymilvus[model]".
+from pymilvus.model.hybrid import BGEM3EmbeddingFunction
+
+ef = BGEM3EmbeddingFunction(use_fp16=False, device="cuda")
+dense_dim = ef.dim["dense"]
+
+docs_embeddings = ef(docs)
+
+# 2. setup Milvus collection and index
+connections.connect("default", uri="milvus.db")
+
+# Specify the data schema for the new Collection.
+fields = [
+    # Use auto generated id as primary key
+    FieldSchema(
+        name="pk", dtype=DataType.VARCHAR, is_primary=True, auto_id=True, max_length=100
+    ),
+    # Store the original text to retrieve based on semantically distance
+    FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=512),
+    # Milvus now supports both sparse and dense vectors,
+    # we can store each in a separate field to conduct hybrid search on both vectors
+    FieldSchema(name="sparse_vector", dtype=DataType.SPARSE_FLOAT_VECTOR),
+    FieldSchema(name="dense_vector", dtype=DataType.FLOAT_VECTOR, dim=dense_dim),
+]
+schema = CollectionSchema(fields, "")
+col_name = "hybrid_demo"
+# Now we can create the new collection with above name and schema.
+col = Collection(col_name, schema, consistency_level="Strong")
+
+# We need to create indices for the vector fields. The indices will be loaded
+# into memory for efficient search.
+sparse_index = {"index_type": "SPARSE_INVERTED_INDEX", "metric_type": "IP"}
+col.create_index("sparse_vector", sparse_index)
+dense_index = {"index_type": "FLAT", "metric_type": "IP"}
+col.create_index("dense_vector", dense_index)
+col.load()
+
+# 3. insert text and sparse/dense vector representations into the collection
+entities = [docs, docs_embeddings["sparse"], docs_embeddings["dense"]]
+for i in range(0, len(docs), 50):
+    batched_entities = [
+        docs[i : i + 50],
+        docs_embeddings["sparse"][i : i + 50],
+        docs_embeddings["dense"][i : i + 50],
+    ]
+    col.insert(batched_entities)
+col.flush()
diff --git a/...tutorials/quickstart/apps/hybrid_demo_with_milvus/pics/Milvus_Logo_Official.png b/...tutorials/quickstart/apps/hybrid_demo_with_milvus/pics/Milvus_Logo_Official.png
diff --git a/bootcamp/tutorials/quickstart/apps/hybrid_demo_with_milvus/pics/demo.jpg b/bootcamp/tutorials/quickstart/apps/hybrid_demo_with_milvus/pics/demo.jpg
diff --git a/bootcamp/tutorials/quickstart/apps/hybrid_demo_with_milvus/requirements.txt b/bootcamp/tutorials/quickstart/apps/hybrid_demo_with_milvus/requirements.txt
@@ -0,0 +1,5 @@
+pandas
+numpy
+pymilvus
+pymilvus[model]
+streamlit
diff --git a/bootcamp/tutorials/quickstart/apps/hybrid_demo_with_milvus/ui.py b/bootcamp/tutorials/quickstart/apps/hybrid_demo_with_milvus/ui.py
@@ -0,0 +1,127 @@
+import streamlit as st
+from streamlit import cache_resource
+from pymilvus.model.hybrid import BGEM3EmbeddingFunction
+from pymilvus import (
+    Collection,
+    AnnSearchRequest,
+    WeightedRanker,
+    connections,
+)
+
+# Logo
+st.image("./pics/Milvus_Logo_Official.png", width=200)
+
+
+@cache_resource
+def get_model():
+    ef = BGEM3EmbeddingFunction(use_fp16=False, device="cpu")
+    return ef
+
+
+@cache_resource
+def get_collection():
+    col_name = "hybrid_demo"
+    connections.connect("default", uri="milvus.db")
+    col = Collection(col_name)
+    return col
+
+
+def search_from_source(source, query):
+    return [f"{source} Result {i+1} for {query}" for i in range(5)]
+
+
+st.title("Milvus Hybird Search Demo")
+
+query = st.text_input("Enter your search query:")
+search_button = st.button("Search")
+
+
+@cache_resource
+def get_tokenizer():
+    ef = get_model()
+    tokenizer = ef.model.tokenizer
+    return tokenizer
+
+
+def doc_text_colorization(query, docs):
+    tokenizer = get_tokenizer()
+    query_tokens_ids = tokenizer.encode(query, return_offsets_mapping=True)
+    query_tokens = tokenizer.convert_ids_to_tokens(query_tokens_ids)
+    colored_texts = []
+
+    for doc in docs:
+        ldx = 0
+        landmarks = []
+        encoding = tokenizer.encode_plus(doc, return_offsets_mapping=True)
+        tokens = tokenizer.convert_ids_to_tokens(encoding["input_ids"])[1:-1]
+        offsets = encoding["offset_mapping"][1:-1]
+        for token, (start, end) in zip(tokens, offsets):
+            if token in query_tokens:
+                if len(landmarks) != 0 and start == landmarks[-1]:
+                    landmarks[-1] = end
+                else:
+                    landmarks.append(start)
+                    landmarks.append(end)
+        close = False
+        color_text = ""
+        for i, c in enumerate(doc):
+            if ldx == len(landmarks):
+                pass
+            elif i == landmarks[ldx]:
+                if close is True:
+                    color_text += "]"
+                else:
+                    color_text += ":red["
+                close = not close
+                ldx = ldx + 1
+            color_text += c
+        if close is True:
+            color_text += "]"
+        colored_texts.append(color_text)
+    return colored_texts
+
+
+def hybrid_search(query_embeddings, sparse_weight=1.0, dense_weight=1.0):
+    col = get_collection()
+    sparse_search_params = {"metric_type": "IP"}
+    sparse_req = AnnSearchRequest(
+        query_embeddings["sparse"], "sparse_vector", sparse_search_params, limit=10
+    )
+    dense_search_params = {"metric_type": "IP"}
+    dense_req = AnnSearchRequest(
+        query_embeddings["dense"], "dense_vector", dense_search_params, limit=10
+    )
+    rerank = WeightedRanker(sparse_weight, dense_weight)
+    res = col.hybrid_search(
+        [sparse_req, dense_req], rerank=rerank, limit=10, output_fields=["text"]
+    )
+    if len(res):
+        return [hit.fields["text"] for hit in res[0]]
+    else:
+        return []
+
+
+# Display search results when the button is clicked
+if search_button and query:
+    ef = get_model()
+    query_embeddings = ef([query])
+    col1, col2, col3 = st.columns(3)
+    with col1:
+        st.header("Dense")
+        results = hybrid_search(query_embeddings, sparse_weight=0.0, dense_weight=1.0)
+        for result in results:
+            st.markdown(result)
+
+    with col2:
+        st.header("Sparse")
+        results = hybrid_search(query_embeddings, sparse_weight=1.0, dense_weight=0.0)
+        colored_results = doc_text_colorization(query, results)
+        for result in colored_results:
+            st.markdown(result)
+
+    with col3:
+        st.header("Hybrid")
+        results = hybrid_search(query_embeddings, sparse_weight=0.7, dense_weight=1.0)
+        colored_results = doc_text_colorization(query, results)
+        for result in colored_results:
+            st.markdown(result)
diff --git a/bootcamp/tutorials/quickstart/apps/image_search_with_milvus/.streamlit/config.toml b/bootcamp/tutorials/quickstart/apps/image_search_with_milvus/.streamlit/config.toml
@@ -0,0 +1,3 @@
+[theme]
+base = "dark"
+primaryColor = "#4fc4f9"
diff --git a/bootcamp/tutorials/quickstart/apps/rag_search_with_milvus/.streamlit/config.toml b/bootcamp/tutorials/quickstart/apps/rag_search_with_milvus/.streamlit/config.toml
@@ -0,0 +1,3 @@
+[theme]
+base = "dark"
+primaryColor = "#4fc4f9"