From 4ce397e7eff934f0d270106e400bec094fc8b331 Mon Sep 17 00:00:00 2001 From: Christy Bergman Date: Mon, 1 Jul 2024 14:46:14 -0700 Subject: [PATCH] Update basic RAG chunking and eval notebook (#1367) Signed-off-by: Christy Bergman --- .../dbta_may_2024/1. RAG_basic.ipynb | 2710 +++++++++-------- 1 file changed, 1504 insertions(+), 1206 deletions(-) diff --git a/bootcamp/workshops/dbta_may_2024/1. RAG_basic.ipynb b/bootcamp/workshops/dbta_may_2024/1. RAG_basic.ipynb index 8ac782058..229fcdda1 100644 --- a/bootcamp/workshops/dbta_may_2024/1. RAG_basic.ipynb +++ b/bootcamp/workshops/dbta_may_2024/1. RAG_basic.ipynb @@ -50,99 +50,121 @@ "# Import common libraries.\n", "import sys, os, time, pprint\n", "\n", - "# Import custom functions for splitting and search.\n", - "sys.path.append(\"../..\") # Adds higher directory to python modules path.\n", - "import milvus_utilities as _utils" - ] - }, - { - "cell_type": "markdown", - "id": "e059b674", - "metadata": {}, - "source": [ - "## Download Data\n", + "# # Import custom functions for splitting and search.\n", + "# sys.path.append(\"../..\") # Adds higher directory to python modules path.\n", + "# import milvus_utilities as _utils\n", "\n", - "The data used in this notebook is Milvus documentation web pages.\n", + "import grpc\n", + "from concurrent.futures import ThreadPoolExecutor\n", "\n", - "The code block below downloads all the web pages into a local directory called `rtdocs`. \n", + "# Server side.\n", + "server = grpc.server(ThreadPoolExecutor(max_workers=10),\n", + " options=[('grpc.max_receive_message_length', 50 * 1024 * 1024)]) # 50 MB\n", "\n", - "I've already uploaded the `rtdocs` data folder to github, so you should see it if you cloned my repo." + "# Client side.\n", + "channel = grpc.insecure_channel('localhost:50051',\n", + " options=[('grpc.max_send_message_length', 50 * 1024 * 1024)]) # 50 MB" ] }, { "cell_type": "code", "execution_count": 3, - "id": "25686cc7", + "id": "0cac5a60", "metadata": {}, "outputs": [], "source": [ - "# # UNCOMMENT TO DOWNLOAD THE DOCS.\n", + "# Function to remove newlines and double spaces from a string.\n", + "def clean_text(text):\n", + " clean_text = text.replace(\"\\n\\n\", \" \")\\\n", + " .replace(\"\\n\", \" \")\\\n", + " .replace(\"

\", \" \")\\\n", + " .replace(\"

\", \" \")\n", + " \n", + " # Remove extra whitespace.\n", + " clean_text = ' '.join(clean_text.split())\n", + " return clean_text\n", "\n", - "# # !pip install -U langchain\n", - "# from langchain_community.document_loaders import RecursiveUrlLoader\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", "\n", - "# DOCS_PAGE=\"https://milvus.io/docs/\"\n", + "# Function to inspect chunk lengths in matplotlit.\n", + "def plot_chunk_lengths(chunked_docs, title_keyword):\n", + " # Get chunk lengths.\n", + " lengths = [len(doc.page_content) for doc in chunked_docs]\n", "\n", - "# loader = RecursiveUrlLoader(DOCS_PAGE)\n", - "# docs = loader.load()\n", + " # Mean, median lengths.\n", + " mean_length = np.mean(lengths)\n", + " median_length = np.median(lengths)\n", "\n", - "# num_documents = len(docs)\n", - "# print(f\"loaded {num_documents} documents\")" + " # Assemble the title.\n", + " title = f\"Chunk Lengths from {title_keyword} Chunking\"\n", + "\n", + " # Plot the lengths.\n", + " plt.figure(figsize=(10, 6)) # Adjust figure size\n", + " plt.plot(lengths, marker='o') # Plot lengths with circle markers\n", + " plt.title(title, fontsize=20, fontweight='bold')\n", + " plt.xlabel('Document Index') # X-axis label\n", + " plt.ylabel('Length') # Y-axis label\n", + " plt.grid(True) # Show grid\n", + "\n", + " # Add a horizontal line at mean and median length\n", + " plt.axhline(y=mean_length, color='g', linestyle='-')\n", + " plt.axhline(y=median_length, color='r', linestyle='-')\n", + " plt.text(len(lengths)-1, mean_length, f'mean = {mean_length:.0f}', va='center', ha='left', backgroundcolor='w', fontsize=12)\n", + " plt.text(0, median_length, f'median = {median_length:.0f}', va='center', ha='right', backgroundcolor='w', fontsize=12)\n", + "\n", + " plt.show() # Display the plot" ] }, { - "cell_type": "code", - "execution_count": 4, - "id": "3b5d81f4", + "cell_type": "markdown", + "id": "e059b674", "metadata": {}, - "outputs": [], "source": [ - "# # Save Langchain docs to a local directory.\n", - "# OUTPUT_DIR = \"../../RAG/rtdocs_new/\"\n", - "# os.makedirs(OUTPUT_DIR, exist_ok=True)\n", + "## Download Data\n", "\n", - "# # Convert each doc to HTML and save to the specified directory\n", - "# for doc in docs:\n", - "# # Extract file name\n", - "# filename = doc.metadata['source'].split('/')[-1].replace(\".md\", \".html\")\n", - " \n", - "# # Check that filename is not empty\n", - "# if filename:\n", - "# with open(os.path.join(OUTPUT_DIR, filename), \"w\") as f:\n", - "# f.write(doc.page_content)\n", - "# else:\n", - "# print(\"Filename is empty. Skipping this doc.\")\n", - "# pprint.pprint(doc.metadata)\n", - "# pprint.pprint(doc.page_content[:500])" + "The data used in this notebook is Milvus documentation web pages.\n", + "\n", + "The code block below downloads all the web pages into a local directory called `rtdocs`. \n", + "\n", + "I've already uploaded the `rtdocs` data folder to github, so you should see it if you cloned my repo." ] }, { "cell_type": "code", - "execution_count": 5, - "id": "83b232dd", + "execution_count": 4, + "id": "25686cc7", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "loaded 22 documents\n", + "loaded 23 documents\n", "\n", "\n" ] + }, + { + "data": { + "text/plain": [ + "'Milvus vector database documentation\n", - "It can run in Jupyter notebooks, Colab, or locally. Requires pymilvus>=2.4.3.\n", + "# # Save Langchain docs to a local directory.\n", + "# OUTPUT_DIR = \"../../RAG/rtdocs_new/\"\n", + "# os.makedirs(OUTPUT_DIR, exist_ok=True)\n", "\n", - "⛔️ Milvus Lite is not meant for production workloads." + "# # Convert each doc to HTML and save to the specified directory\n", + "# for doc in docs:\n", + "# # Extract file name\n", + "# filename = doc.metadata['source'].split('/')[-1].replace(\".md\", \".html\")\n", + " \n", + "# # Check that filename is not empty\n", + "# if filename:\n", + "# with open(os.path.join(OUTPUT_DIR, filename), \"w\") as f:\n", + "# f.write(doc.page_content)\n", + "# else:\n", + "# print(\"Filename is empty. Skipping this doc.\")\n", + "# pprint.pprint(doc.metadata)\n", + "# pprint.pprint(doc.page_content[:500])" ] }, { - "cell_type": "code", - "execution_count": 6, - "id": "953bf30d", + "cell_type": "markdown", + "id": "f19236fe", "metadata": {}, - "outputs": [], "source": [ - "# !python -m pip install -U pymilvus" + "## Load the Embedding Model checkpoint and use it to create vector embeddings\n", + "\n", + "#### What are Embeddings?\n", + "\n", + "Check out [this blog](https://zilliz.com/glossary/vector-embeddings) for an introduction to embeddings. \n", + "\n", + "An excellent place to start is by selecting an embedding model from the [HuggingFace MTEB Leaderboard](https://huggingface.co/spaces/mteb/leaderboard), sorted descending by the \"Retrieval Average'' column since this task is most relevant to RAG. Then, choose the smallest, highest-ranking embedding model. But, Beware!! some models listed are overfit to the training data, so they won't perform on your data as promised. \n", + "\n", + "Milvus (and Zilliz) only supports tested embedding models that are **not overfit**!" ] }, { "cell_type": "code", - "execution_count": 7, - "id": "6fc289e8", + "execution_count": 6, + "id": "f043c02e", "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/miniconda3/envs/py311-unum/lib/python3.11/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", + " warnings.warn(\n" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ - "pymilvus:2.4.4\n" + "MODEL: BAAI/bge-large-en-v1.5, EMBEDDING_DIM: 1024\n" ] } ], "source": [ - "# STEP 1. CONNECT A CLIENT TO LIGHT MILVUS PYTHON SERVER.\n", - "\n", - "# !python -m pip install -U pymilvus\n", - "import pymilvus\n", - "print(f\"pymilvus:{pymilvus.__version__}\")\n", + "from langchain_huggingface import HuggingFaceEmbeddings\n", "\n", - "# Connect a client to the Milvus Lite server.\n", - "from pymilvus import MilvusClient\n", - "mc = MilvusClient(\"milvus_demo.db\")" + "# Use an embedding model.\n", + "model_name = \"BAAI/bge-large-en-v1.5\"\n", + "model_kwargs = {'device': 'cpu'}\n", + "encode_kwargs = {'normalize_embeddings': True}\n", + "embed_model = HuggingFaceEmbeddings(\n", + " model_name=model_name,\n", + " model_kwargs=model_kwargs,\n", + " encode_kwargs=encode_kwargs\n", + ")\n", + "EMBEDDING_DIM = embed_model.dict()['client'].get_sentence_embedding_dimension()\n", + "print(f\"MODEL: {model_name}, EMBEDDING_DIM: {EMBEDDING_DIM}\")" ] }, { - "cell_type": "markdown", - "id": "f9d758e4", + "cell_type": "code", + "execution_count": 7, + "id": "de45cdd4", "metadata": {}, + "outputs": [], "source": [ - "# Optional - Connect to Zilliz Cloud free tier cluster\n", - "To use fully-managed Milvus on [Ziliz Cloud free trial](https://cloud.zilliz.com/login). \n", - " 1. Choose the default \"Starter\" option and accept the default Cloud Provider and Region when you create a cluster. \n", - " 2. On the Cluster main page, copy your `API Key` and store it locally in a .env variable. See [this note](https://help.openai.com/en/articles/5112595-best-practices-for-api-key-safety) how to do that.\n", - " 3. Also on the Cluster main page, copy the `Public Endpoint URI` and store it somewhere convenient.\n", - " 4. Jupyter also requires them in a local .env file.
\n", - "Anywhere in the bootcamp directory, create a .env file\n", - "Insert lines like this, substituting your actual API keys for the sample text:
\n", - "ZILLIZ_API_KEY=f370c
\n", - "OPENAI_API_KEY=sk-H
\n", - "ANYSCALE_ENPOINT_KEY=es
\n", - "ANTHROPIC_API_KEY=sk-an
\n", - "VARIABLE_NAME=value
\n", - "Save the .env file
" + "# !python -m pip install --upgrade langchain_openai" ] }, { "cell_type": "code", "execution_count": 8, - "id": "0806d2db", + "id": "1fab8a0e", "metadata": {}, "outputs": [], "source": [ - "# # STEP 1. CONNECT TO ZILLIZ CLOUD\n", - "# import os\n", - "# import pymilvus\n", - "# print(f\"pymilvus version: {pymilvus.__version__}\")\n", - "# from pymilvus import connections, utility, MilvusClient\n", - "# TOKEN = os.getenv(\"ZILLIZ_API_KEY\")\n", - "\n", - "# # Connect to Zilliz cloud using endpoint URI and API key TOKEN.\n", - "# # TODO change this.\n", - "# CLUSTER_ENDPOINT=\"https://in03-xxxx.api.gcp-us-west1.zillizcloud.com:443\"\n", - "# CLUSTER_ENDPOINT=\"https://in03-48a5b11fae525c9.api.gcp-us-west1.zillizcloud.com:443\"\n", - "# connections.connect(\n", - "# alias='default',\n", - "# # Public endpoint obtained from Zilliz Cloud\n", - "# uri=CLUSTER_ENDPOINT,\n", - "# # API key or a colon-separated cluster username and password\n", - "# token=TOKEN,\n", - "# )\n", + "# from langchain_openai.embeddings import OpenAIEmbeddings\n", + "\n", + "# # 1. Get your API key: https://platform.openai.com/api-keys\n", + "# # 2. Save your api key in env variable.\n", + "# # https://help.openai.com/en/articles/5112595-best-practices-for-api-key-safety\n", "\n", - "# # Use no-schema Milvus client uses flexible json key:value format.\n", - "# # https://milvus.io/docs/using_milvusclient.md\n", - "# mc = MilvusClient(\n", - "# uri=CLUSTER_ENDPOINT,\n", - "# # API key or a colon-separated cluster username and password\n", - "# token=TOKEN)\n", + "# # 3. OpenAI embedding model name, `text-embedding-3-large` or `ext-embedding-3-small`.\n", + "# # release notes: https://openai.com/index/new-embedding-models-and-api-updates/\n", + "# model_name = \"text-embedding-3-small\"\n", + "# EMBEDDING_DIM = 512\n", "\n", - "# # Check if the server is ready and get colleciton name.\n", - "# print(f\"Type of server: {utility.get_server_version()}\")" + "# model_kwargs = {'device': 'cpu'}\n", + "# embed_model = OpenAIEmbeddings(\n", + "# model=model_name,\n", + "# dimensions=EMBEDDING_DIM,\n", + "# model_kwargs=model_kwargs\n", + "# )\n", + "# print(f\"MODEL: {model_name}, EMBEDDING_DIM: {EMBEDDING_DIM}\")" ] }, { "cell_type": "markdown", - "id": "103354dc", + "id": "0804d3ff", "metadata": {}, "source": [ - "## Optional - Start up Milvus running in local Docker\n", + "## HTML Chunking using LangChain\n", + "\n", + "Before embedding, it is necessary to decide your chunk strategy, chunk size, and chunk overlap. This section uses:\n", + "- **Strategy** = Use markdown header hierarchies. Keep markdown sections together unless they are too long.\n", + "- **Chunk size** = Use the embedding model's parameter `MAX_SEQ_LENGTH`\n", + "- **Overlap** = Rule-of-thumb 10-15%\n", + "- **Function** = \n", + " - Langchain's `HTMLHeaderTextSplitter` to split markdown sections.\n", + " - Langchain's `RecursiveCharacterTextSplitter` to split up long reviews recursively.\n", "\n", - ">⛔️ Make sure you pip install the correct version of pymilvus and server yml file. **Versions (major and minor) should all match**.\n", "\n", - "1. [Install Docker](https://docs.docker.com/get-docker/)\n", - "2. Start your Docker Desktop\n", - "3. Download the latest [docker-compose.yml](https://milvus.io/docs/install_standalone-docker.md#Download-the-YAML-file) (or run the wget command, replacing version to what you are using)\n", - "> wget https://github.com/milvus-io/milvus/releases/download/v2.4.0-rc.1/milvus-standalone-docker-compose.yml -O docker-compose.yml\n", - "4. From your terminal: \n", - " - cd into directory where you saved the .yml file (usualy same dir as this notebook)\n", - " - docker compose up -d\n", - " - verify (either in terminal or on Docker Desktop) the containers are running\n", - "5. From your code (see notebook code below):\n", - " - Import milvus\n", - " - Connect to the local milvus server" + "Notice below, each chunk is grounded with the document source page.
\n", + "In addition, header titles are kept together with the chunk of markdown text." ] }, { "cell_type": "code", "execution_count": 9, - "id": "9dd9b467", + "id": "a9b8a0da", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Pattern for splitting: ]*)?>||]*)?/>\n", + "docs: 23, split into 23 html chunks.\n", + "DocsBlogCommunity Why Milvus What is MilvusUse Cases Tutorials CodelabsBootcampDemosVideo Tools AttuMilvus CLISizing ToolMilvus Backup Get Started Docs Tutorials Tools Blog Community Get Started Searc\n", + "{'h1': 'Welcome to Milvus Docs!', 'h2': 'Here you will learn about', 'h3': '', 'source': 'https://milvus.io/docs/', 'doc_index': '5a75d188-5760-42ed-908a-b9c35c6998a2'}\n" + ] + } + ], "source": [ - "# # CONNECT TO MILVUS STANDALONE DOCKER.\n", - "\n", - "# import pymilvus, time\n", - "# from pymilvus import (connections, MilvusClient, utility)\n", - "# print(f\"Pymilvus: {pymilvus.__version__}\")\n", + "# !python -m pip install lxml\n", + "from langchain.text_splitter import HTMLHeaderTextSplitter, RecursiveCharacterTextSplitter\n", + "import uuid, re\n", "\n", - "# # ####################################################################################################\n", - "# # # Connect to local server running in Docker container.\n", - "# # # Download the latest .yaml file: https://milvus.io/docs/install_standalone-docker.md\n", - "# # # Or, download directly from milvus github (replace with desired version):\n", - "# !wget https://github.com/milvus-io/milvus/releases/download/v2.4.4/milvus-standalone-docker-compose.yml -O docker-compose.yml\n", - "# # ####################################################################################################\n", + "doc_ids = [str(uuid.uuid4()) for _ in docs]\n", "\n", - "# # Start Milvus standalone on docker, running quietly in the background.\n", - "# !docker compose up -d\n", + "# Define the headers to split on for the HTMLHeaderTextSplitter\n", + "headers_to_split_on = [\n", + " (\"

\", \"Header 1\"),\n", + " (\"

\", \"Header 2\"),\n", + " (\"

\", \"Header 3\"),\n", + "]\n", + "# Create an instance of the HTMLHeaderTextSplitter\n", + "html_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)\n", "\n", - "# # Verify which local port the Milvus server is listening on\n", - "# !docker ps -a #19530/tcp\n", + "# Construct a regex pattern to match any of the specified headers\n", + "pattern = '|'.join(re.escape(header[0]) for header in headers_to_split_on)\n", + "# More complex pattern to match opening, closing, and self-closing tags.\n", + "pattern = r\"]*)?>||]*)?/>\"\n", + "print(\"Pattern for splitting:\", pattern)\n", "\n", - "# # Connect to the local server.\n", - "# connection = connections.connect(\n", - "# alias=\"default\", \n", - "# host='localhost', # or '0.0.0.0' or 'localhost'\n", - "# port='19530'\n", - "# )\n", + "# Function to remove all HTML tags\n", + "def remove_html_tags(text):\n", + " return re.sub(r\"<[^>]+>\", \"\", text)\n", "\n", - "# # Get server version.\n", - "# print(utility.get_server_version())\n", + "# Split the HTML text using the HTMLHeaderTextSplitter.\n", + "html_docs = []\n", + "for doc_id, doc in zip(doc_ids, docs):\n", + "\n", + " # Grab the headers\n", + " new_text = doc.page_content\n", + " new_text = clean_text(new_text).replace('\\n', ' ')\n", + " html_split_list = re.split(pattern, new_text)\n", + " # print(html_split_list[0])\n", + " # Remove HTML tags from each extracted header text\n", + " html_split_list = [remove_html_tags(header) for header in html_split_list]\n", + " # print(html_split_list[0])\n", + " # print(f\"Split into {len(html_split_list)} parts.\")\n", + "\n", + " # Split the text into chunks using the HTMLHeaderTextSplitter.\n", + " splits = html_splitter.split_text(doc.page_content)\n", "\n", - "# # Use no-schema Milvus client uses flexible json key:value format.\n", - "# mc = MilvusClient(connections=connection)" - ] - }, - { - "cell_type": "markdown", - "id": "f39af3fd", - "metadata": {}, - "source": [ - "## Load the Embedding Model checkpoint and use it to create vector embeddings\n", + " for split in splits:\n", "\n", - "#### What are Embeddings?\n", + " # Clean the text.\n", + " split.page_content = clean_text(split.page_content)\n", "\n", - "Check out [this blog](https://zilliz.com/glossary/vector-embeddings) for an introduction to embeddings. \n", + " # Assemble the metadata.\n", + " metadata = {}\n", + " # Handle exception if h1 does not exist.\n", + " try:\n", + " header_value1 = html_split_list[1].strip()[:25]\n", + " metadata[\"h1\"] = header_value1\n", + " # print(f\"header_name: h1, header_value: {header_value1}\")\n", + " except:\n", + " break\n", + " # Handle exception if h2 does not exist.\n", + " try:\n", + " header_value2 = html_split_list[2].strip()[:25]\n", + " if len(header_value2) <= 0:\n", + " header_value2 = html_split_list[3].strip()[:25]\n", + " metadata[\"h2\"] = header_value2\n", + " # print(f\"header_name: h2, header_value: {header_value2}\")\n", + " except:\n", + " break\n", + " # Handle exception if h2 does not exist.\n", + " try:\n", + " header_value3 = html_split_list[3].strip()[:25]\n", + " if (len(header_value3) <= 0 or header_value3 == header_value2):\n", + " header_value3 = html_split_list[4].strip()[:25]\n", + " if header_value3 == header_value2:\n", + " header_value3 = html_split_list[5].strip()[:25]\n", + " metadata[\"h3\"] = header_value3\n", + " # print(f\"header_name: h3, header_value: {header_value3}\")\n", + " except:\n", + " break\n", + " split.metadata = {\n", + " **metadata,\n", + " \"source\": doc.metadata[\"source\"],\n", + " 'doc_index': doc_id\n", + " }\n", + " html_docs.extend(splits)\n", "\n", - "An excellent place to start is by selecting an embedding model from the [HuggingFace MTEB Leaderboard](https://huggingface.co/spaces/mteb/leaderboard), sorted descending by the \"Retrieval Average'' column since this task is most relevant to RAG. Then, choose the smallest, highest-ranking embedding model. But, Beware!! some models listed are overfit to the training data, so they won't perform on your data as promised. \n", + "print(f\"docs: {len(docs)}, split into {len(html_docs)} html chunks.\")\n", "\n", - "Milvus (and Zilliz) only supports tested embedding models that are **not overfit**!" + "# Inspect a chunk\n", + "print(html_docs[0].page_content[:200])\n", + "print(html_docs[0].metadata)" ] }, { "cell_type": "code", "execution_count": 10, - "id": "0a6c58ab", + "id": "2e3acd77", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "DocsBlogCommunity Why Milvus What is MilvusUse Cases Tutorials CodelabsBootcampDemosVideo Tools AttuMilvus CLISizing ToolMilvus Backup Get Started Docs Tutorials Tools Blog Community Get Started Searc\n", + "{'h1': 'Hybrid Search', 'h2': 'Since Milvus 2.4, we intr', 'h3': 'Preparations', 'source': 'https://milvus.io/docs/multi-vector-search.md', 'doc_index': 'd86ecd0c-39d2-432d-ac1e-a856274804af'}\n" + ] + } + ], "source": [ - "# !python -m pip install -U sentence-transformers transformers" + "# Inspect another chunk\n", + "print(html_docs[20].page_content[:200])\n", + "print(html_docs[20].metadata)" ] }, { "cell_type": "code", "execution_count": 11, - "id": "1805f966", + "id": "961af854", "metadata": {}, "outputs": [ { - "name": "stderr", - "output_type": "stream", - "text": [ - "/opt/miniconda3/envs/py311-unum/lib/python3.11/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", - " warnings.warn(\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "model_name: BAAI/bge-large-en-v1.5\n", - "EMBEDDING_DIM: 1024\n", - "MAX_SEQ_LENGTH: 1536\n" - ] + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" } ], "source": [ - "# STEP 2. DOWNLOAD AN OPEN SOURCE EMBEDDING MODEL.\n", - "\n", - "# Import torch.\n", - "import torch\n", - "from sentence_transformers import SentenceTransformer\n", - "\n", - "# Initialize torch settings\n", - "torch.backends.cudnn.deterministic = True\n", - "DEVICE = torch.device('cuda:3' if torch.cuda.is_available() else 'cpu')\n", - "\n", - "# Load the model from huggingface model hub.\n", - "model_name = \"BAAI/bge-large-en-v1.5\"\n", - "# model_name = \"BAAI/bge-m3\"\n", - "encoder = SentenceTransformer(model_name, device=DEVICE)\n", - "# print(encoder)\n", - "\n", - "# Get the model parameters and save for later.\n", - "EMBEDDING_DIM = encoder.get_sentence_embedding_dimension()\n", - "MAX_SEQ_LENGTH_IN_TOKENS = encoder.get_max_seq_length() \n", - "# Assume tokens are 3 characters long.\n", - "MAX_SEQ_LENGTH = MAX_SEQ_LENGTH_IN_TOKENS * 3\n", - "EOS_TOKEN_LENGTH = 1 * 3\n", - "\n", - "# Inspect model parameters.\n", - "print(f\"model_name: {model_name}\")\n", - "print(f\"EMBEDDING_DIM: {EMBEDDING_DIM}\")\n", - "print(f\"MAX_SEQ_LENGTH: {MAX_SEQ_LENGTH}\")" + "# Inspect chunk lengths\n", + "plot_chunk_lengths(html_docs, 'HTML')" ] }, { "cell_type": "markdown", - "id": "9609497f", + "id": "1fa9700d", "metadata": {}, "source": [ - "## Create a Milvus collection\n", - "\n", - "You can think of a collection in Milvus like a \"table\" in SQL databases. The **collection** will contain the \n", - "- **Schema** (or [no-schema Milvus client](https://milvus.io/docs/using_milvusclient.md)). \n", - "💡 You'll need the vector `EMBEDDING_DIM` parameter from your embedding model.\n", - "Typical values are:\n", - " - 1024 for sbert embedding models\n", - " - 1536 for ada-002 OpenAI embedding models\n", - "- **Vector index** for efficient vector search\n", - "- **Vector distance metric** for measuring nearest neighbor vectors\n", - "- **Consistency level**\n", - "In Milvus, transactional consistency is possible; however, according to the [CAP theorem](https://en.wikipedia.org/wiki/CAP_theorem), some latency must be sacrificed. 💡 Searching movie reviews is not mission-critical, so [`eventually`](https://milvus.io/docs/consistency.md) consistent is fine here.\n", - "\n", - "## Add a Vector Index\n", - "\n", - "The vector index determines the vector **search algorithm** used to find the closest vectors in your data to the query a user submits. \n", + "## Small-to-Big using LangChain\n", "\n", - "Most vector indexes use different sets of parameters depending on whether the database is:\n", - "- **inserting vectors** (creation mode) - vs - \n", - "- **searching vectors** (search mode) \n", + "Often times it can be useful to retrieve larger chunks of information, but embed smaller chunks. This allows for embeddings to capture the semantic meaning as closely as possible, but for as much context as possible to be passed downstream. Note that this is what the ParentDocumentRetriever does. Here we show what is going on under the hood.\n", "\n", - "Scroll down the [docs page](https://milvus.io/docs/index.md) to see a table listing different vector indexes available on Milvus. For example:\n", - "- FLAT - deterministic exhaustive search\n", - "- IVF_FLAT or IVF_SQ8 - Hash index (stochastic approximate search)\n", - "- HNSW - Graph index (stochastic approximate search)\n", - "- AUTOINDEX - OSS or [Zilliz cloud](https://docs.zilliz.com/docs/autoindex-explained) automatic index based on type of GPU, size of data.\n", + "> - The vector store indexes and searches embeddings of the smallest (sub) documents. \n", + "> - The document store houses the \"parent\" documents and associates them with an identifier.\n", "\n", - "Besides a search algorithm, we also need to specify a **distance metric**, that is, a definition of what is considered \"close\" in vector space. In the cell below, the [`HNSW`](https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md) search index is chosen. Its possible distance metrics are one of:\n", - "- L2 - L2-norm\n", - "- IP - Dot-product\n", - "- COSINE - Angular distance\n", - "\n", - "💡 Most use cases work better with normalized embeddings, in which case L2 is useless (every vector has length=1) and IP and COSINE are the same. Only choose L2 if you plan to keep your embeddings unnormalized." + "- See MultiVectorRetriever [api](https://api.python.langchain.com/en/latest/retrievers/langchain.retrievers.multi_vector.MultiVectorRetriever.html).\n", + "- See ParentDocumentRetriever [docs](https://python.langchain.com/v0.2/docs/how_to/multi_vector/#smaller-chunks) and [api](https://api.python.langchain.com/en/latest/retrievers/langchain.retrievers.parent_document_retriever.ParentDocumentRetriever.html)." ] }, { - "cell_type": "markdown", - "id": "42dff3b3", + "cell_type": "code", + "execution_count": 12, + "id": "19a45770", "metadata": {}, + "outputs": [], "source": [ - "### Exercise #1 (2 min):\n", - "Create a collection named \"movies\". Use the default AUTOINDEX.\n", - "> 💡 AUTOINDEX works on both Milvus and Zilliz Cloud (where it is the fastest!)" + "# !python -m pip install lxml\n", + "from langchain_milvus import Milvus\n", + "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", + "import time, pprint, uuid\n", + "import numpy as np\n", + "from langchain.storage import InMemoryByteStore\n", + "from langchain.retrievers.multi_vector import MultiVectorRetriever\n", + "\n", + "# Create doc storage for the parent documents\n", + "store = InMemoryByteStore()\n", + "id_key = \"doc_id\"\n", + "\n", + "# Create vectorstore for vector index and retrieval.\n", + "COLLECTION_NAME = \"MilvusDocs\"\n", + "vectorstore = Milvus(\n", + " collection_name=COLLECTION_NAME,\n", + " embedding_function=embed_model,\n", + " connection_args={\"uri\": \"./milvus_demo.db\"},\n", + " auto_id=True,\n", + " # Set to True to drop the existing collection if it exists.\n", + " drop_old=True,\n", + ")\n", + "\n", + "# The MultiVectorRetriever (empty to start)\n", + "retriever = MultiVectorRetriever(\n", + " vectorstore=vectorstore,\n", + " byte_store=store,\n", + " id_key=id_key,\n", + ")" ] }, { "cell_type": "code", - "execution_count": 12, - "id": "e6197605", + "execution_count": 85, + "id": "2ffc217b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Successfully dropped collection: `MilvusDocs`\n", - "Successfully created collection: `MilvusDocs`\n" + "23 docs split into 3694 parent documents.\n", + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ - "# # STEP 3. DOCKER OR ZILLIZ CLOUD: CREATE A NO-SCHEMA MILVUS COLLECTION AND DEFINE THE DATABASE INDEX.\n", - "\n", - "# # Set the Milvus collection name.\n", - "# COLLECTION_NAME = \"MilvusDocs\"\n", - "\n", - "# # Add custom HNSW search index to the collection.\n", - "# # M = max number graph connections per layer. Large M = denser graph.\n", - "# # Choice of M: 4~64, larger M for larger data and larger embedding lengths.\n", - "# M = 16\n", - "# # efConstruction = num_candidate_nearest_neighbors per layer. \n", - "# # Use Rule of thumb: int. 8~512, efConstruction = M * 2.\n", - "# efConstruction = M * 2\n", - "# # Create the search index for local Milvus server.\n", - "# INDEX_PARAMS = dict({\n", - "# 'M': M, \n", - "# \"efConstruction\": efConstruction })\n", - "# index_params = {\n", - "# \"index_type\": \"HNSW\", \n", - "# \"metric_type\": \"COSINE\", \n", - "# \"params\": INDEX_PARAMS\n", - "# }\n", - "\n", - "# # Check if collection already exists, if so drop it.\n", - "# has = utility.has_collection(COLLECTION_NAME)\n", - "# if has:\n", - "# drop_result = utility.drop_collection(COLLECTION_NAME)\n", - "# print(f\"Successfully dropped collection: `{COLLECTION_NAME}`\")\n", - "\n", - "# # Create the collection.\n", - "# mc.create_collection(\n", - "# COLLECTION_NAME, \n", - "# EMBEDDING_DIM,\n", - "# consistency_level=\"Eventually\", \n", - "# auto_id=True, \n", - "# overwrite=True,\n", - "# # skip setting params below, if using AUTOINDEX\n", - "# params=index_params\n", - "# )\n", - "# print(f\"Successfully created collection: `{COLLECTION_NAME}`\")" + "# Inspect chunk lengths\n", + "plot_chunk_lengths(parent_docs, 'Parent-HTML')" ] }, { "cell_type": "markdown", - "id": "3b2cac7c", + "id": "a3821b33", "metadata": {}, "source": [ - "## Simple Chunking\n", - "\n", - "Before embedding, it is necessary to decide your chunk strategy, chunk size, and chunk overlap. This section uses:\n", - "- **Strategy** = Simple fixed chunk lengths.\n", - "- **Chunk size** = Use the embedding model's parameter `MAX_SEQ_LENGTH`\n", - "- **Overlap** = Rule-of-thumb 10-15%\n", - "- **Function** = \n", - " - Langchain's `RecursiveCharacterTextSplitter` to split up long reviews recursively." + "## Small-to-big Chunking without HTML chunking" ] }, { - "cell_type": "markdown", - "id": "45c00e6e", + "cell_type": "code", + "execution_count": 86, + "id": "10770e5c", "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "loaded 22 documents\n", + "\n", + "\n" + ] + }, + { + "data": { + "text/plain": [ + "'Why Milvus\\n\\nDocs\\n\\nTutorials\\n\\nTools\\n\\nBlog\\n\\nCommunity\\n\\nStars0\\n\\nTry Managed Milvus FREE\\n\\nSearch\\n\\nHome\\n\\nv2.4.x\\n\\nAbout Milvus\\n\\nGet StartedPrerequisitesInstall MilvusInstall SDKsQuickstart\\n\\nConcepts\\n\\nUser Guide\\n\\nModels\\n\\nAdministration Guide\\n\\nTools\\n\\nIntegrations\\n\\nExample Applications\\n\\nFAQs\\n\\nAPI reference\\n\\nQuickstart\\n\\nThis guide explains how to connect to your Milvus cluster and performs CRUD operations in minutes\\n\\nBefore you start\\n\\nYou have installed Milvus standalone or Milvus cluster.\\n\\nYou have insta'" + ] + }, + "execution_count": 86, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "### Exercise #2 (2 min):\n", - "Change the chunk_size and see what happens? Model default is 1536.\n", + "# From raw docs\n", + "\n", + "# UNCOMMENT TO READ THE DOCS FROM A LOCAL DIRECTORY.\n", + "\n", + "# Read docs into LangChain\n", + "# !pip install -U langchain\n", + "# !pip install unstructured\n", + "from langchain.document_loaders import DirectoryLoader\n", + "\n", + "# Load HTML files from a local directory\n", + "path = \"../../RAG/rtdocs_new/\"\n", + "global_pattern = '*.html'\n", + "loader = DirectoryLoader(path=path, glob=global_pattern)\n", + "docs = loader.load()\n", + "\n", + "num_documents = len(docs)\n", + "print(f\"loaded {num_documents} documents\")\n", + "\n", + "# # Subset docs for faster testing\n", + "# docs = docs[5:7].copy()\n", + "# num_documents = len(docs)\n", + "# print(f\"testing with {num_documents} documents\")\n", "\n", - "- What do your observations imply about changing the chunk_size and the number of vectors?\n", - "- How many vectors are there with chunk_size=512?" + "# Print the type of the docs.\n", + "print(type(docs))\n", + "print(type(docs[0]))\n", + "\n", + "docs[0].page_content[:500]" ] }, { "cell_type": "code", - "execution_count": 14, - "id": "b1499b20", + "execution_count": 88, + "id": "7424f6ca", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "23 docs split into 130 parent documents.\n", + "Why Milvus Docs Tutorials Tools Blog Community Stars0 Try Managed Milvus FREE Search Home v2.4.x About Milvus Get StartedPrerequisitesInstall MilvusInstall SDKsQuickstart Concepts User Guide Models Ad\n", + "{'doc_index': 'd6cda686-b369-40de-9784-666ee6a5952e',\n", + " 'source': 'https://milvus.io/docs/quickstart.md'}\n" + ] + } + ], "source": [ - "# from langchain.text_splitter import RecursiveCharacterTextSplitter\n", - "# import numpy as np\n", - "# import pprint\n", - "\n", - "# ###############\n", - "# ## EXERCISE #2: Change chunk_size to 512 below. How many chunks (vectors) does this create?\n", - "# ## ANSWER: 427\n", - "# ## BONUS: Can you explain why the number of vectors changed from 134 to 427? \n", - "# ## Hint: What is the default chunk overlap? 134 * (3 + 0.10) approx. equals 804.\n", - "# ###############\n", - "# chunk_size = #(exercise): code here\n", - "# chunk_overlap = np.round(chunk_size * 0.10, 0)\n", - "# print(f\"chunk_size: {chunk_size}, chunk_overlap: {chunk_overlap}\")\n", - "\n", - "# # Create an instance of the RecursiveCharacterTextSplitter\n", - "# child_splitter = RecursiveCharacterTextSplitter(\n", - "# chunk_size = chunk_size,\n", - "# chunk_overlap = chunk_overlap,\n", - "# length_function = len, # using built-in Python len function\n", - "# )\n", + "# Parent docs directly from raw docs\n", + "parent_docs = parent_text_splitter.split_documents(docs)\n", + "doc_ids = [str(uuid.uuid4()) for _ in parent_docs]\n", + "\n", + "# Clean the parent docs.\n", + "for doc_id, doc in zip(doc_ids, parent_docs):\n", + " doc.page_content = clean_text(doc.page_content)\n", + " doc.metadata[\"source\"] = \\\n", + " doc.metadata[\"source\"]\\\n", + " .replace(\"../../RAG/rtdocs_new\", \"https://milvus.io/docs\")\\\n", + " .replace(\".html\", \".md\")\n", + " doc.metadata = {\n", + " **doc.metadata,\n", + " \"doc_index\": doc_id\n", + " }\n", "\n", - "# # Split the documents further into smaller, recursive chunks.\n", - "# chunks = child_splitter.split_documents(docs)\n", - "# print(f\"docs: {len(docs)}, split into: {len(chunks)}\")" + "print(f\"{len(html_docs)} docs split into {len(parent_docs)} parent documents.\")\n", + "# inspect a parent doc.\n", + "print(parent_docs[0].page_content[:200])\n", + "pprint.pprint(parent_docs[0].metadata)" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "id": "5062ca09", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Inspect chunk lengths\n", + "plot_chunk_lengths(parent_docs, 'Parent')" ] }, { "cell_type": "code", "execution_count": 15, - "id": "075a3022", + "id": "b9f3b559", "metadata": {}, "outputs": [ { @@ -613,486 +717,351 @@ "output_type": "stream", "text": [ "chunk_size: 512, chunk_overlap: 51.0\n", - "docs: 22, split into chunks: 427\n", - "type: list of \n", - "\n", - "Looking at a sample chunk...\n", - "('Why Milvus Docs Tutorials Tools Blog Community Stars0 Try Managed '\n", - " 'Milvus FREE Search Home ')\n", - "{'source': 'https://milvus.io/docs_new/quickstart.md'}\n" + "228 docs split into 873 child documents.\n" ] } ], "source": [ - "# STEP 4. PREPARE DATA: CHUNK AND EMBED\n", - "\n", - "# !python -m pip install lxml\n", - "from langchain.text_splitter import HTMLHeaderTextSplitter, RecursiveCharacterTextSplitter\n", - "import numpy as np\n", - "import pprint\n", - "\n", - "# Define chunk size and overlap 10% chunk_size.\n", "CHUNK_SIZE = 512\n", "chunk_overlap = np.round(CHUNK_SIZE * 0.10, 0)\n", "print(f\"chunk_size: {CHUNK_SIZE}, chunk_overlap: {chunk_overlap}\")\n", "\n", - "# Create an instance of the RecursiveCharacterTextSplitter\n", - "child_splitter = RecursiveCharacterTextSplitter(\n", - " chunk_size = CHUNK_SIZE,\n", - " chunk_overlap = chunk_overlap,\n", + "# The splitter to use to create smaller (child) chunks\n", + "child_text_splitter = RecursiveCharacterTextSplitter(\n", + " chunk_size=CHUNK_SIZE, \n", + " chunk_overlap=chunk_overlap,\n", " length_function = len, # use built-in Python len function\n", - " # separators=[\"\\n\\n\"],\n", + " # separators=[\"\\n\\n\"], # split at end of paragraphs\n", ")\n", "\n", - "# Split the documents further into smaller, recursive chunks.\n", - "smaller_chunks = child_splitter.split_documents(docs)\n", - "print(f\"docs: {len(docs)}, split into chunks: {len(smaller_chunks)}\")\n", - "print(f\"type: list of {type(smaller_chunks[0])}\") \n", - "\n", - "# Clean up newlines in the chunks.\n", - "for chunk in smaller_chunks:\n", - " chunk.page_content = chunk.page_content.replace(\"\\n\", \" \")\n", - " \n", - "# Clean up the metadata urls\n", - "for chunk in smaller_chunks:\n", - " new_url = chunk.metadata[\"source\"]\n", - " new_url = new_url.replace(\"../../RAG/rtdocs\", \"https://milvus.io/docs\")\n", - " new_url = new_url.replace(\".html\", \".md\")\n", - " chunk.metadata.update({\"source\": new_url})\n", - "\n", - "# Inspect a chunk.\n", - "print()\n", - "print(\"Looking at a sample chunk...\")\n", - "pprint.pprint(smaller_chunks[0].page_content[:100])\n", - "pprint.pprint(smaller_chunks[0].metadata)" - ] - }, - { - "cell_type": "markdown", - "id": "c60423a5", - "metadata": {}, - "source": [ - "## HTML Chunking\n", - "\n", - "Before embedding, it is necessary to decide your chunk strategy, chunk size, and chunk overlap. This section uses:\n", - "- **Strategy** = Use markdown header hierarchies. Keep markdown sections together unless they are too long.\n", - "- **Chunk size** = Use the embedding model's parameter `MAX_SEQ_LENGTH`\n", - "- **Overlap** = Rule-of-thumb 10-15%\n", - "- **Function** = \n", - " - Langchain's `HTMLHeaderTextSplitter` to split markdown sections.\n", - " - Langchain's `RecursiveCharacterTextSplitter` to split up long reviews recursively.\n", + "# Generate the \"sub\" documents by splitting the original documents. \n", + "# Store the doc_id in the metadata of the corresponding Document object.\n", + "sub_docs = []\n", + "for i, doc in enumerate(parent_docs):\n", + " _id = doc_ids[i]\n", + " _sub_docs = child_text_splitter.split_documents([doc])\n", + " for _doc in _sub_docs:\n", + " _doc.metadata[id_key] = _id\n", + " sub_docs.extend(_sub_docs)\n", "\n", + "# # Insert HTML headers into smaller chunks (extends their \"context\").\n", + "# for chunk in sub_docs:\n", + "# if chunk.page_content.startswith(chunk.metadata['h1'][:20]):\n", + "# continue\n", + "# metadata_str = ' '.join(str(v) for k, v in chunk.metadata.items() if k in ['h1', 'h2'])\n", + "# chunk.page_content = f'{metadata_str} {chunk.page_content}'\n", "\n", - "Notice below, each chunk is grounded with the document source page.
\n", - "In addition, header titles are kept together with the chunk of markdown text." + "print(f\"{len(parent_docs)} docs split into {len(sub_docs)} child documents.\")" ] }, { "cell_type": "code", "execution_count": 16, - "id": "5ab9cd1e", + "id": "6631ed2c", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "chunk_size: 512, chunk_overlap: 51.0\n", - "docs: 22, split into parent chunks: 129\n", - "parent_chunks: 129, split into smaller chunks: 586\n", - "type: list of \n", - "\n", - "Looking at a sample chunk...\n", - "('immediately after data insertions may result in empty result set. To avoid '\n", - " 'this, you are advised to wait for a few seconds. Single-vector search The '\n", - " 'value of the query_vectors variable is a list conta')\n", - "{'doc_index': 0,\n", - " 'h1': 'Why Milvus Docs Tutorials Tools Blog Community Sta',\n", - " 'source': '../../RAG/rtdocs_new/quickstart.html'}\n" + "using PyMilvus. Added guidance on how to enable RBAC with Milvus Operator. Added descriptions of Milvus CDC. Introducing PyMilvus Integration with Embedding Models Engineering Made with Love by the De\n", + "{'doc_id': 'ab615aaf-af19-4b50-ac9d-8bbd1d1c6d51',\n", + " 'doc_index': '5a75d188-5760-42ed-908a-b9c35c6998a2',\n", + " 'h1': 'Welcome to Milvus Docs!',\n", + " 'h2': 'Here you will learn about',\n", + " 'h3': '',\n", + " 'source': 'https://milvus.io/docs/'}\n" ] } ], "source": [ - "# STEP 4. PREPARE DATA: CHUNK AND EMBED\n", - "\n", - "# !python -m pip install lxml\n", - "from langchain.text_splitter import HTMLHeaderTextSplitter, RecursiveCharacterTextSplitter\n", - "\n", - "# Define chunk size 512 and overlap 10% chunk_size.\n", - "# These will be ANN search vectors.\n", - "CHUNK_SIZE = 512\n", - "chunk_overlap = round(CHUNK_SIZE * 0.10, 0)\n", - "print(f\"chunk_size: {CHUNK_SIZE}, chunk_overlap: {chunk_overlap}\")\n", - "\n", - "# Define chunk size for \"larger\" chunks.\n", - "# These will be parent chunks retrieved to stuff in the Prompt.\n", - "PARENT_CHUNK_SIZE = MAX_SEQ_LENGTH #2000\n", - "\n", - "# Splitter is used to create the parent \"larger\" chunks.\n", - "parent_splitter = RecursiveCharacterTextSplitter(\n", - " chunk_size=PARENT_CHUNK_SIZE,\n", - " length_function = len,\n", - " # add_start_index=True\n", - " )\n", - "\n", - "# Splitter is used to create the child \"smaller\" chunks for ANN search.\n", - "child_splitter = RecursiveCharacterTextSplitter(\n", - " chunk_size = CHUNK_SIZE,\n", - " # chunk_overlap = chunk_overlap,\n", - " length_function = len,\n", - " add_start_index=True\n", - " )\n", - "\n", - "# Define the headers to split on for the HTMLHeaderTextSplitter\n", - "headers_to_split_on = [\n", - " (\"h1\", \"Header 1\"),\n", - " (\"h2\", \"Header 2\"),\n", - "]\n", - "# Create an instance of the HTMLHeaderTextSplitter\n", - "html_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)\n", - "\n", - "# Split the HTML text using the HTMLHeaderTextSplitter.\n", - "html_header_splits = []\n", - "doc_index = 0\n", - "for doc in docs:\n", - " splits = html_splitter.split_text(doc.page_content)\n", - " for split in splits:\n", - " # Add the source URL and header values to the metadata\n", - " metadata = {}\n", - " new_text = split.page_content\n", - " for header_name, metadata_header_name in headers_to_split_on:\n", - " # Handle exception if h1 does not exist.\n", - " try:\n", - " header_value = new_text.split(\"¶ \")[0].strip()[:50]\n", - " metadata[header_name] = header_value\n", - " except:\n", - " break\n", - " # Handle exception if h2 does not exist.\n", - " try:\n", - " new_text = new_text.split(\"¶ \")[1].strip()[:50]\n", - " metadata[header_name] = new_text\n", - " except:\n", - " break\n", - " split.metadata = {\n", - " **metadata,\n", - " \"source\": doc.metadata[\"source\"],\n", - " 'doc_index': doc_index\n", - " }\n", - " html_header_splits.extend(splits)\n", - " doc_index += 1\n", - "\n", - "# Split the HTML chunks into parent chunks.\n", - "chunks = parent_splitter.split_documents(html_header_splits)\n", - "print(f\"docs: {len(docs)}, split into parent chunks: {len(chunks)}\")\n", - "\n", - "# Split HTML header chunks into smaller chunks for ANN search.\n", - "smaller_chunks = []\n", - "for chunk in chunks:\n", - " smaller_chunks.extend(child_splitter.split_documents([chunk]))\n", - "print(f\"parent_chunks: {len(chunks)}, split into smaller chunks: {len(smaller_chunks)}\")\n", - "print(f\"type: list of {type(smaller_chunks[0])}\") \n", - "\n", - "# Insert HTML headers into smaller chunks (extends their \"context\").\n", - "for chunk in smaller_chunks:\n", - " if chunk.page_content.startswith(chunk.metadata['h1'][:20]):\n", - " continue\n", - " metadata_str = ' '.join(str(v) for k, v in chunk.metadata.items() if k in ['h1', 'h2'])\n", - " chunk.page_content = f'{metadata_str} {chunk.page_content}'\n", - "\n", - "# Inspect a parent chunk.\n", - "print()\n", - "print(\"Looking at a sample chunk...\")\n", - "pprint.pprint(chunks[4].page_content[:200])\n", - "pprint.pprint(chunks[4].metadata)" + "# Inspect a sub_doc.\n", + "print(sub_docs[4].page_content[:200])\n", + "pprint.pprint(sub_docs[4].metadata)" ] }, { "cell_type": "code", "execution_count": 17, - "id": "bf206547", + "id": "4c8801a0", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Inspect chunk lengths\n", + "plot_chunk_lengths(sub_docs, 'Small-to-big')" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "68fd9dcb", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "\n", - "Looking at a sample chunk...\n", - "('Why Milvus Docs Tutorials Tools Blog Community Sta loaded collection, refer '\n", - " 'to Manage Collections. Collections created using the RESTful API are always '\n", - " 'automatically loaded. Insert Data Collections cr')\n", - "{'doc_index': 0,\n", - " 'h1': 'Why Milvus Docs Tutorials Tools Blog Community Sta',\n", - " 'source': 'https://milvus.io/docs/quickstart.md',\n", - " 'start_index': 0}\n" + "Indexed 228 parent documents in 0.00 seconds and \n", + "Indexed a total of 873 child documents in 116.82 seconds\n" ] } ], "source": [ - "# Function to remove newlines and double spaces from a string.\n", - "def clean_text(text):\n", - " clean_text = text.replace(\"\\n\\n\", \" \")\\\n", - " .replace(\"

\", \" \")\\\n", - " .replace(\"

\", \" \")\n", - " return clean_text\n", + "# Index the documents in the document store.\n", + "start_time = time.time()\n", + "retriever.docstore.mset(list(zip(doc_ids, parent_docs)))\n", + "print(f\"Indexed {len(parent_docs)} parent documents in {time.time()-start_time:.2f} seconds and \")\n", + "\n", + "# Index the documents in the vector store.\n", + "batch_size = 300\n", + "total_docs = len(sub_docs)\n", + "# Calculate the number of batches needed\n", + "batches = (total_docs + batch_size - 1) // batch_size\n", + "start_time = time.time()\n", + "for i in range(batches):\n", + " batch_start = i * batch_size\n", + " batch_end = min(batch_start + batch_size, total_docs)\n", + " batch = sub_docs[batch_start:batch_end]\n", + " retriever.vectorstore.add_documents(batch)\n", "\n", - "# Clean up the metadata urls and chunk texts.\n", - "for doc in smaller_chunks:\n", - " doc.metadata[\"source\"] = \\\n", - " doc.metadata[\"source\"]\\\n", - " .replace(\"../../RAG/rtdocs_new\", \"https://milvus.io/docs\")\\\n", - " .replace(\".html\", \".md\")\n", - " doc.page_content = clean_text(doc.page_content)\n", + "print(f\"Indexed a total of {total_docs} child documents in {time.time()-start_time:.2f} seconds\")\n", + "\n", + "# Indexed 507 documents in 56.15 seconds." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "88aa1894", + "metadata": {}, + "outputs": [], + "source": [ + "# Delete the Milvus collection and doc store.\n", + "# del vectorstore, retriever, store" + ] + }, + { + "cell_type": "markdown", + "id": "0c324937", + "metadata": {}, + "source": [ + "## Semantic Chunking using LangChain\n", + "\n", + "This chunker works by determining when to \"break\" apart sentences. This is done by calculating cosine distances between adjacent sentences. Looking across all these cosine distances, look for outlier distances past some threshold. These outlier distances determine when chunks are split.\n", + "\n", + "There are a few ways to determine what that threshold is, which are controlled by the breakpoint_threshold_type kwarg.\n", + "\n", + "- `percentile` (default) — In this method, any distances greater than X percentile is split into a chunk.\n", "\n", - "# Inspect a child chunk.\n", - "print()\n", - "print(\"Looking at a sample chunk...\")\n", - "pprint.pprint(smaller_chunks[15].page_content[:200])\n", - "pprint.pprint(smaller_chunks[15].metadata)" + "- `standard_deviation` — In this method, any difference greater than X standard deviations is split.\n", + "\n", + "- `interquartile` — In this method, the interquartile distance is used to split chunks.\n", + "\n", + "- See Semantic Chunking [docs](https://python.langchain.com/v0.2/docs/how_to/semantic-chunker/#standard-deviation) and [api](https://api.python.langchain.com/en/latest/text_splitter/langchain_experimental.text_splitter.SemanticChunker.html)." ] }, { "cell_type": "code", - "execution_count": 18, - "id": "f2f7463b", + "execution_count": 20, + "id": "23149b10", "metadata": {}, "outputs": [], "source": [ - "# # Double-check if parent chunks are correctly indexed.\n", - "# temp_doc_index = smaller_chunks[15].metadata['doc_index']\n", - "# temp_start_index = smaller_chunks[15].metadata['start_index']\n", - "# temp_start_index = temp_start_index - 512\n", - "# temp_end_index = temp_start_index + 1536\n", - "\n", - "# temp = docs[temp_doc_index].page_content\n", - "# print(len(temp))\n", - "# pprint.pprint(temp[temp_start_index:temp_end_index].replace(\"\\n\\n\", \" \"))" + "# !python -m pip install langchain_experimental" ] }, { - "cell_type": "markdown", - "id": "e338e0a5", + "cell_type": "code", + "execution_count": 21, + "id": "574e5387", "metadata": {}, + "outputs": [], "source": [ - "### Transform chunks into vectors using the embedding model" + "# # UNCOMMENT TO READ THE DOCS FROM A LOCAL DIRECTORY.\n", + "\n", + "# # Read docs into LangChain\n", + "# # !pip install -U langchain\n", + "# # !pip install unstructured\n", + "# from langchain.document_loaders import DirectoryLoader\n", + "\n", + "# # Load HTML files from a local directory\n", + "# path = \"../../RAG/rtdocs_new/\"\n", + "# global_pattern = '*.html'\n", + "# loader = DirectoryLoader(path=path, glob=global_pattern)\n", + "# docs = loader.load()\n", + "\n", + "# num_documents = len(docs)\n", + "# print(f\"loaded {num_documents} documents\")\n", + "\n", + "# # # Subset docs for faster testing\n", + "# # docs = docs[5:7].copy()\n", + "# # num_documents = len(docs)\n", + "# # print(f\"testing with {num_documents} documents\")\n", + "\n", + "# # Print the type of the docs.\n", + "# print(type(docs))\n", + "# print(type(docs[0]))\n", + "\n", + "# docs[0].page_content[:500]" ] }, { "cell_type": "code", - "execution_count": 19, - "id": "d223c6f1", + "execution_count": 22, + "id": "bf87ad6a", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Embedding time for 586 chunks: 59.12 seconds\n" - ] - } - ], + "outputs": [], "source": [ - "# STEP 5. TRANSFORM CHUNKS INTO VECTORS USING EMBEDDING MODEL INFERENCE.\n", - "\n", - "# Encoder input is docs as a list of strings.\n", - "list_of_strings = [doc.page_content for doc in smaller_chunks if hasattr(doc, 'page_content')]\n", - "\n", - "# Embedding inference using the Milvus built-in sparse-dense-reranking encoder.\n", - "start_time = time.time()\n", - "embeddings = torch.tensor(encoder.encode(list_of_strings))\n", - "end_time = time.time()\n", - "\n", - "print(f\"Embedding time for {len(list_of_strings)} chunks: \", end=\"\")\n", - "print(f\"{round(end_time - start_time, 2)} seconds\")\n", - "\n", - "# Inference Embeddings: 100%|██████████| 19/19 [00:35<00:00, 1.86s/it]\n", - "# Embedding time for 127 chunks: 57.92 seconds" + "# TODO - MOVE THIS TO A LOOP\n", + "# # Extract all document content into a single string.\n", + "# all_page_contents = \" \".join(doc.page_content for doc in docs)\n", + "# all_page_contents = clean_text(all_page_contents)\n", + "# # print(f\"len(all_page_contents): {len(all_page_contents)}\")\n", + "# # print(all_page_contents[:500])" ] }, { "cell_type": "code", - "execution_count": 20, - "id": "babb0388", + "execution_count": 23, + "id": "093a58da", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "type embeddings: of \n", - "of numbers: \n" - ] - } - ], + "outputs": [], "source": [ - "# Normalize the embeddings.\n", - "embeddings = np.array(embeddings / np.linalg.norm(embeddings))\n", + "# from langchain_experimental.text_splitter import SemanticChunker\n", "\n", - "# Convert embeddings to list of `numpy.ndarray`, each containing `numpy.float32` numbers.\n", - "converted_values = list(map(np.float32, embeddings))\n", + "# text_splitter = SemanticChunker(embed_model)\n", + "# semantic_docs = text_splitter.create_documents([all_page_contents])\n", "\n", - "# Inspect the embeddings.\n", - "# assert len(chunks[0].page_content) <= MAX_SEQ_LENGTH_IN_TOKENS\n", - "assert len(converted_values[0]) == EMBEDDING_DIM\n", - "print(f\"type embeddings: {type(converted_values)} of {type(converted_values[0])}\")\n", - "print(f\"of numbers: {type(converted_values[0][0])}\")" + "# lengths = [len(doc.page_content) for doc in semantic_docs]\n", + "# print(f\"Created {len(semantic_docs)} semantic documents from {len(docs)}.\")\n", + "# print(\", \".join(str(len(length)) for length in lengths))\n", + "# pprint.pprint(semantic_docs[0].page_content)\n", + "# # Created 59 semantic documents from 22." ] }, { - "cell_type": "markdown", - "id": "d9bd8153", + "cell_type": "code", + "execution_count": 24, + "id": "3d70b441", "metadata": {}, + "outputs": [], "source": [ - "## Insert data into Milvus\n", + "# # Inspect chunk lengths\n", + "# lengths = [len(doc.page_content) for doc in html_docs]\n", + "# import matplotlib.pyplot as plt\n", "\n", - "For each original text chunk, we'll write the sextuplet (`chunk, h1, h2, source, dense_vector, sparse_vector`) into the database.\n", + "# # Mean, median lengths.\n", + "# mean_length = np.mean(lengths)\n", + "# median_length = np.median(lengths)\n", "\n", - "
\n", - "\n", - "
\n", + "# # Plotting the lengths.\n", + "# plt.figure(figsize=(10, 6)) # Adjust figure size\n", + "# plt.plot(lengths, marker='o') # Plot normalized lengths with circle markers\n", + "# plt.title('Chunk Lengths from Semantic Chunking', fontsize=20, fontweight='bold')\n", + "# plt.xlabel('Document Index') # X-axis label\n", + "# plt.ylabel('Normalized Length') # Y-axis label, now showing normalized length\n", + "# plt.grid(True) # Show grid\n", "\n", - "**The Milvus Client wrapper can only handle loading data from a list of dictionaries.**\n", + "# # Add a horizontal red line at mean length\n", + "# plt.axhline(y=mean_length, color='g', linestyle='-')\n", + "# plt.axhline(y=median_length, color='r', linestyle='-')\n", + "# plt.text(len(lengths)-1, mean_length, f'mean = {mean_length:.0f}', va='center', ha='left', backgroundcolor='w', fontsize=12)\n", + "# plt.text(0, median_length, f'median = {median_length:.0f}', va='center', ha='right', backgroundcolor='w', fontsize=12)\n", "\n", - "Otherwise, in general, Milvus supports loading data from:\n", - "- pandas dataframes \n", - "- list of dictionaries " + "# plt.show() # Display the plot" ] }, { "cell_type": "code", - "execution_count": 21, - "id": "79dd2299", + "execution_count": 25, + "id": "a76a7304", "metadata": {}, "outputs": [], "source": [ - "# STEP 6. INSERT CHUNK LIST INTO MILVUS OR ZILLIZ.\n", - "\n", - "# Create chunk_list and dict_list in a single loop\n", - "dict_list = []\n", - "for chunk, vector in zip(smaller_chunks, converted_values):\n", - " # Assemble embedding vector, original text chunk, metadata.\n", - " chunk_dict = {\n", - " 'chunk': chunk.page_content,\n", - " 'h1': chunk.metadata.get('h1', \"\")[:50],\n", - " 'h2': chunk.metadata.get('h2', \"\")[:50],\n", - " 'source': chunk.metadata.get('source', \"\"),\n", - " 'doc_index': chunk.metadata.get('doc_index', 0),\n", - " 'start_index': chunk.metadata.get('start_index', 0),\n", - " 'vector': vector,\n", - " }\n", - " dict_list.append(chunk_dict)\n", + "# # Use Percentile to determine breakpoints.\n", + "# text_splitter = SemanticChunker(\n", + "# embed_model, \n", + "# breakpoint_threshold_type=\"percentile\",\n", + "# breakpoint_threshold_amount=0.99\n", + "# )\n", + "\n", + "# semantic_docs = text_splitter.create_documents([all_page_contents])\n", + "# print(f\"Created {len(semantic_docs)} semantic documents from {len(docs)}.\")\n", "\n", - "# # TODO - Uncomment to inspect a chunk and its metadata.\n", - "# print(len(dict_list))\n", - "# print(type(dict_list[1]), len(dict_list[1]))\n", - "# pprint.pprint(dict_list[1])" + "# # Too many!\n", + "# # Created 1138 semantic documents from 22." ] }, { "cell_type": "code", - "execution_count": 22, - "id": "f3ac0d5c", + "execution_count": 26, + "id": "88e376c7", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Start inserting entities\n", - "Milvus insert time for 586 vectors: 0.23 seconds\n" - ] - } - ], + "outputs": [], "source": [ - "# Insert data into the Milvus collection.\n", - "print(\"Start inserting entities\")\n", + "# # Use Standard Deviation to determine breakpoints.\n", + "# text_splitter = SemanticChunker(\n", + "# embed_model, \n", + "# breakpoint_threshold_type=\"standard_deviation\",\n", + "# breakpoint_threshold_amount=0.80\n", + "# )\n", + "# semantic_docs = text_splitter.create_documents([all_page_contents])\n", "\n", - "start_time = time.time()\n", - "mc.insert(\n", - " COLLECTION_NAME,\n", - " data=dict_list,\n", - " progress_bar=True)\n", + "# print(f\"Created {len(semantic_docs)} semantic documents from {len(docs)}.\")\n", + "# print(\", \".join(str(len(doc.page_content)) for doc in semantic_docs))\n", + "# pprint.pprint(semantic_docs[0].page_content)\n", "\n", - "end_time = time.time()\n", - "print(f\"Milvus insert time for {len(dict_list)} vectors: \", end=\"\")\n", - "print(f\"{np.round(end_time - start_time, 2)} seconds\")" + "# # Created 195 semantic documents from 22." ] }, { - "cell_type": "markdown", - "id": "277e7ad7", + "cell_type": "code", + "execution_count": 27, + "id": "9fec633e", "metadata": {}, + "outputs": [], "source": [ - "## Aside - example Milvus collection API calls\n", - "https://milvus.io/docs/manage-collections.md#View-Collections\n", - "\n", - "Below are some common API calls for checking a collection.\n", - "- `.num_entities`, flushes data and executes row count.\n", - "- `.describe_collection()`, gives details about the schema, index, collection.\n", - "- `.query()`, gives back selected data from the collection." + "# # Create vectorstore for vector index and retrieval.\n", + "# COLLECTION_NAME = \"MilvusDocs\"\n", + "# start_time = time.time()\n", + "# vectorstore = Milvus.from_documents(\n", + "# collection_name=COLLECTION_NAME,\n", + "# documents=semantic_docs,\n", + "# embedding=embed_model,\n", + "# connection_args={\"uri\": \"./milvus_demo.db\"},\n", + "# # Override LangChain default values for Milvus.\n", + "# consistency_level=\"Eventually\",\n", + "# # auto_id=True,\n", + "# drop_old=True,\n", + "# index_params = {\n", + "# \"metric_type\": \"COSINE\",\n", + "# \"index_type\": \"AUTOINDEX\",\n", + "# \"params\": {},}\n", + "# )\n", + "# end_time = time.time()\n", + "# print(f\"Created Milvus collection from {len(semantic_docs)} docs in {end_time - start_time:.2f} seconds\")" ] }, { "cell_type": "code", - "execution_count": 23, - "id": "c211c099", + "execution_count": 28, + "id": "9d1ec60b", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'aliases': [],\n", - " 'auto_id': True,\n", - " 'collection_id': 0,\n", - " 'collection_name': 'MilvusDocs',\n", - " 'consistency_level': 0,\n", - " 'description': '',\n", - " 'enable_dynamic_field': True,\n", - " 'fields': [{'auto_id': True,\n", - " 'description': '',\n", - " 'field_id': 100,\n", - " 'is_primary': True,\n", - " 'name': 'id',\n", - " 'params': {},\n", - " 'type': },\n", - " {'description': '',\n", - " 'field_id': 101,\n", - " 'name': 'vector',\n", - " 'params': {'dim': 1024},\n", - " 'type': }],\n", - " 'num_partitions': 0,\n", - " 'num_shards': 0,\n", - " 'properties': {}}\n", - "timing: 0.0014 seconds\n", - "\n", - "data: [\"{'count(*)': 586}\"] , extra_info: {'cost': 0}\n", - "timing: 0.0023 seconds\n" - ] - } - ], + "outputs": [], "source": [ - "# Example Milvus Collection utility API calls.\n", - "# https://milvus.io/docs/manage-collections.md#View-Collections\n", - "\n", - "# View collection info, incurs a call to .flush() first.\n", - "start_time = time.time()\n", - "pprint.pprint(mc.describe_collection(COLLECTION_NAME))\n", - "end_time = time.time()\n", - "print(f\"timing: {round(end_time - start_time, 4)} seconds\")\n", - "print()\n", - "\n", - "# Milvus Lite - notice a delay, so wait 30 seconds.\n", - "time.sleep(15)\n", - "\n", - "# Count rows, incurs a call to .flush() first.\n", - "start_time = time.time()\n", - "res = mc.query( collection_name=COLLECTION_NAME, \n", - " filter=\"\", \n", - " output_fields = [\"count(*)\"], )\n", - "pprint.pprint(res)\n", - "end_time = time.time()\n", - "print(f\"timing: {round(end_time - start_time, 4)} seconds\")" + "# Delete the Milvus collection and vectorstore.\n", + "# del vectorstore" ] }, { @@ -1122,7 +1091,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 29, "id": "5e7f41f4", "metadata": {}, "outputs": [ @@ -1151,14 +1120,14 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 68, "id": "cd25ffca", "metadata": {}, "outputs": [], "source": [ "# SELECT A PARTICULAR QUESTION TO ASK.\n", "\n", - "SAMPLE_QUESTION = QUESTION1" + "SAMPLE_QUESTION = QUESTION4" ] }, { @@ -1177,278 +1146,300 @@ ] }, { - "cell_type": "markdown", - "id": "532c2758", + "cell_type": "code", + "execution_count": 31, + "id": "2b45afb5", "metadata": {}, + "outputs": [], "source": [ - "### Exercise #3 (2 min):\n", - "Search Milvus using the default search index." + "# # Retrieve semantic chunks.\n", + "# TOP_K = 2\n", + "# semantic_retriever = vectorstore.as_retriever(search_kwargs={\"k\" : TOP_K})\n", + "# semantic_results = semantic_retriever.invoke(SAMPLE_QUESTION)\n", + "\n", + "# print(len(semantic_results))\n", + "\n", + "# # Print the retrieved chunk and metadata.\n", + "# contexts = []\n", + "# for semantic_result in semantic_results:\n", + "# context = semantic_result.page_content\n", + "# print(len(context))\n", + "# pprint.pprint(f\"chunk: {context}\")\n", + "# pprint.pprint(f\"metadata: {semantic_result.metadata}\")\n", + "# context = semantic_result.page_content\n", + "# contexts.append(context)" ] }, { "cell_type": "code", - "execution_count": 26, - "id": "c457f8c6", + "execution_count": 69, + "id": "edf82971", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "512\n", + "chunk: during searches or queries. Milvus provides several index types and metrics to sort field values for efficient similarity searches. The following table lists the supported index types and metrics for different vector field types. For details, refer to In-memory Index and Similarity Metrics. Floating point embeddings Binary embeddings Sparse embeddings Metric Types Index Types Euclidean distance (L2)Inner product (IP)Cosine similarity (COSINE) FLATIVF_FLATIVF_SQ8IVF_PQGPU_IVF_FLATGPU_IVF_PQHNSWDISKANN Metric\n", + "(\"metadata: {'h1': 'Index Vector Fields', 'h2': 'This guide walks you thro', \"\n", + " \"'h3': 'Overview', 'source': 'https://milvus.io/docs/index-vector-fields.md', \"\n", + " \"'doc_index': 'c393a40d-864a-4cc4-9da3-4c53cf2d6602', 'doc_id': \"\n", + " \"'6c120544-d259-4fb1-b6f2-0481de55c492', 'pk': 450852554287612165}\")\n", + "498\n", + "chunk: field. index_params.add_index( field_name=\"vector\", metric_type=\"COSINE\", index_type=\"IVF_FLAT\", index_name=\"vector_index\", params={ \"nlist\": 128 } ) # 4.3. Create an index file client.create_index( collection_name=\"customized_setup\", index_params=index_params ) import io.milvus.v2.common.IndexParam; import io.milvus.v2.service.index.request.CreateIndexReq; // 4 Prepare index parameters // 4.2 Add an index for the vector field \"vector\" IndexParam indexParamForVectorField = IndexParam.builder()\n", + "(\"metadata: {'h1': 'Index Vector Fields', 'h2': 'This guide walks you thro', \"\n", + " \"'h3': 'Overview', 'source': 'https://milvus.io/docs/index-vector-fields.md', \"\n", + " \"'doc_index': 'c393a40d-864a-4cc4-9da3-4c53cf2d6602', 'doc_id': \"\n", + " \"'2ba6f33a-058b-46d8-8139-b41b1bef3f45', 'pk': 450852554287612149}\")\n", + "['https://milvus.io/docs/index-vector-fields.md']\n", + "2\n" + ] + } + ], "source": [ - "# query_embeddings = _utils.embed_query(encoder, [SAMPLE_QUESTION])\n", - "# TOP_K = 2\n", + "# The vector store alone will retrieve small chunks:\n", + "TOP_K = 2\n", + "child_results = retriever.vectorstore.similarity_search(\n", + " SAMPLE_QUESTION,\n", + " k=TOP_K)\n", "\n", - "# results = mc.search(\n", - "# #(exercise): code here # Answer: COLLECTION_NAME,\n", - "# data=query_embeddings,\n", - "# limit=TOP_K,\n", - "# consistency_level=\"Eventually\"\n", - "# )\n", - "# print(f\"Found top {len(results[0])} results for question: {SAMPLE_QUESTION}\")" + "# Print the retrieved chunk and metadata.\n", + "# Append each unique context and source to a list.\n", + "contexts = []\n", + "sources = []\n", + "for child_result in child_results:\n", + " context = child_result.page_content\n", + " print(len(context))\n", + " print(f\"chunk: {context}\")\n", + " pprint.pprint(f\"metadata: {child_result.metadata}\")\n", + " context = child_result.page_content\n", + " contexts.append(context)\n", + " source = child_result.metadata['source']\n", + " if source not in sources:\n", + " sources.append(source)\n", + "\n", + "print(sources)\n", + "print(len(contexts))" ] }, { "cell_type": "code", - "execution_count": 27, - "id": "1f0cc47f", + "execution_count": 70, + "id": "809ff495", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "output fields: ['chunk', 'h1', 'h2', 'source', 'doc_index', 'start_index']\n", - "Found top 2 results for question: What do the parameters for HNSW mean?\n" + "Num parent results: 4\n", + "1571\n", + "chunk: during searches or queries. Milvus provides several index types and metrics to sort field values for efficient similarity searches. The following table lists the supported index types and metrics for different vector field types. For details, refer to In-memory Index and Similarity Metrics. Floating point embeddings Binary embeddings Sparse embeddings Metric Types Index Types Euclidean distance (L2)Inner product (IP)Cosine similarity (COSINE) FLATIVF_FLATIVF_SQ8IVF_PQGPU_IVF_FLATGPU_IVF_PQHNSWDISKANN Metric Types Index Types Jaccard (JACCARD)Hamming (HAMMING) BIN_FLATBIN_IVF_FLAT Metric Types Index Types IP SPARSE_INVERTED_INDEXSPARSE_WAND It is recommended to create indexes for both the vector field and scalar fields that are frequently accessed. As explained in Manage Collections, Milvus automatically generates an index and loads it into memory when creating a collection if any of the following conditions are specified in the collection creation request: The dimensionality of the vector field and the metric type, or The schema and the index parameters. The code snippet below repurposes the existing code to establish a connection to a Milvus instance and create a collection without specifying its index parameters. In this case, the collection lacks an index and remains unloaded. To prepare for indexing, use MilvusClient to connect to the Milvus server and set up a collection by using create_schema(), add_field(), and create_collection(). To prepare for indexing, use MilvusClientV2 to connect to the Milvus server and set up a collection by using\n", + "(\"metadata: {'h1': 'Index Vector Fields', 'h2': 'This guide walks you thro', \"\n", + " \"'h3': 'Overview', 'source': 'https://milvus.io/docs/index-vector-fields.md', \"\n", + " \"'doc_index': 'c393a40d-864a-4cc4-9da3-4c53cf2d6602'}\")\n", + "1584\n", + "chunk: field. index_params.add_index( field_name=\"vector\", metric_type=\"COSINE\", index_type=\"IVF_FLAT\", index_name=\"vector_index\", params={ \"nlist\": 128 } ) # 4.3. Create an index file client.create_index( collection_name=\"customized_setup\", index_params=index_params ) import io.milvus.v2.common.IndexParam; import io.milvus.v2.service.index.request.CreateIndexReq; // 4 Prepare index parameters // 4.2 Add an index for the vector field \"vector\" IndexParam indexParamForVectorField = IndexParam.builder() .fieldName(\"vector\") .indexName(\"vector_index\") .indexType(IndexParam.IndexType.IVF_FLAT) .metricType(IndexParam.MetricType.COSINE) .extraParams(Map.of(\"nlist\", 128)) .build(); List indexParams = new ArrayList<>(); indexParams.add(indexParamForVectorField); // 4.3 Crate an index file CreateIndexReq createIndexReq = CreateIndexReq.builder() .collectionName(\"customized_setup\") .indexParams(indexParams) .build(); client.createIndex(createIndexReq); // 4. Set up index for the collection // 4.1. Set up the index parameters res = await client.createIndex({ collection_name: \"customized_setup\", field_name: \"vector\", index_type: \"AUTOINDEX\", metric_type: \"COSINE\", index_name: \"vector_index\", params: { \"nlist\": 128 } }) console.log(res.error_code) // Output // // Success // Parameter Description field_name The name of the target file to apply this object applies. metric_type The algorithm that is used to measure similarity between vectors. Possible values are IP, L2, COSINE, JACCARD, HAMMING. This is available only when the specified field is a vector field. For more\n", + "(\"metadata: {'h1': 'Index Vector Fields', 'h2': 'This guide walks you thro', \"\n", + " \"'h3': 'Overview', 'source': 'https://milvus.io/docs/index-vector-fields.md', \"\n", + " \"'doc_index': 'c393a40d-864a-4cc4-9da3-4c53cf2d6602'}\")\n", + "1582\n", + "chunk: DocsBlogCommunity Why Milvus What is MilvusUse Cases Tutorials CodelabsBootcampDemosVideo Tools AttuMilvus CLISizing ToolMilvus Backup Get Started Docs Tutorials Tools Blog Community Get Started Search Home ​ v2.4.x About Milvus Get Started Concepts Architecture Bitset Consistency Multi-tenancy Timestamp Similarity Metrics Time Synchronization Vector Index In-memory Index On-disk Index GPU Index Scalar Index Reranking Users and Roles In-memory Replica Terminology User Guide Models Milvus Migration Administration Guide Tools Integrations Tutorials FAQs API Reference In-memory Index ANNS vector indexes Indexes supported in Milvus FAQ What's next This topic lists various types of in-memory indexes Milvus supports, scenarios each of them best suits, and parameters users can configure to achieve better search performance. For on-disk indexes, see On-disk Index. Indexing is the process of efficiently organizing data, and it plays a major role in making similarity search useful by dramatically accelerating time-consuming queries on large datasets. To improve query performance, you can specify an index type for each vector field. Currently, a vector field only supports one index type. Milvus automatically deletes the old index when switching the index type. Most of the vector index types supported by Milvus use approximate nearest neighbors search (ANNS) algorithms. Compared with accurate retrieval, which is usually very time-consuming, the core idea of ANNS is no longer limited to returning the most accurate result, but only searching for neighbors of the target.\n", + "(\"metadata: {'h1': 'In-memory Index', 'h2': 'This topic lists various ', 'h3': \"\n", + " \"'ANNS vector indexes', 'source': 'https://milvus.io/docs/index.md', \"\n", + " \"'doc_index': '709e587e-ba1f-4e39-900c-433bcfc02990'}\")\n", + "1585\n", + "chunk: DocsBlogCommunity Why Milvus What is MilvusUse Cases Tutorials CodelabsBootcampDemosVideo Tools AttuMilvus CLISizing ToolMilvus Backup Get Started Docs Tutorials Tools Blog Community Get Started Search Home ​ v2.4.x About Milvus Get Started Concepts User Guide Models Milvus Migration Administration Guide Tools Integrations Tutorials FAQs API Reference Blog Get Started Recommended articles Welcome to Milvus Docs! Here you will learn about what Milvus is, and how to install, use, and deploy Milvus to build an application according to your business need. Try Managed Milvus For Free! Try Zilliz Cloud for free! The easiest way to experience Milvus! Zilliz Cloud Zilliz Cloud China Install Milvus Learn how to install Milvus using either Docker Compose or on Kubernetes. Quick Start Learn how to quickly run Milvus with sample code. Bootcamp Learn how to build vector similarity search applications with Milvus. Use Manage Collections Insert, Upsert, and Delete Index Vector Fields Single-Vector Search Get & Scalar Query Deploy Configure Milvus Manage Dependencies Deploy on Clouds Scale a Milvus Cluster Monitor and Alert Learn System Configuration Architecture Overview Vector Index Similarity Metrics Glossary What's new in docs Mar 2024 - Milvus 2.4.0 release Added guidance on how to conduct hybrid search. Added description of GPU index. Added guidance on how to embed your data using PyMilvus. Added guidance on how to enable RBAC with Milvus Operator. Added descriptions of Milvus CDC. Introducing PyMilvus Integration with Embedding Models Engineering Made with Love by the\n", + "(\"metadata: {'h1': 'Welcome to Milvus Docs!', 'h2': 'Here you will learn \"\n", + " \"about', 'h3': '', 'source': 'https://milvus.io/docs/', 'doc_index': \"\n", + " \"'5a75d188-5760-42ed-908a-b9c35c6998a2'}\")\n" ] } ], "source": [ - "# Define metadata fields you can filter on.\n", - "OUTPUT_FIELDS = list(dict_list[0].keys())\n", - "OUTPUT_FIELDS.remove('vector')\n", - "print(f\"output fields: {OUTPUT_FIELDS}\")\n", + "# Whereas the retriever will return the larger parent document:\n", + "parent_results = retriever.invoke(\n", + " SAMPLE_QUESTION) #[0].page_content\n", "\n", - "query_embeddings = _utils.embed_query(encoder, [SAMPLE_QUESTION])\n", - "TOP_K = 2\n", + "# Print the retrieved chunk and metadata.\n", + "print(f\"Num parent results: {len(parent_results)}\")\n", + "for parent_result in parent_results:\n", + " print(len(parent_result.page_content))\n", + " print(f\"chunk: {parent_result.page_content}\")\n", + " pprint.pprint(f\"metadata: {parent_result.metadata}\")\n", "\n", - "results = mc.search(\n", - " COLLECTION_NAME,\n", - " data=query_embeddings, \n", - " # search_params=SEARCH_PARAMS,\n", - " output_fields=OUTPUT_FIELDS, \n", - " # Milvus can utilize metadata in boolean expressions to filter search.\n", - " # filter=filter_expression,\n", - " limit=TOP_K,\n", - " consistency_level=\"Eventually\"\n", - ")\n", - "print(f\"Found top {len(results[0])} results for question: {SAMPLE_QUESTION}\")\n", - "\n", - "# Define a convenience function for searching.\n", - "def mc_run_search(question, filter_expression, top_k=3):\n", - " # Embed the question using the same encoder.\n", - " query_embeddings = _utils.embed_query(encoder, [question])\n", - "\n", - " # # Return top k results with HNSW index.\n", - " # SEARCH_PARAMS = dict({\n", - " # # Re-use index param for num_candidate_nearest_neighbors.\n", - " # \"ef\": INDEX_PARAMS['efConstruction']\n", - " # })\n", - "\n", - " # Run semantic vector search using your query and the vector database.\n", - " results = mc.search(\n", - " COLLECTION_NAME,\n", - " data=query_embeddings, \n", - " # search_params=SEARCH_PARAMS,\n", - " output_fields=OUTPUT_FIELDS, \n", - " # Milvus can utilize metadata in boolean expressions to filter search.\n", - " filter=filter_expression,\n", - " limit=top_k,\n", - " consistency_level=\"Eventually\"\n", - " )\n", + "# Only the 0th parent chunk seems relevant.\n", + "assert parent_results[0].metadata['source'] == sources[0]" + ] + }, + { + "cell_type": "markdown", + "id": "20cf59f0", + "metadata": {}, + "source": [ + "## Summarize the context using a LLM\n", "\n", - " # Assemble retrieved context and context metadata.\n", - " # The search result is in the variable `results[0]`, which is type \n", - " # 'pymilvus.orm.search.SearchResult'. \n", - " METADATA_FIELDS = [f for f in OUTPUT_FIELDS if f != 'chunk']\n", - " formatted_results, context, context_metadata = _utils.client_assemble_retrieved_context(\n", - " results, metadata_fields=METADATA_FIELDS, num_shot_answers=top_k)\n", - " \n", - " return formatted_results, context, context_metadata" + "Using a small, open source LLM should be good enough for the summarization task." ] }, { "cell_type": "code", - "execution_count": 28, - "id": "40734f49", + "execution_count": 71, + "id": "377d92fc", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "filter: \n", - "Milvus Client search time for 586 vectors: 0.12221956253051758 seconds\n", - "type: , count: 2\n" + "Length long text to summarize: 1011\n" ] } ], "source": [ - "# STEP 7. RETRIEVE ANSWERS FROM YOUR DOCUMENTS STORED IN MILVUS OR ZILLIZ.\n", - "\n", - "# Metadata filters for CSV dataset.\n", - "# expression = 'film_year >= 2019'\n", - "expression = \"\"\n", - "print(f\"filter: {expression}\")\n", - "TOP_K = 2\n", + "# Separate all the context together by space.\n", + "contexts_combined = ' '.join(contexts)\n", "\n", - "start_time = time.time()\n", - "formatted_results, contexts, context_metadata = \\\n", - " mc_run_search(SAMPLE_QUESTION, expression, TOP_K)\n", - "elapsed_time = time.time() - start_time\n", - "print(f\"Milvus Client search time for {len(dict_list)} vectors: {elapsed_time} seconds\")\n", + "# Alternatively use the parent (bigger) chunks.\n", + "# contexts_combined = \" \".join([result.page_content \n", + "# for result in parent_results[:TOP_K]])\n", + "# contexts_combined = parent_results[0].page_content\n", "\n", - "# Inspect search result.\n", - "print(f\"type: {type(formatted_results)}, count: {len(formatted_results)}\")" + "print(f\"Length long text to summarize: {len(contexts_combined)}\")" ] }, { - "cell_type": "markdown", - "id": "aa3cade1", + "cell_type": "code", + "execution_count": 72, + "id": "cfd45c26", "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Length prompt: 1196\n" + ] + } + ], "source": [ - "## Assemble and inspect the search result\n", + "# Define temperature for the LLM and random seed.\n", + "TEMPERATURE = 0.1\n", + "TOP_P = 0.9\n", + "RANDOM_SEED = 415\n", + "MAX_TOKENS = 512\n", + "FREQUENCY_PENALTY = 2\n", "\n", - "The search result is in the variable `results[0]` consisting of top_k-count of objects of type `'pymilvus.client.abstract.Hits'`\n", - "\n" + "# Define a system prompt to use for summarization.\n", + "SYSTEM_PROMPT = f\"\"\"First select the text that is relevant to the \n", + "user's question. Second, summarize the selected text so it is clear,\n", + "easy to understand, and concise, in fewer than 10 sentences.\n", + "Text: {contexts_combined}\n", + "\"\"\"\n", + "print(f\"Length prompt: {len(SYSTEM_PROMPT)}\")\n" ] }, { "cell_type": "code", - "execution_count": 29, - "id": "a7a2a4dd", + "execution_count": 73, + "id": "c66618c8", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Retrieved result #1\n", - "distance = 0.7174309492111206\n", - "('Chunk text: Why Milvus Docs Tutorials Tools Blog Community Sta is a '\n", - " 'range-search parameter and terminates the search process whilst the number '\n", - " 'of consecutive empty buckets reaches the specified value.Increasing this '\n", - " 'value can improve recall rate at the cost of increased search time. [1, '\n", - " '65535] 2 HNSW HNSW (Hierarchical Navigable Small World Graph) is a '\n", - " 'graph-based indexing algorithm. It builds a multi-layer navigation structure '\n", - " 'for an image according to certain rules. In this structure, the upper layers '\n", - " 'are more sparse and the distances between nodes are farther; the')\n", - "h1: Why Milvus Docs Tutorials Tools Blog Community Sta\n", - "h2: \n", - "source: https://milvus.io/docs/index.md\n", - "doc_index: 3\n", - "start_index: 625\n", - "\n", - "Retrieved result #2\n", - "distance = 0.7135186195373535\n", - "('Chunk text: Why Milvus Docs Tutorials Tools Blog Community Sta begin another '\n", - " 'search. After multiple iterations, it can quickly approach the target '\n", - " 'position. In order to improve performance, HNSW limits the maximum degree of '\n", - " 'nodes on each layer of the graph to M. In addition, you can use '\n", - " 'efConstruction (when building index) or ef (when searching targets) to '\n", - " 'specify a search range. Index building parameters Parameter Description '\n", - " 'Range M M defines tha maximum number of outgoing connections in the graph. '\n", - " 'Higher M leads to higher accuracy/run_time at fixed')\n", - "h1: Why Milvus Docs Tutorials Tools Blog Community Sta\n", - "h2: \n", - "source: https://milvus.io/docs/index.md\n", - "doc_index: 3\n", - "start_index: 0\n", + "First select the text that is relevant to the \n", + "user's question. Second, summarize the selected text so it is clear,\n", + "easy to understand, and concise, in fewer than 10 sentences.\n", + "Text: during searches or queries. Milvus provides several index types and metrics to sort field values for efficient similarity searches. The following table lists the supported index types and metrics for different vector field types. For details, refer to In-memory Index and Similarity Metrics. Floating point embeddings Binary embeddings Sparse embeddings Metric Types Index Types Euclidean distance (L2)Inner product (IP)Cosine similarity (COSINE) FLATIVF_FLATIVF_SQ8IVF_PQGPU_IVF_FLATGPU_IVF_PQHNSWDISKANN Metric field. index_params.add_index( field_name=\"vector\", metric_type=\"COSINE\", index_type=\"IVF_FLAT\", index_name=\"vector_index\", params={ \"nlist\": 128 } ) # 4.3. Create an index file client.create_index( collection_name=\"customized_setup\", index_params=index_params ) import io.milvus.v2.common.IndexParam; import io.milvus.v2.service.index.request.CreateIndexReq; // 4 Prepare index parameters // 4.2 Add an index for the vector field \"vector\" IndexParam indexParamForVectorField = IndexParam.builder()\n", "\n" ] } ], "source": [ - "# Loop through search results, print metadata.\n", - "sources = []\n", - "for i in range(len(contexts)):\n", - " print(f\"Retrieved result #{i+1}\")\n", - " print(f\"distance = {formatted_results[i][0]}\")\n", - " pprint.pprint(f\"Chunk text: {contexts[i]}\")\n", - " for key, value in context_metadata[i].items():\n", - " if key == \"source\":\n", - " sources.append(value)\n", - " print(f\"{key}: {value}\")\n", - " print()" + "print(SYSTEM_PROMPT)" ] }, { "cell_type": "code", - "execution_count": 30, - "id": "b04a0485", + "execution_count": 74, + "id": "3eee4d7b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "match_text: 5] 2 HNSW HNSW (Hierarchical Navigable S\n", - "Duplicate parent chunk text found.\n" + "MODEL:llama3:latest, FORMAT:gguf, PARAMETER_SIZE:8B, QUANTIZATION_LEVEL:Q4_0, \n", + "\n" ] } ], "source": [ - "unique_tuples = []\n", - "parent_chunks = []\n", - "\n", - "# Loop through the search results and keep only unique parent chunks.\n", - "i = 0\n", - "for context, item in zip(contexts, context_metadata):\n", - " # Extract doc_index and start_index from each item.\n", - " doc_index = item['doc_index']\n", - " start_index = item['start_index']\n", - " \n", - " # Create a tuple of (doc_index, start_index).\n", - " current_tuple = (doc_index, start_index)\n", - " \n", - " # Initialize current tuple is unique.\n", - " is_unique = True\n", - " \n", - " # Check if the start_index is within 2000 of any start_index in unique_tuples.\n", - " for unique_tuple in unique_tuples:\n", - " if unique_tuple[0] == current_tuple[0] \\\n", - " and abs(unique_tuple[1]-current_tuple[1])<=MAX_SEQ_LENGTH:\n", - " is_unique = False\n", - " print(\"Duplicate parent chunk text found.\")\n", - " break\n", - " \n", - " # Process unique tuples.\n", - " if is_unique:\n", - " # Append it to the list of unique tuples\n", - " unique_tuples.append(current_tuple)\n", - "\n", - " # Get and clean parent chunk text.\n", - " match_text = context\n", - " temp_index = len(match_text) // 2\n", - " match_text = match_text[temp_index:temp_index+40]\n", - " print(f\"match_text: {match_text}\")\n", - " match_text = \"2 HNSW HNSW (Hierarchical Navigable\"\n", - "\n", - " parent_text = docs[current_tuple[0]].page_content\n", - " parent_text = clean_text(parent_text)\n", - " temp_index = parent_text.find(match_text)\n", + "# !python -m pip install ollama\n", + "import ollama\n", "\n", - " if temp_index != -1:\n", - " start_index = max(0, temp_index-200)\n", - " end_index = min(len(parent_text), temp_index+MAX_SEQ_LENGTH-236)\n", - " parent_chunk_text = parent_text[start_index:end_index]\n", - " parent_chunks.append(parent_chunk_text)\n", - " else:\n", - " print(\"Text not found.\")\n", + "# Verify details which model you are running.\n", + "ollama_llama3 = ollama.list()['models'][0]\n", "\n", - " # # TODO: comment out debugging check if parents contain retrieved chunks.\n", - " # print(f\"Unique tuple: {current_tuple}\")\n", - " # print(\"Parent Chunk text: \")\n", - " # pprint.pprint(parent_chunks[i])\n", - " # print()\n", + "# Print the model details.\n", + "keys = ['format', 'parameter_size', 'quantization_level']\n", + "print(f\"MODEL:{ollama.list()['models'][0]['name']}\", end=\", \")\n", + "for key in keys:\n", + " print(f\"{str.upper(key)}:{ollama.list()['models'][0]['details'].get(key, 'Key not found in dictionary')}\", end=\", \")\n", + "print(end=\"\\n\\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "id": "574da045", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Answer: According to the provided text, there isn't a specific mention of what the default AUTOINDEX or vector field distance metric are in Milvus.\n", + "\n", + "However, based on other documentation available for Milvus (https://milvus.io/docs/), it appears that:\n", + "\n", + "* The default index type is IVF_PQ.\n", + "* The default distance metric used by this index is Euclidean Distance (L2).\n", + "\n", + "Please note that these defaults might be subject to change or may have changed since the provided text was written. For more accurate and up-to-date information, I recommend consulting Milvus' official documentation or contacting their support team directly. \n", + "\n", + "\n", + "ollama_llama3_time: 14.61 seconds\n" + ] + } + ], + "source": [ + "# Send the question to llama 3 chat.\n", + "start_time = time.time()\n", + "response = ollama.chat(\n", + " messages=[\n", + " {\"role\": \"system\", \"content\": SYSTEM_PROMPT,},\n", + " \n", + " {\"role\": \"user\", \"content\": f\"question: {SAMPLE_QUESTION}\",}\n", + " ],\n", + " model='llama3',\n", + " stream=False,\n", + " options={\"temperature\": TEMPERATURE, \"seed\": RANDOM_SEED,\n", + " \"top_p\": TOP_P, \n", + " # \"max_tokens\": MAX_TOKENS,\n", + " \"frequency_penalty\": FREQUENCY_PENALTY}\n", + ")\n", + "ollama_llama3_time = time.time() - start_time\n", "\n", - " i += 1" + "# Print all answers in the response.\n", + "semantic_summary = \"\"\n", + "semantic_summary += response['message']['content'] + \" \"\n", + "print(f\"Answer: {semantic_summary}\")\n", + "print(\"\\n\")\n", + "print(f\"ollama_llama3_time: {format(ollama_llama3_time, '.2f')} seconds\")" ] }, { @@ -1468,29 +1459,27 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 38, "id": "eb4c323f", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Length long text to summarize: 1500\n" - ] - } - ], + "outputs": [], "source": [ "# STEP 8. LLM-GENERATED ANSWER TO THE QUESTION, GROUNDED BY RETRIEVED CONTEXT.\n", "\n", + "# Separate all the sources together by comma.\n", + "source_combined = []\n", + "source_combined = ' '.join(reversed(sources))\n", + "\n", "# Separate all the context together by space.\n", "# Lance Martin, LangChain, says put best contexts at end.\n", - "# contexts_combined = ' '.join(reversed(contexts))\n", - "contexts_combined = ' '.join(reversed(parent_chunks))\n", + "contexts_combined = ' '.join(reversed(contexts))\n", "\n", - "# Separate all the sources together by comma.\n", - "source_combined = ' '.join(reversed(sources))\n", - "print(f\"Length long text to summarize: {len(contexts_combined)}\")\n", + "# # Alternatively use the parent (bigger) chunk.\n", + "# contexts_combined = parent_results[0].page_content\n", + "\n", + "# # # Alternatively use the summary.\n", + "# contexts_combined = semantic_summary\n", + "# print(f\"Length long text to summarize: {len(contexts_combined)}\")\n", "\n", "# Define temperature for the LLM and random seed.\n", "TEMPERATURE = 0.1\n", @@ -1502,7 +1491,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 39, "id": "5fc90b64", "metadata": {}, "outputs": [ @@ -1510,7 +1499,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Length prompt: 1948\n" + "Length prompt: 1465\n" ] } ], @@ -1519,7 +1508,8 @@ "the user's question. Second, only if the context is strongly relevant, \n", "answer the question using the context. Otherwise, if the context is not \n", "strongly relevant, answer the question without using the context. \n", - "Be clear, concise, relevant. Answer with fewer than 2 sentences and cite unique sources.\n", + "Be clear, concise, relevant. Answer clearly, easy to understand, \n", + "fewer than 2 sentences, and cite unique sources.\n", "Grounding sources: {source_combined}\n", "Context: {contexts_combined}\n", "\"\"\"\n", @@ -1528,13 +1518,42 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 40, "id": "6c31718c", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "('First, check if the Context below is relevant to \\n'\n", + " \"the user's question. Second, only if the context is strongly relevant, \\n\"\n", + " 'answer the question using the context. Otherwise, if the context is not \\n'\n", + " 'strongly relevant, answer the question without using the context. \\n'\n", + " 'Be clear, concise, relevant. Answer clearly, easy to understand, \\n'\n", + " 'fewer than 2 sentences, and cite unique sources.\\n'\n", + " 'Grounding sources: https://milvus.io/docs/index.md\\n'\n", + " 'Context: at the cost of increased search time.[1, 65535]2 Common search '\n", + " 'Range search HNSW (Hierarchical Navigable Small World Graph) is a '\n", + " 'graph-based indexing algorithm. It builds a multi-layer navigation structure '\n", + " 'for an image according to certain rules. In this structure, the upper layers '\n", + " 'are more sparse and the distances between nodes are farther; the lower '\n", + " 'layers are denser and the distances between nodes are closer. The search '\n", + " 'starts from the uppermost layer, finds the node closest to the target in '\n", + " 'this layer, the node closest to the target in this layer, and then enters '\n", + " 'the next layer to begin another search. After multiple iterations, it can '\n", + " 'quickly approach the target position. In order to improve performance, HNSW '\n", + " 'limits the maximum degree of nodes on each layer of the graph to M. In '\n", + " 'addition, you can use efConstruction (when building index) or ef (when '\n", + " 'searching targets) to specify a search range. ParameterDescriptionRange MM '\n", + " 'defines tha maximum number of outgoing connections in the graph. Higher M '\n", + " 'leads to\\n')\n" + ] + } + ], "source": [ - "# # Inspect the prompt.\n", - "# pprint.pprint(SYSTEM_PROMPT)" + "# Inspect the prompt.\n", + "pprint.pprint(SYSTEM_PROMPT)" ] }, { @@ -1571,7 +1590,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 41, "id": "0edc67e3", "metadata": {}, "outputs": [ @@ -1591,6 +1610,7 @@ "# Verify details which model you are running.\n", "ollama_llama3 = ollama.list()['models'][0]\n", "\n", + "\n", "# Print the model details.\n", "keys = ['format', 'parameter_size', 'quantization_level']\n", "print(f\"MODEL:{ollama.list()['models'][0]['name']}\", end=\", \")\n", @@ -1601,7 +1621,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 42, "id": "76042c9a", "metadata": {}, "outputs": [ @@ -1609,19 +1629,16 @@ "name": "stdout", "output_type": "stream", "text": [ - "(\"According to the context and documentation [1], here's what the parameters \"\n", - " 'for Hierarchical Navigable Small World Graph (HNSW) mean: **M**: Maximum '\n", - " 'number of outgoing connections in each layer of the graph. A higher M leads '\n", - " 'to a better accuracy at the cost of increased search time. Range: (2, '\n", - " '2048) **efConstruction**: Controls the trade-off between index build speed '\n", - " 'and quality. Increasing this parameter may improve index quality but also '\n", - " 'increases indexing time. Range: (1, int_max) **ef**: Parameter controlling '\n", - " 'query time vs. recall rate during searching targets in HNSW indexes. No '\n", - " \"specific range is mentioned for ef, as it depends on the application's \"\n", - " 'requirements and constraints. These parameters allow you to fine-tune your '\n", - " 'HNSW index for optimal performance based on your use case needs [1]. '\n", - " 'References: [1] https://milvus.io/docs/index.md (Milvus documentation)')\n", - "ollama_llama3_time: 10.43 seconds\n" + "In Hierarchical Navigable Small World Graph (HNSW), there are two main parameters that control its behavior:\n", + "\n", + "1. **M** (`max_degree`): This parameter defines the maximum number of outgoing connections in each layer of the graph. A higher value means more nodes can be connected to a single node, which increases the density and searchability of the index.\n", + "2. **ef** (Efficiency Factor): This is used during both indexing construction and target searching. It controls how much information from previous searches should be reused when traversing through layers.\n", + "\n", + "In other words:\n", + "\n", + "* `M` determines how many \"neighbors\" each node can have, which affects the graph's density and search efficiency.\n", + "* `ef` adjusts the trade-off between precision (search accuracy) and recall (completeness of results). A higher value means more information is reused from previous searches, making it faster but potentially less accurate.\n", + "ollama_llama3_time: 10.23 seconds\n" ] } ], @@ -1631,6 +1648,7 @@ "response = ollama.chat(\n", " messages=[\n", " {\"role\": \"system\", \"content\": SYSTEM_PROMPT,},\n", + " \n", " {\"role\": \"user\", \"content\": f\"question: {SAMPLE_QUESTION}\",}\n", " ],\n", " model='llama3',\n", @@ -1642,7 +1660,7 @@ ")\n", "\n", "ollama_llama3_time = time.time() - start_time\n", - "pprint.pprint(response['message']['content'].replace('\\n', ' '))\n", + "print(response['message']['content'])\n", "print(f\"ollama_llama3_time: {format(ollama_llama3_time, '.2f')} seconds\")" ] }, @@ -1656,7 +1674,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": null, "id": "59f95f52", "metadata": {}, "outputs": [], @@ -1667,26 +1685,10 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": null, "id": "1ef7529f", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "('According to the provided context, the parameters for HNSW (Hierarchical '\n", - " 'Navigable Small World Graph) are: * M: defines the maximum number of '\n", - " 'outgoing connections in the graph. Higher M leads to higher accuracy and '\n", - " 'longer run time at fixed ef/efConstruction. * efConstruction: controls index '\n", - " 'search speed/build speed tradeoff. Increasing this parameter may enhance '\n", - " 'index quality, but it also tends to lengthen indexing time. * ef: Parameter '\n", - " 'controlling query time/recall rate tradeoff. These parameters help balance '\n", - " 'between search efficiency and recall rate in HNSW indexing algorithm.')\n", - "llama3_anyscale_endpoints_time: 2.54 seconds\n" - ] - } - ], + "outputs": [], "source": [ "# Call Anyscale enpoint using OpenAI API.\n", "import openai\n", @@ -1724,26 +1726,10 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": null, "id": "309d7025", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "('According to the context, the parameters for HNSW (Hierarchical Navigable '\n", - " 'Small World Graph) are: * M: defines the maximum number of outgoing '\n", - " 'connections in the graph. Higher M leads to higher accuracy and longer run '\n", - " 'time at fixed ef/efConstruction. * efConstruction: controls index search '\n", - " 'speed/build speed tradeoff. Increasing this parameter may enhance index '\n", - " 'quality, but it also tends to lengthen indexing time. * ef: Parameter '\n", - " 'controlling query time. These parameters can be adjusted to balance between '\n", - " 'accuracy and efficiency in HNSW searches.')\n", - "llama3_octai_endpoints_time: 1.70 seconds\n" - ] - } - ], + "outputs": [], "source": [ "# Also try OctoAI\n", "# !python -m pip install octoai\n", @@ -1785,27 +1771,10 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": null, "id": "e6b94795", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "('According to the context, the parameters for HNSW (Hierarchical Navigable '\n", - " 'Small World Graph) are: * M: defines the maximum number of outgoing '\n", - " 'connections in the graph, with higher M leading to higher accuracy and '\n", - " 'longer run time at fixed ef/efConstruction. * efConstruction: controls the '\n", - " 'index search speed/build speed tradeoff, with increasing efConstruction '\n", - " 'enhancing index quality but lengthening indexing time. * ef: controls query '\n", - " 'time/search time, with higher ef leading to faster search but potentially '\n", - " 'lower accuracy. These parameters can be adjusted to balance accuracy, '\n", - " 'search speed, and indexing time.')\n", - "llama3_groq_endpoints_time: 0.48 seconds\n" - ] - } - ], + "outputs": [], "source": [ "# Also try Groq endpoints\n", "# !python -m pip install groq\n", @@ -1859,7 +1828,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": null, "id": "edf66e04", "metadata": {}, "outputs": [], @@ -1879,7 +1848,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": null, "id": "c87b8428", "metadata": {}, "outputs": [], @@ -1948,30 +1917,10 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": null, "id": "e6b16264", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(' The parameters for HNSW, a graph-based indexing algorithm, are as follows: '\n", - " '1. M: Defines the maximum number of outgoing connections in the graph. A '\n", - " 'higher M leads to higher accuracy and longer run time, with a suggested '\n", - " 'range of (2, 2048). 2. efConstruction: Controls the index search speed/build '\n", - " 'speed tradeoff during indexing. Increasing this parameter may enhance index '\n", - " 'quality but also tends to lengthen the indexing time; it accepts values from '\n", - " '1 to int\\\\_max. 3. ef: Controls query time; it is not specified what '\n", - " 'int\\\\_max is in this context but can be any positive integer value '\n", - " '(including zero). This parameter affects query time/accuracy tradeoff during '\n", - " 'target searches within an existing HNSW structure '\n", - " '([1](https://milvus.io/docs/v0.7.0/parameters_overview_HNSW%20index%20type_(standalone).md), '\n", - " '[2](https://www.pinecone-database.com/docs/parameters-and-properties/#hnsw)).')\n", - "mixtral_anyscale_endpoints_time: 4.01 seconds\n" - ] - } - ], + "outputs": [], "source": [ "# Call Anyscale enpoint using OpenAI API.\n", "import openai\n", @@ -2036,8 +1985,8 @@ "the user's question. Second, only if the context is strongly relevant, \n", "answer the question using the context. Otherwise, if the context is not \n", "strongly relevant, answer the question without using the context.\n", - "Be clear, concise, relevant. Answer with fewer than 4 sentences \n", - "and cite unique grounding sources.\n", + "Be clear, concise, relevant. Answer clearly, easy to understand, \n", + "fewer than 4 sentences, and cite unique grounding sources.\n", "Grounding sources: {source_combined}\n", "Context: {contexts_combined}\n", "\"\"\"" @@ -2046,6 +1995,45 @@ { "cell_type": "code", "execution_count": 44, + "id": "ad653053", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "('First, check if the Context below is relevant to \\n'\n", + " \"the user's question. Second, only if the context is strongly relevant, \\n\"\n", + " 'answer the question using the context. Otherwise, if the context is not \\n'\n", + " 'strongly relevant, answer the question without using the context.\\n'\n", + " 'Be clear, concise, relevant. Answer clearly, easy to understand, \\n'\n", + " 'fewer than 4 sentences, and cite unique grounding sources.\\n'\n", + " 'Grounding sources: https://milvus.io/docs/index.md\\n'\n", + " 'Context: at the cost of increased search time.[1, 65535]2 Common search '\n", + " 'Range search HNSW (Hierarchical Navigable Small World Graph) is a '\n", + " 'graph-based indexing algorithm. It builds a multi-layer navigation structure '\n", + " 'for an image according to certain rules. In this structure, the upper layers '\n", + " 'are more sparse and the distances between nodes are farther; the lower '\n", + " 'layers are denser and the distances between nodes are closer. The search '\n", + " 'starts from the uppermost layer, finds the node closest to the target in '\n", + " 'this layer, the node closest to the target in this layer, and then enters '\n", + " 'the next layer to begin another search. After multiple iterations, it can '\n", + " 'quickly approach the target position. In order to improve performance, HNSW '\n", + " 'limits the maximum degree of nodes on each layer of the graph to M. In '\n", + " 'addition, you can use efConstruction (when building index) or ef (when '\n", + " 'searching targets) to specify a search range. ParameterDescriptionRange MM '\n", + " 'defines tha maximum number of outgoing connections in the graph. Higher M '\n", + " 'leads to\\n')\n" + ] + } + ], + "source": [ + "pprint.pprint(SYSTEM_PROMPT)" + ] + }, + { + "cell_type": "code", + "execution_count": 45, "id": "76a62feb", "metadata": {}, "outputs": [ @@ -2054,18 +2042,20 @@ "output_type": "stream", "text": [ "Question: What do the parameters for HNSW mean?\n", - "('Answer: The parameters for HNSW (Hierarchical Navigable Small World Graph) '\n", - " 'are M and efConstruction for index building, and ef for searching targets. \\n'\n", - " '- M defines the maximum number of outgoing connections in the graph, '\n", - " 'affecting accuracy and runtime.\\n'\n", - " '- efConstruction controls search speed/build speed tradeoff during index '\n", - " 'construction.\\n'\n", - " '- ef is a parameter controlling query time/search range. \\n'\n", - " 'These parameters help optimize performance by balancing accuracy, search '\n", - " 'time, and recall rate. [Source: https://milvus.io/docs/index.md]')\n", + "('Answer: In the context of HNSW (Hierarchical Navigable Small World Graph), '\n", + " 'the parameters are as follows:\\n'\n", + " '- M: It defines the maximum number of outgoing connections in the graph. A '\n", + " 'higher M value leads to increased recall but also increases search time.\\n'\n", + " '- efConstruction: This parameter is used when building an index and '\n", + " 'specifies a search range for constructing the graph.\\n'\n", + " '- ef: This parameter is used when searching for targets and specifies a '\n", + " 'search range to find nearest neighbors efficiently.\\n'\n", + " '\\n'\n", + " 'Source:\\n'\n", + " 'https://milvus.io/docs/index.md')\n", "\n", "\n", - "chatgpt_3.5_turbo_time: 2.45253\n" + "chatgpt_3.5_turbo_time: 1.98639\n" ] } ], @@ -2154,7 +2144,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": null, "id": "5d0c2299", "metadata": {}, "outputs": [], @@ -2164,7 +2154,7 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 76, "id": "e1097990", "metadata": {}, "outputs": [ @@ -2191,8 +2181,12 @@ " \n", " Question\n", " ground_truth_answer\n", - " Custom_RAG_context\n", - " simple_context\n", + " naive_context_k_2\n", + " html_context_k_2\n", + " parent_context_k1\n", + " html_context_k_2_summary\n", + " semantic_context_k2_summary\n", + " parent_context_k1_text-embedding-3-small\n", " Custom_RAG_answer\n", " llama3_ollama_answer\n", " llama3_anyscale_answer\n", @@ -2206,10 +2200,14 @@ " 0\n", " What do the parameters for HNSW mean?\n", " * M: maximum degree, or number of connections ...\n", - " HNSW (Hierarchical Navigable Small World Graph...\n", " In order to improve performance, HNSW limits t...\n", - " - M defines the maximum number of outgoing con...\n", - " 1. **M**: Maximum degree of nodes on each laye...\n", + " the node closest to the target in this layer, ...\n", + " Search parameters ParameterDescriptionRangeDef...\n", + " * `M`: defines maximum number of outgoing conn...\n", + " The three main parameters for Hierarchical Nav...\n", + " In order to improve performance, HNSW limits t...\n", + " In the context of HNSW (Hierarchical Navigabl...\n", + " * **M** controls the graph's density and compu...\n", " * M: defines the maximum number of outgoing co...\n", " * M: defines the maximum number of outgoing co...\n", " * M: defines the maximum number of outgoing co...\n", @@ -2219,8 +2217,12 @@ " 1\n", " What are good default values for HNSW paramete...\n", " M=16, efConstruction=32, 
ef=32\n", - " HNSW (Hierarchical Navigable Small World Graph...\n", - " Why Milvus Docs Tutorials Tools Blog Community...\n", + " When conducting searches, note that you can se...\n", + " first. When conducting searches, note that you...\n", + " between the target vector and the clustering c...\n", + " * `nlist`: Set to a moderate value such as 128...\n", + " For an index built on top of a dataset with ap...\n", + " Index building parameters Parameter Descriptio...\n", " - M = 32 
- efConstruction = 100\n", " M=16\\nefConstruction=128
ef=64\n", " * M: 16 
* efConstruction: 100 
* ef: top_k\n", @@ -2232,8 +2234,12 @@ " 2\n", " What does nlist vs nprobe mean in ivf_flat?\n", " # nlist: controls how the vector data is part...\n", - " IVF_FLAT divides vector data into nlist cluste...\n", - " `nlist` in IVF-Flat represents the number of c...\n", + " By adjusting nprobe, an ideal balance between ...\n", + " FLAT index and IVF_FLAT index? IVF_FLAT index ...\n", + " FLAT index and IVF_FLAT index? IVF_FLAT index ...\n", + " * `NLIST`: controls how many initial \"buckets\"...\n", + " In IVF_FLAT, `nlist` (short for \"number of lis...\n", + " By adjusting nprobe, an ideal balance between ...\n", " - nlist in IVF_FLAT refers to the number of cl...\n", " - `nlist` refers to the number of cluster unit...\n", " - `nlist` refers to the number of cluster unit...\n", @@ -2245,8 +2251,12 @@ " 3\n", " What is the default AUTOINDEX index and vector...\n", " Index type = HNSW and distance metric=IP Inner...\n", - " \"AUTOINDEX\", metric_type: \"COSINE\", i...\n", - " Index parameters Index parameters dictate how ...\n", + " Leveraging the metadata stored in an index fil...\n", + " during searches or queries. Milvus provides se...\n", + " during searches or queries. Milvus provides se...\n", + " According to the provided text, there are no s...\n", + " According to the provided text, there isn't a ...\n", + " Why Milvus Docs Tutorials Tools Blog Community...\n", " The default AUTOINDEX index in Milvus is IVF_S...\n", " The default `AUTOINDEX` index uses a combinati...\n", " According to the Milvus documentation, the def...\n", @@ -2272,27 +2282,51 @@ "2 # nlist: controls how the vector data is part... \n", "3 Index type = HNSW and distance metric=IP Inner... \n", "\n", - " Custom_RAG_context \\\n", - "0 HNSW (Hierarchical Navigable Small World Graph... \n", - "1 HNSW (Hierarchical Navigable Small World Graph... \n", - "2 IVF_FLAT divides vector data into nlist cluste... \n", - "3 \"AUTOINDEX\", metric_type: \"COSINE\", i... \n", + " naive_context_k_2 \\\n", + "0 In order to improve performance, HNSW limits t... \n", + "1 When conducting searches, note that you can se... \n", + "2 By adjusting nprobe, an ideal balance between ... \n", + "3 Leveraging the metadata stored in an index fil... \n", + "\n", + " html_context_k_2 \\\n", + "0 the node closest to the target in this layer, ... \n", + "1 first. When conducting searches, note that you... \n", + "2 FLAT index and IVF_FLAT index? IVF_FLAT index ... \n", + "3 during searches or queries. Milvus provides se... \n", + "\n", + " parent_context_k1 \\\n", + "0 Search parameters ParameterDescriptionRangeDef... \n", + "1 between the target vector and the clustering c... \n", + "2 FLAT index and IVF_FLAT index? IVF_FLAT index ... \n", + "3 during searches or queries. Milvus provides se... \n", "\n", - " simple_context \\\n", + " html_context_k_2_summary \\\n", + "0 * `M`: defines maximum number of outgoing conn... \n", + "1 * `nlist`: Set to a moderate value such as 128... \n", + "2 * `NLIST`: controls how many initial \"buckets\"... \n", + "3 According to the provided text, there are no s... \n", + "\n", + " semantic_context_k2_summary \\\n", + "0 The three main parameters for Hierarchical Nav... \n", + "1 For an index built on top of a dataset with ap... \n", + "2 In IVF_FLAT, `nlist` (short for \"number of lis... \n", + "3 According to the provided text, there isn't a ... \n", + "\n", + " parent_context_k1_text-embedding-3-small \\\n", "0 In order to improve performance, HNSW limits t... \n", - "1 Why Milvus Docs Tutorials Tools Blog Community... \n", - "2 `nlist` in IVF-Flat represents the number of c... \n", - "3 Index parameters Index parameters dictate how ... \n", + "1 Index building parameters Parameter Descriptio... \n", + "2 By adjusting nprobe, an ideal balance between ... \n", + "3 Why Milvus Docs Tutorials Tools Blog Community... \n", "\n", " Custom_RAG_answer \\\n", - "0 - M defines the maximum number of outgoing con... \n", + "0 In the context of HNSW (Hierarchical Navigabl... \n", "1 - M = 32 \n", "- efConstruction = 100 \n", "2 - nlist in IVF_FLAT refers to the number of cl... \n", "3 The default AUTOINDEX index in Milvus is IVF_S... \n", "\n", " llama3_ollama_answer \\\n", - "0 1. **M**: Maximum degree of nodes on each laye... \n", + "0 * **M** controls the graph's density and compu... \n", "1 M=16\\nefConstruction=128\n", "ef=64 \n", "2 - `nlist` refers to the number of cluster unit... \n", @@ -2370,7 +2404,7 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 81, "id": "8ae8d2b2", "metadata": {}, "outputs": [ @@ -2392,7 +2426,49 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "1974f8a4724748cf98ff2f22e1341292", + "model_id": "dc8cc97274534e54b258fb08e9d52819", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Evaluating: 0%| | 0/8 [00:00\n", " 0\n", " What do the parameters for HNSW mean?\n", - " [HNSW (Hierarchical Navigable Small World Grap...\n", - " - M defines the maximum number of outgoing con...\n", + " [In order to improve performance, HNSW limits ...\n", + " In the context of HNSW (Hierarchical Navigabl...\n", " * M: maximum degree, or number of connections ...\n", - " 1.0\n", + " 1.000000\n", " 1.0\n", " 1.000000\n", - " Custom_RAG_context\n", + " naive_context_k_2\n", " \n", " \n", " 1\n", " What are good default values for HNSW paramete...\n", - " [HNSW (Hierarchical Navigable Small World Grap...\n", + " [When conducting searches, note that you can s...\n", " - M = 32 
- efConstruction = 100\n", " M=16, efConstruction=32, 
ef=32\n", + " 0.333333\n", " 1.0\n", - " 1.0\n", - " 1.000000\n", - " Custom_RAG_context\n", + " 0.500000\n", + " naive_context_k_2\n", " \n", " \n", " 2\n", " What does nlist vs nprobe mean in ivf_flat?\n", - " [IVF_FLAT divides vector data into nlist clust...\n", + " [By adjusting nprobe, an ideal balance between...\n", " - nlist in IVF_FLAT refers to the number of cl...\n", " # nlist: controls how the vector data is part...\n", - " 0.5\n", - " 1.0\n", " 0.666667\n", - " Custom_RAG_context\n", + " 1.0\n", + " 0.800000\n", + " naive_context_k_2\n", " \n", " \n", " 3\n", " What is the default AUTOINDEX index and vector...\n", - " [\"AUTOINDEX\", metric_type: \"COSINE\", ...\n", + " [Leveraging the metadata stored in an index fi...\n", " The default AUTOINDEX index in Milvus is IVF_S...\n", " Index type = HNSW and distance metric=IP Inner...\n", - " 0.0\n", - " 0.0\n", " 0.000000\n", - " Custom_RAG_context\n", + " 1.0\n", + " 0.000000\n", + " naive_context_k_2\n", " \n", " \n", " 4\n", " What do the parameters for HNSW mean?\n", - " [In order to improve performance, HNSW limits ...\n", - " - M defines the maximum number of outgoing con...\n", + " [the node closest to the target in this layer,...\n", + " In the context of HNSW (Hierarchical Navigabl...\n", " * M: maximum degree, or number of connections ...\n", - " 1.0\n", + " 1.000000\n", " 1.0\n", " 1.000000\n", - " simple_context\n", + " html_context_k_2\n", " \n", " \n", " 5\n", " What are good default values for HNSW paramete...\n", - " [Why Milvus Docs Tutorials Tools Blog Communit...\n", + " [first. When conducting searches, note that yo...\n", " - M = 32 
- efConstruction = 100\n", " M=16, efConstruction=32, 
ef=32\n", - " 0.0\n", + " 0.333333\n", " 1.0\n", - " 0.000000\n", - " simple_context\n", + " 0.500000\n", + " html_context_k_2\n", " \n", " \n", " 6\n", " What does nlist vs nprobe mean in ivf_flat?\n", - " [`nlist` in IVF-Flat represents the number of ...\n", + " [FLAT index and IVF_FLAT index? IVF_FLAT index...\n", " - nlist in IVF_FLAT refers to the number of cl...\n", " # nlist: controls how the vector data is part...\n", - " 0.5\n", + " 0.500000\n", " 1.0\n", " 0.666667\n", - " simple_context\n", + " html_context_k_2\n", " \n", " \n", " 7\n", " What is the default AUTOINDEX index and vector...\n", - " [Index parameters Index parameters dictate how...\n", + " [during searches or queries. Milvus provides s...\n", " The default AUTOINDEX index in Milvus is IVF_S...\n", " Index type = HNSW and distance metric=IP Inner...\n", - " 0.0\n", + " 0.000000\n", + " 1.0\n", + " 0.000000\n", + " html_context_k_2\n", + " \n", + " \n", + " 8\n", + " What do the parameters for HNSW mean?\n", + " [Search parameters ParameterDescriptionRangeDe...\n", + " In the context of HNSW (Hierarchical Navigabl...\n", + " * M: maximum degree, or number of connections ...\n", + " 1.000000\n", + " 1.0\n", + " 1.000000\n", + " parent_context_k1\n", + " \n", + " \n", + " 9\n", + " What are good default values for HNSW paramete...\n", + " [between the target vector and the clustering ...\n", + " - M = 32 
- efConstruction = 100\n", + " M=16, efConstruction=32, 
ef=32\n", + " 0.000000\n", + " 1.0\n", + " 0.000000\n", + " parent_context_k1\n", + " \n", + " \n", + " 10\n", + " What does nlist vs nprobe mean in ivf_flat?\n", + " [FLAT index and IVF_FLAT index? IVF_FLAT index...\n", + " - nlist in IVF_FLAT refers to the number of cl...\n", + " # nlist: controls how the vector data is part...\n", + " 0.500000\n", + " 1.0\n", + " 0.666667\n", + " parent_context_k1\n", + " \n", + " \n", + " 11\n", + " What is the default AUTOINDEX index and vector...\n", + " [during searches or queries. Milvus provides s...\n", + " The default AUTOINDEX index in Milvus is IVF_S...\n", + " Index type = HNSW and distance metric=IP Inner...\n", + " 0.000000\n", + " 1.0\n", + " 0.000000\n", + " parent_context_k1\n", + " \n", + " \n", + " 12\n", + " What do the parameters for HNSW mean?\n", + " [* `M`: defines maximum number of outgoing con...\n", + " In the context of HNSW (Hierarchical Navigabl...\n", + " * M: maximum degree, or number of connections ...\n", + " 1.000000\n", + " 1.0\n", + " 1.000000\n", + " html_context_k_2_summary\n", + " \n", + " \n", + " 13\n", + " What are good default values for HNSW paramete...\n", + " [* `nlist`: Set to a moderate value such as 12...\n", + " - M = 32 
- efConstruction = 100\n", + " M=16, efConstruction=32, 
ef=32\n", + " 0.000000\n", + " 1.0\n", + " 0.000000\n", + " html_context_k_2_summary\n", + " \n", + " \n", + " 14\n", + " What does nlist vs nprobe mean in ivf_flat?\n", + " [* `NLIST`: controls how many initial \"buckets...\n", + " - nlist in IVF_FLAT refers to the number of cl...\n", + " # nlist: controls how the vector data is part...\n", + " 0.500000\n", " 1.0\n", + " 0.666667\n", + " html_context_k_2_summary\n", + " \n", + " \n", + " 15\n", + " What is the default AUTOINDEX index and vector...\n", + " [According to the provided text, there are no ...\n", + " The default AUTOINDEX index in Milvus is IVF_S...\n", + " Index type = HNSW and distance metric=IP Inner...\n", + " 0.000000\n", + " 1.0\n", + " 0.000000\n", + " html_context_k_2_summary\n", + " \n", + " \n", + " 16\n", + " What do the parameters for HNSW mean?\n", + " [The three main parameters for Hierarchical Na...\n", + " In the context of HNSW (Hierarchical Navigabl...\n", + " * M: maximum degree, or number of connections ...\n", + " 1.000000\n", + " 1.0\n", + " 1.000000\n", + " semantic_context_k2_summary\n", + " \n", + " \n", + " 17\n", + " What are good default values for HNSW paramete...\n", + " [For an index built on top of a dataset with a...\n", + " - M = 32 
- efConstruction = 100\n", + " M=16, efConstruction=32, 
ef=32\n", + " 0.000000\n", + " 1.0\n", + " 0.000000\n", + " semantic_context_k2_summary\n", + " \n", + " \n", + " 18\n", + " What does nlist vs nprobe mean in ivf_flat?\n", + " [In IVF_FLAT, `nlist` (short for \"number of li...\n", + " - nlist in IVF_FLAT refers to the number of cl...\n", + " # nlist: controls how the vector data is part...\n", + " 0.500000\n", + " 1.0\n", + " 0.666667\n", + " semantic_context_k2_summary\n", + " \n", + " \n", + " 19\n", + " What is the default AUTOINDEX index and vector...\n", + " [According to the provided text, there isn't a...\n", + " The default AUTOINDEX index in Milvus is IVF_S...\n", + " Index type = HNSW and distance metric=IP Inner...\n", " 0.000000\n", - " simple_context\n", + " 0.0\n", + " 0.000000\n", + " semantic_context_k2_summary\n", " \n", " \n", "\n", "" ], "text/plain": [ - " question \\\n", - "0 What do the parameters for HNSW mean? \n", - "1 What are good default values for HNSW paramete... \n", - "2 What does nlist vs nprobe mean in ivf_flat? \n", - "3 What is the default AUTOINDEX index and vector... \n", - "4 What do the parameters for HNSW mean? \n", - "5 What are good default values for HNSW paramete... \n", - "6 What does nlist vs nprobe mean in ivf_flat? \n", - "7 What is the default AUTOINDEX index and vector... \n", + " question \\\n", + "0 What do the parameters for HNSW mean? \n", + "1 What are good default values for HNSW paramete... \n", + "2 What does nlist vs nprobe mean in ivf_flat? \n", + "3 What is the default AUTOINDEX index and vector... \n", + "4 What do the parameters for HNSW mean? \n", + "5 What are good default values for HNSW paramete... \n", + "6 What does nlist vs nprobe mean in ivf_flat? \n", + "7 What is the default AUTOINDEX index and vector... \n", + "8 What do the parameters for HNSW mean? \n", + "9 What are good default values for HNSW paramete... \n", + "10 What does nlist vs nprobe mean in ivf_flat? \n", + "11 What is the default AUTOINDEX index and vector... \n", + "12 What do the parameters for HNSW mean? \n", + "13 What are good default values for HNSW paramete... \n", + "14 What does nlist vs nprobe mean in ivf_flat? \n", + "15 What is the default AUTOINDEX index and vector... \n", + "16 What do the parameters for HNSW mean? \n", + "17 What are good default values for HNSW paramete... \n", + "18 What does nlist vs nprobe mean in ivf_flat? \n", + "19 What is the default AUTOINDEX index and vector... \n", "\n", - " contexts \\\n", - "0 [HNSW (Hierarchical Navigable Small World Grap... \n", - "1 [HNSW (Hierarchical Navigable Small World Grap... \n", - "2 [IVF_FLAT divides vector data into nlist clust... \n", - "3 [\"AUTOINDEX\", metric_type: \"COSINE\", ... \n", - "4 [In order to improve performance, HNSW limits ... \n", - "5 [Why Milvus Docs Tutorials Tools Blog Communit... \n", - "6 [`nlist` in IVF-Flat represents the number of ... \n", - "7 [Index parameters Index parameters dictate how... \n", + " contexts \\\n", + "0 [In order to improve performance, HNSW limits ... \n", + "1 [When conducting searches, note that you can s... \n", + "2 [By adjusting nprobe, an ideal balance between... \n", + "3 [Leveraging the metadata stored in an index fi... \n", + "4 [the node closest to the target in this layer,... \n", + "5 [first. When conducting searches, note that yo... \n", + "6 [FLAT index and IVF_FLAT index? IVF_FLAT index... \n", + "7 [during searches or queries. Milvus provides s... \n", + "8 [Search parameters ParameterDescriptionRangeDe... \n", + "9 [between the target vector and the clustering ... \n", + "10 [FLAT index and IVF_FLAT index? IVF_FLAT index... \n", + "11 [during searches or queries. Milvus provides s... \n", + "12 [* `M`: defines maximum number of outgoing con... \n", + "13 [* `nlist`: Set to a moderate value such as 12... \n", + "14 [* `NLIST`: controls how many initial \"buckets... \n", + "15 [According to the provided text, there are no ... \n", + "16 [The three main parameters for Hierarchical Na... \n", + "17 [For an index built on top of a dataset with a... \n", + "18 [In IVF_FLAT, `nlist` (short for \"number of li... \n", + "19 [According to the provided text, there isn't a... \n", "\n", - " answer \\\n", - "0 - M defines the maximum number of outgoing con... \n", - "1 - M = 32 \n", + " answer \\\n", + "0 In the context of HNSW (Hierarchical Navigabl... \n", + "1 - M = 32 \n", "- efConstruction = 100 \n", - "2 - nlist in IVF_FLAT refers to the number of cl... \n", - "3 The default AUTOINDEX index in Milvus is IVF_S... \n", - "4 - M defines the maximum number of outgoing con... \n", - "5 - M = 32 \n", + "2 - nlist in IVF_FLAT refers to the number of cl... \n", + "3 The default AUTOINDEX index in Milvus is IVF_S... \n", + "4 In the context of HNSW (Hierarchical Navigabl... \n", + "5 - M = 32 \n", + "- efConstruction = 100 \n", + "6 - nlist in IVF_FLAT refers to the number of cl... \n", + "7 The default AUTOINDEX index in Milvus is IVF_S... \n", + "8 In the context of HNSW (Hierarchical Navigabl... \n", + "9 - M = 32 \n", + "- efConstruction = 100 \n", + "10 - nlist in IVF_FLAT refers to the number of cl... \n", + "11 The default AUTOINDEX index in Milvus is IVF_S... \n", + "12 In the context of HNSW (Hierarchical Navigabl... \n", + "13 - M = 32 \n", + "- efConstruction = 100 \n", + "14 - nlist in IVF_FLAT refers to the number of cl... \n", + "15 The default AUTOINDEX index in Milvus is IVF_S... \n", + "16 In the context of HNSW (Hierarchical Navigabl... \n", + "17 - M = 32 \n", "- efConstruction = 100 \n", - "6 - nlist in IVF_FLAT refers to the number of cl... \n", - "7 The default AUTOINDEX index in Milvus is IVF_S... \n", + "18 - nlist in IVF_FLAT refers to the number of cl... \n", + "19 The default AUTOINDEX index in Milvus is IVF_S... \n", "\n", - " ground_truth context_recall \\\n", - "0 * M: maximum degree, or number of connections ... 1.0 \n", - "1 M=16, efConstruction=32, \n", - "ef=32 1.0 \n", - "2 # nlist: controls how the vector data is part... 0.5 \n", - "3 Index type = HNSW and distance metric=IP Inner... 0.0 \n", - "4 * M: maximum degree, or number of connections ... 1.0 \n", - "5 M=16, efConstruction=32, \n", - "ef=32 0.0 \n", - "6 # nlist: controls how the vector data is part... 0.5 \n", - "7 Index type = HNSW and distance metric=IP Inner... 0.0 \n", + " ground_truth context_recall \\\n", + "0 * M: maximum degree, or number of connections ... 1.000000 \n", + "1 M=16, efConstruction=32, \n", + "ef=32 0.333333 \n", + "2 # nlist: controls how the vector data is part... 0.666667 \n", + "3 Index type = HNSW and distance metric=IP Inner... 0.000000 \n", + "4 * M: maximum degree, or number of connections ... 1.000000 \n", + "5 M=16, efConstruction=32, \n", + "ef=32 0.333333 \n", + "6 # nlist: controls how the vector data is part... 0.500000 \n", + "7 Index type = HNSW and distance metric=IP Inner... 0.000000 \n", + "8 * M: maximum degree, or number of connections ... 1.000000 \n", + "9 M=16, efConstruction=32, \n", + "ef=32 0.000000 \n", + "10 # nlist: controls how the vector data is part... 0.500000 \n", + "11 Index type = HNSW and distance metric=IP Inner... 0.000000 \n", + "12 * M: maximum degree, or number of connections ... 1.000000 \n", + "13 M=16, efConstruction=32, \n", + "ef=32 0.000000 \n", + "14 # nlist: controls how the vector data is part... 0.500000 \n", + "15 Index type = HNSW and distance metric=IP Inner... 0.000000 \n", + "16 * M: maximum degree, or number of connections ... 1.000000 \n", + "17 M=16, efConstruction=32, \n", + "ef=32 0.000000 \n", + "18 # nlist: controls how the vector data is part... 0.500000 \n", + "19 Index type = HNSW and distance metric=IP Inner... 0.000000 \n", "\n", - " context_precision context_f1 evaluated \n", - "0 1.0 1.000000 Custom_RAG_context \n", - "1 1.0 1.000000 Custom_RAG_context \n", - "2 1.0 0.666667 Custom_RAG_context \n", - "3 0.0 0.000000 Custom_RAG_context \n", - "4 1.0 1.000000 simple_context \n", - "5 1.0 0.000000 simple_context \n", - "6 1.0 0.666667 simple_context \n", - "7 1.0 0.000000 simple_context " + " context_precision context_f1 evaluated \n", + "0 1.0 1.000000 naive_context_k_2 \n", + "1 1.0 0.500000 naive_context_k_2 \n", + "2 1.0 0.800000 naive_context_k_2 \n", + "3 1.0 0.000000 naive_context_k_2 \n", + "4 1.0 1.000000 html_context_k_2 \n", + "5 1.0 0.500000 html_context_k_2 \n", + "6 1.0 0.666667 html_context_k_2 \n", + "7 1.0 0.000000 html_context_k_2 \n", + "8 1.0 1.000000 parent_context_k1 \n", + "9 1.0 0.000000 parent_context_k1 \n", + "10 1.0 0.666667 parent_context_k1 \n", + "11 1.0 0.000000 parent_context_k1 \n", + "12 1.0 1.000000 html_context_k_2_summary \n", + "13 1.0 0.000000 html_context_k_2_summary \n", + "14 1.0 0.666667 html_context_k_2_summary \n", + "15 1.0 0.000000 html_context_k_2_summary \n", + "16 1.0 1.000000 semantic_context_k2_summary \n", + "17 1.0 0.000000 semantic_context_k2_summary \n", + "18 1.0 0.666667 semantic_context_k2_summary \n", + "19 0.0 0.000000 semantic_context_k2_summary " ] }, "metadata": {}, @@ -2708,9 +3022,11 @@ "\n", "elif EVALUATE_WHAT == 'CONTEXTS':\n", " pprint.pprint(scores)\n", - " percent_better = (scores[0]['Custom_RAG_context'] - scores[1]['simple_context']) \\\n", - " / scores[1]['simple_context'] * 100\n", - " print(f\"HTML chunking {np.round(percent_better,0)}% improvement over Simple chunking.\")\n", + " percent_better = (scores[0]['naive_context_k_2'] - scores[3]['html_context_k_2_summary']) \\\n", + " / scores[3]['html_context_k_2_summary'] * 100\n", + " # percent_better = (scores[0]['parent_context_k1'] - scores[1]['parent_context_k1_text-embedding-3-small']) \\\n", + " # / scores[1]['parent_context_k1_text-embedding-3-small'] * 100\n", + " print(f\"Chunking {np.round(percent_better,0)}% improvement.\")\n", "\n", "# Display the evaluation details.\n", "display(ragas_result)" @@ -2718,17 +3034,25 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": null, "id": "7c408624", "metadata": {}, "outputs": [], "source": [ - "####################################################\n", - "# Avg Context Precision htmlsplitter score = 0.67 (46% improvement)\n", - "# Avg Context Precision simple score = 0.46\n", + "########### CHANGE THE CHUNKING STRATEGY ###########\n", + "# F1-Score 'naive_context_k_2': 0.57 (36% improvement)\n", + "# F1-Score 'html_context_k_2': 0.54\n", + "# F1-Score 'parent_context_k1': 0.42\n", + "# F1-Score 'html_context_k_2_summary': 0.42\n", + "# F1-Score 'semantic_context_k2_summary': 0.42\n", "####################################################\n", "\n", + "########### CHANGE THE EMBEDDING MODEL #############\n", + "# F1-Score 'naive_context_k_2': 0.57 (27% improvement)\n", + "# F1-Score 'pnaive_context_k_2_text-embedding-3-small': 0.45\n", "####################################################\n", + "\n", + "############## CHANGE THE LLM ######################\n", "# Avg mistralai mixtral_8x7b_instruct score = 0.7031 (6% improvement over gpt-3.5-turbo)\n", "# Avg llama3_70b_anyscale_chat score = 0.6888\n", "# Avg llama3_70b_groq_instruct score = 0.6867\n", @@ -2740,47 +3064,21 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": null, "id": "d0e81e68", "metadata": {}, "outputs": [], "source": [ - "# Drop collection\n", - "# utility.drop_collection(COLLECTION_NAME)\n", - "mc.drop_collection(COLLECTION_NAME)" + "# Delete the Milvus collection and doc store.\n", + "del vectorstore, retriever, store" ] }, { "cell_type": "code", - "execution_count": 51, + "execution_count": null, "id": "c777937e", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Author: Christy Bergman\n", - "\n", - "Python implementation: CPython\n", - "Python version : 3.11.8\n", - "IPython version : 8.22.2\n", - "\n", - "unstructured: 0.14.4\n", - "lxml : 5.1.0\n", - "torch : 2.3.0\n", - "pymilvus : 2.4.4\n", - "langchain : 0.2.2\n", - "ollama : 0.1.8\n", - "octoai : 1.0.2\n", - "groq : 0.8.0\n", - "openai : 1.35.0\n", - "\n", - "conda environment: py311-unum\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "# Props to Sebastian Raschka for this handy watermark.\n", "# !pip install watermark\n",