Add connect notebook for Milvus Lite (#1357)

Signed-off-by: Christy Bergman <[email protected]>
milvus-io · Jun 12, 2024 · 2a6fe60 · 2a6fe60
1 parent f24e840
commit 2a6fe60
Showing 1 changed file with 111 additions and 24 deletions.
diff --git a/bootcamp/milvus_connect.ipynb b/bootcamp/milvus_connect.ipynb
@@ -464,7 +464,29 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "length doc: 11016\n",
+      "('Why MilvusDocsTutorialsToolsBlogCommunityStars0Try Managed Milvus '\n",
+      " 'FREESearchHomev2.4.xAbout MilvusGe')\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Inspect the first document.\n",
+    "import pprint\n",
+    "print(f\"length doc: {len(docs[0].page_content)}\")\n",
+    "pprint.pprint(docs[0].page_content.replace('\\n', '')[:100])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
    "metadata": {},
    "outputs": [
     {
@@ -480,7 +502,7 @@
      "output_type": "stream",
      "text": [
       "EMBEDDING_DIM: 1024\n",
-      "Created Milvus collection from 22 docs in 7.64 seconds\n"
+      "Created Milvus collection from 427 docs in 33.14 seconds\n"
      ]
     }
    ],
@@ -502,18 +524,17 @@
     "EMBEDDING_DIM = embed_model.dict()['client'].get_sentence_embedding_dimension()\n",
     "print(f\"EMBEDDING_DIM: {EMBEDDING_DIM}\")\n",
     "\n",
-    "# # Chunking\n",
-    "# text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=51)\n",
+    "# Chunking\n",
+    "text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=51)\n",
     "\n",
     "# Create a Milvus collection from the documents and embeddings.\n",
     "start_time = time.time()\n",
-    "# docs = text_splitter.split_documents(docs)\n",
+    "docs = text_splitter.split_documents(docs)\n",
     "vectorstore = Milvus.from_documents(\n",
     "    documents=docs,\n",
     "    embedding=embed_model,\n",
     "    connection_args={\n",
-    "        \"uri\": \"./milvus_demo.db\",\n",
-    "    },\n",
+    "        \"uri\": \"./milvus_demo.db\"},\n",
     "    # Override LangChain default values for Milvus.\n",
     "    consistency_level=\"Eventually\",\n",
     "    drop_old=True,\n",
@@ -528,7 +549,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 18,
    "metadata": {},
    "outputs": [
     {
@@ -557,7 +578,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 19,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -589,16 +610,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 20,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# !python -m pip install -U llama-index llama-index-embeddings-huggingface llama-index-vector-stores-milvus"
+    "# !python -m pip install -U --no-cache-dir llama-index llama-index-embeddings-huggingface llama-index-vector-stores-milvus"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 33,
    "metadata": {},
    "outputs": [
     {
@@ -616,23 +637,54 @@
     "from llama_index.core import SimpleDirectoryReader\n",
     "\n",
     "# Load HTML files from a local directory\n",
+    "# https://docs.llamaindex.ai/en/stable/api_reference/readers/simple_directory_reader\n",
+    "# Supposed to automatically parse files based on their extension.\n",
     "path = \"RAG/rtdocs_new/\"\n",
-    "docs = SimpleDirectoryReader(path).load_data()\n",
+    "loader = SimpleDirectoryReader(\n",
+    "        input_dir=path, \n",
+    "        required_exts=[\".html\"],\n",
+    "        recursive=True # Recursively search subdirectories\n",
+    "    )\n",
+    "lli_docs = loader.load_data()\n",
     "\n",
-    "num_documents = len(docs)\n",
+    "num_documents = len(lli_docs)\n",
     "print(f\"loaded {num_documents} documents\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "length doc: 663373\n",
+      "('<!DOCTYPE html><html lang=\"en\"><head><meta charSet=\"utf-8\"/><meta '\n",
+      " 'http-equiv=\"x-ua-compatible\" conte')\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Inspect the first document.\n",
+    "import pprint\n",
+    "\n",
+    "# html docs were not parsed by SimpleDirectoryReader.\n",
+    "print(f\"length doc: {len(lli_docs[0].text)}\")\n",
+    "pprint.pprint(lli_docs[0].text[:100])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/var/folders/vn/4v5_m9mx69x3h7jcl1chb7nr0000gn/T/ipykernel_28726/1337788918.py:12: DeprecationWarning: Call to deprecated class method from_defaults. (ServiceContext is deprecated, please use `llama_index.settings.Settings` instead.) -- Deprecated since version 0.10.0.\n",
+      "/var/folders/vn/4v5_m9mx69x3h7jcl1chb7nr0000gn/T/ipykernel_4999/3447014088.py:12: DeprecationWarning: Call to deprecated class method from_defaults. (ServiceContext is deprecated, please use `llama_index.settings.Settings` instead.) -- Deprecated since version 0.10.0.\n",
       "  service_context = ServiceContext.from_defaults(\n",
       "/opt/miniconda3/envs/py311-unum/lib/python3.11/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
       "  warnings.warn(\n"
@@ -654,7 +706,7 @@
       " 'text_instruction': None}\n",
       "\n",
       "Start chunking, embedding, inserting...\n",
-      "Created LlamaIndex collection from 1 docs in 98.19 seconds\n"
+      "Created LlamaIndex collection from 1 docs in 101.56 seconds\n"
      ]
     }
    ],
@@ -702,18 +754,18 @@
     "start_time = time.time()\n",
     "llamaindex = VectorStoreIndex.from_documents(\n",
     "    # Too slow!  Just use one document.\n",
-    "    docs[:1], \n",
+    "    lli_docs[:1], \n",
     "    storage_context=storage_context, \n",
     "    service_context=service_context\n",
     ")\n",
     "end_time = time.time()\n",
-    "print(f\"Created LlamaIndex collection from {len(docs[:1])} docs in {end_time - start_time:.2f} seconds\")\n",
+    "print(f\"Created LlamaIndex collection from {len(lli_docs[:1])} docs in {end_time - start_time:.2f} seconds\")\n",
     "# Created LlamaIndex collection from 1 docs in 106.32 seconds"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 36,
    "metadata": {},
    "outputs": [
     {
@@ -739,7 +791,7 @@
     }
    ],
    "source": [
-    "# Describe the collection.\n",
+    "# Describe the collection, it looks like the Milvus overrides did not all work.\n",
     "temp = llamaindex.storage_context.vector_store.to_dict()\n",
     "first_15_keys = list(temp.keys())[:15]\n",
     "for key in first_15_keys:\n",
@@ -748,7 +800,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 37,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -758,21 +810,23 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 38,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "The watermark extension is already loaded. To reload it, use:\n",
+      "  %reload_ext watermark\n",
       "Author: Christy Bergman\n",
       "\n",
       "Python implementation: CPython\n",
       "Python version       : 3.11.8\n",
       "IPython version      : 8.22.2\n",
       "\n",
       "pymilvus    : 2.4.3\n",
-      "llama_index : 0.10.43\n",
+      "llama_index : 0.10.44\n",
       "langchain   : 0.2.2\n",
       "unstructured: 0.14.4\n",
       "\n",
@@ -788,6 +842,39 @@
     "%load_ext watermark\n",
     "%watermark -a 'Christy Bergman' -v -p pymilvus,llama_index,langchain,unstructured --conda"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "llama-index                             0.10.44\n",
+      "llama-index-agent-openai                0.2.7\n",
+      "llama-index-cli                         0.1.12\n",
+      "llama-index-core                        0.10.44\n",
+      "llama-index-embeddings-huggingface      0.2.1\n",
+      "llama-index-embeddings-openai           0.1.10\n",
+      "llama-index-indices-managed-llama-cloud 0.1.6\n",
+      "llama-index-legacy                      0.9.48\n",
+      "llama-index-llms-ollama                 0.1.5\n",
+      "llama-index-llms-openai                 0.1.22\n",
+      "llama-index-multi-modal-llms-openai     0.1.6\n",
+      "llama-index-program-openai              0.1.6\n",
+      "llama-index-question-gen-openai         0.1.3\n",
+      "llama-index-readers-file                0.1.23\n",
+      "llama-index-readers-llama-parse         0.1.4\n",
+      "llama-index-vector-stores-milvus        0.1.17\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Check all llamaindex packages info, make sure they latest.\n",
+    "!pip list | grep llama-index"
+   ]
   }
  ],
  "metadata": {