Skip to content

Commit

Permalink
Add connect notebook for Milvus Lite (#1357)
Browse files Browse the repository at this point in the history
Signed-off-by: Christy Bergman <[email protected]>
  • Loading branch information
christy committed Jun 12, 2024
1 parent f24e840 commit 2a6fe60
Showing 1 changed file with 111 additions and 24 deletions.
135 changes: 111 additions & 24 deletions bootcamp/milvus_connect.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -464,7 +464,29 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"length doc: 11016\n",
"('Why MilvusDocsTutorialsToolsBlogCommunityStars0Try Managed Milvus '\n",
" 'FREESearchHomev2.4.xAbout MilvusGe')\n"
]
}
],
"source": [
"# Inspect the first document.\n",
"import pprint\n",
"print(f\"length doc: {len(docs[0].page_content)}\")\n",
"pprint.pprint(docs[0].page_content.replace('\\n', '')[:100])"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
Expand All @@ -480,7 +502,7 @@
"output_type": "stream",
"text": [
"EMBEDDING_DIM: 1024\n",
"Created Milvus collection from 22 docs in 7.64 seconds\n"
"Created Milvus collection from 427 docs in 33.14 seconds\n"
]
}
],
Expand All @@ -502,18 +524,17 @@
"EMBEDDING_DIM = embed_model.dict()['client'].get_sentence_embedding_dimension()\n",
"print(f\"EMBEDDING_DIM: {EMBEDDING_DIM}\")\n",
"\n",
"# # Chunking\n",
"# text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=51)\n",
"# Chunking\n",
"text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=51)\n",
"\n",
"# Create a Milvus collection from the documents and embeddings.\n",
"start_time = time.time()\n",
"# docs = text_splitter.split_documents(docs)\n",
"docs = text_splitter.split_documents(docs)\n",
"vectorstore = Milvus.from_documents(\n",
" documents=docs,\n",
" embedding=embed_model,\n",
" connection_args={\n",
" \"uri\": \"./milvus_demo.db\",\n",
" },\n",
" \"uri\": \"./milvus_demo.db\"},\n",
" # Override LangChain default values for Milvus.\n",
" consistency_level=\"Eventually\",\n",
" drop_old=True,\n",
Expand All @@ -528,7 +549,7 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 18,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -557,7 +578,7 @@
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -589,16 +610,16 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"# !python -m pip install -U llama-index llama-index-embeddings-huggingface llama-index-vector-stores-milvus"
"# !python -m pip install -U --no-cache-dir llama-index llama-index-embeddings-huggingface llama-index-vector-stores-milvus"
]
},
{
"cell_type": "code",
"execution_count": 19,
"execution_count": 33,
"metadata": {},
"outputs": [
{
Expand All @@ -616,23 +637,54 @@
"from llama_index.core import SimpleDirectoryReader\n",
"\n",
"# Load HTML files from a local directory\n",
"# https://docs.llamaindex.ai/en/stable/api_reference/readers/simple_directory_reader\n",
"# Supposed to automatically parse files based on their extension.\n",
"path = \"RAG/rtdocs_new/\"\n",
"docs = SimpleDirectoryReader(path).load_data()\n",
"loader = SimpleDirectoryReader(\n",
" input_dir=path, \n",
" required_exts=[\".html\"],\n",
" recursive=True # Recursively search subdirectories\n",
" )\n",
"lli_docs = loader.load_data()\n",
"\n",
"num_documents = len(docs)\n",
"num_documents = len(lli_docs)\n",
"print(f\"loaded {num_documents} documents\")"
]
},
{
"cell_type": "code",
"execution_count": 21,
"execution_count": 34,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"length doc: 663373\n",
"('<!DOCTYPE html><html lang=\"en\"><head><meta charSet=\"utf-8\"/><meta '\n",
" 'http-equiv=\"x-ua-compatible\" conte')\n"
]
}
],
"source": [
"# Inspect the first document.\n",
"import pprint\n",
"\n",
"# html docs were not parsed by SimpleDirectoryReader.\n",
"print(f\"length doc: {len(lli_docs[0].text)}\")\n",
"pprint.pprint(lli_docs[0].text[:100])"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/var/folders/vn/4v5_m9mx69x3h7jcl1chb7nr0000gn/T/ipykernel_28726/1337788918.py:12: DeprecationWarning: Call to deprecated class method from_defaults. (ServiceContext is deprecated, please use `llama_index.settings.Settings` instead.) -- Deprecated since version 0.10.0.\n",
"/var/folders/vn/4v5_m9mx69x3h7jcl1chb7nr0000gn/T/ipykernel_4999/3447014088.py:12: DeprecationWarning: Call to deprecated class method from_defaults. (ServiceContext is deprecated, please use `llama_index.settings.Settings` instead.) -- Deprecated since version 0.10.0.\n",
" service_context = ServiceContext.from_defaults(\n",
"/opt/miniconda3/envs/py311-unum/lib/python3.11/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
" warnings.warn(\n"
Expand All @@ -654,7 +706,7 @@
" 'text_instruction': None}\n",
"\n",
"Start chunking, embedding, inserting...\n",
"Created LlamaIndex collection from 1 docs in 98.19 seconds\n"
"Created LlamaIndex collection from 1 docs in 101.56 seconds\n"
]
}
],
Expand Down Expand Up @@ -702,18 +754,18 @@
"start_time = time.time()\n",
"llamaindex = VectorStoreIndex.from_documents(\n",
" # Too slow! Just use one document.\n",
" docs[:1], \n",
" lli_docs[:1], \n",
" storage_context=storage_context, \n",
" service_context=service_context\n",
")\n",
"end_time = time.time()\n",
"print(f\"Created LlamaIndex collection from {len(docs[:1])} docs in {end_time - start_time:.2f} seconds\")\n",
"print(f\"Created LlamaIndex collection from {len(lli_docs[:1])} docs in {end_time - start_time:.2f} seconds\")\n",
"# Created LlamaIndex collection from 1 docs in 106.32 seconds"
]
},
{
"cell_type": "code",
"execution_count": 22,
"execution_count": 36,
"metadata": {},
"outputs": [
{
Expand All @@ -739,7 +791,7 @@
}
],
"source": [
"# Describe the collection.\n",
"# Describe the collection, it looks like the Milvus overrides did not all work.\n",
"temp = llamaindex.storage_context.vector_store.to_dict()\n",
"first_15_keys = list(temp.keys())[:15]\n",
"for key in first_15_keys:\n",
Expand All @@ -748,7 +800,7 @@
},
{
"cell_type": "code",
"execution_count": 23,
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -758,21 +810,23 @@
},
{
"cell_type": "code",
"execution_count": 23,
"execution_count": 38,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The watermark extension is already loaded. To reload it, use:\n",
" %reload_ext watermark\n",
"Author: Christy Bergman\n",
"\n",
"Python implementation: CPython\n",
"Python version : 3.11.8\n",
"IPython version : 8.22.2\n",
"\n",
"pymilvus : 2.4.3\n",
"llama_index : 0.10.43\n",
"llama_index : 0.10.44\n",
"langchain : 0.2.2\n",
"unstructured: 0.14.4\n",
"\n",
Expand All @@ -788,6 +842,39 @@
"%load_ext watermark\n",
"%watermark -a 'Christy Bergman' -v -p pymilvus,llama_index,langchain,unstructured --conda"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"llama-index 0.10.44\n",
"llama-index-agent-openai 0.2.7\n",
"llama-index-cli 0.1.12\n",
"llama-index-core 0.10.44\n",
"llama-index-embeddings-huggingface 0.2.1\n",
"llama-index-embeddings-openai 0.1.10\n",
"llama-index-indices-managed-llama-cloud 0.1.6\n",
"llama-index-legacy 0.9.48\n",
"llama-index-llms-ollama 0.1.5\n",
"llama-index-llms-openai 0.1.22\n",
"llama-index-multi-modal-llms-openai 0.1.6\n",
"llama-index-program-openai 0.1.6\n",
"llama-index-question-gen-openai 0.1.3\n",
"llama-index-readers-file 0.1.23\n",
"llama-index-readers-llama-parse 0.1.4\n",
"llama-index-vector-stores-milvus 0.1.17\n"
]
}
],
"source": [
"# Check all llamaindex packages info, make sure they latest.\n",
"!pip list | grep llama-index"
]
}
],
"metadata": {
Expand Down

0 comments on commit 2a6fe60

Please sign in to comment.