{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Retrieving from a Multi-Source Chroma Vectorstore " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Imports and Settings" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from datetime import datetime\n", "import os, sys\n", "import shutil\n", "import chromadb\n", "import langchain\n", "from langchain.document_loaders import PyPDFLoader\n", "from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter\n", "from langchain_community.vectorstores import Chroma" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Settings\n", "abscurdir = os.path.abspath(os.curdir)\n", "docsdir = os.path.join(os.path.dirname(abscurdir), 'docs')\n", "doc_fnames = [\n", " \"The hundred-page machine learning book.pdf\",\n", " \"the-big-book-of-mlops-v10-072023 - Databricks.pdf\",\n", " \"How to Build a Career in AI - Andrew Ng.pdf\"\n", " ]\n", "collection_name = 'MLbooks'\n", "create_new_db = False\n", "existing_dbname = 'chroma_20241124_132314'\n", "\n", "chunk_size =1000\n", "chunk_overlap = 200\n", "\n", "# embedding_strategy = 'Chroma2LC'\n", "embedding_strategy = 'LC2Chroma'\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Create/Get Vector Store" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Embedding Function" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Define Adapters:\n", "\n", "# For converting Chroma's embedding functions to LC embeddings:\n", "from langchain_core.embeddings import Embeddings\n", "from chromadb.api.types import EmbeddingFunction\n", "\n", "\n", "class ChromaEmbeddingsAdapter(Embeddings):\n", " def __init__(self, ef: EmbeddingFunction):\n", " self.ef = ef\n", "\n", " def embed_documents(self, texts):\n", " return self.ef(texts)\n", "\n", " def embed_query(self, query):\n", " return self.ef([query])[0]\n", " \n", "# For converting LC embeddings to Chroma embedding functions:\n", "from langchain_core.embeddings import Embeddings\n", "from chromadb.api.types import EmbeddingFunction, Documents\n", "\n", "class LangChainEmbeddingAdapter(EmbeddingFunction[Documents]):\n", " def __init__(self, ef: Embeddings):\n", " self.ef = ef\n", "\n", " def __call__(self, input: Documents) -> Embeddings:\n", " # LC EFs also have embed_query but Chroma doesn't support that so we just use embed_documents\n", " # TODO: better type checking\n", " return self.ef.embed_documents(input)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "if embedding_strategy == 'Chroma2LC':\n", " from chromadb.utils import embedding_functions\n", " chroma_ef = embedding_functions.DefaultEmbeddingFunction()\n", " # chroma_ef = SentenceTransformerEmbeddingFunction(model_name=\"all-MiniLM-L6-v2\")\n", " from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction\n", " lc_embeddings = ChromaEmbeddingsAdapter(chroma_ef)\n", "elif embedding_strategy == 'LC2Chroma':\n", " from langchain_community.embeddings import HuggingFaceEmbeddings\n", " # from langchain_huggingface import HuggingFaceEmbeddings\n", " lc_embeddings = HuggingFaceEmbeddings(model_name=\"all-MiniLM-L6-v2\")\n", " chroma_ef = LangChainEmbeddingAdapter(lc_embeddings)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Creating/Getting a Collection from a Persistent Database" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "if create_new_db:\n", " tstamp = datetime.now().strftime(\"%Y%m%d_%H%M%S\")\n", " dbname = f'chroma_{tstamp}'\n", " print(f'Creating new database at {os.path.join(docsdir, dbname)}')\n", "else:\n", " dbname = existing_dbname\n", " print(f'Using existing database at {os.path.join(docsdir, dbname)}')\n", "\n", "persist_directory = os.path.join(docsdir, dbname)\n", "\n", "client = chromadb.PersistentClient(\n", " path=persist_directory\n", " )\n", "print(f'Database has {client.count_collections()} collections')\n", "\n", "if create_new_db:\n", " try:\n", " client.delete_collection(\n", " name=collection_name\n", " )\n", " print(f'Collection {collection_name} deleted')\n", " except Exception as e:\n", " print(f'While attempting to delete collection {collection_name}, following exception was raised: {e}')\n", " # shutil.rmtree(persist_directory, ignore_errors=True)\n", " # print(f'Persistent database at {persist_directory} deleted')\n", "\n", "collection = client.get_or_create_collection(\n", " name=collection_name,\n", " embedding_function = chroma_ef,\n", " )\n", "print(f'Collection {collection_name} has {collection.count()} entries')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Splitting and adding documents to collection" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def docs_to_splits(doc_fname):\n", " loader = PyPDFLoader(doc_fname)\n", " pages = loader.load()\n", "\n", " r_text_splitter = RecursiveCharacterTextSplitter(\n", " # separators=['\\n'],\n", " # separators=[\"\\n\\n\"], #, \"\\n\", \"(?<=\\. )\"], # , \" \", \"\"],\n", " # separators=[\"(?<=\\. )\"], # , \" \", \"\"],\n", " chunk_size=chunk_size,\n", " chunk_overlap=chunk_overlap,\n", " #length_function=len\n", " )\n", " splits = r_text_splitter.split_documents(pages)\n", " # print(splits[1])\n", " print(f\" len(pages): {len(pages)}, len(splits) = {len(splits)}\", end='..')\n", "\n", " return splits" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(f'Adding documents to collection {collection_name}:')\n", "for doc_i, doc_fname in enumerate(doc_fnames):\n", " doc_path = os.path.join(docsdir, doc_fname)\n", " # check if document exists in collection:\n", " results = collection.get(\n", " where={\"source\": doc_path},\n", " include=[\"metadatas\"],\n", " )\n", " if len(results[\"ids\"]) > 0:\n", " print(f' - {doc_fname}: already exists in collection')\n", " else:\n", " print(f' - {doc_fname}: splitting document..', end=' ')\n", " splits = docs_to_splits(doc_path)\n", " ids = [f'D{doc_i}S{split_i}' for split_i, split in enumerate(splits)]\n", " print(f' adding to collection..', end=' ')\n", " collection.add(\n", " ids = ids,\n", " documents=[split.page_content for split in splits],\n", " metadatas=[split.metadata for split in splits]\n", " )\n", " print(f' done.')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Creating Vector Sore based on collection" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "vectorstore = Chroma(\n", " client=client,\n", " collection_name=collection_name,\n", " embedding_function=lc_embeddings,\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(vectorstore._collection.count())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Retrieval" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# question = \"How does multi-class classification work?\"\n", "# question = \"How can I build skills in AI?\"\n", "# question = \"What can go wrong when deploying a model?\"\n", "question = \"How can machine learning models help a business?\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def display_retrieval_results(docs):\n", " print('page contents:')\n", " for i, doc in enumerate(docs):\n", " print(f\"doc-{i}: {doc.page_content}\")\n", " print('\\nmeta_data:')\n", " for i, doc in enumerate(docs):\n", " print(f\"doc-{i}: {doc.metadata}\") " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Similarity Search" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "docs = vectorstore.similarity_search(question,k=5)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "display_retrieval_results(docs)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Maximum Marginal Relevance (MMR) Search" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "docs_mmr = vectorstore.max_marginal_relevance_search(question,k=3)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "display_retrieval_results(docs_mmr)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Working with meta-data using self-query retriever " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from langchain.retrievers.self_query.base import SelfQueryRetriever\n", "from langchain.chains.query_constructor.base import AttributeInfo" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "all_metadatas = collection.get(include=[\"metadatas\"]).get('metadatas')\n", "sources = list(set([x.get('source') for x in all_metadatas]))\n", "sources_fmt = [f\"`{source}`\" for source in sources ]\n", "sources_str = \", \".join(sources_fmt)\n", "print(f\"Distinct sources in collection: {sources_str}\")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "metadata_field_info = [\n", " AttributeInfo(\n", " name=\"source\",\n", " description=f\"The lecture the chunk is from, should be one of {sources_str}\",\n", " type=\"string\",\n", " ),\n", " AttributeInfo(\n", " name=\"page\",\n", " description=\"The page from the lecture\",\n", " type=\"integer\",\n", " ),\n", "]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# from langchain.llms import OpenAI\n", "# llm = OpenAI(model='gpt-3.5-turbo-instruct', temperature=0)\n", "\n", "from langchain.llms import Ollama\n", "llm = Ollama(model=\"llama3.2:1b\", temperature=0)\n", "\n", "document_content_description = \"ML books\"\n", "\n", "retriever = SelfQueryRetriever.from_llm(\n", " llm,\n", " vectorstore,\n", " document_content_description,\n", " metadata_field_info=metadata_field_info,\n", " verbose=False\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "question = \"How can machine learning models help a business according to Andrew Ng's book?\"\n", "\n", "# docs = retriever.get_relevant_documents(question)\n", "# display_retrieval_results(docs)\n", "retriever.invoke(question)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Compression Retriever" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from langchain.retrievers import ContextualCompressionRetriever\n", "from langchain.retrievers.document_compressors import LLMChainExtractor" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def pretty_print_docs(docs):\n", " print(f\"\\n{'-' * 100}\\n\".join([f\"Document {i+1}:\\n\\n\" + d.page_content for i, d in enumerate(docs)]))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from langchain.llms import Ollama\n", "llm = Ollama(model=\"llama3.2:1b\", temperature=0)\n", "compressor = LLMChainExtractor.from_llm(llm)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "compression_retriever = ContextualCompressionRetriever(\n", " base_compressor=compressor,\n", " base_retriever=vectorstore.as_retriever()\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "question = \"How can machine learning models help a business?\"\n", "compressed_docs = compression_retriever.get_relevant_documents(question)\n", "pretty_print_docs(compressed_docs)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Combining MMR and Compression" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "compression_retriever = ContextualCompressionRetriever(\n", " base_compressor=compressor,\n", " base_retriever=vectorstore.as_retriever(search_type = \"mmr\")\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "question = \"How can machine learning models help a business?\"\n", "compressed_docs = compression_retriever.get_relevant_documents(question)\n", "pretty_print_docs(compressed_docs)" ] } ], "metadata": { "kernelspec": { "display_name": "langchain_311", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.1" } }, "nbformat": 4, "nbformat_minor": 2 }