{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Retrieving from a Multi-Source Chroma Vectorstore "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Imports and Settings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from datetime import datetime\n",
    "import os, sys\n",
    "import shutil\n",
    "import chromadb\n",
    "import langchain\n",
    "from langchain.document_loaders import PyPDFLoader\n",
    "from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter\n",
    "from langchain_community.vectorstores import Chroma"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Settings\n",
    "abscurdir = os.path.abspath(os.curdir)\n",
    "docsdir = os.path.join(os.path.dirname(abscurdir), 'docs')\n",
    "doc_fnames = [\n",
    "    \"The hundred-page machine learning book.pdf\",\n",
    "    \"the-big-book-of-mlops-v10-072023 - Databricks.pdf\",\n",
    "    \"How to Build a Career in AI - Andrew Ng.pdf\"\n",
    "    ]\n",
    "collection_name = 'MLbooks'\n",
    "create_new_db = False\n",
    "existing_dbname = 'chroma_20241124_132314'\n",
    "\n",
    "chunk_size =1000\n",
    "chunk_overlap = 200\n",
    "\n",
    "# embedding_strategy = 'Chroma2LC'\n",
    "embedding_strategy = 'LC2Chroma'\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Create/Get Vector Store"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Embedding Function"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Define Adapters:\n",
    "\n",
    "# For converting Chroma's embedding functions to LC embeddings:\n",
    "from langchain_core.embeddings import Embeddings\n",
    "from chromadb.api.types import EmbeddingFunction\n",
    "\n",
    "\n",
    "class ChromaEmbeddingsAdapter(Embeddings):\n",
    "        def __init__(self, ef: EmbeddingFunction):\n",
    "            self.ef = ef\n",
    "\n",
    "        def embed_documents(self, texts):\n",
    "            return self.ef(texts)\n",
    "\n",
    "        def embed_query(self, query):\n",
    "            return self.ef([query])[0]\n",
    "        \n",
    "# For converting LC embeddings to Chroma embedding functions:\n",
    "from langchain_core.embeddings import Embeddings\n",
    "from chromadb.api.types import EmbeddingFunction, Documents\n",
    "\n",
    "class LangChainEmbeddingAdapter(EmbeddingFunction[Documents]):\n",
    "    def __init__(self, ef: Embeddings):\n",
    "        self.ef = ef\n",
    "\n",
    "    def __call__(self, input: Documents) -> Embeddings:\n",
    "        # LC EFs also have embed_query but Chroma doesn't support that so we just use embed_documents\n",
    "        # TODO: better type checking\n",
    "        return self.ef.embed_documents(input)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "if embedding_strategy == 'Chroma2LC':\n",
    "    from chromadb.utils import embedding_functions\n",
    "    chroma_ef = embedding_functions.DefaultEmbeddingFunction()\n",
    "    # chroma_ef = SentenceTransformerEmbeddingFunction(model_name=\"all-MiniLM-L6-v2\")\n",
    "    from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction\n",
    "    lc_embeddings = ChromaEmbeddingsAdapter(chroma_ef)\n",
    "elif embedding_strategy == 'LC2Chroma':\n",
    "    from langchain_community.embeddings import HuggingFaceEmbeddings\n",
    "    # from langchain_huggingface import HuggingFaceEmbeddings\n",
    "    lc_embeddings = HuggingFaceEmbeddings(model_name=\"all-MiniLM-L6-v2\")\n",
    "    chroma_ef = LangChainEmbeddingAdapter(lc_embeddings)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Creating/Getting a Collection from a Persistent Database"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "if create_new_db:\n",
    "    tstamp = datetime.now().strftime(\"%Y%m%d_%H%M%S\")\n",
    "    dbname = f'chroma_{tstamp}'\n",
    "    print(f'Creating new database at {os.path.join(docsdir, dbname)}')\n",
    "else:\n",
    "    dbname = existing_dbname\n",
    "    print(f'Using existing database at {os.path.join(docsdir, dbname)}')\n",
    "\n",
    "persist_directory = os.path.join(docsdir, dbname)\n",
    "\n",
    "client = chromadb.PersistentClient(\n",
    "    path=persist_directory\n",
    "    )\n",
    "print(f'Database has {client.count_collections()} collections')\n",
    "\n",
    "if create_new_db:\n",
    "    try:\n",
    "        client.delete_collection(\n",
    "            name=collection_name\n",
    "        )\n",
    "        print(f'Collection {collection_name} deleted')\n",
    "    except Exception as e:\n",
    "        print(f'While attempting to delete collection {collection_name}, following exception was raised: {e}')\n",
    "    # shutil.rmtree(persist_directory, ignore_errors=True)\n",
    "    # print(f'Persistent database at {persist_directory} deleted')\n",
    "\n",
    "collection = client.get_or_create_collection(\n",
    "    name=collection_name,\n",
    "    embedding_function = chroma_ef,\n",
    "    )\n",
    "print(f'Collection {collection_name} has {collection.count()} entries')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Splitting and adding documents to collection"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def docs_to_splits(doc_fname):\n",
    "    loader = PyPDFLoader(doc_fname)\n",
    "    pages = loader.load()\n",
    "\n",
    "    r_text_splitter = RecursiveCharacterTextSplitter(\n",
    "        # separators=['\\n'],\n",
    "        # separators=[\"\\n\\n\"], #, \"\\n\", \"(?<=\\. )\"], # , \" \", \"\"],\n",
    "        # separators=[\"(?<=\\. )\"], # , \" \", \"\"],\n",
    "        chunk_size=chunk_size,\n",
    "        chunk_overlap=chunk_overlap,\n",
    "        #length_function=len\n",
    "    )\n",
    "    splits = r_text_splitter.split_documents(pages)\n",
    "    # print(splits[1])\n",
    "    print(f\" len(pages): {len(pages)}, len(splits) = {len(splits)}\", end='..')\n",
    "\n",
    "    return splits"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(f'Adding documents to collection {collection_name}:')\n",
    "for doc_i, doc_fname in enumerate(doc_fnames):\n",
    "    doc_path = os.path.join(docsdir, doc_fname)\n",
    "    # check if document exists in collection:\n",
    "    results = collection.get(\n",
    "        where={\"source\": doc_path},\n",
    "        include=[\"metadatas\"],\n",
    "        )\n",
    "    if len(results[\"ids\"]) > 0:\n",
    "        print(f' - {doc_fname}: already exists in collection')\n",
    "    else:\n",
    "        print(f' - {doc_fname}: splitting document..', end=' ')\n",
    "        splits = docs_to_splits(doc_path)\n",
    "        ids = [f'D{doc_i}S{split_i}' for split_i, split in enumerate(splits)]\n",
    "        print(f' adding to collection..', end=' ')\n",
    "        collection.add(\n",
    "            ids = ids,\n",
    "            documents=[split.page_content for split in splits],\n",
    "            metadatas=[split.metadata for split in splits]\n",
    "        )\n",
    "        print(f' done.')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Creating Vector Sore based on collection"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "vectorstore = Chroma(\n",
    "    client=client,\n",
    "    collection_name=collection_name,\n",
    "    embedding_function=lc_embeddings,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(vectorstore._collection.count())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Retrieval"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# question = \"How does multi-class classification work?\"\n",
    "# question = \"How can I build skills in AI?\"\n",
    "# question = \"What can go wrong when deploying a model?\"\n",
    "question = \"How can machine learning models help a business?\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def display_retrieval_results(docs):\n",
    "    print('page contents:')\n",
    "    for i, doc in enumerate(docs):\n",
    "        print(f\"doc-{i}: {doc.page_content}\")\n",
    "    print('\\nmeta_data:')\n",
    "    for i, doc in enumerate(docs):\n",
    "       print(f\"doc-{i}: {doc.metadata}\")              "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Similarity Search"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "docs = vectorstore.similarity_search(question,k=5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "display_retrieval_results(docs)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Maximum Marginal Relevance (MMR) Search"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "docs_mmr = vectorstore.max_marginal_relevance_search(question,k=3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "display_retrieval_results(docs_mmr)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Working with meta-data using self-query retriever "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain.retrievers.self_query.base import SelfQueryRetriever\n",
    "from langchain.chains.query_constructor.base import AttributeInfo"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_metadatas  = collection.get(include=[\"metadatas\"]).get('metadatas')\n",
    "sources = list(set([x.get('source') for x  in all_metadatas]))\n",
    "sources_fmt = [f\"`{source}`\" for source in sources ]\n",
    "sources_str = \", \".join(sources_fmt)\n",
    "print(f\"Distinct sources in collection: {sources_str}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "metadata_field_info = [\n",
    "    AttributeInfo(\n",
    "        name=\"source\",\n",
    "        description=f\"The lecture the chunk is from, should be one of {sources_str}\",\n",
    "        type=\"string\",\n",
    "    ),\n",
    "    AttributeInfo(\n",
    "        name=\"page\",\n",
    "        description=\"The page from the lecture\",\n",
    "        type=\"integer\",\n",
    "    ),\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# from langchain.llms import OpenAI\n",
    "# llm = OpenAI(model='gpt-3.5-turbo-instruct', temperature=0)\n",
    "\n",
    "from langchain.llms import Ollama\n",
    "llm = Ollama(model=\"llama3.2:1b\", temperature=0)\n",
    "\n",
    "document_content_description = \"ML books\"\n",
    "\n",
    "retriever = SelfQueryRetriever.from_llm(\n",
    "    llm,\n",
    "    vectorstore,\n",
    "    document_content_description,\n",
    "    metadata_field_info=metadata_field_info,\n",
    "    verbose=False\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "question = \"How can machine learning models help a business according to Andrew Ng's book?\"\n",
    "\n",
    "# docs = retriever.get_relevant_documents(question)\n",
    "# display_retrieval_results(docs)\n",
    "retriever.invoke(question)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Compression Retriever"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain.retrievers import ContextualCompressionRetriever\n",
    "from langchain.retrievers.document_compressors import LLMChainExtractor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def pretty_print_docs(docs):\n",
    "    print(f\"\\n{'-' * 100}\\n\".join([f\"Document {i+1}:\\n\\n\" + d.page_content for i, d in enumerate(docs)]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain.llms import Ollama\n",
    "llm = Ollama(model=\"llama3.2:1b\", temperature=0)\n",
    "compressor = LLMChainExtractor.from_llm(llm)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "compression_retriever = ContextualCompressionRetriever(\n",
    "    base_compressor=compressor,\n",
    "    base_retriever=vectorstore.as_retriever()\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "question = \"How can machine learning models help a business?\"\n",
    "compressed_docs = compression_retriever.get_relevant_documents(question)\n",
    "pretty_print_docs(compressed_docs)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Combining MMR and Compression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "compression_retriever = ContextualCompressionRetriever(\n",
    "    base_compressor=compressor,\n",
    "    base_retriever=vectorstore.as_retriever(search_type = \"mmr\")\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "question = \"How can machine learning models help a business?\"\n",
    "compressed_docs = compression_retriever.get_relevant_documents(question)\n",
    "pretty_print_docs(compressed_docs)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "langchain_311",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}