# Retrieving from a Multi-Source Chroma Vectorstore 

## Imports and Settings

In [None]:
from datetime import datetime
import os, sys
import shutil
import chromadb
import langchain
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain_community.vectorstores import Chroma

In [None]:
# Settings
abscurdir = os.path.abspath(os.curdir)
docsdir = os.path.join(os.path.dirname(abscurdir), 'docs')
doc_fnames = [
 "The hundred-page machine learning book.pdf",
 "the-big-book-of-mlops-v10-072023 - Databricks.pdf",
 "How to Build a Career in AI - Andrew Ng.pdf"
 ]
collection_name = 'MLbooks'
create_new_db = False
existing_dbname = 'chroma_20241124_132314'

chunk_size =1000
chunk_overlap = 200

# embedding_strategy = 'Chroma2LC'
embedding_strategy = 'LC2Chroma'


## Create/Get Vector Store

### Embedding Function

In [None]:
# Define Adapters:

# For converting Chroma's embedding functions to LC embeddings:
from langchain_core.embeddings import Embeddings
from chromadb.api.types import EmbeddingFunction


class ChromaEmbeddingsAdapter(Embeddings):
 def __init__(self, ef: EmbeddingFunction):
 self.ef = ef

 def embed_documents(self, texts):
 return self.ef(texts)

 def embed_query(self, query):
 return self.ef([query])[0]
 
# For converting LC embeddings to Chroma embedding functions:
from langchain_core.embeddings import Embeddings
from chromadb.api.types import EmbeddingFunction, Documents

class LangChainEmbeddingAdapter(EmbeddingFunction[Documents]):
 def __init__(self, ef: Embeddings):
 self.ef = ef

 def __call__(self, input: Documents) -> Embeddings:
 # LC EFs also have embed_query but Chroma doesn't support that so we just use embed_documents
 # TODO: better type checking
 return self.ef.embed_documents(input)

In [None]:
if embedding_strategy == 'Chroma2LC':
 from chromadb.utils import embedding_functions
 chroma_ef = embedding_functions.DefaultEmbeddingFunction()
 # chroma_ef = SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
 from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
 lc_embeddings = ChromaEmbeddingsAdapter(chroma_ef)
elif embedding_strategy == 'LC2Chroma':
 from langchain_community.embeddings import HuggingFaceEmbeddings
 # from langchain_huggingface import HuggingFaceEmbeddings
 lc_embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
 chroma_ef = LangChainEmbeddingAdapter(lc_embeddings)


### Creating/Getting a Collection from a Persistent Database

In [None]:
if create_new_db:
 tstamp = datetime.now().strftime("%Y%m%d_%H%M%S")
 dbname = f'chroma_{tstamp}'
 print(f'Creating new database at {os.path.join(docsdir, dbname)}')
else:
 dbname = existing_dbname
 print(f'Using existing database at {os.path.join(docsdir, dbname)}')

persist_directory = os.path.join(docsdir, dbname)

client = chromadb.PersistentClient(
 path=persist_directory
 )
print(f'Database has {client.count_collections()} collections')

if create_new_db:
 try:
 client.delete_collection(
 name=collection_name
 )
 print(f'Collection {collection_name} deleted')
 except Exception as e:
 print(f'While attempting to delete collection {collection_name}, following exception was raised: {e}')
 # shutil.rmtree(persist_directory, ignore_errors=True)
 # print(f'Persistent database at {persist_directory} deleted')

collection = client.get_or_create_collection(
 name=collection_name,
 embedding_function = chroma_ef,
 )
print(f'Collection {collection_name} has {collection.count()} entries')

### Splitting and adding documents to collection

In [None]:
def docs_to_splits(doc_fname):
 loader = PyPDFLoader(doc_fname)
 pages = loader.load()

 r_text_splitter = RecursiveCharacterTextSplitter(
 # separators=['\n'],
 # separators=["\n\n"], #, "\n", "(?<=\. )"], # , " ", ""],
 # separators=["(?<=\. )"], # , " ", ""],
 chunk_size=chunk_size,
 chunk_overlap=chunk_overlap,
 #length_function=len
 )
 splits = r_text_splitter.split_documents(pages)
 # print(splits[1])
 print(f" len(pages): {len(pages)}, len(splits) = {len(splits)}", end='..')

 return splits

In [None]:
print(f'Adding documents to collection {collection_name}:')
for doc_i, doc_fname in enumerate(doc_fnames):
 doc_path = os.path.join(docsdir, doc_fname)
 # check if document exists in collection:
 results = collection.get(
 where={"source": doc_path},
 include=["metadatas"],
 )
 if len(results["ids"]) > 0:
 print(f' - {doc_fname}: already exists in collection')
 else:
 print(f' - {doc_fname}: splitting document..', end=' ')
 splits = docs_to_splits(doc_path)
 ids = [f'D{doc_i}S{split_i}' for split_i, split in enumerate(splits)]
 print(f' adding to collection..', end=' ')
 collection.add(
 ids = ids,
 documents=[split.page_content for split in splits],
 metadatas=[split.metadata for split in splits]
 )
 print(f' done.')

### Creating Vector Sore based on collection

In [None]:
vectorstore = Chroma(
 client=client,
 collection_name=collection_name,
 embedding_function=lc_embeddings,
)

In [None]:
print(vectorstore._collection.count())

## Retrieval

In [None]:
# question = "How does multi-class classification work?"
# question = "How can I build skills in AI?"
# question = "What can go wrong when deploying a model?"
question = "How can machine learning models help a business?"

In [None]:
def display_retrieval_results(docs):
 print('page contents:')
 for i, doc in enumerate(docs):
 print(f"doc-{i}: {doc.page_content}")
 print('\nmeta_data:')
 for i, doc in enumerate(docs):
 print(f"doc-{i}: {doc.metadata}") 

### Similarity Search

In [None]:
docs = vectorstore.similarity_search(question,k=5)

In [None]:
display_retrieval_results(docs)

### Maximum Marginal Relevance (MMR) Search

In [None]:
docs_mmr = vectorstore.max_marginal_relevance_search(question,k=3)

In [None]:
display_retrieval_results(docs_mmr)

### Working with meta-data using self-query retriever 

In [None]:
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo

In [None]:
all_metadatas = collection.get(include=["metadatas"]).get('metadatas')
sources = list(set([x.get('source') for x in all_metadatas]))
sources_fmt = [f"`{source}`" for source in sources ]
sources_str = ", ".join(sources_fmt)
print(f"Distinct sources in collection: {sources_str}")


In [None]:
metadata_field_info = [
 AttributeInfo(
 name="source",
 description=f"The lecture the chunk is from, should be one of {sources_str}",
 type="string",
 ),
 AttributeInfo(
 name="page",
 description="The page from the lecture",
 type="integer",
 ),
]

In [None]:
# from langchain.llms import OpenAI
# llm = OpenAI(model='gpt-3.5-turbo-instruct', temperature=0)

from langchain.llms import Ollama
llm = Ollama(model="llama3.2:1b", temperature=0)

document_content_description = "ML books"

retriever = SelfQueryRetriever.from_llm(
 llm,
 vectorstore,
 document_content_description,
 metadata_field_info=metadata_field_info,
 verbose=False
)

In [None]:
question = "How can machine learning models help a business according to Andrew Ng's book?"

# docs = retriever.get_relevant_documents(question)
# display_retrieval_results(docs)
retriever.invoke(question)

## Compression Retriever

In [None]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

In [None]:
def pretty_print_docs(docs):
 print(f"\n{'-' * 100}\n".join([f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]))

In [None]:
from langchain.llms import Ollama
llm = Ollama(model="llama3.2:1b", temperature=0)
compressor = LLMChainExtractor.from_llm(llm)

In [None]:
compression_retriever = ContextualCompressionRetriever(
 base_compressor=compressor,
 base_retriever=vectorstore.as_retriever()
)

In [None]:
question = "How can machine learning models help a business?"
compressed_docs = compression_retriever.get_relevant_documents(question)
pretty_print_docs(compressed_docs)

### Combining MMR and Compression

In [None]:
compression_retriever = ContextualCompressionRetriever(
 base_compressor=compressor,
 base_retriever=vectorstore.as_retriever(search_type = "mmr")
)

In [None]:
question = "How can machine learning models help a business?"
compressed_docs = compression_retriever.get_relevant_documents(question)
pretty_print_docs(compressed_docs)