OnurKerimoglu commited on
Commit
15895e5
·
1 Parent(s): 848c55f

added notebooks/similarity_search_with_chromadb.ipynb

Browse files
notebooks/similarity_search_with_chromadb.ipynb ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import os, sys\n",
10
+ "from langchain_community.embeddings import HuggingFaceEmbeddings\n",
11
+ "import langchain\n",
12
+ "from langchain.document_loaders import PyPDFLoader\n",
13
+ "from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter\n",
14
+ "from langchain_community.vectorstores import Chroma\n",
15
+ "import shutil"
16
+ ]
17
+ },
18
+ {
19
+ "cell_type": "code",
20
+ "execution_count": null,
21
+ "metadata": {},
22
+ "outputs": [],
23
+ "source": [
24
+ "# Settings\n",
25
+ "doc_fname = \"../docs/The hundred-page machine learning book.pdf\"\n",
26
+ "persist_directory = '../docs/chroma/'\n",
27
+ "collection_name = '100p_ML_book'\n",
28
+ "create_new_db = False\n",
29
+ "\n",
30
+ "chunk_size =1000\n",
31
+ "chunk_overlap = 200\n",
32
+ "embedding = HuggingFaceEmbeddings()"
33
+ ]
34
+ },
35
+ {
36
+ "cell_type": "code",
37
+ "execution_count": null,
38
+ "metadata": {},
39
+ "outputs": [],
40
+ "source": [
41
+ "if create_new_db:\n",
42
+ " loader = PyPDFLoader(doc_fname)\n",
43
+ " pages = loader.load()\n",
44
+ "\n",
45
+ " r_text_splitter = RecursiveCharacterTextSplitter(\n",
46
+ " # separators=['\\n'],\n",
47
+ " # separators=[\"\\n\\n\"], #, \"\\n\", \"(?<=\\. )\"], # , \" \", \"\"],\n",
48
+ " # separators=[\"(?<=\\. )\"], # , \" \", \"\"],\n",
49
+ " chunk_size=chunk_size,\n",
50
+ " chunk_overlap=chunk_overlap,\n",
51
+ " #length_function=len\n",
52
+ " )\n",
53
+ " splits = r_text_splitter.split_documents(pages)\n",
54
+ " print(splits[1])\n",
55
+ " print(f\"len(pages): {len(pages)}, len(splits) = {len(splits)}\")"
56
+ ]
57
+ },
58
+ {
59
+ "cell_type": "code",
60
+ "execution_count": null,
61
+ "metadata": {},
62
+ "outputs": [],
63
+ "source": [
64
+ "if create_new_db:\n",
65
+ " #!rm -rf ./docs/chroma\n",
66
+ " shutil.rmtree(persist_directory, ignore_errors=True)\n",
67
+ " vectordb = Chroma.from_documents(\n",
68
+ " documents=splits,\n",
69
+ " embedding=embedding,\n",
70
+ " persist_directory=persist_directory,\n",
71
+ " collection_name=collection_name\n",
72
+ " )\n",
73
+ " # vectordb.persist()\n",
74
+ "else:\n",
75
+ " vectordb = Chroma(\n",
76
+ " collection_name=collection_name,\n",
77
+ " persist_directory=persist_directory,\n",
78
+ " embedding_function=embedding\n",
79
+ " )"
80
+ ]
81
+ },
82
+ {
83
+ "cell_type": "code",
84
+ "execution_count": null,
85
+ "metadata": {},
86
+ "outputs": [],
87
+ "source": [
88
+ "print(vectordb._collection.count())"
89
+ ]
90
+ },
91
+ {
92
+ "cell_type": "markdown",
93
+ "metadata": {},
94
+ "source": [
95
+ "## Similarity Search"
96
+ ]
97
+ },
98
+ {
99
+ "cell_type": "code",
100
+ "execution_count": null,
101
+ "metadata": {},
102
+ "outputs": [],
103
+ "source": [
104
+ "question = \"How does multi-label classification work?\""
105
+ ]
106
+ },
107
+ {
108
+ "cell_type": "code",
109
+ "execution_count": null,
110
+ "metadata": {},
111
+ "outputs": [],
112
+ "source": [
113
+ "docs = vectordb.similarity_search(question,k=3)"
114
+ ]
115
+ },
116
+ {
117
+ "cell_type": "code",
118
+ "execution_count": null,
119
+ "metadata": {},
120
+ "outputs": [],
121
+ "source": [
122
+ "len(docs)"
123
+ ]
124
+ },
125
+ {
126
+ "cell_type": "code",
127
+ "execution_count": null,
128
+ "metadata": {},
129
+ "outputs": [],
130
+ "source": [
131
+ "docs[0].page_content"
132
+ ]
133
+ },
134
+ {
135
+ "cell_type": "code",
136
+ "execution_count": null,
137
+ "metadata": {},
138
+ "outputs": [],
139
+ "source": [
140
+ "for doc in docs:\n",
141
+ " print(doc.metadata)"
142
+ ]
143
+ }
144
+ ],
145
+ "metadata": {
146
+ "kernelspec": {
147
+ "display_name": "langchain_311",
148
+ "language": "python",
149
+ "name": "python3"
150
+ },
151
+ "language_info": {
152
+ "codemirror_mode": {
153
+ "name": "ipython",
154
+ "version": 3
155
+ },
156
+ "file_extension": ".py",
157
+ "mimetype": "text/x-python",
158
+ "name": "python",
159
+ "nbconvert_exporter": "python",
160
+ "pygments_lexer": "ipython3",
161
+ "version": "3.11.1"
162
+ }
163
+ },
164
+ "nbformat": 4,
165
+ "nbformat_minor": 2
166
+ }