adi-123 commited on
Commit
7131533
·
verified ·
1 Parent(s): 1651e22

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +50 -166
utils.py CHANGED
@@ -1,121 +1,44 @@
1
  import os
2
- from typing import List, IO, Tuple, Dict, Any
3
  from PyPDF2 import PdfReader
4
  from docx import Document
 
 
5
  from langchain.text_splitter import RecursiveCharacterTextSplitter
6
- from langchain_google_genai import (
7
- GoogleGenerativeAIEmbeddings,
8
- ChatGoogleGenerativeAI,
9
- )
10
  from langchain_community.vectorstores import FAISS
11
  from langchain.prompts import PromptTemplate
12
  from langchain.schema import Document as LangchainDocument
13
- from dotenv import load_dotenv
14
- import google.generativeai as genai
15
- import streamlit as st
16
- import tempfile
17
 
18
- # Load environment variables and configure Google API
19
  load_dotenv()
20
- GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
21
- genai.configure(api_key=GOOGLE_API_KEY)
22
 
23
  def get_pdf_text(pdf_docs: List[IO[bytes]]) -> str:
24
- """
25
- Extract text content from a list of PDF files.
26
-
27
- Args:
28
- pdf_docs (List[IO[bytes]]): List of uploaded PDF files from Streamlit's file uploader.
29
-
30
- Returns:
31
- str: A single string containing concatenated text extracted from all PDFs.
32
- """
33
- text = ""
34
- for pdf in pdf_docs:
35
- pdf_reader = PdfReader(pdf)
36
- for page in pdf_reader.pages:
37
- page_text = page.extract_text()
38
- if page_text:
39
- text += page_text + "\n"
40
- return text
41
 
42
  def get_docx_text(docx_docs: List[IO[bytes]]) -> str:
43
- """
44
- Extract text content from a list of Word documents.
45
-
46
- Args:
47
- docx_docs (List[IO[bytes]]): List of uploaded Word files from Streamlit's file uploader.
48
-
49
- Returns:
50
- str: A single string containing concatenated text extracted from all Word documents.
51
- """
52
- text = ""
53
- for docx in docx_docs:
54
- # Create a temporary file to handle the uploaded file
55
- with tempfile.NamedTemporaryFile(delete=False, suffix='.docx') as temp_file:
56
- try:
57
- # Write the uploaded file content to the temporary file
58
- temp_file.write(docx.getvalue())
59
- temp_file.flush()
60
-
61
- # Open the document and extract text from paragraphs
62
- doc = Document(temp_file.name)
63
- doc_text = []
64
- for paragraph in doc.paragraphs:
65
- doc_text.append(paragraph.text)
66
-
67
- text += '\n'.join(doc_text) + "\n"
68
-
69
- except Exception as e:
70
- st.warning(f"Warning: Could not process document {docx.name}: {str(e)}")
71
- continue
72
- finally:
73
- # Clean up the temporary file
74
- try:
75
- os.unlink(temp_file.name)
76
- except Exception:
77
- pass
78
-
79
- return text
80
 
81
  def get_text_chunks(text: str) -> List[str]:
82
- """
83
- Split text into manageable chunks for processing.
84
-
85
- Args:
86
- text (str): The raw text to split.
87
-
88
- Returns:
89
- List[str]: A list of text chunks.
90
- """
91
- text_splitter = RecursiveCharacterTextSplitter(
92
- chunk_size=1000, # Reduced chunk size to match the working example
93
- chunk_overlap=200 # Adjusted overlap to match the working example
94
- )
95
- return text_splitter.split_text(text)
96
 
97
  def get_vector_store(text_chunks: List[str]) -> None:
98
- """
99
- Create and save a FAISS vector store from text chunks.
100
-
101
- Args:
102
- text_chunks (List[str]): List of text chunks.
103
-
104
- Returns:
105
- None
106
- """
107
- embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
108
  documents = [LangchainDocument(page_content=chunk) for chunk in text_chunks]
109
  vector_store = FAISS.from_documents(documents, embedding=embeddings)
110
  vector_store.save_local("faiss_index")
111
 
112
- def get_conversational_chain() -> Tuple[ChatGoogleGenerativeAI, PromptTemplate]:
113
- """
114
- Initialize the conversational AI model and prompt template.
115
-
116
- Returns:
117
- Tuple[ChatGoogleGenerativeAI, PromptTemplate]: Model and prompt template.
118
- """
119
  prompt_template = """
120
  As a professional assistant, provide a detailed and formally written answer to the question using the provided context.
121
  Ensure that the response is professionally formatted and avoids informal language.
@@ -128,103 +51,64 @@ def get_conversational_chain() -> Tuple[ChatGoogleGenerativeAI, PromptTemplate]:
128
 
129
  Answer:
130
  """
131
-
132
- model = ChatGoogleGenerativeAI(model="gemini-1.5-pro", temperature=0.3)
 
 
 
133
  prompt = PromptTemplate(
134
  template=prompt_template,
135
  input_variables=["context", "question"]
136
  )
137
- return model, prompt
138
-
139
 
140
  def self_assess(question: str) -> str:
141
- """
142
- Determine whether the AI can answer the question directly or needs to search the documents.
143
-
144
- Args:
145
- question (str): The user's question.
146
-
147
- Returns:
148
- str: The AI's response, which is either the direct answer or 'NEED_RETRIEVAL' if document search is needed.
149
- """
150
  assessment_prompt = [
151
- {
152
- "role": "system",
153
- "content": "You are an expert assistant who provides professionally formatted and formally written answers.",
154
- },
155
- {
156
- "role": "user",
157
- "content": f"""
158
- If you are confident in answering the following question based on your existing knowledge,
159
- please provide a detailed and formally written answer directly. If you are not confident or require additional information to answer accurately,
160
- please respond with 'NEED_RETRIEVAL'.
161
-
162
- Question: {question}
163
- """,
164
- },
165
- ]
166
- model = ChatGoogleGenerativeAI(model="gemini-1.5-pro", temperature=0.3)
167
- response = model.invoke(assessment_prompt)
168
- return response.content.strip() # Removed .upper()
169
-
170
-
171
- def process_docs_for_query(docs: List[Document], question: str) -> str:
172
- """
173
- Process documents to generate an answer to the user's question.
174
 
175
- Args:
176
- docs (List[Document]): Relevant documents retrieved from the vector store.
177
- question (str): The user's question.
 
 
 
 
 
178
 
179
- Returns:
180
- str: The AI-generated answer based on the documents.
181
- """
182
  if not docs:
183
  return "I apologize, but I couldn't find any relevant information in the provided documents to answer your question."
184
 
185
- context = "\n\n".join([doc.page_content for doc in docs])
186
- model, prompt = get_conversational_chain()
187
- formatted_prompt = prompt.format(context=context, question=question)
188
- response = model.invoke(formatted_prompt)
189
  return response.content
190
 
191
-
192
  def user_input(user_question: str) -> None:
193
- """
194
- Handle user input, decide whether to search documents or answer directly, and display the response.
195
-
196
- Args:
197
- user_question (str): The question entered by the user.
198
-
199
- Returns:
200
- None
201
- """
202
  assessment = self_assess(user_question)
203
 
204
- # Display source notification
205
  if assessment.strip().upper() == "NEED_RETRIEVAL":
206
  st.info("🔍 Searching through your uploaded documents for the answer...")
207
  need_retrieval = True
208
  else:
209
- need_retrieval = False
210
  st.info("💡 Answering based on AI's built-in knowledge...")
 
211
 
212
  try:
213
  if need_retrieval:
214
- embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
215
- vector_store = FAISS.load_local(
216
- "faiss_index", embeddings, allow_dangerous_deserialization=True
217
  )
 
218
  docs = vector_store.similarity_search(user_question)
219
- response = process_docs_for_query(docs, user_question)
220
  else:
221
- response = assessment
222
 
223
- # Display the response
224
  st.markdown("### Answer")
225
- st.markdown(f"{response}")
226
 
227
- except Exception:
228
- st.error(
229
- "⚠️ An error occurred while processing your question. Please make sure you've uploaded and processed your documents first."
230
- )
 
1
  import os
2
+ from typing import List, IO, Tuple
3
  from PyPDF2 import PdfReader
4
  from docx import Document
5
+ from dotenv import load_dotenv
6
+ import streamlit as st
7
  from langchain.text_splitter import RecursiveCharacterTextSplitter
8
+ from langchain_together.embeddings import TogetherEmbeddings
9
+ from langchain_together.chat_models import ChatTogether
 
 
10
  from langchain_community.vectorstores import FAISS
11
  from langchain.prompts import PromptTemplate
12
  from langchain.schema import Document as LangchainDocument
 
 
 
 
13
 
14
+ # Load environment variables
15
  load_dotenv()
16
+ if not os.getenv("TOGETHER_API_KEY"):
17
+ os.environ["TOGETHER_API_KEY"] = input("Enter TOGETHER_API_KEY: ")
18
 
19
  def get_pdf_text(pdf_docs: List[IO[bytes]]) -> str:
20
+ # unchanged...
21
+ ...
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  def get_docx_text(docx_docs: List[IO[bytes]]) -> str:
24
+ # unchanged...
25
+ ...
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
  def get_text_chunks(text: str) -> List[str]:
28
+ return RecursiveCharacterTextSplitter(
29
+ chunk_size=1000,
30
+ chunk_overlap=200
31
+ ).split_text(text)
 
 
 
 
 
 
 
 
 
 
32
 
33
  def get_vector_store(text_chunks: List[str]) -> None:
34
+ embeddings = TogetherEmbeddings(
35
+ model="togethercomputer/m2-bert-80M-8k-retrieval"
36
+ )
 
 
 
 
 
 
 
37
  documents = [LangchainDocument(page_content=chunk) for chunk in text_chunks]
38
  vector_store = FAISS.from_documents(documents, embedding=embeddings)
39
  vector_store.save_local("faiss_index")
40
 
41
+ def get_conversational_chain() -> Tuple[ChatTogether, PromptTemplate]:
 
 
 
 
 
 
42
  prompt_template = """
43
  As a professional assistant, provide a detailed and formally written answer to the question using the provided context.
44
  Ensure that the response is professionally formatted and avoids informal language.
 
51
 
52
  Answer:
53
  """
54
+ llm = ChatTogether(
55
+ model="meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
56
+ temperature=0.3,
57
+ max_tokens=None
58
+ )
59
  prompt = PromptTemplate(
60
  template=prompt_template,
61
  input_variables=["context", "question"]
62
  )
63
+ return llm, prompt
 
64
 
65
  def self_assess(question: str) -> str:
 
 
 
 
 
 
 
 
 
66
  assessment_prompt = [
67
+ {"role": "system", "content": "You are an expert assistant who provides professionally formatted and formally written answers."},
68
+ {"role": "user", "content": f"""If you are confident in answering the following question based on your existing knowledge, please provide a detailed and formally written answer directly. If you are not confident or require additional information to answer accurately, please respond with 'NEED_RETRIEVAL'.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
 
70
+ Question: {question}"""}
71
+ ]
72
+ llm = ChatTogether(
73
+ model="meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
74
+ temperature=0.3
75
+ )
76
+ response = llm.invoke(assessment_prompt)
77
+ return response.content.strip()
78
 
79
+ def process_docs_for_query(docs: List[LangchainDocument], question: str) -> str:
 
 
80
  if not docs:
81
  return "I apologize, but I couldn't find any relevant information in the provided documents to answer your question."
82
 
83
+ context = "\n\n".join(doc.page_content for doc in docs)
84
+ llm, prompt = get_conversational_chain()
85
+ formatted = prompt.format(context=context, question=question)
86
+ response = llm.invoke(formatted)
87
  return response.content
88
 
 
89
  def user_input(user_question: str) -> None:
 
 
 
 
 
 
 
 
 
90
  assessment = self_assess(user_question)
91
 
 
92
  if assessment.strip().upper() == "NEED_RETRIEVAL":
93
  st.info("🔍 Searching through your uploaded documents for the answer...")
94
  need_retrieval = True
95
  else:
 
96
  st.info("💡 Answering based on AI's built-in knowledge...")
97
+ need_retrieval = False
98
 
99
  try:
100
  if need_retrieval:
101
+ embeddings = TogetherEmbeddings(
102
+ model="togethercomputer/m2‑bert‑80M‑8k‑retrieval"
 
103
  )
104
+ vector_store = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)
105
  docs = vector_store.similarity_search(user_question)
106
+ answer = process_docs_for_query(docs, user_question)
107
  else:
108
+ answer = assessment
109
 
 
110
  st.markdown("### Answer")
111
+ st.markdown(answer)
112
 
113
+ except Exception as e:
114
+ st.error(f"⚠️ An error occurred: {e}")