Spaces:

adi-123
/

Docosphere

Running

App Files Files Community

adi-123 commited on Jul 25

Commit

e4a6388

verified ·

1 Parent(s): 44d3012

Update utils.py

Browse files

Files changed (1) hide show

utils.py +49 -21

utils.py CHANGED Viewed

@@ -2,22 +2,22 @@ import os
 import tempfile
 import streamlit as st
 from typing import List, IO, Tuple
 from PyPDF2 import PdfReader
 from docx import Document
 from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain_community.vectorstores import FAISS
-from langchain.prompts import PromptTemplate
 from langchain.schema import Document as LangchainDocument
-from langchain_together.embeddings import TogetherEmbeddings
 from langchain_together.chat_models import ChatTogether
-from dotenv import load_dotenv
-# Load from .env if available
 load_dotenv()
 def get_together_api_key() -> str:
     """
-    Retrieves the Together API key from environment or Streamlit secrets.
     """
     key = os.getenv("TOGETHER_API_KEY")
     if not key:
@@ -26,10 +26,13 @@ def get_together_api_key() -> str:
         except Exception:
             pass
     if not key:
-        raise EnvironmentError("TOGETHER_API_KEY not found. Set in env or Hugging Face secrets.")
     return key
 def get_pdf_text(pdf_docs: List[IO[bytes]]) -> str:
     text = ""
     for pdf in pdf_docs:
         pdf_reader = PdfReader(pdf)
@@ -40,15 +43,18 @@ def get_pdf_text(pdf_docs: List[IO[bytes]]) -> str:
     return text
 def get_docx_text(docx_docs: List[IO[bytes]]) -> str:
     text = ""
     for docx in docx_docs:
-        with tempfile.NamedTemporaryFile(delete=False, suffix='.docx') as temp_file:
             try:
                 temp_file.write(docx.getvalue())
                 temp_file.flush()
                 doc = Document(temp_file.name)
                 doc_text = [p.text for p in doc.paragraphs]
-                text += '\n'.join(doc_text) + "\n"
             except Exception as e:
                 st.warning(f"Warning: Could not process document {docx.name}: {str(e)}")
             finally:
@@ -59,20 +65,34 @@ def get_docx_text(docx_docs: List[IO[bytes]]) -> str:
     return text
 def get_text_chunks(text: str) -> List[str]:
     splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
     return splitter.split_text(text)
 def get_vector_store(text_chunks: List[str]) -> None:
     api_key = get_together_api_key()
-    embeddings = TogetherEmbeddings(
-        model="togethercomputer/m2-bert-80M-8k-retrieval",
-        api_key=api_key
     )
     documents = [LangchainDocument(page_content=chunk) for chunk in text_chunks]
-    vector_store = FAISS.from_documents(documents, embedding=embeddings)
     vector_store.save_local("faiss_index")
 def get_conversational_chain() -> Tuple[ChatTogether, PromptTemplate]:
     api_key = get_together_api_key()
     llm = ChatTogether(
         model="meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
@@ -98,6 +118,9 @@ def get_conversational_chain() -> Tuple[ChatTogether, PromptTemplate]:
     return llm, prompt
 def self_assess(question: str) -> str:
     api_key = get_together_api_key()
     llm = ChatTogether(
         model="meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
@@ -124,15 +147,22 @@ def self_assess(question: str) -> str:
     return response.content.strip()
 def process_docs_for_query(docs: List[LangchainDocument], question: str) -> str:
     if not docs:
         return "I couldn't find relevant information in your uploaded documents to answer that question."
     context = "\n\n".join(doc.page_content for doc in docs)
     llm, prompt = get_conversational_chain()
-    final_prompt = prompt.format(context=context, question=question)
-    response = llm.invoke(final_prompt)
     return response.content
 def user_input(user_question: str) -> None:
     assessment = self_assess(user_question)
     if assessment.strip().upper() == "NEED_RETRIEVAL":
@@ -145,11 +175,8 @@ def user_input(user_question: str) -> None:
     try:
         if need_retrieval:
             api_key = get_together_api_key()
-            embeddings = TogetherEmbeddings(
-                model="togethercomputer/m2-bert-80M-8k-retrieval",
-                api_key=api_key
-            )
-            vector_store = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)
             docs = vector_store.similarity_search(user_question)
             response = process_docs_for_query(docs, user_question)
         else:
@@ -157,5 +184,6 @@ def user_input(user_question: str) -> None:
         st.markdown("### Answer")
         st.markdown(response)
     except Exception as e:
-        st.error(f"⚠️ An error occurred: {e}")

 import tempfile
 import streamlit as st
 from typing import List, IO, Tuple
+from dotenv import load_dotenv
 from PyPDF2 import PdfReader
 from docx import Document
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.schema import Document as LangchainDocument
+from langchain_community.vectorstores import FAISS
 from langchain_together.chat_models import ChatTogether
+from langchain.prompts import PromptTemplate
+from together import Together
+# Load environment variables
 load_dotenv()
 def get_together_api_key() -> str:
     """
+    Retrieves the Together API key from environment variables or Streamlit secrets.
     """
     key = os.getenv("TOGETHER_API_KEY")
     if not key:
         except Exception:
             pass
     if not key:
+        raise EnvironmentError("TOGETHER_API_KEY not found in env or Hugging Face secrets.")
     return key
 def get_pdf_text(pdf_docs: List[IO[bytes]]) -> str:
+    """
+    Extract text content from a list of PDF files.
+    """
     text = ""
     for pdf in pdf_docs:
         pdf_reader = PdfReader(pdf)
     return text
 def get_docx_text(docx_docs: List[IO[bytes]]) -> str:
+    """
+    Extract text content from a list of Word documents.
+    """
     text = ""
     for docx in docx_docs:
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".docx") as temp_file:
             try:
                 temp_file.write(docx.getvalue())
                 temp_file.flush()
                 doc = Document(temp_file.name)
                 doc_text = [p.text for p in doc.paragraphs]
+                text += "\n".join(doc_text) + "\n"
             except Exception as e:
                 st.warning(f"Warning: Could not process document {docx.name}: {str(e)}")
             finally:
     return text
 def get_text_chunks(text: str) -> List[str]:
+    """
+    Split text into manageable chunks for processing.
+    """
     splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
     return splitter.split_text(text)
 def get_vector_store(text_chunks: List[str]) -> None:
+    """
+    Create and store a FAISS vector store using Together AI embeddings.
+    """
     api_key = get_together_api_key()
+    client = Together(api_key=api_key)
+    response = client.embeddings.create(
+        model="BAAI/bge-base-en-v1.5",
+        input=text_chunks
     )
+    embeddings = [item["embedding"] for item in response.data]
     documents = [LangchainDocument(page_content=chunk) for chunk in text_chunks]
+    vector_store = FAISS.from_documents(
+        documents,
+        embedding_function=lambda _: embeddings.pop(0)
+    )
     vector_store.save_local("faiss_index")
 def get_conversational_chain() -> Tuple[ChatTogether, PromptTemplate]:
+    """
+    Initialize the LLM and prompt template for answering questions.
+    """
     api_key = get_together_api_key()
     llm = ChatTogether(
         model="meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
     return llm, prompt
 def self_assess(question: str) -> str:
+    """
+    Determine whether the AI can answer the question directly or needs document retrieval.
+    """
     api_key = get_together_api_key()
     llm = ChatTogether(
         model="meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
     return response.content.strip()
 def process_docs_for_query(docs: List[LangchainDocument], question: str) -> str:
+    """
+    Use retrieved documents and the LLM to generate an answer.
+    """
     if not docs:
         return "I couldn't find relevant information in your uploaded documents to answer that question."
     context = "\n\n".join(doc.page_content for doc in docs)
     llm, prompt = get_conversational_chain()
+    formatted_prompt = prompt.format(context=context, question=question)
+    response = llm.invoke(formatted_prompt)
     return response.content
 def user_input(user_question: str) -> None:
+    """
+    Process the user's question, decide on retrieval or not, and display the answer.
+    """
     assessment = self_assess(user_question)
     if assessment.strip().upper() == "NEED_RETRIEVAL":
     try:
         if need_retrieval:
             api_key = get_together_api_key()
+            client = Together(api_key=api_key)
+            vector_store = FAISS.load_local("faiss_index", embedding_function=lambda x: [0.0]*768, allow_dangerous_deserialization=True)
             docs = vector_store.similarity_search(user_question)
             response = process_docs_for_query(docs, user_question)
         else:
         st.markdown("### Answer")
         st.markdown(response)
     except Exception as e:
+        st.error(f"⚠️ An error occurred while processing your question: {e}")