File size: 3,097 Bytes
fde63e9
 
 
b1118ff
fde63e9
 
 
 
 
 
 
 
4a6fc54
fde63e9
 
b1118ff
fde63e9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b1118ff
fde63e9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b1118ff
fde63e9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import os
import sys
import streamlit as st
import PyPDF2  # PyPDF2 for PDF text extraction
import faiss  # FAISS for similarity search
import numpy as np
from langchain_community.llms import Replicate
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import CharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings

# Replicate API token
os.environ['REPLICATE_API_TOKEN'] = "r8_TN8tlsE4jjj9WISWhBKx7NqzHLAGwvq3pJOUj" # You need to use your replicate api

def extract_text_with_pypdf2(file):
    reader = PyPDF2.PdfReader(file)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    return text

# Streamlit interface
st.title("PDF Chatbot by *** Tasrif Nur Himel ***")

uploaded_file = st.file_uploader("Upload a PDF", type="pdf")
if uploaded_file is not None:
    # Extract text from the PDF using PyPDF2
    extracted_text = extract_text_with_pypdf2(uploaded_file)

    # Split the text into smaller chunks for processing
    text_splitter = CharacterTextSplitter(
        separator=" ",  # Using space as a separator for more control
        chunk_size=800,
        chunk_overlap=200,
        length_function=len
    )
    texts = text_splitter.split_text(extracted_text)

    # Use HuggingFace embeddings for transforming text into numerical vectors
    embeddings = HuggingFaceEmbeddings()

    # Create FAISS vector store from text and embeddings
    document_search = FAISS.from_texts(texts, embeddings)

    # Initialize Replicate Llama2 Model
    llm = Replicate(
        model="a16z-infra/llama13b-v2-chat:df7690f1994d94e96ad9d568eac121aecf50684a0b0963b25a41cc40061269e5",
        model_kwargs={"temperature": 0.75, "max_length": 3000}
    )

    # Set up the Conversational Retrieval Chain
    def qa_chain(query):
        # Search for similar documents
        docs = document_search.similarity_search(query)
        # Create a combined prompt from query and documents
        combined_prompt = f"Question: {query}\n\nRelevant Documents:\n"
        combined_prompt += "\n\n".join([doc.page_content for doc in docs])
        # Get the response from the Llama model
        response = llm.invoke(combined_prompt)  # Updated method call
        return response

    st.write("PDF successfully uploaded and processed. You can now ask questions about its content.")

    chat_history = st.session_state.get('chat_history', [])

    if 'init' not in st.session_state:
        st.session_state['init'] = True
        st.session_state['chat_history'] = []

    query = st.text_input("Prompt:")
    if st.button("Send"):
        if query.lower() in ["exit", "quit", "q"]:
            st.write('Exiting')
            sys.exit()
        result = qa_chain(query)
        st.write('Answer: ' + result)
        st.session_state['chat_history'].append((query, result))

    # Display the chat history
    st.write("### Chat History")
    for query, answer in st.session_state['chat_history']:
        st.write(f"**You:** {query}")
        st.write(f"**Bot:** {answer}")

else:
    st.write("Please upload a PDF file.")