Spaces:
Sleeping
Sleeping
Sanchay3011
commited on
Commit
Β·
4888ade
1
Parent(s):
01b1ec7
Added AI Data Scientist Agent app
Browse files- app.py +131 -0
- chatbot.py +45 -0
- pages/01_π_Upload_and_Schema.py +21 -0
- pages/02_π§Ή_Clean_Data.py +57 -0
- pages/03_π_Data_Visualization.py +101 -0
- pages/04_π€_Modeling_and_Evaluation.py +137 -0
- pages/05_π_Report.py +139 -0
- requirements.txt +2 -3
app.py
ADDED
|
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
|
| 3 |
+
st.set_page_config(page_title="AI Data Scientist Agent", page_icon="π€", layout="wide")
|
| 4 |
+
|
| 5 |
+
# ===== CSS =====
|
| 6 |
+
st.markdown("""
|
| 7 |
+
<style>
|
| 8 |
+
.stApp {
|
| 9 |
+
background: linear-gradient(135deg, #89f7fe, #66a6ff);
|
| 10 |
+
}
|
| 11 |
+
.main-title {
|
| 12 |
+
text-align: center;
|
| 13 |
+
font-size: 3em;
|
| 14 |
+
color: white;
|
| 15 |
+
font-weight: bold;
|
| 16 |
+
margin-bottom: 0.3em;
|
| 17 |
+
}
|
| 18 |
+
.subtitle {
|
| 19 |
+
text-align: center;
|
| 20 |
+
font-size: 1.2em;
|
| 21 |
+
color: #f0f0f0;
|
| 22 |
+
margin-bottom: 2em;
|
| 23 |
+
}
|
| 24 |
+
.cards-container {
|
| 25 |
+
display: flex;
|
| 26 |
+
justify-content: center;
|
| 27 |
+
flex-wrap: wrap;
|
| 28 |
+
gap: 25px;
|
| 29 |
+
margin-top: 30px;
|
| 30 |
+
}
|
| 31 |
+
.card {
|
| 32 |
+
background-color: white;
|
| 33 |
+
width: 260px;
|
| 34 |
+
height: 180px;
|
| 35 |
+
border-radius: 20px;
|
| 36 |
+
box-shadow: 0px 6px 20px rgba(0,0,0,0.15);
|
| 37 |
+
text-align: center;
|
| 38 |
+
padding: 20px;
|
| 39 |
+
transition: transform 0.2s ease-in-out, box-shadow 0.2s ease-in-out;
|
| 40 |
+
cursor: pointer;
|
| 41 |
+
text-decoration: none;
|
| 42 |
+
display: flex;
|
| 43 |
+
flex-direction: column;
|
| 44 |
+
justify-content: center;
|
| 45 |
+
}
|
| 46 |
+
.card:hover {
|
| 47 |
+
transform: translateY(-6px);
|
| 48 |
+
box-shadow: 0px 10px 30px rgba(0,0,0,0.25);
|
| 49 |
+
}
|
| 50 |
+
.card-icon {
|
| 51 |
+
font-size: 2.5em;
|
| 52 |
+
margin-bottom: 12px;
|
| 53 |
+
}
|
| 54 |
+
.card-title {
|
| 55 |
+
font-size: 1.2em;
|
| 56 |
+
font-weight: bold;
|
| 57 |
+
margin-bottom: 6px;
|
| 58 |
+
color: #333;
|
| 59 |
+
}
|
| 60 |
+
.card-desc {
|
| 61 |
+
font-size: 0.9em;
|
| 62 |
+
color: #666;
|
| 63 |
+
}
|
| 64 |
+
.center-btn {
|
| 65 |
+
text-align: center;
|
| 66 |
+
margin-top: 40px;
|
| 67 |
+
}
|
| 68 |
+
.get-started-btn {
|
| 69 |
+
background-color: #ff6b6b;
|
| 70 |
+
color: white;
|
| 71 |
+
padding: 14px 40px;
|
| 72 |
+
border-radius: 30px;
|
| 73 |
+
font-size: 1.2em;
|
| 74 |
+
font-weight: bold;
|
| 75 |
+
text-decoration: none;
|
| 76 |
+
transition: background 0.3s ease-in-out;
|
| 77 |
+
}
|
| 78 |
+
.get-started-btn:hover {
|
| 79 |
+
background-color: #ff4757;
|
| 80 |
+
}
|
| 81 |
+
</style>
|
| 82 |
+
""", unsafe_allow_html=True)
|
| 83 |
+
|
| 84 |
+
# ===== Title =====
|
| 85 |
+
st.markdown("<h1 class='main-title'>π AI Data Scientist Agent</h1>", unsafe_allow_html=True)
|
| 86 |
+
st.markdown("<p class='subtitle'>Your end-to-end assistant for Data Cleaning, Analysis, Modeling, and Reporting π</p>", unsafe_allow_html=True)
|
| 87 |
+
|
| 88 |
+
# ===== Cards =====
|
| 89 |
+
cards_html = """
|
| 90 |
+
<div class="cards-container">
|
| 91 |
+
<a href="?page=01_π_Upload_and_Schema" class="card">
|
| 92 |
+
<div class="card-icon">π</div>
|
| 93 |
+
<div class="card-title">Step-1: Upload & Schema</div>
|
| 94 |
+
<div class="card-desc">Upload your dataset & explore its structure</div>
|
| 95 |
+
</a>
|
| 96 |
+
<a href="?page=02_π§Ή_Clean_Data" class="card">
|
| 97 |
+
<div class="card-icon">π§Ή</div>
|
| 98 |
+
<div class="card-title">Step-2: Clean Data</div>
|
| 99 |
+
<div class="card-desc">Handle missing values, duplicates & outliers</div>
|
| 100 |
+
</a>
|
| 101 |
+
<a href="?page=03_π_Data_Visualization" class="card">
|
| 102 |
+
<div class="card-icon">π</div>
|
| 103 |
+
<div class="card-title">Step-3: Visualize Data</div>
|
| 104 |
+
<div class="card-desc">Generate interactive charts & correlations</div>
|
| 105 |
+
</a>
|
| 106 |
+
<a href="?page=04_π€_Modeling_and_Evaluation" class="card">
|
| 107 |
+
<div class="card-icon">π€</div>
|
| 108 |
+
<div class="card-title">Step-4: Modeling</div>
|
| 109 |
+
<div class="card-desc">Train ML models & pick the best one</div>
|
| 110 |
+
</a>
|
| 111 |
+
<a href="?page=05_π_Report" class="card">
|
| 112 |
+
<div class="card-icon">π</div>
|
| 113 |
+
<div class="card-title">Step-5: Report</div>
|
| 114 |
+
<div class="card-desc">Download automated PDF reports with insights</div>
|
| 115 |
+
</a>
|
| 116 |
+
</div>
|
| 117 |
+
"""
|
| 118 |
+
|
| 119 |
+
st.markdown(cards_html, unsafe_allow_html=True)
|
| 120 |
+
|
| 121 |
+
# ===== Get Started Button =====
|
| 122 |
+
st.markdown("""
|
| 123 |
+
<div class="center-btn">
|
| 124 |
+
<a href="?page=01_π_Upload_and_Schema" class="get-started-btn">β¨ Get Started</a>
|
| 125 |
+
</div>
|
| 126 |
+
""", unsafe_allow_html=True)
|
| 127 |
+
|
| 128 |
+
# ===== Handle Navigation =====
|
| 129 |
+
query_params = st.query_params
|
| 130 |
+
if "page" in query_params:
|
| 131 |
+
st.switch_page(f"pages/{query_params['page']}.py")
|
chatbot.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import streamlit as st
|
| 3 |
+
from dotenv import load_dotenv
|
| 4 |
+
from langchain_groq import ChatGroq
|
| 5 |
+
|
| 6 |
+
load_dotenv()
|
| 7 |
+
|
| 8 |
+
def chatbot_sidebar():
|
| 9 |
+
st.sidebar.markdown("## π€ Chat with AI Data Scientist!")
|
| 10 |
+
|
| 11 |
+
if "dataset" not in st.session_state:
|
| 12 |
+
st.sidebar.warning("β οΈ Please upload a dataset first.")
|
| 13 |
+
return
|
| 14 |
+
|
| 15 |
+
df = st.session_state["dataset"]
|
| 16 |
+
|
| 17 |
+
# Use the working model
|
| 18 |
+
llm = ChatGroq(
|
| 19 |
+
model="llama-3.1-8b-instant",
|
| 20 |
+
api_key=os.getenv("GROQ_API_KEY"),
|
| 21 |
+
temperature=0
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
user_input = st.sidebar.text_area("π¬ Ask me about your dataset:")
|
| 25 |
+
if user_input:
|
| 26 |
+
try:
|
| 27 |
+
prompt = f"""
|
| 28 |
+
You are a professional data scientist. Analyze the DataFrame `df` below:
|
| 29 |
+
|
| 30 |
+
Preview:
|
| 31 |
+
{df.head(5).to_string()}
|
| 32 |
+
|
| 33 |
+
Schema:
|
| 34 |
+
{df.dtypes.to_string()}
|
| 35 |
+
|
| 36 |
+
Now answer the userβs question in plain English, based on the full datasetβnot code experiments.
|
| 37 |
+
|
| 38 |
+
User: {user_input}
|
| 39 |
+
"""
|
| 40 |
+
|
| 41 |
+
response = llm.invoke(prompt)
|
| 42 |
+
st.sidebar.write("π€:", response.content.strip())
|
| 43 |
+
|
| 44 |
+
except Exception as e:
|
| 45 |
+
st.sidebar.error(f"β οΈ Error: {e}")
|
pages/01_π_Upload_and_Schema.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import pandas as pd
|
| 3 |
+
|
| 4 |
+
from chatbot import chatbot_sidebar
|
| 5 |
+
|
| 6 |
+
st.title("π Upload & Schema")
|
| 7 |
+
|
| 8 |
+
uploaded_file = st.file_uploader("Upload a CSV file and open the sidebar from the Top-Left corner (>>) to interact with the specialized AI Data Scientist!", type=["csv"])
|
| 9 |
+
|
| 10 |
+
if uploaded_file is not None:
|
| 11 |
+
df = pd.read_csv(uploaded_file)
|
| 12 |
+
|
| 13 |
+
# Store in session state
|
| 14 |
+
st.session_state["dataset"] = df
|
| 15 |
+
st.session_state["uploaded_filename"] = uploaded_file.name
|
| 16 |
+
|
| 17 |
+
st.success(f"β
Uploaded: {uploaded_file.name}")
|
| 18 |
+
st.dataframe(df.head())
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
chatbot_sidebar()
|
pages/02_π§Ή_Clean_Data.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import pandas as pd
|
| 3 |
+
|
| 4 |
+
from chatbot import chatbot_sidebar
|
| 5 |
+
|
| 6 |
+
st.session_state["page_name"] = "Clean"
|
| 7 |
+
|
| 8 |
+
st.title("π§Ή Data Cleaning")
|
| 9 |
+
|
| 10 |
+
# Check if dataset exists in session_state
|
| 11 |
+
if "dataset" not in st.session_state:
|
| 12 |
+
st.warning("β οΈ Please upload a dataset first in the Upload & Schema page.")
|
| 13 |
+
st.stop()
|
| 14 |
+
|
| 15 |
+
# Load dataset
|
| 16 |
+
df = st.session_state["dataset"]
|
| 17 |
+
|
| 18 |
+
st.subheader("Current Data Preview")
|
| 19 |
+
st.write(df.head())
|
| 20 |
+
|
| 21 |
+
# -------------------------
|
| 22 |
+
# Cleaning Options
|
| 23 |
+
# -------------------------
|
| 24 |
+
st.subheader("Cleaning Options")
|
| 25 |
+
|
| 26 |
+
if st.checkbox("Remove Missing Values"):
|
| 27 |
+
df = df.dropna()
|
| 28 |
+
|
| 29 |
+
if st.checkbox("Remove Duplicates"):
|
| 30 |
+
df = df.drop_duplicates()
|
| 31 |
+
|
| 32 |
+
if st.checkbox("Standardize Column Names"):
|
| 33 |
+
df.columns = [col.strip().lower().replace(" ", "_") for col in df.columns]
|
| 34 |
+
|
| 35 |
+
# -------------------------
|
| 36 |
+
# Save Cleaned Data
|
| 37 |
+
# -------------------------
|
| 38 |
+
if st.button("πΎ Save Cleaned Dataset"):
|
| 39 |
+
st.session_state["dataset"] = df # replace original dataset
|
| 40 |
+
st.success("β
Cleaned dataset saved! This version will be used in the next steps.")
|
| 41 |
+
|
| 42 |
+
st.subheader("Preview of Cleaned Data")
|
| 43 |
+
st.write(df.head())
|
| 44 |
+
|
| 45 |
+
# -------------------------
|
| 46 |
+
# Download Option
|
| 47 |
+
# -------------------------
|
| 48 |
+
csv = df.to_csv(index=False).encode("utf-8")
|
| 49 |
+
st.download_button(
|
| 50 |
+
label="π₯ Download as CSV",
|
| 51 |
+
data=csv,
|
| 52 |
+
file_name="cleaned_dataset.csv",
|
| 53 |
+
mime="text/csv",
|
| 54 |
+
)
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
chatbot_sidebar()
|
pages/03_π_Data_Visualization.py
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import matplotlib.pyplot as plt
|
| 4 |
+
|
| 5 |
+
from chatbot import chatbot_sidebar
|
| 6 |
+
|
| 7 |
+
st.session_state["page_name"] = "Data_Visualisation"
|
| 8 |
+
|
| 9 |
+
st.title("π Exploratory Data Analysis (EDA)")
|
| 10 |
+
|
| 11 |
+
# -------------------------
|
| 12 |
+
# Load Dataset
|
| 13 |
+
# -------------------------
|
| 14 |
+
if "dataset" not in st.session_state:
|
| 15 |
+
st.warning("β οΈ Please upload and clean your dataset first.")
|
| 16 |
+
st.stop()
|
| 17 |
+
|
| 18 |
+
df = st.session_state["dataset"]
|
| 19 |
+
|
| 20 |
+
st.subheader("Data Preview")
|
| 21 |
+
st.write(df.head())
|
| 22 |
+
|
| 23 |
+
# -------------------------
|
| 24 |
+
# Summary Statistics
|
| 25 |
+
# -------------------------
|
| 26 |
+
st.subheader("π Summary Statistics")
|
| 27 |
+
st.write(df.describe(include="all"))
|
| 28 |
+
|
| 29 |
+
# -------------------------
|
| 30 |
+
# 1. Histogram
|
| 31 |
+
# -------------------------
|
| 32 |
+
st.subheader("π Histogram")
|
| 33 |
+
column = st.selectbox("Select a column", df.columns, key="hist")
|
| 34 |
+
if column:
|
| 35 |
+
fig, ax = plt.subplots()
|
| 36 |
+
df[column].hist(ax=ax, bins=20, color="skyblue", edgecolor="black")
|
| 37 |
+
ax.set_title(f"Histogram of {column}")
|
| 38 |
+
ax.set_xlabel(column)
|
| 39 |
+
ax.set_ylabel("Frequency")
|
| 40 |
+
st.pyplot(fig)
|
| 41 |
+
|
| 42 |
+
# -------------------------
|
| 43 |
+
# 2. Boxplot
|
| 44 |
+
# -------------------------
|
| 45 |
+
st.subheader("π¦ Boxplot (Detect Outliers)")
|
| 46 |
+
box_col = st.selectbox("Select numeric column", df.select_dtypes(include="number").columns, key="box")
|
| 47 |
+
if box_col:
|
| 48 |
+
fig, ax = plt.subplots()
|
| 49 |
+
ax.boxplot(df[box_col].dropna())
|
| 50 |
+
ax.set_title(f"Boxplot of {box_col}")
|
| 51 |
+
ax.set_ylabel(box_col)
|
| 52 |
+
st.pyplot(fig)
|
| 53 |
+
|
| 54 |
+
# -------------------------
|
| 55 |
+
# 3. Scatter Plot
|
| 56 |
+
# -------------------------
|
| 57 |
+
st.subheader("βοΈ Scatter Plot (Relationship)")
|
| 58 |
+
col_x = st.selectbox("X-axis (Numeric)", df.select_dtypes(include="number").columns, key="scatter_x")
|
| 59 |
+
col_y = st.selectbox("Y-axis (Numeric)", df.select_dtypes(include="number").columns, key="scatter_y")
|
| 60 |
+
if col_x and col_y:
|
| 61 |
+
fig, ax = plt.subplots()
|
| 62 |
+
ax.scatter(df[col_x], df[col_y], alpha=0.6, color="purple")
|
| 63 |
+
ax.set_xlabel(col_x)
|
| 64 |
+
ax.set_ylabel(col_y)
|
| 65 |
+
ax.set_title(f"{col_x} vs {col_y}")
|
| 66 |
+
st.pyplot(fig)
|
| 67 |
+
|
| 68 |
+
# -------------------------
|
| 69 |
+
# 4. Bar Chart (Categorical Count)
|
| 70 |
+
# -------------------------
|
| 71 |
+
st.subheader("π Bar Chart (Category Counts)")
|
| 72 |
+
cat_col = st.selectbox("Select categorical column", df.select_dtypes(exclude="number").columns, key="bar")
|
| 73 |
+
if cat_col:
|
| 74 |
+
counts = df[cat_col].value_counts()
|
| 75 |
+
fig, ax = plt.subplots()
|
| 76 |
+
counts.plot(kind="bar", ax=ax, color="orange", edgecolor="black")
|
| 77 |
+
ax.set_title(f"Count of {cat_col}")
|
| 78 |
+
ax.set_xlabel(cat_col)
|
| 79 |
+
ax.set_ylabel("Count")
|
| 80 |
+
st.pyplot(fig)
|
| 81 |
+
|
| 82 |
+
# -------------------------
|
| 83 |
+
# 5. Correlation Heatmap
|
| 84 |
+
# -------------------------
|
| 85 |
+
st.subheader("π₯ Correlation Heatmap")
|
| 86 |
+
if len(df.select_dtypes(include="number").columns) > 1:
|
| 87 |
+
fig, ax = plt.subplots(figsize=(6, 4))
|
| 88 |
+
corr = df.corr(numeric_only=True)
|
| 89 |
+
im = ax.imshow(corr, cmap="coolwarm", aspect="auto")
|
| 90 |
+
ax.set_xticks(range(len(corr)))
|
| 91 |
+
ax.set_yticks(range(len(corr)))
|
| 92 |
+
ax.set_xticklabels(corr.columns, rotation=45, ha="right")
|
| 93 |
+
ax.set_yticklabels(corr.columns)
|
| 94 |
+
fig.colorbar(im)
|
| 95 |
+
ax.set_title("Correlation Heatmap")
|
| 96 |
+
st.pyplot(fig)
|
| 97 |
+
else:
|
| 98 |
+
st.info("Need at least 2 numeric columns for correlation heatmap.")
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
chatbot_sidebar()
|
pages/04_π€_Modeling_and_Evaluation.py
ADDED
|
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from sklearn.model_selection import train_test_split
|
| 4 |
+
from sklearn.preprocessing import LabelEncoder
|
| 5 |
+
from sklearn.linear_model import LinearRegression, LogisticRegression
|
| 6 |
+
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
|
| 7 |
+
from sklearn.metrics import accuracy_score, mean_squared_error
|
| 8 |
+
|
| 9 |
+
from chatbot import chatbot_sidebar
|
| 10 |
+
|
| 11 |
+
st.session_state["page_name"] = "Modeling and Evaluation"
|
| 12 |
+
|
| 13 |
+
st.title("π€ Modeling & Evaluation")
|
| 14 |
+
|
| 15 |
+
# -------------------------
|
| 16 |
+
# Load dataset
|
| 17 |
+
# -------------------------
|
| 18 |
+
if "dataset" not in st.session_state:
|
| 19 |
+
st.warning("β οΈ Please upload a dataset first.")
|
| 20 |
+
st.stop()
|
| 21 |
+
|
| 22 |
+
df = st.session_state["dataset"]
|
| 23 |
+
|
| 24 |
+
# -------------------------
|
| 25 |
+
# Target Column Selection
|
| 26 |
+
# -------------------------
|
| 27 |
+
st.markdown("### π― Select Target Column")
|
| 28 |
+
|
| 29 |
+
target_col = st.selectbox("Choose the target column:", df.columns, key="target_select")
|
| 30 |
+
|
| 31 |
+
if st.button("Confirm Target"):
|
| 32 |
+
st.session_state["target_col"] = target_col
|
| 33 |
+
st.session_state["run_modeling"] = False # reset before running
|
| 34 |
+
st.success(f"β
Target column set to **{target_col}**")
|
| 35 |
+
|
| 36 |
+
# -------------------------
|
| 37 |
+
# Run Modeling Button
|
| 38 |
+
# -------------------------
|
| 39 |
+
if "target_col" in st.session_state:
|
| 40 |
+
if st.button("π Run Modeling"):
|
| 41 |
+
st.session_state["run_modeling"] = True
|
| 42 |
+
|
| 43 |
+
# -------------------------
|
| 44 |
+
# Modeling Logic
|
| 45 |
+
# -------------------------
|
| 46 |
+
if st.session_state.get("run_modeling", False):
|
| 47 |
+
|
| 48 |
+
with st.spinner("β³ Training models... Please wait."):
|
| 49 |
+
target_col = st.session_state["target_col"]
|
| 50 |
+
|
| 51 |
+
# Split X and y
|
| 52 |
+
X = df.drop(columns=[target_col])
|
| 53 |
+
y = df[target_col]
|
| 54 |
+
|
| 55 |
+
# Encode categorical features
|
| 56 |
+
X = pd.get_dummies(X, drop_first=True)
|
| 57 |
+
|
| 58 |
+
# Encode target if categorical
|
| 59 |
+
if y.dtype == "object" or y.dtype.name == "category":
|
| 60 |
+
le = LabelEncoder()
|
| 61 |
+
y = le.fit_transform(y)
|
| 62 |
+
|
| 63 |
+
# Train/Test split
|
| 64 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
| 65 |
+
|
| 66 |
+
results = []
|
| 67 |
+
|
| 68 |
+
# Classification vs Regression detection
|
| 69 |
+
problem_type = "classification" if len(pd.Series(y).unique()) <= 10 else "regression"
|
| 70 |
+
|
| 71 |
+
if problem_type == "classification":
|
| 72 |
+
try:
|
| 73 |
+
clf = LogisticRegression(max_iter=1000)
|
| 74 |
+
clf.fit(X_train, y_train)
|
| 75 |
+
preds = clf.predict(X_test)
|
| 76 |
+
acc = accuracy_score(y_test, preds)
|
| 77 |
+
results.append(("Logistic Regression", acc))
|
| 78 |
+
except Exception as e:
|
| 79 |
+
st.error(f"β Logistic Regression failed: {e}")
|
| 80 |
+
|
| 81 |
+
try:
|
| 82 |
+
rf_clf = RandomForestClassifier()
|
| 83 |
+
rf_clf.fit(X_train, y_train)
|
| 84 |
+
preds = rf_clf.predict(X_test)
|
| 85 |
+
acc = accuracy_score(y_test, preds)
|
| 86 |
+
results.append(("Random Forest Classifier", acc))
|
| 87 |
+
except Exception as e:
|
| 88 |
+
st.error(f"β Random Forest failed: {e}")
|
| 89 |
+
|
| 90 |
+
else: # Regression
|
| 91 |
+
try:
|
| 92 |
+
lr = LinearRegression()
|
| 93 |
+
lr.fit(X_train, y_train)
|
| 94 |
+
preds = lr.predict(X_test)
|
| 95 |
+
mse = mean_squared_error(y_test, preds)
|
| 96 |
+
results.append(("Linear Regression", mse))
|
| 97 |
+
except Exception as e:
|
| 98 |
+
st.error(f"β Linear Regression failed: {e}")
|
| 99 |
+
|
| 100 |
+
try:
|
| 101 |
+
rf_reg = RandomForestRegressor()
|
| 102 |
+
rf_reg.fit(X_train, y_train)
|
| 103 |
+
preds = rf_reg.predict(X_test)
|
| 104 |
+
mse = mean_squared_error(y_test, preds)
|
| 105 |
+
results.append(("Random Forest Regressor", mse))
|
| 106 |
+
except Exception as e:
|
| 107 |
+
st.error(f"β Random Forest failed: {e}")
|
| 108 |
+
|
| 109 |
+
# -------------------------
|
| 110 |
+
# Show Results
|
| 111 |
+
# -------------------------
|
| 112 |
+
if results:
|
| 113 |
+
st.markdown("### π Model Results")
|
| 114 |
+
|
| 115 |
+
for model, score in results:
|
| 116 |
+
if problem_type == "classification":
|
| 117 |
+
st.write(f"β
**{model} Accuracy:** {score:.4f}")
|
| 118 |
+
else:
|
| 119 |
+
st.write(f"β
**{model} MSE:** {score:.2f}")
|
| 120 |
+
|
| 121 |
+
# Pick best model
|
| 122 |
+
if problem_type == "classification":
|
| 123 |
+
best_model = max(results, key=lambda x: x[1])
|
| 124 |
+
st.success(f"π Best Model: **{best_model[0]}** with Accuracy = {best_model[1]:.4f}")
|
| 125 |
+
else:
|
| 126 |
+
best_model = min(results, key=lambda x: x[1])
|
| 127 |
+
st.success(f"π Best Model: **{best_model[0]}** with MSE = {best_model[1]:.2f}")
|
| 128 |
+
|
| 129 |
+
# Save best model to session
|
| 130 |
+
st.session_state["best_model_name"] = best_model[0]
|
| 131 |
+
st.session_state["best_score"] = best_model[1]
|
| 132 |
+
st.session_state["problem_type"] = problem_type
|
| 133 |
+
|
| 134 |
+
else:
|
| 135 |
+
st.error("β No models could be trained. Please check your dataset.")
|
| 136 |
+
|
| 137 |
+
chatbot_sidebar()
|
pages/05_π_Report.py
ADDED
|
@@ -0,0 +1,139 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import matplotlib.pyplot as plt
|
| 4 |
+
import seaborn as sns
|
| 5 |
+
import io
|
| 6 |
+
from reportlab.lib.pagesizes import letter
|
| 7 |
+
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Image, Table, TableStyle
|
| 8 |
+
from reportlab.lib.styles import getSampleStyleSheet
|
| 9 |
+
from reportlab.lib import colors
|
| 10 |
+
|
| 11 |
+
from chatbot import chatbot_sidebar
|
| 12 |
+
|
| 13 |
+
st.session_state["page_name"] = "Report"
|
| 14 |
+
|
| 15 |
+
st.title("π Generate Report")
|
| 16 |
+
|
| 17 |
+
# -------------------------
|
| 18 |
+
# Load dataset from session_state
|
| 19 |
+
# -------------------------
|
| 20 |
+
if "dataset" in st.session_state and "uploaded_filename" in st.session_state:
|
| 21 |
+
df = st.session_state["dataset"]
|
| 22 |
+
dataset_name = st.session_state["uploaded_filename"].split(".")[0]
|
| 23 |
+
|
| 24 |
+
def generate_pdf():
|
| 25 |
+
buffer = io.BytesIO()
|
| 26 |
+
doc = SimpleDocTemplate(buffer, pagesize=letter)
|
| 27 |
+
styles = getSampleStyleSheet()
|
| 28 |
+
elements = []
|
| 29 |
+
|
| 30 |
+
# -------------------------
|
| 31 |
+
# Title
|
| 32 |
+
# -------------------------
|
| 33 |
+
elements.append(Paragraph(f"{dataset_name} Report", styles['Title']))
|
| 34 |
+
elements.append(Spacer(1, 12))
|
| 35 |
+
|
| 36 |
+
# -------------------------
|
| 37 |
+
# Dataset Overview
|
| 38 |
+
# -------------------------
|
| 39 |
+
elements.append(Paragraph("π Dataset Overview", styles['Heading2']))
|
| 40 |
+
elements.append(Paragraph(f"Rows: {df.shape[0]}", styles['Normal']))
|
| 41 |
+
elements.append(Paragraph(f"Columns: {df.shape[1]}", styles['Normal']))
|
| 42 |
+
elements.append(Paragraph(f"Missing Values: {df.isnull().sum().sum()}", styles['Normal']))
|
| 43 |
+
elements.append(Spacer(1, 12))
|
| 44 |
+
|
| 45 |
+
# -------------------------
|
| 46 |
+
# Descriptive Statistics (Split into chunks so it fits PDF)
|
| 47 |
+
# -------------------------
|
| 48 |
+
elements.append(Paragraph("π Descriptive Statistics", styles['Heading2']))
|
| 49 |
+
|
| 50 |
+
stats_df = df.describe().round(2).reset_index()
|
| 51 |
+
|
| 52 |
+
# Format numbers with commas + 2 decimals
|
| 53 |
+
def format_value(x):
|
| 54 |
+
if isinstance(x, (int, float)):
|
| 55 |
+
return f"{x:,.2f}"
|
| 56 |
+
return str(x)
|
| 57 |
+
|
| 58 |
+
stats_df = stats_df.applymap(format_value)
|
| 59 |
+
|
| 60 |
+
chunk_size = 6 # number of columns per table
|
| 61 |
+
for start in range(0, stats_df.shape[1], chunk_size):
|
| 62 |
+
subset = stats_df.iloc[:, start:start + chunk_size]
|
| 63 |
+
table_data = [subset.columns.tolist()] + subset.values.tolist()
|
| 64 |
+
|
| 65 |
+
stats_table = Table(table_data, repeatRows=1)
|
| 66 |
+
style_commands = [
|
| 67 |
+
('BACKGROUND', (0, 0), (-1, 0), colors.HexColor("#4CAF50")),
|
| 68 |
+
('TEXTCOLOR', (0, 0), (-1, 0), colors.white),
|
| 69 |
+
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
|
| 70 |
+
('ALIGN', (0, 0), (-1, -1), 'CENTER'),
|
| 71 |
+
('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
|
| 72 |
+
('FONTSIZE', (0, 0), (-1, -1), 7), # smaller font
|
| 73 |
+
]
|
| 74 |
+
# Alternating row background
|
| 75 |
+
for i in range(1, len(table_data)):
|
| 76 |
+
bg_color = colors.whitesmoke if i % 2 == 0 else colors.lightgrey
|
| 77 |
+
style_commands.append(('BACKGROUND', (0, i), (-1, i), bg_color))
|
| 78 |
+
|
| 79 |
+
stats_table.setStyle(TableStyle(style_commands))
|
| 80 |
+
elements.append(stats_table)
|
| 81 |
+
elements.append(Spacer(1, 12))
|
| 82 |
+
|
| 83 |
+
# -------------------------
|
| 84 |
+
# Best Model Summary
|
| 85 |
+
# -------------------------
|
| 86 |
+
if "best_model" in st.session_state:
|
| 87 |
+
elements.append(Paragraph("π€ Best Model Summary", styles['Heading2']))
|
| 88 |
+
model_table_data = [
|
| 89 |
+
["Model", "Score", "Type"],
|
| 90 |
+
[
|
| 91 |
+
st.session_state["best_model_name"],
|
| 92 |
+
f"{st.session_state['best_score']:.4f}",
|
| 93 |
+
st.session_state["problem_type"]
|
| 94 |
+
]
|
| 95 |
+
]
|
| 96 |
+
model_table = Table(model_table_data)
|
| 97 |
+
model_table.setStyle(TableStyle([
|
| 98 |
+
('BACKGROUND', (0, 0), (-1, 0), colors.HexColor("#2196F3")),
|
| 99 |
+
('TEXTCOLOR', (0, 0), (-1, 0), colors.white),
|
| 100 |
+
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
|
| 101 |
+
('ALIGN', (0, 0), (-1, -1), 'CENTER'),
|
| 102 |
+
('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
|
| 103 |
+
]))
|
| 104 |
+
elements.append(model_table)
|
| 105 |
+
elements.append(Spacer(1, 12))
|
| 106 |
+
|
| 107 |
+
# -------------------------
|
| 108 |
+
# Correlation Heatmap (if numeric)
|
| 109 |
+
# -------------------------
|
| 110 |
+
num_cols = df.select_dtypes(include=["int64", "float64"]).columns
|
| 111 |
+
if len(num_cols) > 1:
|
| 112 |
+
fig, ax = plt.subplots(figsize=(5, 4))
|
| 113 |
+
sns.heatmap(df[num_cols].corr(), annot=True, cmap="coolwarm", ax=ax)
|
| 114 |
+
img_buffer = io.BytesIO()
|
| 115 |
+
plt.savefig(img_buffer, format='png')
|
| 116 |
+
plt.close(fig)
|
| 117 |
+
img_buffer.seek(0)
|
| 118 |
+
elements.append(Paragraph("π Correlation Heatmap", styles['Heading2']))
|
| 119 |
+
elements.append(Image(img_buffer, width=400, height=300))
|
| 120 |
+
elements.append(Spacer(1, 12))
|
| 121 |
+
|
| 122 |
+
doc.build(elements)
|
| 123 |
+
buffer.seek(0)
|
| 124 |
+
return buffer
|
| 125 |
+
|
| 126 |
+
pdf_buffer = generate_pdf()
|
| 127 |
+
|
| 128 |
+
st.download_button(
|
| 129 |
+
label="π₯ Download Detailed Report (PDF)",
|
| 130 |
+
data=pdf_buffer,
|
| 131 |
+
file_name=f"{dataset_name}_report.pdf",
|
| 132 |
+
mime="application/pdf"
|
| 133 |
+
)
|
| 134 |
+
|
| 135 |
+
else:
|
| 136 |
+
st.warning("β οΈ Please upload and process a dataset first.")
|
| 137 |
+
|
| 138 |
+
# Chatbot
|
| 139 |
+
chatbot_sidebar()
|
requirements.txt
CHANGED
|
@@ -1,8 +1,7 @@
|
|
| 1 |
streamlit
|
| 2 |
pandas
|
|
|
|
|
|
|
| 3 |
matplotlib
|
| 4 |
seaborn
|
| 5 |
-
scikit-learn
|
| 6 |
reportlab
|
| 7 |
-
python-dotenv
|
| 8 |
-
groq
|
|
|
|
| 1 |
streamlit
|
| 2 |
pandas
|
| 3 |
+
numpy
|
| 4 |
+
scikit-learn
|
| 5 |
matplotlib
|
| 6 |
seaborn
|
|
|
|
| 7 |
reportlab
|
|
|
|
|
|