Sanchay3011 commited on
Commit
4888ade
Β·
1 Parent(s): 01b1ec7

Added AI Data Scientist Agent app

Browse files
app.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ st.set_page_config(page_title="AI Data Scientist Agent", page_icon="πŸ€–", layout="wide")
4
+
5
+ # ===== CSS =====
6
+ st.markdown("""
7
+ <style>
8
+ .stApp {
9
+ background: linear-gradient(135deg, #89f7fe, #66a6ff);
10
+ }
11
+ .main-title {
12
+ text-align: center;
13
+ font-size: 3em;
14
+ color: white;
15
+ font-weight: bold;
16
+ margin-bottom: 0.3em;
17
+ }
18
+ .subtitle {
19
+ text-align: center;
20
+ font-size: 1.2em;
21
+ color: #f0f0f0;
22
+ margin-bottom: 2em;
23
+ }
24
+ .cards-container {
25
+ display: flex;
26
+ justify-content: center;
27
+ flex-wrap: wrap;
28
+ gap: 25px;
29
+ margin-top: 30px;
30
+ }
31
+ .card {
32
+ background-color: white;
33
+ width: 260px;
34
+ height: 180px;
35
+ border-radius: 20px;
36
+ box-shadow: 0px 6px 20px rgba(0,0,0,0.15);
37
+ text-align: center;
38
+ padding: 20px;
39
+ transition: transform 0.2s ease-in-out, box-shadow 0.2s ease-in-out;
40
+ cursor: pointer;
41
+ text-decoration: none;
42
+ display: flex;
43
+ flex-direction: column;
44
+ justify-content: center;
45
+ }
46
+ .card:hover {
47
+ transform: translateY(-6px);
48
+ box-shadow: 0px 10px 30px rgba(0,0,0,0.25);
49
+ }
50
+ .card-icon {
51
+ font-size: 2.5em;
52
+ margin-bottom: 12px;
53
+ }
54
+ .card-title {
55
+ font-size: 1.2em;
56
+ font-weight: bold;
57
+ margin-bottom: 6px;
58
+ color: #333;
59
+ }
60
+ .card-desc {
61
+ font-size: 0.9em;
62
+ color: #666;
63
+ }
64
+ .center-btn {
65
+ text-align: center;
66
+ margin-top: 40px;
67
+ }
68
+ .get-started-btn {
69
+ background-color: #ff6b6b;
70
+ color: white;
71
+ padding: 14px 40px;
72
+ border-radius: 30px;
73
+ font-size: 1.2em;
74
+ font-weight: bold;
75
+ text-decoration: none;
76
+ transition: background 0.3s ease-in-out;
77
+ }
78
+ .get-started-btn:hover {
79
+ background-color: #ff4757;
80
+ }
81
+ </style>
82
+ """, unsafe_allow_html=True)
83
+
84
+ # ===== Title =====
85
+ st.markdown("<h1 class='main-title'>πŸš€ AI Data Scientist Agent</h1>", unsafe_allow_html=True)
86
+ st.markdown("<p class='subtitle'>Your end-to-end assistant for Data Cleaning, Analysis, Modeling, and Reporting πŸ“Š</p>", unsafe_allow_html=True)
87
+
88
+ # ===== Cards =====
89
+ cards_html = """
90
+ <div class="cards-container">
91
+ <a href="?page=01_πŸ“‚_Upload_and_Schema" class="card">
92
+ <div class="card-icon">πŸ“‚</div>
93
+ <div class="card-title">Step-1: Upload & Schema</div>
94
+ <div class="card-desc">Upload your dataset & explore its structure</div>
95
+ </a>
96
+ <a href="?page=02_🧹_Clean_Data" class="card">
97
+ <div class="card-icon">🧹</div>
98
+ <div class="card-title">Step-2: Clean Data</div>
99
+ <div class="card-desc">Handle missing values, duplicates & outliers</div>
100
+ </a>
101
+ <a href="?page=03_πŸ“Š_Data_Visualization" class="card">
102
+ <div class="card-icon">πŸ“Š</div>
103
+ <div class="card-title">Step-3: Visualize Data</div>
104
+ <div class="card-desc">Generate interactive charts & correlations</div>
105
+ </a>
106
+ <a href="?page=04_πŸ€–_Modeling_and_Evaluation" class="card">
107
+ <div class="card-icon">πŸ€–</div>
108
+ <div class="card-title">Step-4: Modeling</div>
109
+ <div class="card-desc">Train ML models & pick the best one</div>
110
+ </a>
111
+ <a href="?page=05_πŸ“‘_Report" class="card">
112
+ <div class="card-icon">πŸ“‘</div>
113
+ <div class="card-title">Step-5: Report</div>
114
+ <div class="card-desc">Download automated PDF reports with insights</div>
115
+ </a>
116
+ </div>
117
+ """
118
+
119
+ st.markdown(cards_html, unsafe_allow_html=True)
120
+
121
+ # ===== Get Started Button =====
122
+ st.markdown("""
123
+ <div class="center-btn">
124
+ <a href="?page=01_πŸ“‚_Upload_and_Schema" class="get-started-btn">✨ Get Started</a>
125
+ </div>
126
+ """, unsafe_allow_html=True)
127
+
128
+ # ===== Handle Navigation =====
129
+ query_params = st.query_params
130
+ if "page" in query_params:
131
+ st.switch_page(f"pages/{query_params['page']}.py")
chatbot.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import streamlit as st
3
+ from dotenv import load_dotenv
4
+ from langchain_groq import ChatGroq
5
+
6
+ load_dotenv()
7
+
8
+ def chatbot_sidebar():
9
+ st.sidebar.markdown("## πŸ€– Chat with AI Data Scientist!")
10
+
11
+ if "dataset" not in st.session_state:
12
+ st.sidebar.warning("⚠️ Please upload a dataset first.")
13
+ return
14
+
15
+ df = st.session_state["dataset"]
16
+
17
+ # Use the working model
18
+ llm = ChatGroq(
19
+ model="llama-3.1-8b-instant",
20
+ api_key=os.getenv("GROQ_API_KEY"),
21
+ temperature=0
22
+ )
23
+
24
+ user_input = st.sidebar.text_area("πŸ’¬ Ask me about your dataset:")
25
+ if user_input:
26
+ try:
27
+ prompt = f"""
28
+ You are a professional data scientist. Analyze the DataFrame `df` below:
29
+
30
+ Preview:
31
+ {df.head(5).to_string()}
32
+
33
+ Schema:
34
+ {df.dtypes.to_string()}
35
+
36
+ Now answer the user’s question in plain English, based on the full datasetβ€”not code experiments.
37
+
38
+ User: {user_input}
39
+ """
40
+
41
+ response = llm.invoke(prompt)
42
+ st.sidebar.write("πŸ€–:", response.content.strip())
43
+
44
+ except Exception as e:
45
+ st.sidebar.error(f"⚠️ Error: {e}")
pages/01_πŸ“‚_Upload_and_Schema.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+
4
+ from chatbot import chatbot_sidebar
5
+
6
+ st.title("πŸ“‚ Upload & Schema")
7
+
8
+ uploaded_file = st.file_uploader("Upload a CSV file and open the sidebar from the Top-Left corner (>>) to interact with the specialized AI Data Scientist!", type=["csv"])
9
+
10
+ if uploaded_file is not None:
11
+ df = pd.read_csv(uploaded_file)
12
+
13
+ # Store in session state
14
+ st.session_state["dataset"] = df
15
+ st.session_state["uploaded_filename"] = uploaded_file.name
16
+
17
+ st.success(f"βœ… Uploaded: {uploaded_file.name}")
18
+ st.dataframe(df.head())
19
+
20
+
21
+ chatbot_sidebar()
pages/02_🧹_Clean_Data.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+
4
+ from chatbot import chatbot_sidebar
5
+
6
+ st.session_state["page_name"] = "Clean"
7
+
8
+ st.title("🧹 Data Cleaning")
9
+
10
+ # Check if dataset exists in session_state
11
+ if "dataset" not in st.session_state:
12
+ st.warning("⚠️ Please upload a dataset first in the Upload & Schema page.")
13
+ st.stop()
14
+
15
+ # Load dataset
16
+ df = st.session_state["dataset"]
17
+
18
+ st.subheader("Current Data Preview")
19
+ st.write(df.head())
20
+
21
+ # -------------------------
22
+ # Cleaning Options
23
+ # -------------------------
24
+ st.subheader("Cleaning Options")
25
+
26
+ if st.checkbox("Remove Missing Values"):
27
+ df = df.dropna()
28
+
29
+ if st.checkbox("Remove Duplicates"):
30
+ df = df.drop_duplicates()
31
+
32
+ if st.checkbox("Standardize Column Names"):
33
+ df.columns = [col.strip().lower().replace(" ", "_") for col in df.columns]
34
+
35
+ # -------------------------
36
+ # Save Cleaned Data
37
+ # -------------------------
38
+ if st.button("πŸ’Ύ Save Cleaned Dataset"):
39
+ st.session_state["dataset"] = df # replace original dataset
40
+ st.success("βœ… Cleaned dataset saved! This version will be used in the next steps.")
41
+
42
+ st.subheader("Preview of Cleaned Data")
43
+ st.write(df.head())
44
+
45
+ # -------------------------
46
+ # Download Option
47
+ # -------------------------
48
+ csv = df.to_csv(index=False).encode("utf-8")
49
+ st.download_button(
50
+ label="πŸ“₯ Download as CSV",
51
+ data=csv,
52
+ file_name="cleaned_dataset.csv",
53
+ mime="text/csv",
54
+ )
55
+
56
+
57
+ chatbot_sidebar()
pages/03_πŸ“Š_Data_Visualization.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import matplotlib.pyplot as plt
4
+
5
+ from chatbot import chatbot_sidebar
6
+
7
+ st.session_state["page_name"] = "Data_Visualisation"
8
+
9
+ st.title("πŸ“Š Exploratory Data Analysis (EDA)")
10
+
11
+ # -------------------------
12
+ # Load Dataset
13
+ # -------------------------
14
+ if "dataset" not in st.session_state:
15
+ st.warning("⚠️ Please upload and clean your dataset first.")
16
+ st.stop()
17
+
18
+ df = st.session_state["dataset"]
19
+
20
+ st.subheader("Data Preview")
21
+ st.write(df.head())
22
+
23
+ # -------------------------
24
+ # Summary Statistics
25
+ # -------------------------
26
+ st.subheader("πŸ“Œ Summary Statistics")
27
+ st.write(df.describe(include="all"))
28
+
29
+ # -------------------------
30
+ # 1. Histogram
31
+ # -------------------------
32
+ st.subheader("πŸ“ˆ Histogram")
33
+ column = st.selectbox("Select a column", df.columns, key="hist")
34
+ if column:
35
+ fig, ax = plt.subplots()
36
+ df[column].hist(ax=ax, bins=20, color="skyblue", edgecolor="black")
37
+ ax.set_title(f"Histogram of {column}")
38
+ ax.set_xlabel(column)
39
+ ax.set_ylabel("Frequency")
40
+ st.pyplot(fig)
41
+
42
+ # -------------------------
43
+ # 2. Boxplot
44
+ # -------------------------
45
+ st.subheader("πŸ“¦ Boxplot (Detect Outliers)")
46
+ box_col = st.selectbox("Select numeric column", df.select_dtypes(include="number").columns, key="box")
47
+ if box_col:
48
+ fig, ax = plt.subplots()
49
+ ax.boxplot(df[box_col].dropna())
50
+ ax.set_title(f"Boxplot of {box_col}")
51
+ ax.set_ylabel(box_col)
52
+ st.pyplot(fig)
53
+
54
+ # -------------------------
55
+ # 3. Scatter Plot
56
+ # -------------------------
57
+ st.subheader("βš–οΈ Scatter Plot (Relationship)")
58
+ col_x = st.selectbox("X-axis (Numeric)", df.select_dtypes(include="number").columns, key="scatter_x")
59
+ col_y = st.selectbox("Y-axis (Numeric)", df.select_dtypes(include="number").columns, key="scatter_y")
60
+ if col_x and col_y:
61
+ fig, ax = plt.subplots()
62
+ ax.scatter(df[col_x], df[col_y], alpha=0.6, color="purple")
63
+ ax.set_xlabel(col_x)
64
+ ax.set_ylabel(col_y)
65
+ ax.set_title(f"{col_x} vs {col_y}")
66
+ st.pyplot(fig)
67
+
68
+ # -------------------------
69
+ # 4. Bar Chart (Categorical Count)
70
+ # -------------------------
71
+ st.subheader("πŸ“Š Bar Chart (Category Counts)")
72
+ cat_col = st.selectbox("Select categorical column", df.select_dtypes(exclude="number").columns, key="bar")
73
+ if cat_col:
74
+ counts = df[cat_col].value_counts()
75
+ fig, ax = plt.subplots()
76
+ counts.plot(kind="bar", ax=ax, color="orange", edgecolor="black")
77
+ ax.set_title(f"Count of {cat_col}")
78
+ ax.set_xlabel(cat_col)
79
+ ax.set_ylabel("Count")
80
+ st.pyplot(fig)
81
+
82
+ # -------------------------
83
+ # 5. Correlation Heatmap
84
+ # -------------------------
85
+ st.subheader("πŸ”₯ Correlation Heatmap")
86
+ if len(df.select_dtypes(include="number").columns) > 1:
87
+ fig, ax = plt.subplots(figsize=(6, 4))
88
+ corr = df.corr(numeric_only=True)
89
+ im = ax.imshow(corr, cmap="coolwarm", aspect="auto")
90
+ ax.set_xticks(range(len(corr)))
91
+ ax.set_yticks(range(len(corr)))
92
+ ax.set_xticklabels(corr.columns, rotation=45, ha="right")
93
+ ax.set_yticklabels(corr.columns)
94
+ fig.colorbar(im)
95
+ ax.set_title("Correlation Heatmap")
96
+ st.pyplot(fig)
97
+ else:
98
+ st.info("Need at least 2 numeric columns for correlation heatmap.")
99
+
100
+
101
+ chatbot_sidebar()
pages/04_πŸ€–_Modeling_and_Evaluation.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from sklearn.model_selection import train_test_split
4
+ from sklearn.preprocessing import LabelEncoder
5
+ from sklearn.linear_model import LinearRegression, LogisticRegression
6
+ from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
7
+ from sklearn.metrics import accuracy_score, mean_squared_error
8
+
9
+ from chatbot import chatbot_sidebar
10
+
11
+ st.session_state["page_name"] = "Modeling and Evaluation"
12
+
13
+ st.title("πŸ€– Modeling & Evaluation")
14
+
15
+ # -------------------------
16
+ # Load dataset
17
+ # -------------------------
18
+ if "dataset" not in st.session_state:
19
+ st.warning("⚠️ Please upload a dataset first.")
20
+ st.stop()
21
+
22
+ df = st.session_state["dataset"]
23
+
24
+ # -------------------------
25
+ # Target Column Selection
26
+ # -------------------------
27
+ st.markdown("### 🎯 Select Target Column")
28
+
29
+ target_col = st.selectbox("Choose the target column:", df.columns, key="target_select")
30
+
31
+ if st.button("Confirm Target"):
32
+ st.session_state["target_col"] = target_col
33
+ st.session_state["run_modeling"] = False # reset before running
34
+ st.success(f"βœ… Target column set to **{target_col}**")
35
+
36
+ # -------------------------
37
+ # Run Modeling Button
38
+ # -------------------------
39
+ if "target_col" in st.session_state:
40
+ if st.button("πŸš€ Run Modeling"):
41
+ st.session_state["run_modeling"] = True
42
+
43
+ # -------------------------
44
+ # Modeling Logic
45
+ # -------------------------
46
+ if st.session_state.get("run_modeling", False):
47
+
48
+ with st.spinner("⏳ Training models... Please wait."):
49
+ target_col = st.session_state["target_col"]
50
+
51
+ # Split X and y
52
+ X = df.drop(columns=[target_col])
53
+ y = df[target_col]
54
+
55
+ # Encode categorical features
56
+ X = pd.get_dummies(X, drop_first=True)
57
+
58
+ # Encode target if categorical
59
+ if y.dtype == "object" or y.dtype.name == "category":
60
+ le = LabelEncoder()
61
+ y = le.fit_transform(y)
62
+
63
+ # Train/Test split
64
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
65
+
66
+ results = []
67
+
68
+ # Classification vs Regression detection
69
+ problem_type = "classification" if len(pd.Series(y).unique()) <= 10 else "regression"
70
+
71
+ if problem_type == "classification":
72
+ try:
73
+ clf = LogisticRegression(max_iter=1000)
74
+ clf.fit(X_train, y_train)
75
+ preds = clf.predict(X_test)
76
+ acc = accuracy_score(y_test, preds)
77
+ results.append(("Logistic Regression", acc))
78
+ except Exception as e:
79
+ st.error(f"❌ Logistic Regression failed: {e}")
80
+
81
+ try:
82
+ rf_clf = RandomForestClassifier()
83
+ rf_clf.fit(X_train, y_train)
84
+ preds = rf_clf.predict(X_test)
85
+ acc = accuracy_score(y_test, preds)
86
+ results.append(("Random Forest Classifier", acc))
87
+ except Exception as e:
88
+ st.error(f"❌ Random Forest failed: {e}")
89
+
90
+ else: # Regression
91
+ try:
92
+ lr = LinearRegression()
93
+ lr.fit(X_train, y_train)
94
+ preds = lr.predict(X_test)
95
+ mse = mean_squared_error(y_test, preds)
96
+ results.append(("Linear Regression", mse))
97
+ except Exception as e:
98
+ st.error(f"❌ Linear Regression failed: {e}")
99
+
100
+ try:
101
+ rf_reg = RandomForestRegressor()
102
+ rf_reg.fit(X_train, y_train)
103
+ preds = rf_reg.predict(X_test)
104
+ mse = mean_squared_error(y_test, preds)
105
+ results.append(("Random Forest Regressor", mse))
106
+ except Exception as e:
107
+ st.error(f"❌ Random Forest failed: {e}")
108
+
109
+ # -------------------------
110
+ # Show Results
111
+ # -------------------------
112
+ if results:
113
+ st.markdown("### πŸ“Š Model Results")
114
+
115
+ for model, score in results:
116
+ if problem_type == "classification":
117
+ st.write(f"βœ… **{model} Accuracy:** {score:.4f}")
118
+ else:
119
+ st.write(f"βœ… **{model} MSE:** {score:.2f}")
120
+
121
+ # Pick best model
122
+ if problem_type == "classification":
123
+ best_model = max(results, key=lambda x: x[1])
124
+ st.success(f"πŸ† Best Model: **{best_model[0]}** with Accuracy = {best_model[1]:.4f}")
125
+ else:
126
+ best_model = min(results, key=lambda x: x[1])
127
+ st.success(f"πŸ† Best Model: **{best_model[0]}** with MSE = {best_model[1]:.2f}")
128
+
129
+ # Save best model to session
130
+ st.session_state["best_model_name"] = best_model[0]
131
+ st.session_state["best_score"] = best_model[1]
132
+ st.session_state["problem_type"] = problem_type
133
+
134
+ else:
135
+ st.error("❌ No models could be trained. Please check your dataset.")
136
+
137
+ chatbot_sidebar()
pages/05_πŸ“‘_Report.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import matplotlib.pyplot as plt
4
+ import seaborn as sns
5
+ import io
6
+ from reportlab.lib.pagesizes import letter
7
+ from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Image, Table, TableStyle
8
+ from reportlab.lib.styles import getSampleStyleSheet
9
+ from reportlab.lib import colors
10
+
11
+ from chatbot import chatbot_sidebar
12
+
13
+ st.session_state["page_name"] = "Report"
14
+
15
+ st.title("πŸ“‘ Generate Report")
16
+
17
+ # -------------------------
18
+ # Load dataset from session_state
19
+ # -------------------------
20
+ if "dataset" in st.session_state and "uploaded_filename" in st.session_state:
21
+ df = st.session_state["dataset"]
22
+ dataset_name = st.session_state["uploaded_filename"].split(".")[0]
23
+
24
+ def generate_pdf():
25
+ buffer = io.BytesIO()
26
+ doc = SimpleDocTemplate(buffer, pagesize=letter)
27
+ styles = getSampleStyleSheet()
28
+ elements = []
29
+
30
+ # -------------------------
31
+ # Title
32
+ # -------------------------
33
+ elements.append(Paragraph(f"{dataset_name} Report", styles['Title']))
34
+ elements.append(Spacer(1, 12))
35
+
36
+ # -------------------------
37
+ # Dataset Overview
38
+ # -------------------------
39
+ elements.append(Paragraph("πŸ“Š Dataset Overview", styles['Heading2']))
40
+ elements.append(Paragraph(f"Rows: {df.shape[0]}", styles['Normal']))
41
+ elements.append(Paragraph(f"Columns: {df.shape[1]}", styles['Normal']))
42
+ elements.append(Paragraph(f"Missing Values: {df.isnull().sum().sum()}", styles['Normal']))
43
+ elements.append(Spacer(1, 12))
44
+
45
+ # -------------------------
46
+ # Descriptive Statistics (Split into chunks so it fits PDF)
47
+ # -------------------------
48
+ elements.append(Paragraph("πŸ“ˆ Descriptive Statistics", styles['Heading2']))
49
+
50
+ stats_df = df.describe().round(2).reset_index()
51
+
52
+ # Format numbers with commas + 2 decimals
53
+ def format_value(x):
54
+ if isinstance(x, (int, float)):
55
+ return f"{x:,.2f}"
56
+ return str(x)
57
+
58
+ stats_df = stats_df.applymap(format_value)
59
+
60
+ chunk_size = 6 # number of columns per table
61
+ for start in range(0, stats_df.shape[1], chunk_size):
62
+ subset = stats_df.iloc[:, start:start + chunk_size]
63
+ table_data = [subset.columns.tolist()] + subset.values.tolist()
64
+
65
+ stats_table = Table(table_data, repeatRows=1)
66
+ style_commands = [
67
+ ('BACKGROUND', (0, 0), (-1, 0), colors.HexColor("#4CAF50")),
68
+ ('TEXTCOLOR', (0, 0), (-1, 0), colors.white),
69
+ ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
70
+ ('ALIGN', (0, 0), (-1, -1), 'CENTER'),
71
+ ('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
72
+ ('FONTSIZE', (0, 0), (-1, -1), 7), # smaller font
73
+ ]
74
+ # Alternating row background
75
+ for i in range(1, len(table_data)):
76
+ bg_color = colors.whitesmoke if i % 2 == 0 else colors.lightgrey
77
+ style_commands.append(('BACKGROUND', (0, i), (-1, i), bg_color))
78
+
79
+ stats_table.setStyle(TableStyle(style_commands))
80
+ elements.append(stats_table)
81
+ elements.append(Spacer(1, 12))
82
+
83
+ # -------------------------
84
+ # Best Model Summary
85
+ # -------------------------
86
+ if "best_model" in st.session_state:
87
+ elements.append(Paragraph("πŸ€– Best Model Summary", styles['Heading2']))
88
+ model_table_data = [
89
+ ["Model", "Score", "Type"],
90
+ [
91
+ st.session_state["best_model_name"],
92
+ f"{st.session_state['best_score']:.4f}",
93
+ st.session_state["problem_type"]
94
+ ]
95
+ ]
96
+ model_table = Table(model_table_data)
97
+ model_table.setStyle(TableStyle([
98
+ ('BACKGROUND', (0, 0), (-1, 0), colors.HexColor("#2196F3")),
99
+ ('TEXTCOLOR', (0, 0), (-1, 0), colors.white),
100
+ ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
101
+ ('ALIGN', (0, 0), (-1, -1), 'CENTER'),
102
+ ('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
103
+ ]))
104
+ elements.append(model_table)
105
+ elements.append(Spacer(1, 12))
106
+
107
+ # -------------------------
108
+ # Correlation Heatmap (if numeric)
109
+ # -------------------------
110
+ num_cols = df.select_dtypes(include=["int64", "float64"]).columns
111
+ if len(num_cols) > 1:
112
+ fig, ax = plt.subplots(figsize=(5, 4))
113
+ sns.heatmap(df[num_cols].corr(), annot=True, cmap="coolwarm", ax=ax)
114
+ img_buffer = io.BytesIO()
115
+ plt.savefig(img_buffer, format='png')
116
+ plt.close(fig)
117
+ img_buffer.seek(0)
118
+ elements.append(Paragraph("πŸ“Œ Correlation Heatmap", styles['Heading2']))
119
+ elements.append(Image(img_buffer, width=400, height=300))
120
+ elements.append(Spacer(1, 12))
121
+
122
+ doc.build(elements)
123
+ buffer.seek(0)
124
+ return buffer
125
+
126
+ pdf_buffer = generate_pdf()
127
+
128
+ st.download_button(
129
+ label="πŸ“₯ Download Detailed Report (PDF)",
130
+ data=pdf_buffer,
131
+ file_name=f"{dataset_name}_report.pdf",
132
+ mime="application/pdf"
133
+ )
134
+
135
+ else:
136
+ st.warning("⚠️ Please upload and process a dataset first.")
137
+
138
+ # Chatbot
139
+ chatbot_sidebar()
requirements.txt CHANGED
@@ -1,8 +1,7 @@
1
  streamlit
2
  pandas
 
 
3
  matplotlib
4
  seaborn
5
- scikit-learn
6
  reportlab
7
- python-dotenv
8
- groq
 
1
  streamlit
2
  pandas
3
+ numpy
4
+ scikit-learn
5
  matplotlib
6
  seaborn
 
7
  reportlab