Spaces:

Sanchay3011
/

ai-ds-agent

Sleeping

App Files Files Community

Sanchay3011 commited on Aug 29

Commit

4888ade

1 Parent(s): 01b1ec7

Added AI Data Scientist Agent app

Browse files

Files changed (8) hide show

app.py +131 -0
chatbot.py +45 -0
pages/01_📂_Upload_and_Schema.py +21 -0
pages/02_🧹_Clean_Data.py +57 -0
pages/03_📊_Data_Visualization.py +101 -0
pages/04_🤖_Modeling_and_Evaluation.py +137 -0
pages/05_📑_Report.py +139 -0
requirements.txt +2 -3

app.py ADDED Viewed

	@@ -0,0 +1,131 @@

+import streamlit as st
+st.set_page_config(page_title="AI Data Scientist Agent", page_icon="🤖", layout="wide")
+# ===== CSS =====
+st.markdown("""
+    <style>
+    .stApp {
+        background: linear-gradient(135deg, #89f7fe, #66a6ff);
+    }
+    .main-title {
+        text-align: center;
+        font-size: 3em;
+        color: white;
+        font-weight: bold;
+        margin-bottom: 0.3em;
+    }
+    .subtitle {
+        text-align: center;
+        font-size: 1.2em;
+        color: #f0f0f0;
+        margin-bottom: 2em;
+    }
+    .cards-container {
+        display: flex;
+        justify-content: center;
+        flex-wrap: wrap;
+        gap: 25px;
+        margin-top: 30px;
+    }
+    .card {
+        background-color: white;
+        width: 260px;
+        height: 180px;
+        border-radius: 20px;
+        box-shadow: 0px 6px 20px rgba(0,0,0,0.15);
+        text-align: center;
+        padding: 20px;
+        transition: transform 0.2s ease-in-out, box-shadow 0.2s ease-in-out;
+        cursor: pointer;
+        text-decoration: none;
+        display: flex;
+        flex-direction: column;
+        justify-content: center;
+    }
+    .card:hover {
+        transform: translateY(-6px);
+        box-shadow: 0px 10px 30px rgba(0,0,0,0.25);
+    }
+    .card-icon {
+        font-size: 2.5em;
+        margin-bottom: 12px;
+    }
+    .card-title {
+        font-size: 1.2em;
+        font-weight: bold;
+        margin-bottom: 6px;
+        color: #333;
+    }
+    .card-desc {
+        font-size: 0.9em;
+        color: #666;
+    }
+    .center-btn {
+        text-align: center;
+        margin-top: 40px;
+    }
+    .get-started-btn {
+        background-color: #ff6b6b;
+        color: white;
+        padding: 14px 40px;
+        border-radius: 30px;
+        font-size: 1.2em;
+        font-weight: bold;
+        text-decoration: none;
+        transition: background 0.3s ease-in-out;
+    }
+    .get-started-btn:hover {
+        background-color: #ff4757;
+    }
+    </style>
+""", unsafe_allow_html=True)
+# ===== Title =====
+st.markdown("<h1 class='main-title'>🚀 AI Data Scientist Agent</h1>", unsafe_allow_html=True)
+st.markdown("<p class='subtitle'>Your end-to-end assistant for Data Cleaning, Analysis, Modeling, and Reporting 📊</p>", unsafe_allow_html=True)
+# ===== Cards =====
+cards_html = """
+<div class="cards-container">
+    <a href="?page=01_📂_Upload_and_Schema" class="card">
+        <div class="card-icon">📂</div>
+        <div class="card-title">Step-1: Upload & Schema</div>
+        <div class="card-desc">Upload your dataset & explore its structure</div>
+    </a>
+    <a href="?page=02_🧹_Clean_Data" class="card">
+        <div class="card-icon">🧹</div>
+        <div class="card-title">Step-2: Clean Data</div>
+        <div class="card-desc">Handle missing values, duplicates & outliers</div>
+    </a>
+    <a href="?page=03_📊_Data_Visualization" class="card">
+        <div class="card-icon">📊</div>
+        <div class="card-title">Step-3: Visualize Data</div>
+        <div class="card-desc">Generate interactive charts & correlations</div>
+    </a>
+    <a href="?page=04_🤖_Modeling_and_Evaluation" class="card">
+        <div class="card-icon">🤖</div>
+        <div class="card-title">Step-4: Modeling</div>
+        <div class="card-desc">Train ML models & pick the best one</div>
+    </a>
+    <a href="?page=05_📑_Report" class="card">
+        <div class="card-icon">📑</div>
+        <div class="card-title">Step-5: Report</div>
+        <div class="card-desc">Download automated PDF reports with insights</div>
+    </a>
+</div>
+"""
+st.markdown(cards_html, unsafe_allow_html=True)
+# ===== Get Started Button =====
+st.markdown("""
+<div class="center-btn">
+    <a href="?page=01_📂_Upload_and_Schema" class="get-started-btn">✨ Get Started</a>
+</div>
+""", unsafe_allow_html=True)
+# ===== Handle Navigation =====
+query_params = st.query_params
+if "page" in query_params:
+    st.switch_page(f"pages/{query_params['page']}.py")

chatbot.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import os
+import streamlit as st
+from dotenv import load_dotenv
+from langchain_groq import ChatGroq
+load_dotenv()
+def chatbot_sidebar():
+    st.sidebar.markdown("## 🤖 Chat with AI Data Scientist!")
+    if "dataset" not in st.session_state:
+        st.sidebar.warning("⚠️ Please upload a dataset first.")
+        return
+    df = st.session_state["dataset"]
+    # Use the working model
+    llm = ChatGroq(
+        model="llama-3.1-8b-instant",
+        api_key=os.getenv("GROQ_API_KEY"),
+        temperature=0
+    )
+    user_input = st.sidebar.text_area("💬 Ask me about your dataset:")
+    if user_input:
+        try:
+            prompt = f"""
+            You are a professional data scientist. Analyze the DataFrame `df` below:
+            Preview:
+            {df.head(5).to_string()}
+            Schema:
+            {df.dtypes.to_string()}
+            Now answer the user’s question in plain English, based on the full dataset—not code experiments.
+            User: {user_input}
+            """
+            response = llm.invoke(prompt)
+            st.sidebar.write("🤖:", response.content.strip())
+        except Exception as e:
+            st.sidebar.error(f"⚠️ Error: {e}")

pages/01_📂_Upload_and_Schema.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import streamlit as st
+import pandas as pd
+from chatbot import chatbot_sidebar
+st.title("📂 Upload & Schema")
+uploaded_file = st.file_uploader("Upload a CSV file and open the sidebar from the Top-Left corner (>>) to interact with the specialized AI Data Scientist!", type=["csv"])
+if uploaded_file is not None:
+    df = pd.read_csv(uploaded_file)
+    # Store in session state
+    st.session_state["dataset"] = df
+    st.session_state["uploaded_filename"] = uploaded_file.name
+    st.success(f"✅ Uploaded: {uploaded_file.name}")
+    st.dataframe(df.head())
+chatbot_sidebar()

pages/02_🧹_Clean_Data.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import streamlit as st
+import pandas as pd
+from chatbot import chatbot_sidebar
+st.session_state["page_name"] = "Clean"
+st.title("🧹 Data Cleaning")
+# Check if dataset exists in session_state
+if "dataset" not in st.session_state:
+    st.warning("⚠️ Please upload a dataset first in the Upload & Schema page.")
+    st.stop()
+# Load dataset
+df = st.session_state["dataset"]
+st.subheader("Current Data Preview")
+st.write(df.head())
+# -------------------------
+# Cleaning Options
+# -------------------------
+st.subheader("Cleaning Options")
+if st.checkbox("Remove Missing Values"):
+    df = df.dropna()
+if st.checkbox("Remove Duplicates"):
+    df = df.drop_duplicates()
+if st.checkbox("Standardize Column Names"):
+    df.columns = [col.strip().lower().replace(" ", "_") for col in df.columns]
+# -------------------------
+# Save Cleaned Data
+# -------------------------
+if st.button("💾 Save Cleaned Dataset"):
+    st.session_state["dataset"] = df  # replace original dataset
+    st.success("✅ Cleaned dataset saved! This version will be used in the next steps.")
+st.subheader("Preview of Cleaned Data")
+st.write(df.head())
+# -------------------------
+# Download Option
+# -------------------------
+csv = df.to_csv(index=False).encode("utf-8")
+st.download_button(
+    label="📥 Download as CSV",
+    data=csv,
+    file_name="cleaned_dataset.csv",
+    mime="text/csv",
+)
+chatbot_sidebar()

pages/03_📊_Data_Visualization.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import streamlit as st
+import pandas as pd
+import matplotlib.pyplot as plt
+from chatbot import chatbot_sidebar
+st.session_state["page_name"] = "Data_Visualisation"
+st.title("📊 Exploratory Data Analysis (EDA)")
+# -------------------------
+# Load Dataset
+# -------------------------
+if "dataset" not in st.session_state:
+    st.warning("⚠️ Please upload and clean your dataset first.")
+    st.stop()
+df = st.session_state["dataset"]
+st.subheader("Data Preview")
+st.write(df.head())
+# -------------------------
+# Summary Statistics
+# -------------------------
+st.subheader("📌 Summary Statistics")
+st.write(df.describe(include="all"))
+# -------------------------
+# 1. Histogram
+# -------------------------
+st.subheader("📈 Histogram")
+column = st.selectbox("Select a column", df.columns, key="hist")
+if column:
+    fig, ax = plt.subplots()
+    df[column].hist(ax=ax, bins=20, color="skyblue", edgecolor="black")
+    ax.set_title(f"Histogram of {column}")
+    ax.set_xlabel(column)
+    ax.set_ylabel("Frequency")
+    st.pyplot(fig)
+# -------------------------
+# 2. Boxplot
+# -------------------------
+st.subheader("📦 Boxplot (Detect Outliers)")
+box_col = st.selectbox("Select numeric column", df.select_dtypes(include="number").columns, key="box")
+if box_col:
+    fig, ax = plt.subplots()
+    ax.boxplot(df[box_col].dropna())
+    ax.set_title(f"Boxplot of {box_col}")
+    ax.set_ylabel(box_col)
+    st.pyplot(fig)
+# -------------------------
+# 3. Scatter Plot
+# -------------------------
+st.subheader("⚖️ Scatter Plot (Relationship)")
+col_x = st.selectbox("X-axis (Numeric)", df.select_dtypes(include="number").columns, key="scatter_x")
+col_y = st.selectbox("Y-axis (Numeric)", df.select_dtypes(include="number").columns, key="scatter_y")
+if col_x and col_y:
+    fig, ax = plt.subplots()
+    ax.scatter(df[col_x], df[col_y], alpha=0.6, color="purple")
+    ax.set_xlabel(col_x)
+    ax.set_ylabel(col_y)
+    ax.set_title(f"{col_x} vs {col_y}")
+    st.pyplot(fig)
+# -------------------------
+# 4. Bar Chart (Categorical Count)
+# -------------------------
+st.subheader("📊 Bar Chart (Category Counts)")
+cat_col = st.selectbox("Select categorical column", df.select_dtypes(exclude="number").columns, key="bar")
+if cat_col:
+    counts = df[cat_col].value_counts()
+    fig, ax = plt.subplots()
+    counts.plot(kind="bar", ax=ax, color="orange", edgecolor="black")
+    ax.set_title(f"Count of {cat_col}")
+    ax.set_xlabel(cat_col)
+    ax.set_ylabel("Count")
+    st.pyplot(fig)
+# -------------------------
+# 5. Correlation Heatmap
+# -------------------------
+st.subheader("🔥 Correlation Heatmap")
+if len(df.select_dtypes(include="number").columns) > 1:
+    fig, ax = plt.subplots(figsize=(6, 4))
+    corr = df.corr(numeric_only=True)
+    im = ax.imshow(corr, cmap="coolwarm", aspect="auto")
+    ax.set_xticks(range(len(corr)))
+    ax.set_yticks(range(len(corr)))
+    ax.set_xticklabels(corr.columns, rotation=45, ha="right")
+    ax.set_yticklabels(corr.columns)
+    fig.colorbar(im)
+    ax.set_title("Correlation Heatmap")
+    st.pyplot(fig)
+else:
+    st.info("Need at least 2 numeric columns for correlation heatmap.")
+chatbot_sidebar()

pages/04_🤖_Modeling_and_Evaluation.py ADDED Viewed

	@@ -0,0 +1,137 @@

+import streamlit as st
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import LabelEncoder
+from sklearn.linear_model import LinearRegression, LogisticRegression
+from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
+from sklearn.metrics import accuracy_score, mean_squared_error
+from chatbot import chatbot_sidebar
+st.session_state["page_name"] = "Modeling and Evaluation"
+st.title("🤖 Modeling & Evaluation")
+# -------------------------
+# Load dataset
+# -------------------------
+if "dataset" not in st.session_state:
+    st.warning("⚠️ Please upload a dataset first.")
+    st.stop()
+df = st.session_state["dataset"]
+# -------------------------
+# Target Column Selection
+# -------------------------
+st.markdown("### 🎯 Select Target Column")
+target_col = st.selectbox("Choose the target column:", df.columns, key="target_select")
+if st.button("Confirm Target"):
+    st.session_state["target_col"] = target_col
+    st.session_state["run_modeling"] = False  # reset before running
+    st.success(f"✅ Target column set to **{target_col}**")
+# -------------------------
+# Run Modeling Button
+# -------------------------
+if "target_col" in st.session_state:
+    if st.button("🚀 Run Modeling"):
+        st.session_state["run_modeling"] = True
+# -------------------------
+# Modeling Logic
+# -------------------------
+if st.session_state.get("run_modeling", False):
+    with st.spinner("⏳ Training models... Please wait."):
+        target_col = st.session_state["target_col"]
+        # Split X and y
+        X = df.drop(columns=[target_col])
+        y = df[target_col]
+        # Encode categorical features
+        X = pd.get_dummies(X, drop_first=True)
+        # Encode target if categorical
+        if y.dtype == "object" or y.dtype.name == "category":
+            le = LabelEncoder()
+            y = le.fit_transform(y)
+        # Train/Test split
+        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+        results = []
+        # Classification vs Regression detection
+        problem_type = "classification" if len(pd.Series(y).unique()) <= 10 else "regression"
+        if problem_type == "classification":
+            try:
+                clf = LogisticRegression(max_iter=1000)
+                clf.fit(X_train, y_train)
+                preds = clf.predict(X_test)
+                acc = accuracy_score(y_test, preds)
+                results.append(("Logistic Regression", acc))
+            except Exception as e:
+                st.error(f"❌ Logistic Regression failed: {e}")
+            try:
+                rf_clf = RandomForestClassifier()
+                rf_clf.fit(X_train, y_train)
+                preds = rf_clf.predict(X_test)
+                acc = accuracy_score(y_test, preds)
+                results.append(("Random Forest Classifier", acc))
+            except Exception as e:
+                st.error(f"❌ Random Forest failed: {e}")
+        else:  # Regression
+            try:
+                lr = LinearRegression()
+                lr.fit(X_train, y_train)
+                preds = lr.predict(X_test)
+                mse = mean_squared_error(y_test, preds)
+                results.append(("Linear Regression", mse))
+            except Exception as e:
+                st.error(f"❌ Linear Regression failed: {e}")
+            try:
+                rf_reg = RandomForestRegressor()
+                rf_reg.fit(X_train, y_train)
+                preds = rf_reg.predict(X_test)
+                mse = mean_squared_error(y_test, preds)
+                results.append(("Random Forest Regressor", mse))
+            except Exception as e:
+                st.error(f"❌ Random Forest failed: {e}")
+        # -------------------------
+        # Show Results
+        # -------------------------
+        if results:
+            st.markdown("### 📊 Model Results")
+            for model, score in results:
+                if problem_type == "classification":
+                    st.write(f"✅ **{model} Accuracy:** {score:.4f}")
+                else:
+                    st.write(f"✅ **{model} MSE:** {score:.2f}")
+            # Pick best model
+            if problem_type == "classification":
+                best_model = max(results, key=lambda x: x[1])
+                st.success(f"🏆 Best Model: **{best_model[0]}** with Accuracy = {best_model[1]:.4f}")
+            else:
+                best_model = min(results, key=lambda x: x[1])
+                st.success(f"🏆 Best Model: **{best_model[0]}** with MSE = {best_model[1]:.2f}")
+            # Save best model to session
+            st.session_state["best_model_name"] = best_model[0]
+            st.session_state["best_score"] = best_model[1]
+            st.session_state["problem_type"] = problem_type
+        else:
+            st.error("❌ No models could be trained. Please check your dataset.")
+chatbot_sidebar()

pages/05_📑_Report.py ADDED Viewed

	@@ -0,0 +1,139 @@

+import streamlit as st
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+import io
+from reportlab.lib.pagesizes import letter
+from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Image, Table, TableStyle
+from reportlab.lib.styles import getSampleStyleSheet
+from reportlab.lib import colors
+from chatbot import chatbot_sidebar
+st.session_state["page_name"] = "Report"
+st.title("📑 Generate Report")
+# -------------------------
+# Load dataset from session_state
+# -------------------------
+if "dataset" in st.session_state and "uploaded_filename" in st.session_state:
+    df = st.session_state["dataset"]
+    dataset_name = st.session_state["uploaded_filename"].split(".")[0]
+    def generate_pdf():
+        buffer = io.BytesIO()
+        doc = SimpleDocTemplate(buffer, pagesize=letter)
+        styles = getSampleStyleSheet()
+        elements = []
+        # -------------------------
+        # Title
+        # -------------------------
+        elements.append(Paragraph(f"{dataset_name} Report", styles['Title']))
+        elements.append(Spacer(1, 12))
+        # -------------------------
+        # Dataset Overview
+        # -------------------------
+        elements.append(Paragraph("📊 Dataset Overview", styles['Heading2']))
+        elements.append(Paragraph(f"Rows: {df.shape[0]}", styles['Normal']))
+        elements.append(Paragraph(f"Columns: {df.shape[1]}", styles['Normal']))
+        elements.append(Paragraph(f"Missing Values: {df.isnull().sum().sum()}", styles['Normal']))
+        elements.append(Spacer(1, 12))
+        # -------------------------
+        # Descriptive Statistics (Split into chunks so it fits PDF)
+        # -------------------------
+        elements.append(Paragraph("📈 Descriptive Statistics", styles['Heading2']))
+        stats_df = df.describe().round(2).reset_index()
+        # Format numbers with commas + 2 decimals
+        def format_value(x):
+            if isinstance(x, (int, float)):
+                return f"{x:,.2f}"
+            return str(x)
+        stats_df = stats_df.applymap(format_value)
+        chunk_size = 6  # number of columns per table
+        for start in range(0, stats_df.shape[1], chunk_size):
+            subset = stats_df.iloc[:, start:start + chunk_size]
+            table_data = [subset.columns.tolist()] + subset.values.tolist()
+            stats_table = Table(table_data, repeatRows=1)
+            style_commands = [
+                ('BACKGROUND', (0, 0), (-1, 0), colors.HexColor("#4CAF50")),
+                ('TEXTCOLOR', (0, 0), (-1, 0), colors.white),
+                ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
+                ('ALIGN', (0, 0), (-1, -1), 'CENTER'),
+                ('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
+                ('FONTSIZE', (0, 0), (-1, -1), 7),  # smaller font
+            ]
+            # Alternating row background
+            for i in range(1, len(table_data)):
+                bg_color = colors.whitesmoke if i % 2 == 0 else colors.lightgrey
+                style_commands.append(('BACKGROUND', (0, i), (-1, i), bg_color))
+            stats_table.setStyle(TableStyle(style_commands))
+            elements.append(stats_table)
+            elements.append(Spacer(1, 12))
+        # -------------------------
+        # Best Model Summary
+        # -------------------------
+        if "best_model" in st.session_state:
+            elements.append(Paragraph("🤖 Best Model Summary", styles['Heading2']))
+            model_table_data = [
+                ["Model", "Score", "Type"],
+                [
+                    st.session_state["best_model_name"],
+                    f"{st.session_state['best_score']:.4f}",
+                    st.session_state["problem_type"]
+                ]
+            ]
+            model_table = Table(model_table_data)
+            model_table.setStyle(TableStyle([
+                ('BACKGROUND', (0, 0), (-1, 0), colors.HexColor("#2196F3")),
+                ('TEXTCOLOR', (0, 0), (-1, 0), colors.white),
+                ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
+                ('ALIGN', (0, 0), (-1, -1), 'CENTER'),
+                ('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
+            ]))
+            elements.append(model_table)
+            elements.append(Spacer(1, 12))
+        # -------------------------
+        # Correlation Heatmap (if numeric)
+        # -------------------------
+        num_cols = df.select_dtypes(include=["int64", "float64"]).columns
+        if len(num_cols) > 1:
+            fig, ax = plt.subplots(figsize=(5, 4))
+            sns.heatmap(df[num_cols].corr(), annot=True, cmap="coolwarm", ax=ax)
+            img_buffer = io.BytesIO()
+            plt.savefig(img_buffer, format='png')
+            plt.close(fig)
+            img_buffer.seek(0)
+            elements.append(Paragraph("📌 Correlation Heatmap", styles['Heading2']))
+            elements.append(Image(img_buffer, width=400, height=300))
+            elements.append(Spacer(1, 12))
+        doc.build(elements)
+        buffer.seek(0)
+        return buffer
+    pdf_buffer = generate_pdf()
+    st.download_button(
+        label="📥 Download Detailed Report (PDF)",
+        data=pdf_buffer,
+        file_name=f"{dataset_name}_report.pdf",
+        mime="application/pdf"
+    )
+else:
+    st.warning("⚠️ Please upload and process a dataset first.")
+# Chatbot
+chatbot_sidebar()

requirements.txt CHANGED Viewed

@@ -1,8 +1,7 @@
 streamlit
 pandas
 matplotlib
 seaborn
-scikit-learn
 reportlab
-python-dotenv
-groq

 streamlit
 pandas
+numpy
+scikit-learn
 matplotlib
 seaborn
 reportlab