import pandas as pd import numpy as np from scoring import METRIC_MAP, PRESET_CONFIGS from config import POCKET_MODEL_THRESHOLD class DevSuite: def __init__(self, df): self.df = df self.report = { "summary": {"critical": 0, "medium": 0, "low": 0, "tests_passed": 0}, "critical_issues": [], "medium_issues": [], "low_issues": [], "anomalies": [], "statistics": {} } def run_all_tests(self): if self.df is None or self.df.empty: self._add_issue("critical", "DataFrame is empty or None.") return self._generate_markdown_report() self._test_normalization_bounds() self._test_parameter_scaling() self._test_badges_logic() self._test_weight_sums() self._test_score_ranges() self._collect_normalization_stats() return self._generate_markdown_report() def get_anomalies_df(self): return pd.DataFrame(self.report["anomalies"]) if self.report["anomalies"] else pd.DataFrame() def get_statistics_df(self): return pd.DataFrame(self.report["statistics"]).T if self.report["statistics"] else pd.DataFrame() def _test_normalization_bounds(self): """Check if normalized metrics are within [0, 1].""" norm_cols = [v[0] for k, v in METRIC_MAP.items() if v[0] in self.df.columns] for col in norm_cols: # We no longer use sentinel values, so just drop NaNs values = self.df[col].dropna() if values.empty: continue # Allow for tiny floating point errors (-1e-6 to 1.0+1e-6) if values.min() < -1e-6 or values.max() > 1.0 + 1e-6: self._add_issue("critical", f"Normalization bounds broken in '{col}': Range [{values.min():.3f}, {values.max():.3f}]") else: self.report["summary"]["tests_passed"] += 1 def _test_parameter_scaling(self): """Check if parameters look scaled correctly (Billion scale, not Million or arbitrary).""" if 'Total Parameters' in self.df.columns: # Check maximum parameter value in the dataset # If the largest model is < 100, we might have a scaling issue (e.g., Llama 405B missing or scaled down) max_params = self.df['Total Parameters'].max() if pd.isna(max_params) or max_params == 0: self._add_issue("critical", "Parameter column appears empty or zeroed.") elif max_params < 100: self._add_issue("critical", f"Parameter scaling suspicious: Max parameter found is {max_params}. Expected > 100 for large models.") else: self.report["summary"]["tests_passed"] += 1 def _test_badges_logic(self): """Check if badges are assigned consistent with config thresholds.""" if 'Badges' in self.df.columns and 'Total Parameters' in self.df.columns: # Check Pocket Badge pocket_models = self.df[self.df['Badges'].astype(str).str.contains("🤏", na=False)] if not pocket_models.empty: max_pocket_params = pocket_models['Total Parameters'].max() # Allow small buffer for float comparison if max_pocket_params > POCKET_MODEL_THRESHOLD + 0.1: self._add_issue("medium", f"Pocket badge assigned to model with {max_pocket_params}B params (Threshold: {POCKET_MODEL_THRESHOLD}B).") else: self.report["summary"]["tests_passed"] += 1 else: self.report["summary"]["tests_passed"] += 1 def _test_weight_sums(self): """Check if preset weights sum to 1.0.""" for preset, weights in PRESET_CONFIGS.items(): if isinstance(weights, dict) and 'special_type' not in weights: total_weight = sum(weights.values()) if abs(total_weight - 1.0) > 1e-4: self._add_issue("medium", f"Preset '{preset}' weights sum to {total_weight:.2f} (expected 1.0)") else: self.report["summary"]["tests_passed"] += 1 def _test_score_ranges(self): """Check if final scores are within reasonable bounds [0, 1.1].""" score_cols = [c for c in self.df.columns if c.startswith("Score_")] for col in score_cols: if 'Efficiency' in col: continue # Efficiency score is not normalized to 0-1 vals = self.df[col].dropna() if not vals.empty and (vals.min() < 0 or vals.max() > 1.1): self._add_issue("medium", f"Score out of range in {col}: [{vals.min():.2f}, {vals.max():.2f}]") else: self.report["summary"]["tests_passed"] += 1 def _collect_normalization_stats(self): """Collect statistics for normalized columns.""" norm_cols = [v[0] for k, v in METRIC_MAP.items() if v[0] in self.df.columns] for col in norm_cols: values = self.df[col].dropna() self.report["statistics"][col] = { "min": float(values.min()) if not values.empty else 0, "max": float(values.max()) if not values.empty else 0, "mean": float(values.mean()) if not values.empty else 0, "std": float(values.std()) if not values.empty else 0 } def _add_issue(self, level, message): self.report["summary"][level] += 1 self.report[f"{level}_issues"].append(message) def _generate_markdown_report(self): r = self.report md = [ f"## Executive Summary", f"- **Tests Passed**: {r['summary']['tests_passed']}", f"- **Critical Issues**: {r['summary']['critical']}", f"- **Medium Issues**: {r['summary']['medium']}" ] if r['critical_issues']: md.append("\n### 🔴 Critical Issues") md.extend([f"- {i}" for i in r['critical_issues']]) if r['medium_issues']: md.append("\n### 🟠 Medium Issues") md.extend([f"- {i}" for i in r['medium_issues']]) if r['low_issues']: md.append("\n### 🟡 Low Issues") md.extend([f"- {i}" for i in r['low_issues']]) if not r['critical_issues'] and not r['medium_issues']: md.append("\n### ✅ System Status: Healthy") return "\n".join(md)