File size: 6,759 Bytes
a597782 1c249d8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 |
import pandas as pd
import numpy as np
from scoring import METRIC_MAP, PRESET_CONFIGS
from config import POCKET_MODEL_THRESHOLD
class DevSuite:
def __init__(self, df):
self.df = df
self.report = {
"summary": {"critical": 0, "medium": 0, "low": 0, "tests_passed": 0},
"critical_issues": [],
"medium_issues": [],
"low_issues": [],
"anomalies": [],
"statistics": {}
}
def run_all_tests(self):
if self.df is None or self.df.empty:
self._add_issue("critical", "DataFrame is empty or None.")
return self._generate_markdown_report()
self._test_normalization_bounds()
self._test_parameter_scaling()
self._test_badges_logic()
self._test_weight_sums()
self._test_score_ranges()
self._collect_normalization_stats()
return self._generate_markdown_report()
def get_anomalies_df(self):
return pd.DataFrame(self.report["anomalies"]) if self.report["anomalies"] else pd.DataFrame()
def get_statistics_df(self):
return pd.DataFrame(self.report["statistics"]).T if self.report["statistics"] else pd.DataFrame()
def _test_normalization_bounds(self):
"""Check if normalized metrics are within [0, 1]."""
norm_cols = [v[0] for k, v in METRIC_MAP.items() if v[0] in self.df.columns]
for col in norm_cols:
# We no longer use sentinel values, so just drop NaNs
values = self.df[col].dropna()
if values.empty:
continue
# Allow for tiny floating point errors (-1e-6 to 1.0+1e-6)
if values.min() < -1e-6 or values.max() > 1.0 + 1e-6:
self._add_issue("critical", f"Normalization bounds broken in '{col}': Range [{values.min():.3f}, {values.max():.3f}]")
else:
self.report["summary"]["tests_passed"] += 1
def _test_parameter_scaling(self):
"""Check if parameters look scaled correctly (Billion scale, not Million or arbitrary)."""
if 'Total Parameters' in self.df.columns:
# Check maximum parameter value in the dataset
# If the largest model is < 100, we might have a scaling issue (e.g., Llama 405B missing or scaled down)
max_params = self.df['Total Parameters'].max()
if pd.isna(max_params) or max_params == 0:
self._add_issue("critical", "Parameter column appears empty or zeroed.")
elif max_params < 100:
self._add_issue("critical", f"Parameter scaling suspicious: Max parameter found is {max_params}. Expected > 100 for large models.")
else:
self.report["summary"]["tests_passed"] += 1
def _test_badges_logic(self):
"""Check if badges are assigned consistent with config thresholds."""
if 'Badges' in self.df.columns and 'Total Parameters' in self.df.columns:
# Check Pocket Badge
pocket_models = self.df[self.df['Badges'].astype(str).str.contains("π€", na=False)]
if not pocket_models.empty:
max_pocket_params = pocket_models['Total Parameters'].max()
# Allow small buffer for float comparison
if max_pocket_params > POCKET_MODEL_THRESHOLD + 0.1:
self._add_issue("medium", f"Pocket badge assigned to model with {max_pocket_params}B params (Threshold: {POCKET_MODEL_THRESHOLD}B).")
else:
self.report["summary"]["tests_passed"] += 1
else:
self.report["summary"]["tests_passed"] += 1
def _test_weight_sums(self):
"""Check if preset weights sum to 1.0."""
for preset, weights in PRESET_CONFIGS.items():
if isinstance(weights, dict) and 'special_type' not in weights:
total_weight = sum(weights.values())
if abs(total_weight - 1.0) > 1e-4:
self._add_issue("medium", f"Preset '{preset}' weights sum to {total_weight:.2f} (expected 1.0)")
else:
self.report["summary"]["tests_passed"] += 1
def _test_score_ranges(self):
"""Check if final scores are within reasonable bounds [0, 1.1]."""
score_cols = [c for c in self.df.columns if c.startswith("Score_")]
for col in score_cols:
if 'Efficiency' in col: continue # Efficiency score is not normalized to 0-1
vals = self.df[col].dropna()
if not vals.empty and (vals.min() < 0 or vals.max() > 1.1):
self._add_issue("medium", f"Score out of range in {col}: [{vals.min():.2f}, {vals.max():.2f}]")
else:
self.report["summary"]["tests_passed"] += 1
def _collect_normalization_stats(self):
"""Collect statistics for normalized columns."""
norm_cols = [v[0] for k, v in METRIC_MAP.items() if v[0] in self.df.columns]
for col in norm_cols:
values = self.df[col].dropna()
self.report["statistics"][col] = {
"min": float(values.min()) if not values.empty else 0,
"max": float(values.max()) if not values.empty else 0,
"mean": float(values.mean()) if not values.empty else 0,
"std": float(values.std()) if not values.empty else 0
}
def _add_issue(self, level, message):
self.report["summary"][level] += 1
self.report[f"{level}_issues"].append(message)
def _generate_markdown_report(self):
r = self.report
md = [
f"## Executive Summary",
f"- **Tests Passed**: {r['summary']['tests_passed']}",
f"- **Critical Issues**: {r['summary']['critical']}",
f"- **Medium Issues**: {r['summary']['medium']}"
]
if r['critical_issues']:
md.append("\n### π΄ Critical Issues")
md.extend([f"- {i}" for i in r['critical_issues']])
if r['medium_issues']:
md.append("\n### π Medium Issues")
md.extend([f"- {i}" for i in r['medium_issues']])
if r['low_issues']:
md.append("\n### π‘ Low Issues")
md.extend([f"- {i}" for i in r['low_issues']])
if not r['critical_issues'] and not r['medium_issues']:
md.append("\n### β
System Status: Healthy")
return "\n".join(md) |