File size: 6,759 Bytes
a597782
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1c249d8
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import pandas as pd
import numpy as np
from scoring import METRIC_MAP, PRESET_CONFIGS
from config import POCKET_MODEL_THRESHOLD

class DevSuite:
    def __init__(self, df):
        self.df = df
        self.report = {
            "summary": {"critical": 0, "medium": 0, "low": 0, "tests_passed": 0},
            "critical_issues": [], 
            "medium_issues": [], 
            "low_issues": [], 
            "anomalies": [], 
            "statistics": {}
        }

    def run_all_tests(self):
        if self.df is None or self.df.empty:
            self._add_issue("critical", "DataFrame is empty or None.")
            return self._generate_markdown_report()
        
        self._test_normalization_bounds()
        self._test_parameter_scaling()
        self._test_badges_logic()
        self._test_weight_sums()
        self._test_score_ranges()
        self._collect_normalization_stats()
        
        return self._generate_markdown_report()

    def get_anomalies_df(self):
        return pd.DataFrame(self.report["anomalies"]) if self.report["anomalies"] else pd.DataFrame()

    def get_statistics_df(self):
        return pd.DataFrame(self.report["statistics"]).T if self.report["statistics"] else pd.DataFrame()

    def _test_normalization_bounds(self):
        """Check if normalized metrics are within [0, 1]."""
        norm_cols = [v[0] for k, v in METRIC_MAP.items() if v[0] in self.df.columns]
        
        for col in norm_cols:
            # We no longer use sentinel values, so just drop NaNs
            values = self.df[col].dropna()
            
            if values.empty: 
                continue
                
            # Allow for tiny floating point errors (-1e-6 to 1.0+1e-6)
            if values.min() < -1e-6 or values.max() > 1.0 + 1e-6:
                self._add_issue("critical", f"Normalization bounds broken in '{col}': Range [{values.min():.3f}, {values.max():.3f}]")
            else:
                self.report["summary"]["tests_passed"] += 1

    def _test_parameter_scaling(self):
        """Check if parameters look scaled correctly (Billion scale, not Million or arbitrary)."""
        if 'Total Parameters' in self.df.columns:
            # Check maximum parameter value in the dataset
            # If the largest model is < 100, we might have a scaling issue (e.g., Llama 405B missing or scaled down)
            max_params = self.df['Total Parameters'].max()
            
            if pd.isna(max_params) or max_params == 0:
                self._add_issue("critical", "Parameter column appears empty or zeroed.")
            elif max_params < 100:
                self._add_issue("critical", f"Parameter scaling suspicious: Max parameter found is {max_params}. Expected > 100 for large models.")
            else:
                self.report["summary"]["tests_passed"] += 1

    def _test_badges_logic(self):
        """Check if badges are assigned consistent with config thresholds."""
        if 'Badges' in self.df.columns and 'Total Parameters' in self.df.columns:
            # Check Pocket Badge
            pocket_models = self.df[self.df['Badges'].astype(str).str.contains("🀏", na=False)]
            
            if not pocket_models.empty:
                max_pocket_params = pocket_models['Total Parameters'].max()
                # Allow small buffer for float comparison
                if max_pocket_params > POCKET_MODEL_THRESHOLD + 0.1:
                    self._add_issue("medium", f"Pocket badge assigned to model with {max_pocket_params}B params (Threshold: {POCKET_MODEL_THRESHOLD}B).")
                else:
                    self.report["summary"]["tests_passed"] += 1
            else:
                self.report["summary"]["tests_passed"] += 1

    def _test_weight_sums(self):
        """Check if preset weights sum to 1.0."""
        for preset, weights in PRESET_CONFIGS.items():
            if isinstance(weights, dict) and 'special_type' not in weights:
                total_weight = sum(weights.values())
                if abs(total_weight - 1.0) > 1e-4:
                    self._add_issue("medium", f"Preset '{preset}' weights sum to {total_weight:.2f} (expected 1.0)")
                else:
                    self.report["summary"]["tests_passed"] += 1

    def _test_score_ranges(self):
        """Check if final scores are within reasonable bounds [0, 1.1]."""
        score_cols = [c for c in self.df.columns if c.startswith("Score_")]
        for col in score_cols:
            if 'Efficiency' in col: continue # Efficiency score is not normalized to 0-1
            
            vals = self.df[col].dropna()
            if not vals.empty and (vals.min() < 0 or vals.max() > 1.1):
                self._add_issue("medium", f"Score out of range in {col}: [{vals.min():.2f}, {vals.max():.2f}]")
            else:
                self.report["summary"]["tests_passed"] += 1

    def _collect_normalization_stats(self):
        """Collect statistics for normalized columns."""
        norm_cols = [v[0] for k, v in METRIC_MAP.items() if v[0] in self.df.columns]
        for col in norm_cols:
            values = self.df[col].dropna()
            
            self.report["statistics"][col] = {
                "min": float(values.min()) if not values.empty else 0,
                "max": float(values.max()) if not values.empty else 0,
                "mean": float(values.mean()) if not values.empty else 0,
                "std": float(values.std()) if not values.empty else 0
            }

    def _add_issue(self, level, message):
        self.report["summary"][level] += 1
        self.report[f"{level}_issues"].append(message)

    def _generate_markdown_report(self):
        r = self.report
        md = [
            f"## Executive Summary",
            f"- **Tests Passed**: {r['summary']['tests_passed']}",
            f"- **Critical Issues**: {r['summary']['critical']}",
            f"- **Medium Issues**: {r['summary']['medium']}"
        ]
        
        if r['critical_issues']:
            md.append("\n### πŸ”΄ Critical Issues")
            md.extend([f"- {i}" for i in r['critical_issues']])
            
        if r['medium_issues']:
            md.append("\n### 🟠 Medium Issues")
            md.extend([f"- {i}" for i in r['medium_issues']])
            
        if r['low_issues']:
            md.append("\n### 🟑 Low Issues")
            md.extend([f"- {i}" for i in r['low_issues']])
            
        if not r['critical_issues'] and not r['medium_issues']:
            md.append("\n### βœ… System Status: Healthy")
            
        return "\n".join(md)