Spaces:

VOIDER
/

UGI-Leaderboard-Presets

Running

File size: 11,022 Bytes

1c249d8

import pandas as pd
import numpy as np
import os, time, json
from datetime import datetime, timedelta
from config import *

class DataLoader:
    def __init__(self):
        self.df, self.last_updated = None, "Unknown"

    def load_data(self, force_refresh=False):
        """Загрузка данных с поддержкой принудительного обновления."""
        if force_refresh or self._needs_update():
            print("🔄 Cache expired or missing. Fetching fresh data...")
            try:
                self.df = self._process_data(pd.read_csv(CSV_URL, on_bad_lines='skip'))
                self._save_cache()
                print(f"✅ Data processed. Rows: {len(self.df)}")
            except Exception as e:
                print(f"⚠️ Error fetching data: {e}")
                self.df = pd.read_parquet(CACHE_FILE) if os.path.exists(CACHE_FILE) else pd.DataFrame()
                self._load_meta()
        else:
            print("⚡ Loading from cache.")
            self.df = pd.read_parquet(CACHE_FILE)
            self._load_meta()
        return self.df

    def _needs_update(self):
        """Проверка необходимости обновления кеша."""
        if not os.path.exists(CACHE_FILE) or not os.path.exists(META_FILE):
            return True
        try:
            with open(META_FILE) as f:
                return (time.time() - json.load(f).get('timestamp', 0)) > CACHE_DURATION
        except:
            return True

    def clear_cache(self):
        """Принудительная очистка всех файлов кеша."""
        deleted = []
        for file in [CACHE_FILE, META_FILE]:
            if os.path.exists(file):
                try:
                    os.remove(file)
                    deleted.append(file)
                except Exception as e:
                    print(f"⚠️ Failed to delete {file}: {e}")
        if deleted:
            print(f"🗑️ Cleared cache: {', '.join(deleted)}")
        return deleted

    def _save_cache(self):
        self.df.to_parquet(CACHE_FILE)
        with open(META_FILE, 'w') as f:
            json.dump({'timestamp': time.time()}, f)
        self.last_updated = datetime.fromtimestamp(time.time()).strftime("%Y-%m-%d %H:%M")

    def _load_meta(self):
        try:
            with open(META_FILE) as f:
                self.last_updated = datetime.fromtimestamp(json.load(f)['timestamp']).strftime("%Y-%m-%d %H:%M")
        except:
            pass

    def _clean_column(self, series, scale=1.0):
        """Безопасная очистка и масштабирование числовых колонок."""
        if pd.api.types.is_string_dtype(series) or series.dtype == 'object':
            series = series.astype(str).str.rstrip('%')
        series = pd.to_numeric(series, errors='coerce')
        return series / scale if scale > 1 else series

    def _get_model_type(self, row):
        """Определение типа модели для сортировки."""
        # Returns: (sort_value, short_code, full_name)
        if pd.isna(row.get('Total Parameters')) or row.get('Total Parameters', 0) <= 0:
            return (3, 'P', 'Proprietary')
        
        is_foundation = row.get('Is Foundation', False)
        is_merged = row.get('Is Merged', False)
        
        if is_foundation and not is_merged:
            return (0, 'B', 'Base')
        if is_merged:
            return (2, 'M', 'Merge')
        if row.get('Is Finetuned', False) and not is_merged:
            return (1, 'F', 'Finetune')
            
        return (4, '', 'Unknown')

    def _process_data(self, df):
        """Основной пайплайн обработки."""
        print("⚙️ Processing pipeline started...")
        df.columns = df.columns.str.strip()

        # === 1. COLUMN GROUPS ===
        col_groups = {
            'percentage': (['Textbook', 'Pop Culture', 'Dialogue_Percentage', 'Verb_to_Noun_Ratio', 
                           'Show Rec Correlation', 'avg_length_error_pct'], 100.0),
            'already_norm': (['avg_writing_style_score', 'originality_score', 'internal_semantic_redundancy',
                             'lexical_stuckness', 'wm_recipe_percent_error_score', 'wm_geoguessr_mae_score',
                             'wm_weight_percent_error_score', 'wm_music_mae_score'], 1.0),
            'numeric': (['Total Parameters', 'Active Parameters', 'Repetition Interrupts', 'Avg Thinking Chars'], 1.0),
            'scale_10': (['avg_nsfw_score', 'avg_dark_score', 'Hazardous', 'Entertainment', 
                         'SocPol', 'W/10-Direct', 'W/10-Adherence'], 10.0)
        }

        for group, (cols, scale) in col_groups.items():
            for col in cols:
                if col in df.columns:
                    df[col] = self._clean_column(df[col], scale)
                    if group == 'already_norm':
                        df[col] = df[col].clip(0, 1.0)
                else:
                    df[col] = np.nan

        # === 2. BOOLEANS & STRINGS ===
        if 'Is Thinking Model' in df.columns:
            df['Is Thinking Model'] = (
                df['Is Thinking Model'].astype(str).fillna('FALSE').str.strip().str.upper() == 'TRUE'
            )
        else:
            df['Is Thinking Model'] = False

        df['Architecture'] = df.get('Architecture', 'Unknown').fillna('Unknown').replace('null', 'Unknown')

        # === 3. MODEL TYPES & DATES ===
        type_data = df.apply(self._get_model_type, axis=1)
        df['_type_sort'] = type_data.apply(lambda x: x[0])
        df['Type_Code'] = type_data.apply(lambda x: x[1])
        df['Type_Name'] = type_data.apply(lambda x: x[2])

        if 'Test Date' in df.columns:
            df['Test Date'] = pd.to_datetime(df['Test Date'], format='%m/%d/%Y', errors='coerce')
            week_ago = datetime.now() - timedelta(days=7)
            df['Is_New'] = df['Test Date'].apply(lambda x: True if pd.notna(x) and x >= week_ago else False)
            df['Test Date'] = df['Test Date'].dt.strftime('%Y-%m-%d')
        else:
            df['Is_New'] = False

        # === 4. PENALTIES ===
        df['penalty_repetition'] = REPETITION_BASE ** df['Repetition Interrupts'].fillna(0)
        
        chars = df['Avg Thinking Chars'].fillna(0)
        df['penalty_thinking'] = np.where(
            df['Is Thinking Model'] & (chars > THINKING_THRESHOLD),
            np.power(THINKING_THRESHOLD / (chars + 1e-6), THINKING_PENALTY_POWER).clip(upper=1.0),
            1.0
        )

        # === 5. GAUSSIAN SCORES ===
        df['gauss_Dialogue'] = self._gaussian_score(df['Dialogue_Percentage'], GAUSSIAN_DIALOGUE_TARGET, GAUSSIAN_DIALOGUE_SIGMA)
        df['gauss_VerbNoun'] = self._gaussian_score(df['Verb_to_Noun_Ratio'], GAUSSIAN_VERBNOUN_TARGET, GAUSSIAN_VERBNOUN_SIGMA)

        # === 6. NORMALIZATION ===
        norm_config = {
            # Direct normalization (Higher = Better)
            'norm_Textbook': ('Textbook', 'direct'),
            'norm_PopCulture': ('Pop Culture', 'direct'),
            'norm_ShowRec': ('Show Rec Correlation', 'direct'),
            'norm_Style': ('avg_writing_style_score', 'direct'),
            'norm_Originality': ('originality_score', 'direct'),
            'norm_NSFW': ('avg_nsfw_score', 'direct'),
            'norm_Dark': ('avg_dark_score', 'direct'),
            'norm_Hazardous': ('Hazardous', 'direct'),
            'norm_Entertainment': ('Entertainment', 'direct'),
            'norm_Instruction': ('W/10-Adherence', 'direct'),
            'norm_Unbound_Direct': ('W/10-Direct', 'direct'),
            # World Model (Direct)
            'norm_Recipe': ('wm_recipe_percent_error_score', 'direct'),
            'norm_Geo': ('wm_geoguessr_mae_score', 'direct'),
            'norm_Weight': ('wm_weight_percent_error_score', 'direct'),
            'norm_Music': ('wm_music_mae_score', 'direct'),
            # Inverse normalization (Higher = Worse)
            'inv_Semantic': ('internal_semantic_redundancy', 'inverse'),
            'inv_Lexical': ('lexical_stuckness', 'inverse'),
            'inv_LengthErr': ('avg_length_error_pct', 'inverse')
        }

        for dest, (src, mode) in norm_config.items():
            if src in df.columns:
                df[dest] = self._inverse_normalize(df[src]) if mode == 'inverse' else self._robust_normalize(df[src])
            else:
                df[dest] = np.nan

        # === 7. COMPOSITES ===
        composites = {
            'Composite_WorldModel': ['norm_Recipe', 'norm_Geo', 'norm_Weight', 'norm_Music'],
            'Composite_Unbound': ['norm_Unbound_Direct', 'norm_Entertainment', 'norm_Hazardous'],
            'Composite_Redundancy': ['inv_Semantic', 'inv_Lexical']
        }
        for comp, cols in composites.items():
            df[comp] = df[cols].mean(axis=1, skipna=False)

        # === 8. SMART NA FILLING (For Sorting) ===
        print("🔧 Applying smart NA handling for sorting...")
        higher_is_better = [
            'Show Rec Correlation', 'norm_Textbook', 'norm_PopCulture', 'norm_ShowRec',
            'norm_Style', 'norm_Originality', 'Composite_WorldModel', 'Composite_Unbound',
            'norm_Recipe', 'norm_Geo', 'norm_Weight', 'norm_Music'
        ]
        for col in higher_is_better:
            if col in df.columns:
                df[col] = df[col].fillna(-99999)

        lower_is_better = [
            'avg_length_error_pct', 'internal_semantic_redundancy', 'lexical_stuckness',
            'inv_Semantic', 'inv_Lexical', 'inv_LengthErr'
        ]
        for col in lower_is_better:
            if col in df.columns:
                df[col] = df[col].fillna(99999)

        print("✅ Processing complete!")
        return df

    def _robust_normalize(self, series):
        """Robust normalization with divide-by-zero protection."""
        valid = series.dropna()
        if valid.empty or valid.std() < MIN_STD_THRESHOLD:
            return pd.Series(np.nan, index=series.index)
        q05, q95 = valid.quantile(ROBUST_QUANTILE_LOW), valid.quantile(ROBUST_QUANTILE_HIGH)
        denominator = q95 - q05
        if abs(denominator) < MIN_STD_THRESHOLD:
            return pd.Series(np.nan, index=series.index)
        return (series.clip(q05, q95) - q05) / denominator

    def _inverse_normalize(self, series):
        """Inverse robust normalization."""
        valid = series.dropna()
        if valid.empty or valid.std() < MIN_STD_THRESHOLD:
            return pd.Series(np.nan, index=series.index)
        p5, p95 = valid.quantile(ROBUST_QUANTILE_LOW), valid.quantile(ROBUST_QUANTILE_HIGH)
        denominator = p95 - p5
        if abs(denominator) < MIN_STD_THRESHOLD:
            return pd.Series(np.nan, index=series.index)
        return (p95 - series.clip(p5, p95)) / denominator

    def _gaussian_score(self, series, target, sigma):
        return np.exp(-((series - target) ** 2) / (2 * sigma ** 2))

# Create instance
loader = DataLoader()