# -*- coding: utf-8 -*- """ Accuracy Benchmark - Speed and accuracy measurement for sentiment analysis Measures: - Detection speed (ms per text) - Accuracy per emotion - Overall accuracy by polarity - Confusion matrix """ import time from typing import Dict, List, Any, Tuple from dataclasses import dataclass, field from collections import defaultdict @dataclass class EmotionResult: """Result for a single emotion test""" emotion: str text: str expected_polarity: str detected_label: str detected_polarity: str is_correct: bool inference_time_ms: float emoji: str @dataclass class BenchmarkResults: """Aggregated benchmark results""" total_tests: int = 0 correct_tests: int = 0 accuracy: float = 0.0 avg_inference_time_ms: float = 0.0 min_inference_time_ms: float = 0.0 max_inference_time_ms: float = 0.0 median_inference_time_ms: float = 0.0 emotion_accuracy: Dict[str, float] = field(default_factory=dict) emotion_results: Dict[str, List[EmotionResult]] = field(default_factory=dict) failed_emotions: List[str] = field(default_factory=list) confusion_matrix: Dict[str, Dict[str, int]] = field(default_factory=dict) class AccuracyBenchmark: """ Benchmark sentiment analysis accuracy and speed Tests all emotions and generates accuracy metrics """ # Map transformer output to polarity LABEL_TO_POLARITY = { "happiness": "positive", "sadness": "negative", "positive": "positive", "negative": "negative", "neutral": "neutral", "joy": "positive", "anger": "negative", "fear": "negative", "surprise": "positive", "disgust": "negative", } def __init__(self, analyzer, emoji_mapper): """ Initialize benchmark with analyzer and mapper Args: analyzer: SentimentAnalyzer instance emoji_mapper: EmojiMapper instance """ self.analyzer = analyzer self.emoji_mapper = emoji_mapper def _get_polarity(self, label: str) -> str: """Map emotion label to polarity (positive/negative/neutral)""" return self.LABEL_TO_POLARITY.get(label.lower(), "neutral") def run_single_test(self, text: str, expected_polarity: str, emotion: str) -> EmotionResult: """Run a single test and return result""" # Time the inference start_time = time.perf_counter() result = self.analyzer.analyze(text) end_time = time.perf_counter() inference_time_ms = (end_time - start_time) * 1000 detected_label = result.get("label", "neutral") # Use polarity from result if available, otherwise infer from label if "polarity" in result: detected_polarity = result["polarity"] else: detected_polarity = self._get_polarity(detected_label) # Get emoji emoji = self.emoji_mapper.get_emoji(detected_label) # Check correctness (polarity match) is_correct = detected_polarity == expected_polarity return EmotionResult( emotion=emotion, text=text, expected_polarity=expected_polarity, detected_label=detected_label, detected_polarity=detected_polarity, is_correct=is_correct, inference_time_ms=inference_time_ms, emoji=emoji ) def run_benchmark(self, test_data: Dict[str, List[Tuple[str, str]]]) -> BenchmarkResults: """ Run full benchmark on test data Args: test_data: Dict mapping emotion -> [(text, expected_polarity), ...] Returns: BenchmarkResults with all metrics """ results = BenchmarkResults() results.emotion_results = defaultdict(list) results.confusion_matrix = defaultdict(lambda: defaultdict(int)) all_times = [] for emotion, test_cases in test_data.items(): emotion_correct = 0 emotion_total = 0 for text, expected_polarity in test_cases: result = self.run_single_test(text, expected_polarity, emotion) results.emotion_results[emotion].append(result) results.total_tests += 1 emotion_total += 1 all_times.append(result.inference_time_ms) # Update confusion matrix results.confusion_matrix[expected_polarity][result.detected_polarity] += 1 if result.is_correct: results.correct_tests += 1 emotion_correct += 1 # Calculate per-emotion accuracy if emotion_total > 0: emotion_acc = emotion_correct / emotion_total results.emotion_accuracy[emotion] = emotion_acc if emotion_acc < 0.5: # Less than 50% accuracy results.failed_emotions.append(emotion) # Calculate overall metrics if results.total_tests > 0: results.accuracy = results.correct_tests / results.total_tests if all_times: all_times_sorted = sorted(all_times) results.avg_inference_time_ms = sum(all_times) / len(all_times) results.min_inference_time_ms = min(all_times) results.max_inference_time_ms = max(all_times) mid = len(all_times_sorted) // 2 if len(all_times_sorted) % 2 == 0: results.median_inference_time_ms = (all_times_sorted[mid - 1] + all_times_sorted[mid]) / 2 else: results.median_inference_time_ms = all_times_sorted[mid] return results def get_accuracy_report(self, results: BenchmarkResults) -> str: """Generate human-readable accuracy report""" lines = [ "=" * 70, "SENTIMENT ANALYSIS ACCURACY BENCHMARK", "=" * 70, "", f"Total Tests: {results.total_tests}", f"Correct: {results.correct_tests}", f"Overall Accuracy: {results.accuracy:.1%}", f"Avg Inference Time: {results.avg_inference_time_ms:.2f} ms", "", "-" * 70, "ACCURACY BY EMOTION (sorted by accuracy)", "-" * 70, ] # Sort emotions by accuracy sorted_emotions = sorted( results.emotion_accuracy.items(), key=lambda x: x[1], reverse=True ) for emotion, acc in sorted_emotions: status = "✓" if acc >= 0.5 else "✗" lines.append(f"{status} {emotion:25} {acc:6.1%}") lines.extend([ "", "-" * 70, "CONFUSION MATRIX (expected → detected)", "-" * 70, ]) # Print confusion matrix polarities = ["positive", "negative", "neutral"] header = " " + " ".join(f"{p:>10}" for p in polarities) lines.append(header) for expected in polarities: row = f"{expected:>8} " for detected in polarities: count = results.confusion_matrix[expected][detected] row += f"{count:>10} " lines.append(row) if results.failed_emotions: lines.extend([ "", "-" * 70, f"FAILED EMOTIONS (< 50% accuracy): {len(results.failed_emotions)}", "-" * 70, ]) for em in results.failed_emotions: lines.append(f" ✗ {em}") lines.append("=" * 70) return "\n".join(lines) if __name__ == "__main__": # Quick test from avatar import SentimentAnalyzer, EmojiMapper analyzer = SentimentAnalyzer() mapper = EmojiMapper() benchmark = AccuracyBenchmark(analyzer, mapper) # Mini test test_data = { "happiness": [ ("I am happy", "positive"), ("I am good", "positive"), ], "sadness": [ ("I am sad", "negative"), ("I feel terrible", "negative"), ], } results = benchmark.run_benchmark(test_data) print(benchmark.get_accuracy_report(results))