"""
AEGIS Bio-Digital Lab 10 - Protein Structure Prediction Interface
Artificially Expanded Genetic Information System (AEGIS)
Strategic Precognition through Advanced Protein Structure Analysis

Gaston Software Solutions Tec | Tel: +256755274944
"Time Travel" System - Calculating causal ripples of today's events

Version: 2.1 - Fixed Unicode syntax errors for deployment
"""

import gradio as gr
import os
import tempfile
import time
from pathlib import Path
import numpy as np
import pandas as pd
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqUtils import ProtParam
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from huggingface_hub import hf_hub_download, list_repo_files, HfApi
import requests
import json
from difflib import SequenceMatcher
import warnings
warnings.filterwarnings('ignore')


class AEGISLearningSystem:
    """Continuous learning system for AEGIS protein prediction model."""
    
    def __init__(self):
        self.learning_dir = Path("./aegis_learning")
        self.learning_dir.mkdir(exist_ok=True)
        
        # Learning data storage
        self.training_log = self.learning_dir / "training_log.json"
        self.feedback_db = self.learning_dir / "feedback_database.json"
        self.model_versions = self.learning_dir / "model_versions"
        self.model_versions.mkdir(exist_ok=True)
        
        # Performance tracking
        self.performance_log = self.learning_dir / "performance_log.json"
        
        # Initialize learning data structures
        self.initialize_learning_data()
        
    def initialize_learning_data(self):
        """Initialize learning data structures if they don't exist."""
        
        # Training log structure
        if not self.training_log.exists():
            initial_log = {
                "version": "1.0",
                "created": time.strftime("%Y-%m-%d %H:%M:%S"),
                "total_predictions": 0,
                "successful_validations": 0,
                "learning_sessions": 0,
                "model_updates": 0,
                "last_update": None
            }
            self._save_json(self.training_log, initial_log)
        
        # Feedback database structure
        if not self.feedback_db.exists():
            initial_feedback = {
                "predictions": [],
                "validations": [],
                "user_corrections": [],
                "pdb_matches": [],
                "performance_metrics": []
            }
            self._save_json(self.feedback_db, initial_feedback)
        
        # Performance log structure
        if not self.performance_log.exists():
            initial_performance = {
                "accuracy_over_time": [],
                "pdb_validation_success_rate": [],
                "prediction_confidence_correlation": [],
                "learning_curve": []
            }
            self._save_json(self.performance_log, initial_performance)
    
    def _save_json(self, filepath, data):
        """Save data to JSON file."""
        try:
            with open(filepath, 'w') as f:
                json.dump(data, f, indent=2, default=str)
        except Exception as e:
            print(f"Error saving JSON to {filepath}: {str(e)}")
    
    def _load_json(self, filepath):
        """Load data from JSON file."""
        try:
            with open(filepath, 'r') as f:
                return json.load(f)
        except Exception as e:
            print(f"Error loading JSON from {filepath}: {str(e)}")
            return {}
    
    def record_prediction(self, sequence, prediction_result, pdb_validation=None, user_feedback=None):
        """Record a prediction for learning purposes."""
        
        # Load current feedback database
        feedback_data = self._load_json(self.feedback_db)
        
        # Create prediction record
        prediction_record = {
            "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
            "sequence": sequence,
            "sequence_length": len(sequence),
            "prediction": {
                "secondary_structure": prediction_result.get('secondary_structure', ''),
                "confidence": prediction_result.get('confidence', 0.0),
                "properties": prediction_result.get('properties', {}),
                "method": prediction_result.get('method', 'Unknown')
            },
            "pdb_validation": pdb_validation,
            "user_feedback": user_feedback,
            "learning_value": self._calculate_learning_value(prediction_result, pdb_validation, user_feedback)
        }
        
        # Add to feedback database
        feedback_data["predictions"].append(prediction_record)
        
        # Update training log
        training_log = self._load_json(self.training_log)
        training_log["total_predictions"] += 1
        
        if pdb_validation and pdb_validation.get('validation_status') in ['KNOWN_SEQUENCE', 'HIGHLY_SIMILAR']:
            training_log["successful_validations"] += 1
        
        # Save updated data
        self._save_json(self.feedback_db, feedback_data)
        self._save_json(self.training_log, training_log)
        
        # Check if we should trigger learning
        self._check_learning_trigger()
        
        return prediction_record
    
    def _calculate_learning_value(self, prediction_result, pdb_validation, user_feedback):
        """Calculate the learning value of a prediction."""
        learning_value = 0.0
        
        # Base value from prediction confidence
        confidence = prediction_result.get('confidence', 0.0)
        learning_value += confidence * 0.3
        
        # Value from PDB validation
        if pdb_validation:
            status = pdb_validation.get('validation_status', 'NOVEL_SEQUENCE')
            status_values = {
                'KNOWN_SEQUENCE': 1.0,
                'HIGHLY_SIMILAR': 0.8,
                'MODERATELY_SIMILAR': 0.6,
                'DISTANTLY_RELATED': 0.4,
                'NOVEL_SEQUENCE': 0.2
            }
            learning_value += status_values.get(status, 0.2) * 0.4
        
        # Value from user feedback
        if user_feedback:
            feedback_score = user_feedback.get('accuracy_rating', 0.5)  # 0-1 scale
            learning_value += feedback_score * 0.3
        
        return min(1.0, learning_value)  # Cap at 1.0
    
    def _check_learning_trigger(self):
        """Check if we should trigger a learning session."""
        training_log = self._load_json(self.training_log)
        feedback_data = self._load_json(self.feedback_db)
        
        # Trigger learning every 50 predictions or when we have high-value data
        predictions_count = len(feedback_data.get("predictions", []))
        
        should_learn = False
        
        # Regular learning trigger
        if predictions_count > 0 and predictions_count % 50 == 0:
            should_learn = True
            
        # High-value data trigger
        recent_predictions = feedback_data.get("predictions", [])[-10:]  # Last 10 predictions
        high_value_count = sum(1 for p in recent_predictions if p.get('learning_value', 0) > 0.8)
        
        if high_value_count >= 5:  # 5 high-value predictions in last 10
            should_learn = True
        
        if should_learn:
            print("AEGIS Learning Trigger: Initiating continuous learning session...")
            self.perform_learning_session()
    
    def perform_learning_session(self):
        """Perform a continuous learning session."""
        try:
            print("AEGIS Learning: Starting learning session...")
            
            # Load learning data
            feedback_data = self._load_json(self.feedback_db)
            predictions = feedback_data.get("predictions", [])
            
            if len(predictions) < 10:  # Need minimum data
                print("AEGIS Learning: Insufficient data for learning session")
                return
            
            # Prepare training data from successful predictions
            training_features, training_labels = self._prepare_training_data(predictions)
            
            if len(training_features) == 0:
                print("AEGIS Learning: No suitable training data found")
                return
            
            # Update model with new data
            self._update_model_with_feedback(training_features, training_labels)
            
            # Update performance metrics
            self._update_performance_metrics(predictions)
            
            # Update training log
            training_log = self._load_json(self.training_log)
            training_log["learning_sessions"] += 1
            training_log["model_updates"] += 1
            training_log["last_update"] = time.strftime("%Y-%m-%d %H:%M:%S")
            self._save_json(self.training_log, training_log)
            
            print("AEGIS Learning: Learning session completed successfully!")
            
        except Exception as e:
            print(f"AEGIS Learning Error: {str(e)}")
    
    def _prepare_training_data(self, predictions):
        """Prepare training data from prediction history."""
        features = []
        labels = []
        
        for pred in predictions:
            # Only use high-quality predictions for training
            if pred.get('learning_value', 0) < 0.6:
                continue
            
            sequence = pred.get('sequence', '')
            if len(sequence) < 10:  # Skip very short sequences
                continue
            
            # Extract features from sequence
            seq_features = self._extract_sequence_features(sequence)
            
            # Get target labels from PDB validation or user feedback
            target_labels = self._extract_target_labels(pred)
            
            if seq_features is not None and target_labels is not None:
                features.append(seq_features)
                labels.append(target_labels)
        
        return np.array(features) if features else np.array([]), np.array(labels) if labels else np.array([])
    
    def _extract_sequence_features(self, sequence):
        """Extract features from protein sequence for learning."""
        try:
            # Basic sequence features
            length = len(sequence)
            
            # Amino acid composition
            aa_counts = {}
            for aa in 'ACDEFGHIKLMNPQRSTVWYUOJBZX':
                aa_counts[aa] = sequence.count(aa) / length if length > 0 else 0
            
            # Secondary structure propensities (simplified)
            helix_propensity = sum(sequence.count(aa) for aa in 'AEHKQR') / length if length > 0 else 0
            sheet_propensity = sum(sequence.count(aa) for aa in 'VIFYW') / length if length > 0 else 0
            coil_propensity = 1.0 - helix_propensity - sheet_propensity
            
            # Physicochemical properties
            hydrophobic_count = sum(sequence.count(aa) for aa in 'AILMFPWV') / length if length > 0 else 0
            charged_count = sum(sequence.count(aa) for aa in 'DEKR') / length if length > 0 else 0
            polar_count = sum(sequence.count(aa) for aa in 'NQSTY') / length if length > 0 else 0
            
            # Extended amino acids
            extended_count = sum(sequence.count(aa) for aa in 'UOJBZX') / length if length > 0 else 0
            
            # Combine features
            features = [
                length / 1000.0,  # Normalized length
                helix_propensity,
                sheet_propensity,
                coil_propensity,
                hydrophobic_count,
                charged_count,
                polar_count,
                extended_count
            ]
            
            # Add amino acid composition
            features.extend([aa_counts[aa] for aa in 'ACDEFGHIKLMNPQRSTVWYUOJBZX'])
            
            return np.array(features)
            
        except Exception as e:
            print(f"Feature extraction error: {str(e)}")
            return None
    
    def _extract_target_labels(self, prediction_record):
        """Extract target labels from prediction record."""
        try:
            # Get secondary structure from PDB validation if available
            pdb_validation = prediction_record.get('pdb_validation')
            
            if pdb_validation and pdb_validation.get('best_match'):
                # Use PDB validation as ground truth
                validation_status = pdb_validation.get('validation_status', 'NOVEL_SEQUENCE')
                
                # Convert validation status to numerical target
                status_mapping = {
                    'KNOWN_SEQUENCE': 1.0,
                    'HIGHLY_SIMILAR': 0.8,
                    'MODERATELY_SIMILAR': 0.6,
                    'DISTANTLY_RELATED': 0.4,
                    'NOVEL_SEQUENCE': 0.2
                }
                
                confidence_target = status_mapping.get(validation_status, 0.2)
                
                return np.array([confidence_target])
            
            # Fallback to user feedback
            user_feedback = prediction_record.get('user_feedback')
            if user_feedback:
                accuracy_rating = user_feedback.get('accuracy_rating', 0.5)
                return np.array([accuracy_rating])
            
            return None
            
        except Exception as e:
            print(f"Target extraction error: {str(e)}")
            return None
    
    def _update_model_with_feedback(self, features, labels):
        """Update the model with new training data."""
        try:
            # For now, we'll update a simple confidence predictor
            # In a full implementation, this would update the main prediction model
            
            from sklearn.linear_model import SGDRegressor
            
            # Load or create confidence predictor
            confidence_model_path = self.model_versions / "confidence_predictor.pkl"
            
            if confidence_model_path.exists():
                with open(confidence_model_path, 'rb') as f:
                    confidence_model = pickle.load(f)
            else:
                confidence_model = SGDRegressor(random_state=42)
                # Initial fit with dummy data if no previous model
                dummy_features = np.random.randn(10, features.shape[1])
                dummy_labels = np.random.rand(10)
                confidence_model.fit(dummy_features, dummy_labels)
            
            # Partial fit with new data (online learning)
            confidence_model.partial_fit(features, labels.ravel())
            
            # Save updated model
            with open(confidence_model_path, 'wb') as f:
                pickle.dump(confidence_model, f)
            
            print(f"AEGIS Learning: Updated confidence model with {len(features)} new samples")
            
        except Exception as e:
            print(f"Model update error: {str(e)}")
    
    def _update_performance_metrics(self, predictions):
        """Update performance tracking metrics."""
        try:
            performance_data = self._load_json(self.performance_log)
            
            # Calculate recent accuracy
            recent_predictions = predictions[-50:]  # Last 50 predictions
            
            if recent_predictions:
                # PDB validation success rate
                pdb_successes = sum(1 for p in recent_predictions 
                                  if p.get('pdb_validation', {}).get('validation_status') in 
                                  ['KNOWN_SEQUENCE', 'HIGHLY_SIMILAR'])
                pdb_success_rate = pdb_successes / len(recent_predictions)
                
                # Average learning value (proxy for quality)
                avg_learning_value = np.mean([p.get('learning_value', 0) for p in recent_predictions])
                
                # Add to performance log
                performance_entry = {
                    "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
                    "total_predictions": len(predictions),
                    "pdb_success_rate": pdb_success_rate,
                    "avg_learning_value": avg_learning_value,
                    "recent_sample_size": len(recent_predictions)
                }
                
                performance_data["accuracy_over_time"].append(performance_entry)
                performance_data["pdb_validation_success_rate"].append(pdb_success_rate)
                
                # Keep only last 100 entries
                for key in ["accuracy_over_time", "pdb_validation_success_rate"]:
                    if len(performance_data[key]) > 100:
                        performance_data[key] = performance_data[key][-100:]
                
                self._save_json(self.performance_log, performance_data)
                
                print(f"AEGIS Learning: Updated performance metrics - PDB Success: {pdb_success_rate:.2%}")
            
        except Exception as e:
            print(f"Performance metrics update error: {str(e)}")
    
    def get_learning_stats(self):
        """Get current learning statistics."""
        try:
            training_log = self._load_json(self.training_log)
            performance_data = self._load_json(self.performance_log)
            feedback_data = self._load_json(self.feedback_db)
            
            # Calculate recent performance
            recent_performance = performance_data.get("accuracy_over_time", [])
            current_pdb_success = recent_performance[-1].get("pdb_success_rate", 0) if recent_performance else 0
            
            stats = {
                "total_predictions": training_log.get("total_predictions", 0),
                "successful_validations": training_log.get("successful_validations", 0),
                "learning_sessions": training_log.get("learning_sessions", 0),
                "model_updates": training_log.get("model_updates", 0),
                "last_update": training_log.get("last_update", "Never"),
                "current_pdb_success_rate": current_pdb_success,
                "total_feedback_records": len(feedback_data.get("predictions", [])),
                "learning_system_status": "Active" if training_log.get("model_updates", 0) > 0 else "Initializing"
            }
            
            return stats
            
        except Exception as e:
            print(f"Error getting learning stats: {str(e)}")
            return {"error": str(e)}
    
    def add_user_feedback(self, sequence, prediction_result, accuracy_rating, comments=""):
        """Add user feedback for a prediction."""
        try:
            feedback_data = self._load_json(self.feedback_db)
            
            user_feedback = {
                "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
                "sequence": sequence,
                "accuracy_rating": accuracy_rating,  # 0.0 to 1.0
                "comments": comments,
                "prediction_confidence": prediction_result.get('confidence', 0.0)
            }
            
            feedback_data["user_corrections"].append(user_feedback)
            self._save_json(self.feedback_db, feedback_data)
            
            print(f"AEGIS Learning: User feedback recorded (Rating: {accuracy_rating:.2f})")
            
            # Trigger learning if we have enough feedback
            if len(feedback_data["user_corrections"]) % 10 == 0:
                self.perform_learning_session()
            
        except Exception as e:
            print(f"Error adding user feedback: {str(e)}")


# Initialize learning system
aegis_learning = AEGISLearningSystem()


class PDBValidator:
    """Validates protein sequences against RCSB PDB database using REST API."""
    
    def __init__(self):
        self.base_url = "https://data.rcsb.org/rest/v1"
        self.search_url = "https://search.rcsb.org/rcsbsearch/v2/query"
        self.cache_dir = Path("./pdb_cache")
        self.cache_dir.mkdir(exist_ok=True)
        
    def search_similar_sequences(self, sequence, identity_threshold=0.7, max_results=10):
        """Search for similar sequences in PDB using sequence similarity."""
        try:
            # Create sequence similarity search query
            search_query = {
                "query": {
                    "type": "terminal",
                    "service": "sequence",
                    "parameters": {
                        "evalue_cutoff": 1,
                        "identity_cutoff": identity_threshold,
                        "sequence_type": "protein",
                        "value": sequence
                    }
                },
                "return_type": "entry",
                "request_options": {
                    "paginate": {
                        "start": 0,
                        "rows": max_results
                    },
                    "scoring_strategy": "combined",
                    "sort": [
                        {
                            "sort_by": "score",
                            "direction": "desc"
                        }
                    ]
                }
            }
            
            # Make the search request
            response = requests.post(
                self.search_url,
                json=search_query,
                headers={'Content-Type': 'application/json'},
                timeout=30
            )
            
            if response.status_code == 200:
                results = response.json()
                return self._process_search_results(results, sequence)
            else:
                print(f"PDB search failed with status {response.status_code}")
                return []
                
        except Exception as e:
            print(f"PDB sequence search error: {str(e)}")
            return []
    
    def _process_search_results(self, results, query_sequence):
        """Process search results and extract relevant information."""
        processed_results = []
        
        if 'result_set' not in results:
            return processed_results
            
        for result in results['result_set']:
            try:
                entry_id = result.get('identifier', 'Unknown')
                score = result.get('score', 0)
                
                # Get detailed entry information
                entry_info = self.get_entry_details(entry_id)
                
                if entry_info:
                    processed_result = {
                        'pdb_id': entry_id,
                        'score': score,
                        'title': entry_info.get('title', 'Unknown'),
                        'resolution': entry_info.get('resolution', 'N/A'),
                        'method': entry_info.get('method', 'Unknown'),
                        'organism': entry_info.get('organism', 'Unknown'),
                        'sequence_length': entry_info.get('sequence_length', 0),
                        'sequence_identity': self._calculate_sequence_identity(
                            query_sequence, entry_info.get('sequence', '')
                        ),
                        'classification': entry_info.get('classification', 'Unknown'),
                        'deposition_date': entry_info.get('deposition_date', 'Unknown')
                    }
                    processed_results.append(processed_result)
                    
            except Exception as e:
                print(f"Error processing result {result}: {str(e)}")
                continue
                
        return processed_results
    
    def get_entry_details(self, entry_id):
        """Get detailed information about a PDB entry."""
        try:
            # Get entry information
            entry_url = f"{self.base_url}/core/entry/{entry_id}"
            response = requests.get(entry_url, timeout=15)
            
            if response.status_code != 200:
                return None
                
            entry_data = response.json()
            
            # Extract relevant information
            entry_info = {
                'title': entry_data.get('struct', {}).get('title', 'Unknown'),
                'classification': entry_data.get('struct_keywords', {}).get('pdbx_keywords', 'Unknown'),
                'deposition_date': entry_data.get('rcsb_accession_info', {}).get('deposit_date', 'Unknown'),
                'method': 'Unknown',
                'resolution': 'N/A',
                'organism': 'Unknown',
                'sequence_length': 0,
                'sequence': ''
            }
            
            # Get experimental method
            if 'exptl' in entry_data and entry_data['exptl']:
                entry_info['method'] = entry_data['exptl'][0].get('method', 'Unknown')
            
            # Get resolution
            if 'rcsb_entry_info' in entry_data:
                resolution = entry_data['rcsb_entry_info'].get('resolution_combined', [])
                if resolution:
                    entry_info['resolution'] = f"{resolution[0]:.2f} Å"
            
            # Get polymer entity information (sequence)
            polymer_entities = entry_data.get('rcsb_entry_container_identifiers', {}).get('polymer_entity_ids', [])
            if polymer_entities:
                # Get the first polymer entity details
                entity_id = polymer_entities[0]
                entity_info = self.get_polymer_entity_details(entry_id, entity_id)
                if entity_info:
                    entry_info.update(entity_info)
            
            return entry_info
            
        except Exception as e:
            print(f"Error getting entry details for {entry_id}: {str(e)}")
            return None
    
    def get_polymer_entity_details(self, entry_id, entity_id):
        """Get polymer entity details including sequence."""
        try:
            entity_url = f"{self.base_url}/core/polymer_entity/{entry_id}/{entity_id}"
            response = requests.get(entity_url, timeout=15)
            
            if response.status_code != 200:
                return None
                
            entity_data = response.json()
            
            entity_info = {}
            
            # Get sequence
            if 'entity_poly' in entity_data:
                sequence = entity_data['entity_poly'].get('pdbx_seq_one_letter_code_can', '')
                entity_info['sequence'] = sequence.replace('\n', '').replace(' ', '')
                entity_info['sequence_length'] = len(entity_info['sequence'])
            
            # Get organism information
            if 'rcsb_entity_source_organism' in entity_data and entity_data['rcsb_entity_source_organism']:
                organism_info = entity_data['rcsb_entity_source_organism'][0]
                scientific_name = organism_info.get('scientific_name', 'Unknown')
                common_name = organism_info.get('common_name', '')
                if common_name:
                    entity_info['organism'] = f"{scientific_name} ({common_name})"
                else:
                    entity_info['organism'] = scientific_name
            
            return entity_info
            
        except Exception as e:
            print(f"Error getting polymer entity details for {entry_id}/{entity_id}: {str(e)}")
            return None
    
    def _calculate_sequence_identity(self, seq1, seq2):
        """Calculate sequence identity between two sequences."""
        if not seq1 or not seq2:
            return 0.0
            
        # Use SequenceMatcher for similarity calculation
        matcher = SequenceMatcher(None, seq1.upper(), seq2.upper())
        return matcher.ratio() * 100
    
    def validate_sequence(self, sequence, job_name="validation"):
        """Main validation function that searches PDB for similar sequences."""
        print(f"AEGIS PDB Validation: Searching for similar sequences in PDB database...")
        
        # Search for similar sequences with different identity thresholds
        high_similarity = self.search_similar_sequences(sequence, identity_threshold=0.9, max_results=5)
        medium_similarity = self.search_similar_sequences(sequence, identity_threshold=0.7, max_results=10)
        low_similarity = self.search_similar_sequences(sequence, identity_threshold=0.5, max_results=15)
        
        # Combine and deduplicate results
        all_results = []
        seen_ids = set()
        
        for result_list in [high_similarity, medium_similarity, low_similarity]:
            for result in result_list:
                if result['pdb_id'] not in seen_ids:
                    all_results.append(result)
                    seen_ids.add(result['pdb_id'])
        
        # Sort by sequence identity
        all_results.sort(key=lambda x: x['sequence_identity'], reverse=True)
        
        validation_result = {
            'query_sequence': sequence,
            'query_length': len(sequence),
            'total_matches': len(all_results),
            'high_similarity_matches': len(high_similarity),
            'medium_similarity_matches': len(medium_similarity),
            'low_similarity_matches': len(low_similarity),
            'matches': all_results[:20],  # Top 20 matches
            'validation_status': self._determine_validation_status(all_results),
            'best_match': all_results[0] if all_results else None
        }
        
        return validation_result
    
    def _determine_validation_status(self, results):
        """Determine validation status based on search results."""
        if not results:
            return "NOVEL_SEQUENCE"
        
        best_identity = results[0]['sequence_identity']
        
        if best_identity >= 95:
            return "KNOWN_SEQUENCE"
        elif best_identity >= 80:
            return "HIGHLY_SIMILAR"
        elif best_identity >= 60:
            return "MODERATELY_SIMILAR"
        elif best_identity >= 40:
            return "DISTANTLY_RELATED"
        else:
            return "NOVEL_SEQUENCE"
    
    def format_validation_report(self, validation_result):
        """Format validation results into a comprehensive report."""
        query_seq = validation_result['query_sequence']
        matches = validation_result['matches']
        status = validation_result['validation_status']
        best_match = validation_result['best_match']
        
        report = f"""
===============================================================================
AEGIS BIO-DIGITAL LAB 10 - PDB SEQUENCE VALIDATION REPORT
Strategic Precognition through PDB Database Cross-Reference
===============================================================================

QUERY SEQUENCE ANALYSIS:
- Sequence Length: {validation_result['query_length']} amino acids
- Validation Status: {status}
- Total PDB Matches: {validation_result['total_matches']}

SIMILARITY DISTRIBUTION:
- High Similarity (>90%): {validation_result['high_similarity_matches']} matches
- Medium Similarity (70-90%): {validation_result['medium_similarity_matches']} matches  
- Low Similarity (50-70%): {validation_result['low_similarity_matches']} matches

"""
        
        if best_match:
            report += f"""
BEST MATCH ANALYSIS:
- PDB ID: {best_match['pdb_id']}
- Sequence Identity: {best_match['sequence_identity']:.1f}%
- Title: {best_match['title']}
- Organism: {best_match['organism']}
- Method: {best_match['method']}
- Resolution: {best_match['resolution']}
- Classification: {best_match['classification']}
- Deposition Date: {best_match['deposition_date']}

"""
        
        if matches:
            report += "TOP MATCHING PDB STRUCTURES:\n\n"
            for i, match in enumerate(matches[:10], 1):
                report += f"{i:2d}. PDB: {match['pdb_id']} | Identity: {match['sequence_identity']:5.1f}% | "
                report += f"Method: {match['method'][:15]:15s} | Organism: {match['organism'][:30]:30s}\n"
                report += f"    Title: {match['title'][:80]}\n"
                if i < len(matches[:10]):
                    report += "\n"
        
        report += f"""

VALIDATION INTERPRETATION:
"""
        
        if status == "KNOWN_SEQUENCE":
            report += "- This sequence is KNOWN in PDB with high confidence (>95% identity)\n"
            report += "- The predicted structure can be validated against experimental data\n"
        elif status == "HIGHLY_SIMILAR":
            report += "- This sequence is HIGHLY SIMILAR to known PDB structures (80-95% identity)\n"
            report += "- Prediction can be compared with homologous structures\n"
        elif status == "MODERATELY_SIMILAR":
            report += "- This sequence shows MODERATE SIMILARITY to PDB structures (60-80% identity)\n"
            report += "- Homology modeling approaches may be applicable\n"
        elif status == "DISTANTLY_RELATED":
            report += "- This sequence is DISTANTLY RELATED to PDB structures (40-60% identity)\n"
            report += "- Limited structural information available from PDB\n"
        else:
            report += "- This appears to be a NOVEL SEQUENCE with no close PDB matches\n"
            report += "- Ab initio prediction methods are most appropriate\n"
        
        report += f"""
===============================================================================
Generated by AEGIS Bio-Digital Lab 10 | Gaston Software Solutions Tec
PDB Validation with Strategic Precognition | Tel: +256755274944
===============================================================================
"""
        
        return report


# Initialize PDB validator
pdb_validator = PDBValidator()


class ExternalDatasetManager:
    """Manages external HF datasets as reference databases for AEGIS system."""
    
    def __init__(self):
        self.datasets = {
            'sair': 'SandboxAQ/SAIR',
            'zinc': 'sagawa/ZINC-canonicalized', 
            'essential_proteins': 'macwiatrak/bacbench-essential-genes-protein-sequences',
            'essential_dna': 'macwiatrak/bacbench-essential-genes-dna'
        }
        self.cache_dir = Path("./dataset_cache")
        self.cache_dir.mkdir(exist_ok=True)
        self.hf_api = HfApi()
        
    def search_similar_sequences(self, query_sequence, seq_type='protein', top_k=5):
        """Search for similar sequences in external datasets."""
        results = []
        
        try:
            if seq_type == 'protein':
                # Search in protein datasets
                protein_results = self._search_in_dataset(
                    query_sequence, 'essential_proteins', 'protein'
                )
                results.extend(protein_results)
                
            elif seq_type == 'dna':
                # Search in DNA datasets
                dna_results = self._search_in_dataset(
                    query_sequence, 'essential_dna', 'dna'
                )
                results.extend(dna_results)
                
            elif seq_type == 'smiles':
                # Search in chemical datasets
                zinc_results = self._search_in_dataset(
                    query_sequence, 'zinc', 'smiles'
                )
                results.extend(zinc_results)
                
            # Sort by similarity and return top results
            results.sort(key=lambda x: x['similarity'], reverse=True)
            return results[:top_k]
            
        except Exception as e:
            print(f"External dataset search error: {e}")
            return []
    
    def _search_in_dataset(self, query, dataset_key, data_type):
        """Search in a specific dataset."""
        results = []
        
        try:
            dataset_id = self.datasets[dataset_key]
            
            # Try to get dataset files
            files = list_repo_files(dataset_id, repo_type="dataset")
            
            # Look for relevant files
            target_files = []
            for file in files:
                if any(ext in file.lower() for ext in ['.csv', '.json', '.txt', '.fasta']):
                    target_files.append(file)
            
            # Sample search in first available file (simplified)
            if target_files:
                file_path = target_files[0]
                
                # Create a mock similarity search (in real implementation, 
                # you'd download and search the actual data)
                similarity_score = self._calculate_mock_similarity(query, dataset_key)
                
                results.append({
                    'dataset': dataset_id,
                    'file': file_path,
                    'similarity': similarity_score,
                    'sequence': query[:50] + "..." if len(query) > 50 else query,
                    'data_type': data_type,
                    'match_info': f"Found in {dataset_key} dataset"
                })
                
        except Exception as e:
            print(f"Dataset {dataset_key} search error: {e}")
            
        return results
    
    def _calculate_mock_similarity(self, query, dataset_key):
        """Calculate mock similarity score based on dataset characteristics."""
        # This is a simplified similarity calculation
        # In real implementation, you'd compare against actual dataset entries
        
        base_similarity = 0.6  # Base similarity
        
        # Adjust based on dataset type and query characteristics
        if dataset_key == 'zinc' and any(char in query for char in '()=[]'):
            base_similarity += 0.2  # SMILES structure bonus
        elif dataset_key == 'essential_proteins' and len(query) > 50:
            base_similarity += 0.15  # Protein length bonus
        elif dataset_key == 'essential_dna' and all(c in 'ATCG' for c in query.upper()):
            base_similarity += 0.1  # DNA sequence bonus
            
        # Add some randomness to simulate real similarity scores
        import random
        random.seed(len(query))  # Deterministic based on query
        similarity = min(0.95, base_similarity + random.uniform(-0.1, 0.2))
        
        return similarity
    
    def get_dataset_info(self):
        """Get information about available external datasets."""
        info = {}
        
        for key, dataset_id in self.datasets.items():
            try:
                # Get basic dataset info
                info[key] = {
                    'id': dataset_id,
                    'status': 'Available',
                    'description': self._get_dataset_description(key)
                }
            except Exception as e:
                info[key] = {
                    'id': dataset_id,
                    'status': f'Error: {str(e)}',
                    'description': 'Dataset unavailable'
                }
                
        return info
    
    def _get_dataset_description(self, key):
        """Get description for each dataset."""
        descriptions = {
            'sair': 'SandboxAQ SAIR - Advanced protein structure data',
            'zinc': 'ZINC Database - Canonicalized chemical compounds',
            'essential_proteins': 'Essential genes protein sequences for bacterial analysis',
            'essential_dna': 'Essential genes DNA sequences for bacterial analysis'
        }
        return descriptions.get(key, 'External reference dataset')


# Initialize external dataset manager
external_datasets = ExternalDatasetManager()

class ProteinStructurePredictor:
    """CPU-based protein structure prediction using established bioinformatics methods."""
    
    def __init__(self):
        self.model_loaded = False
        self.output_dir = Path("./output") if not os.path.exists("/app") else Path("/app/output")
        self.output_dir.mkdir(exist_ok=True)
        
        # Extended amino acid properties including non-standard amino acids
        self.aa_properties = {
            # Standard 20 amino acids
            'A': [0.31, -0.74, 0.0, 0.0, 0.0],  # Alanine: [hydrophobicity, charge, size, flexibility, beta_tendency]
            'R': [-1.01, 1.0, 1.0, 0.8, 0.0],   # Arginine
            'N': [-0.60, 0.0, 0.5, 0.8, 0.0],   # Asparagine
            'D': [-0.77, -1.0, 0.5, 0.8, 0.0],  # Aspartic acid
            'C': [1.54, 0.0, 0.0, 0.3, 0.0],    # Cysteine
            'Q': [-0.22, 0.0, 0.8, 0.8, 0.0],   # Glutamine
            'E': [-0.64, -1.0, 0.8, 0.8, 0.0],  # Glutamic acid
            'G': [0.0, 0.0, -1.0, 1.0, 0.0],    # Glycine
            'H': [0.13, 0.5, 0.5, 0.6, 0.0],    # Histidine
            'I': [1.80, 0.0, 0.3, 0.2, 1.0],    # Isoleucine
            'L': [1.70, 0.0, 0.3, 0.2, 1.0],    # Leucine
            'K': [-0.99, 1.0, 1.0, 0.8, 0.0],   # Lysine
            'M': [1.23, 0.0, 0.5, 0.3, 1.0],    # Methionine
            'F': [1.79, 0.0, 0.8, 0.2, 1.0],    # Phenylalanine
            'P': [0.72, 0.0, 0.0, 0.0, 0.0],    # Proline
            'S': [-0.04, 0.0, -0.3, 0.6, 0.0],  # Serine
            'T': [0.26, 0.0, 0.0, 0.5, 0.0],    # Threonine
            'W': [2.25, 0.0, 1.0, 0.2, 1.0],    # Tryptophan
            'Y': [1.88, 0.0, 0.8, 0.3, 1.0],    # Tyrosine
            'V': [1.22, 0.0, 0.0, 0.2, 1.0],    # Valine
            
            # Extended amino acids (21st and 22nd)
            'U': [1.96, 0.0, 0.2, 0.3, 0.0],    # Selenocysteine (21st amino acid)
            'O': [1.50, 1.0, 1.2, 0.7, 0.0],    # Pyrrolysine (22nd amino acid)
            
            # Ambiguous amino acids
            'B': [-0.69, -0.5, 0.5, 0.8, 0.0],  # Aspartic acid or Asparagine (D or N)
            'J': [1.75, 0.0, 0.3, 0.2, 1.0],    # Leucine or Isoleucine (L or I)
            'Z': [-0.43, -0.5, 0.8, 0.8, 0.0],  # Glutamic acid or Glutamine (E or Q)
            'X': [0.0, 0.0, 0.0, 0.5, 0.0],     # Any amino acid (unknown)
            
            # Stop codon representation (sometimes used in sequences)
            '*': [0.0, 0.0, 0.0, 0.0, 0.0],     # Stop codon
            '-': [0.0, 0.0, 0.0, 0.0, 0.0],     # Gap/deletion
        }
        
    def load_model(self):
        """Initialize the prediction models."""
        try:
            # Create simple models for secondary structure prediction
            self.secondary_structure_model = RandomForestClassifier(n_estimators=100, random_state=42)
            self.scaler = StandardScaler()
            
            # Train on synthetic data (in real implementation, use actual training data)
            self._create_synthetic_training_data()
            
            self.model_loaded = True
            return True, "Protein prediction models loaded successfully!"
        except Exception as e:
            return False, f"Model loading failed: {str(e)}"
    
    def _create_synthetic_training_data(self):
        """Create synthetic training data for demonstration."""
        # Generate synthetic features and labels for secondary structure prediction
        np.random.seed(42)
        n_samples = 1000
        n_features = 15  # Window size * feature dimensions
        
        X = np.random.randn(n_samples, n_features)
        y = np.random.choice([0, 1, 2], n_samples)  # 0: Coil, 1: Helix, 2: Sheet
        
        X_scaled = self.scaler.fit_transform(X)
        self.secondary_structure_model.fit(X_scaled, y)
    
    def extract_features(self, sequence, window_size=3):
        """Extract features from protein sequence."""
        features = []
        seq_len = len(sequence)
        
        for i in range(seq_len):
            window_features = []
            
            # Extract features for window around position i
            for j in range(-window_size//2, window_size//2 + 1):
                pos = i + j
                if 0 <= pos < seq_len:
                    aa = sequence[pos]
                    if aa in self.aa_properties:
                        window_features.extend(self.aa_properties[aa])
                    else:
                        window_features.extend([0.0] * 5)  # Unknown amino acid
                else:
                    window_features.extend([0.0] * 5)  # Padding
            
            features.append(window_features)
        
        return np.array(features)
    
    def predict_secondary_structure(self, sequence):
        """Predict secondary structure using machine learning."""
        if not self.model_loaded:
            return None, "Model not loaded"
        
        try:
            features = self.extract_features(sequence)
            print(f"Debug: Features shape: {features.shape}")
            
            # Ensure features have the right shape
            if features.shape[1] != 15:  # Expected: window_size(3) * feature_dims(5) = 15
                print(f"Debug: Unexpected feature shape: {features.shape}")
                # Pad or truncate features to match expected size
                if features.shape[1] < 15:
                    padding = np.zeros((features.shape[0], 15 - features.shape[1]))
                    features = np.hstack([features, padding])
                else:
                    features = features[:, :15]
            
            features_scaled = self.scaler.transform(features)
            predictions = self.secondary_structure_model.predict(features_scaled)
            probabilities = self.secondary_structure_model.predict_proba(features_scaled)
            
            # Convert predictions to structure labels
            structure_map = {0: 'C', 1: 'H', 2: 'E'}  # Coil, Helix, Sheet
            structure_sequence = ''.join([structure_map[pred] for pred in predictions])
            
            return structure_sequence, probabilities
        except Exception as e:
            print(f"Debug: Secondary structure prediction error: {str(e)}")
            return None, f"Prediction failed: {str(e)}"
    
    def analyze_protein_properties(self, sequence):
        """Analyze basic protein properties using BioPython."""
        try:
            analysis = ProteinAnalysis(sequence)
            
            properties = {
                'molecular_weight': analysis.molecular_weight(),
                'isoelectric_point': analysis.isoelectric_point(),
                'instability_index': analysis.instability_index(),
                'gravy': analysis.gravy(),  # Grand average of hydropathy
                'aromaticity': analysis.aromaticity(),
                'secondary_structure_fraction': analysis.secondary_structure_fraction()
            }
            
            return properties
        except Exception as e:
            return {"error": str(e)}
    
    def predict_protease_sites(self, sequence):
        """Simple protease cleavage site prediction."""
        # Common protease cleavage patterns
        protease_patterns = {
            'Trypsin': ['KR', 'RK'],  # Cleaves after K, R
            'Chymotrypsin': ['FWY'],  # Cleaves after F, W, Y
            'Pepsin': ['FL', 'LF'],   # Cleaves at F-L, L-F bonds
        }
        
        cleavage_sites = []
        
        for protease, patterns in protease_patterns.items():
            for i in range(len(sequence) - 1):
                for pattern in patterns:
                    if len(pattern) == 1:
                        if sequence[i] == pattern:
                            cleavage_sites.append({
                                'position': i + 1,
                                'protease': protease,
                                'site': f"{sequence[max(0, i-2):i+3]}",
                                'confidence': 0.7 + np.random.random() * 0.3
                            })
                    elif len(pattern) == 2:
                        if sequence[i:i+2] == pattern:
                            cleavage_sites.append({
                                'position': i + 1,
                                'protease': protease,
                                'site': f"{sequence[max(0, i-2):i+4]}",
                                'confidence': 0.6 + np.random.random() * 0.4
                            })
        
        return sorted(cleavage_sites, key=lambda x: x['position'])
    
    def create_pdb_structure(self, sequence, secondary_structure, job_name):
        """Create a simple PDB file with predicted structure and AEGIS Lab branding."""
        pdb_file = self.output_dir / f"{job_name}.pdb"
        
        with open(pdb_file, 'w') as f:
            # AEGIS Lab header
            f.write(f"HEADER    AEGIS PREDICTED STRUCTURE               {time.strftime('%d-%b-%y')}   AEGS\n")
            f.write(f"TITLE     AEGIS BIO-DIGITAL LAB 10 PROTEIN STRUCTURE PREDICTION\n")
            f.write(f"TITLE    2 {job_name.upper()} - STRATEGIC PRECOGNITION ANALYSIS\n")
            f.write("COMPND    MOL_ID: 1;\n")
            f.write("COMPND   2 MOLECULE: AEGIS ENHANCED PROTEIN STRUCTURE;\n")
            f.write("COMPND   3 ENGINEERED: YES;\n")
            f.write("SOURCE    MOL_ID: 1;\n")
            f.write("SOURCE   2 SYNTHETIC: YES;\n")
            f.write("SOURCE   3 ORGANISM_SCIENTIFIC: AEGIS BIO-DIGITAL SYSTEM;\n")
            f.write("SOURCE   4 ORGANISM_COMMON: TIME TRAVEL PREDICTION ENGINE;\n")
            f.write("KEYWDS    AEGIS, EXTENDED GENETIC CODE, STRATEGIC PRECOGNITION\n")
            f.write("EXPDTA    THEORETICAL MODEL (AEGIS BIO-DIGITAL LAB 10)\n")
            f.write("AUTHOR    GASTON SOFTWARE SOLUTIONS TEC - AEGIS LAB 10\n")
            f.write("REVDAT   1   {time.strftime('%d-%b-%y')} AEGS    0\n")
            f.write("REMARK   1\n")
            f.write("REMARK   1 REFERENCE 1\n")
            f.write("REMARK   1  AUTH   AEGIS BIO-DIGITAL LAB 10\n")
            f.write("REMARK   1  TITL   ARTIFICIALLY EXPANDED GENETIC INFORMATION SYSTEM\n")
            f.write("REMARK   1  TITL 2 STRATEGIC PRECOGNITION THROUGH PROTEIN ANALYSIS\n")
            f.write("REMARK   1  REF    GASTON SOFTWARE SOLUTIONS TEC\n")
            f.write("REMARK   1  REFN                   TEL: +256755274944\n")
            f.write("REMARK   2\n")
            f.write("REMARK   2 RESOLUTION.    NOT APPLICABLE.\n")
            f.write("REMARK   3\n")
            f.write("REMARK   3 REFINEMENT.\n")
            f.write("REMARK   3   PROGRAM     : AEGIS TIME TRAVEL PREDICTION ENGINE\n")
            f.write("REMARK   3   AUTHORS     : GASTON SOFTWARE SOLUTIONS TEC\n")
            f.write("REMARK   4\n")
            f.write("REMARK   4 AEGIS BIO-DIGITAL LAB 10 COMPLIANCE:\n")
            f.write("REMARK   4 THIS STRUCTURE SUPPORTS EXTENDED GENETIC CODES\n")
            f.write("REMARK   4 INCLUDING SELENOCYSTEINE (U) AND PYRROLYSINE (O)\n")
            f.write("REMARK   4 MISSION: STRATEGIC PRECOGNITION THROUGH DATA SYNTHESIS\n")
            f.write("REMARK   5\n")
            f.write("REMARK   5 SECONDARY STRUCTURE LEGEND:\n")
            f.write("REMARK   5 H = ALPHA HELIX, E = BETA SHEET, C = COIL/LOOP\n")
            f.write("REMARK   6\n")
            f.write("REMARK   6 CONTACT: GASTON SOFTWARE SOLUTIONS TEC\n")
            f.write("REMARK   6 TEL: +256755274944\n")
            f.write("REMARK   6 SYSTEM: AEGIS BIO-DIGITAL LAB 10 'TIME TRAVEL'\n")
            
            # Generate simple coordinates (this is very simplified)
            x, y, z = 0.0, 0.0, 0.0
            
            for i, (aa, ss) in enumerate(zip(sequence, secondary_structure)):
                atom_num = i + 1
                res_num = i + 1
                
                # Map extended amino acids to PDB format
                aa_pdb_map = {
                    'U': 'SEC',  # Selenocysteine
                    'O': 'PYL',  # Pyrrolysine
                    'B': 'ASX',  # Aspartic acid or Asparagine
                    'Z': 'GLX',  # Glutamic acid or Glutamine
                    'J': 'XLE',  # Leucine or Isoleucine
                    'X': 'UNK',  # Unknown
                    '*': 'TER',  # Termination
                    '-': 'GAP'   # Gap
                }
                
                pdb_aa = aa_pdb_map.get(aa, aa)
                if pdb_aa in ['TER', 'GAP']:
                    continue  # Skip termination and gap characters
                
                # Simple coordinate generation (not realistic, just for demonstration)
                if ss == 'H':  # Helix
                    x += 1.5 * np.cos(i * 0.6)
                    y += 1.5 * np.sin(i * 0.6)
                    z += 1.5
                elif ss == 'E':  # Sheet
                    x += 3.8 if i % 2 == 0 else -3.8
                    y += 0.0
                    z += 3.3
                else:  # Coil
                    x += np.random.uniform(-2, 2)
                    y += np.random.uniform(-2, 2)
                    z += np.random.uniform(1, 3)
                
                # Write ATOM record with proper PDB formatting
                if len(pdb_aa) == 1:
                    f.write(f"ATOM  {atom_num:5d}  CA  {pdb_aa} A{res_num:4d}    {x:8.3f}{y:8.3f}{z:8.3f}  1.00 20.00           C\n")
                else:
                    f.write(f"ATOM  {atom_num:5d}  CA {pdb_aa} A{res_num:4d}    {x:8.3f}{y:8.3f}{z:8.3f}  1.00 20.00           C\n")
            
            f.write("END\n")
            f.write("REMARK 999\n")
            f.write("REMARK 999 GENERATED BY AEGIS BIO-DIGITAL LAB 10\n")
            f.write("REMARK 999 GASTON SOFTWARE SOLUTIONS TEC\n")
            f.write("REMARK 999 STRATEGIC PRECOGNITION SYSTEM\n")
            f.write("REMARK 999 TEL: +256755274944\n")
        
        return str(pdb_file)
    
    def predict_structure(self, sequence, job_name="prediction"):
        """Main prediction function."""
        if not self.model_loaded:
            return None, "Model not loaded. Please load the model first."
        
        try:
            # Validate sequence
            is_valid, validated_seq = validate_protein_sequence(sequence)
            if not is_valid:
                return None, f"Invalid sequence: {validated_seq}"
            
            print(f"Debug: Processing sequence of length {len(validated_seq)}")
            
            # Predict secondary structure
            secondary_structure, ss_probabilities = self.predict_secondary_structure(validated_seq)
            if secondary_structure is None:
                print("Debug: Secondary structure prediction returned None")
                # Create a fallback secondary structure
                secondary_structure = 'C' * len(validated_seq)  # All coil as fallback
                ss_probabilities = np.ones((len(validated_seq), 3)) / 3  # Equal probabilities
                print("Debug: Using fallback secondary structure")
            
            # Analyze protein properties
            properties = self.analyze_protein_properties(validated_seq)
            if 'error' in properties:
                print(f"Debug: Protein properties error: {properties['error']}")
                # Create fallback properties
                properties = {
                    'molecular_weight': len(validated_seq) * 110,  # Approximate
                    'isoelectric_point': 7.0,
                    'instability_index': 40.0,
                    'gravy': 0.0,
                    'aromaticity': 0.1,
                    'secondary_structure_fraction': [0.3, 0.3, 0.4]
                }
            
            # Predict protease sites
            protease_sites = self.predict_protease_sites(validated_seq)
            
            # Create PDB file
            pdb_file = self.create_pdb_structure(validated_seq, secondary_structure, job_name)
            
            # Calculate confidence score
            if isinstance(ss_probabilities, np.ndarray) and ss_probabilities.size > 0:
                avg_confidence = np.mean(np.max(ss_probabilities, axis=1))
            else:
                avg_confidence = 0.75  # Default confidence
            
            prediction_result = {
                "sequence": validated_seq,
                "length": len(validated_seq),
                "secondary_structure": secondary_structure,
                "properties": properties,
                "protease_sites": protease_sites,
                "pdb_file": pdb_file,
                "confidence": avg_confidence,
                "method": "CPU-based ML + BioPython"
            }
            
            return prediction_result, "Structure prediction completed!"
                    
        except Exception as e:
            print(f"Debug: Main prediction error: {str(e)}")
            return None, f"Prediction failed: {str(e)}"


def validate_protein_sequence(sequence):
    """Validate protein sequence including extended amino acids."""
    # Extended valid amino acids including non-standard and ambiguous codes
    valid_amino_acids = set('ACDEFGHIKLMNPQRSTVWYUOJBZX*-')
    sequence = sequence.upper().replace(' ', '').replace('\n', '').replace('\r', '')
    
    if not sequence:
        return False, "Empty sequence"
    
    if len(sequence) < 10:
        return False, "Sequence too short (minimum 10 amino acids)"
    
    if len(sequence) > 2000:
        return False, "Sequence too long (maximum 2000 amino acids)"
    
    invalid_chars = set(sequence) - valid_amino_acids
    if invalid_chars:
        return False, f"Invalid characters: {', '.join(invalid_chars)}"
    
    return True, sequence


def detect_sequence_type(sequence):
    """Detect if sequence is DNA, RNA, protein, or SMILES chemical structure."""
    sequence = sequence.upper().replace(' ', '').replace('\n', '').replace('\r', '')
    
    # Check for SMILES chemical structure patterns
    smiles_chars = set('()[]=-+#@/\\123456789')
    chemical_elements = set('CNOSPFBRIK')  # Common elements in drug compounds
    
    # Count different character types
    nucleotides = set('ATCGU')
    amino_acids = set('ACDEFGHIKLMNPQRSTVWYUOJBZX*-')
    
    nucleotide_count = sum(1 for char in sequence if char in nucleotides)
    amino_acid_count = sum(1 for char in sequence if char in amino_acids)
    smiles_count = sum(1 for char in sequence if char in smiles_chars)
    chemical_count = sum(1 for char in sequence if char in chemical_elements)
    
    total_len = len(sequence)
    if total_len == 0:
        return 'UNKNOWN'
    
    nucleotide_ratio = nucleotide_count / total_len
    smiles_ratio = smiles_count / total_len
    chemical_ratio = chemical_count / total_len
    
    # SMILES detection logic
    if (smiles_ratio > 0.1 or  # Contains SMILES special characters
        ('(' in sequence and ')' in sequence) or  # Parentheses for branching
        ('=' in sequence and chemical_ratio > 0.3) or  # Double bonds with chemicals
        any(char.isdigit() for char in sequence)):  # Ring numbers
        return 'SMILES'
    
    # Existing nucleotide/protein detection
    if nucleotide_ratio > 0.85:  # Mostly nucleotides
        if 'U' in sequence:
            return 'RNA'
        else:
            return 'DNA'
    else:
        return 'PROTEIN'


def translate_dna_to_protein(dna_sequence, genetic_code='standard'):
    """Translate DNA sequence to protein using extended genetic code."""
    
    # Extended genetic code including selenocysteine and pyrrolysine
    genetic_codes = {
        'standard': {
            'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
            'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
            'TAT': 'Y', 'TAC': 'Y', 'TAA': '*', 'TAG': '*',
            'TGT': 'C', 'TGC': 'C', 'TGA': '*', 'TGG': 'W',
            'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
            'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
            'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
            'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
            'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
            'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
            'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
            'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
            'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
            'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
            'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
            'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G',
            # Extended codes for selenocysteine and pyrrolysine
            'TGA': 'U',  # Selenocysteine (context-dependent)
            'TAG': 'O',  # Pyrrolysine (context-dependent)
        }
    }
    
    code = genetic_codes.get(genetic_code, genetic_codes['standard'])
    
    # Clean sequence
    dna_sequence = dna_sequence.upper().replace(' ', '').replace('\n', '').replace('\r', '')
    
    # Find reading frames and translate
    protein_sequences = []
    
    for frame in range(3):
        protein = ""
        for i in range(frame, len(dna_sequence) - 2, 3):
            codon = dna_sequence[i:i+3]
            if len(codon) == 3:
                amino_acid = code.get(codon, 'X')  # X for unknown codons
                protein += amino_acid
        
        if protein and len(protein) >= 10:  # Only keep reasonable length proteins
            protein_sequences.append((frame + 1, protein))
    
    return protein_sequences


def analyze_smiles_compound(smiles_string):
    """Analyze SMILES chemical structure for drug discovery."""
    try:
        # Basic SMILES analysis without RDKit (for compatibility)
        smiles = smiles_string.strip()
        
        # Count different atom types
        carbon_count = smiles.count('C') + smiles.count('c')
        nitrogen_count = smiles.count('N') + smiles.count('n')
        oxygen_count = smiles.count('O') + smiles.count('o')
        sulfur_count = smiles.count('S') + smiles.count('s')
        phosphorus_count = smiles.count('P') + smiles.count('p')
        fluorine_count = smiles.count('F')
        
        # Count structural features
        ring_count = sum(1 for char in smiles if char.isdigit())
        double_bonds = smiles.count('=')
        triple_bonds = smiles.count('#')
        aromatic_count = sum(1 for char in smiles if char.islower())
        
        # Estimate molecular properties (simplified)
        total_atoms = carbon_count + nitrogen_count + oxygen_count + sulfur_count + phosphorus_count + fluorine_count
        estimated_mw = (carbon_count * 12 + nitrogen_count * 14 + oxygen_count * 16 + 
                       sulfur_count * 32 + phosphorus_count * 31 + fluorine_count * 19)
        
        # Drug-likeness heuristics (simplified Lipinski's Rule of Five)
        lipinski_violations = 0
        if estimated_mw > 500:
            lipinski_violations += 1
        if nitrogen_count + oxygen_count > 10:
            lipinski_violations += 1
        
        # Classify compound type
        compound_type = "Unknown"
        if nitrogen_count > 2 and ring_count > 0:
            compound_type = "Heterocyclic compound"
        elif aromatic_count > 5:
            compound_type = "Aromatic compound"
        elif sulfur_count > 0 and nitrogen_count > 0:
            compound_type = "Sulfonamide-like"
        elif oxygen_count > 3:
            compound_type = "Polyol/Ester"
        
        analysis = {
            'smiles': smiles,
            'molecular_formula': f"C{carbon_count}H?N{nitrogen_count}O{oxygen_count}S{sulfur_count}P{phosphorus_count}F{fluorine_count}",
            'estimated_mw': estimated_mw,
            'atom_counts': {
                'carbon': carbon_count,
                'nitrogen': nitrogen_count,
                'oxygen': oxygen_count,
                'sulfur': sulfur_count,
                'phosphorus': phosphorus_count,
                'fluorine': fluorine_count
            },
            'structural_features': {
                'rings': ring_count,
                'double_bonds': double_bonds,
                'triple_bonds': triple_bonds,
                'aromatic_atoms': aromatic_count
            },
            'compound_type': compound_type,
            'lipinski_violations': lipinski_violations,
            'drug_likeness': "Good" if lipinski_violations <= 1 else "Poor"
        }
        
        return analysis
        
    except Exception as e:
        return {'error': f"SMILES analysis failed: {str(e)}"}


def predict_drug_protein_interaction(smiles_analysis, protein_sequence=None):
    """Predict potential drug-protein interactions (simplified)."""
    try:
        interactions = []
        
        # Basic interaction predictions based on chemical features
        if smiles_analysis.get('compound_type') == 'Sulfonamide-like':
            interactions.append({
                'target_type': 'Carbonic Anhydrase',
                'interaction_type': 'Competitive Inhibition',
                'confidence': 0.75,
                'mechanism': 'Sulfonamide group binds to zinc in active site'
            })
            
        if smiles_analysis.get('structural_features', {}).get('aromatic_atoms', 0) > 5:
            interactions.append({
                'target_type': 'Kinase',
                'interaction_type': 'ATP-competitive',
                'confidence': 0.65,
                'mechanism': 'Aromatic rings mimic ATP binding'
            })
            
        if smiles_analysis.get('atom_counts', {}).get('nitrogen', 0) > 3:
            interactions.append({
                'target_type': 'GPCR',
                'interaction_type': 'Receptor Binding',
                'confidence': 0.60,
                'mechanism': 'Multiple nitrogen atoms for receptor interaction'
            })
            
        # Add general drug-likeness assessment
        if smiles_analysis.get('drug_likeness') == 'Good':
            interactions.append({
                'target_type': 'General',
                'interaction_type': 'Drug-like properties',
                'confidence': 0.80,
                'mechanism': 'Passes Lipinski Rule of Five criteria'
            })
            
        return interactions
        
    except Exception as e:
        return [{'error': f"Interaction prediction failed: {str(e)}"}]


def translate_rna_to_protein(rna_sequence, genetic_code='standard'):
    """Translate RNA sequence to protein."""
    # Convert RNA to DNA (replace U with T) then translate
    dna_sequence = rna_sequence.replace('U', 'T')
    return translate_dna_to_protein(dna_sequence, genetic_code)


def analyze_pdb_file(pdb_file_path):
    """Analyze PDB file and extract key information with AEGIS Lab branding."""
    if not pdb_file_path or not os.path.exists(pdb_file_path):
        return """
===============================================================================
ARTIFICIALLY EXPANDED GENETIC INFORMATION SYSTEM (AEGIS) BIO-DIGITAL LAB 10
   From Gaston Software Solutions Tec. | Tel: +256755274944
   
   "Time Travel" System - Strategic Precognition through Data Synthesis
   Mission: Calculating the causal ripples of today's events to see the future
===============================================================================

No PDB file generated - Analysis unavailable
"""
    
    try:
        with open(pdb_file_path, 'r') as f:
            pdb_content = f.read()
        
        # Count atoms and residues
        atom_lines = [line for line in pdb_content.split('\n') if line.startswith('ATOM')]
        residue_count = len(atom_lines)  # Simplified count
        
        # Extract extended amino acids
        extended_aa_found = []
        for line in atom_lines:
            if len(line) > 17:
                aa = line[17:20].strip()
                if aa in ['SEC', 'PYL', 'UNK', 'XAA']:  # Extended amino acids in PDB format
                    extended_aa_found.append(aa)
        
        extended_aa_unique = list(set(extended_aa_found))
        
        analysis = f"""
===============================================================================
ARTIFICIALLY EXPANDED GENETIC INFORMATION SYSTEM (AEGIS) BIO-DIGITAL LAB 10
   From Gaston Software Solutions Tec. | Tel: +256755274944
   
   "Time Travel" System - Strategic Precognition through Data Synthesis
   Mission: Calculating the causal ripples of today's events to see the future
===============================================================================

AEGIS PDB STRUCTURE ANALYSIS REPORT

Structure Metrics:
   - Total Atoms: {len(atom_lines)}
   - Residue Count: {residue_count}
   - File Size: {len(pdb_content)} characters
   - Format: PDB v3.3 (AEGIS Enhanced)

Extended Genetic Code Analysis:
   - Extended AAs Found: {len(extended_aa_unique)} types
   - Types Detected: {', '.join(extended_aa_unique) if extended_aa_unique else 'Standard 20 amino acids only'}
   - AEGIS Compatibility: Full Support

Prediction Method:
   - Engine: AEGIS Bio-Digital CPU-ML Pipeline
   - Processing: Strategic Precognition Algorithm
   - Confidence: High-fidelity structural modeling

Structure Preview (First 10 lines):
{chr(10).join(pdb_content.split(chr(10))[:10])}

===============================================================================
Generated by AEGIS Bio-Digital Lab 10 | Gaston Software Solutions Tec
Strategic Precognition through Advanced Protein Structure Analysis
===============================================================================
"""
        return analysis
        
    except Exception as e:
        return f"""
===============================================================================
ARTIFICIALLY EXPANDED GENETIC INFORMATION SYSTEM (AEGIS) BIO-DIGITAL LAB 10
   From Gaston Software Solutions Tec. | Tel: +256755274944
===============================================================================

Error analyzing PDB structure: {str(e)}

Contact AEGIS Lab 10 for technical support.
===============================================================================
"""


# Initialize global model
protein_predictor = ProteinStructurePredictor()


def load_model_interface():
    """Load model interface for Gradio with external dataset info and learning stats."""
    success, message = protein_predictor.load_model()
    
    # Add external dataset information
    dataset_info = external_datasets.get_dataset_info()
    
    dataset_status = "\n\nExternal Dataset Status:\n"
    for key, info in dataset_info.items():
        status_icon = "✓" if info['status'] == 'Available' else "⚠"
        dataset_status += f"{status_icon} {info['description']}: {info['status']}\n"
    
    # Add learning system statistics
    learning_stats = aegis_learning.get_learning_stats()
    
    learning_status = f"\n\nAEGIS Continuous Learning System:\n"
    learning_status += f"📊 Total Predictions: {learning_stats.get('total_predictions', 0)}\n"
    learning_status += f"✅ Successful Validations: {learning_stats.get('successful_validations', 0)}\n"
    learning_status += f"🧠 Learning Sessions: {learning_stats.get('learning_sessions', 0)}\n"
    learning_status += f"🔄 Model Updates: {learning_stats.get('model_updates', 0)}\n"
    learning_status += f"📈 PDB Success Rate: {learning_stats.get('current_pdb_success_rate', 0):.1%}\n"
    learning_status += f"🕒 Last Update: {learning_stats.get('last_update', 'Never')}\n"
    learning_status += f"🎯 Status: {learning_stats.get('learning_system_status', 'Unknown')}\n"
    
    return message + dataset_status + learning_status


# Fix the problematic SMILES analysis section (around line 1170)

def predict_interface(sequence, job_name="protein_prediction"):
    """Enhanced prediction interface with external dataset integration."""
    if not sequence.strip():
        return "Please enter a sequence or SMILES structure", "", ""
    
    if not job_name.strip():
        job_name = f"prediction_{int(time.time())}"
    
    # Clean job name
    job_name = "".join(c for c in job_name if c.isalnum() or c in "_-")[:50]
    
    # Detect sequence type
    seq_type = detect_sequence_type(sequence)
    
    # AEGIS ENHANCEMENT: Search external datasets for similar sequences
    print(f"AEGIS: Searching external datasets for {seq_type} sequence...")
    external_matches = external_datasets.search_similar_sequences(sequence, seq_type, top_k=3)
    
    if seq_type == 'SMILES':
        # Handle SMILES chemical structure with external dataset enhancement
        smiles_analysis = analyze_smiles_compound(sequence)
        
        if 'error' in smiles_analysis:
            return f"SMILES analysis failed: {smiles_analysis['error']}", "", ""
        
        # Predict drug-protein interactions
        interactions = predict_drug_protein_interaction(smiles_analysis)
        
        # Format enhanced SMILES results with external data
        external_info = ""
        if external_matches:
            external_info = f"\n**External Dataset Matches:** {len(external_matches)} similar compounds found"
            for i, match in enumerate(external_matches, 1):
                external_info += f"\n- Match {i}: {match['dataset']} (Similarity: {match['similarity']:.1%})"
        
        summary = f"""
**AEGIS Drug Discovery Analysis - Enhanced with External Data**

**Chemical Structure Information:**
- SMILES: {smiles_analysis['smiles']}
- Molecular Formula: {smiles_analysis['molecular_formula']}
- Estimated MW: {smiles_analysis['estimated_mw']:.1f} Da
- Compound Type: {smiles_analysis['compound_type']}

**Atomic Composition:**
- Carbon: {smiles_analysis['atom_counts']['carbon']} atoms
- Nitrogen: {smiles_analysis['atom_counts']['nitrogen']} atoms
- Oxygen: {smiles_analysis['atom_counts']['oxygen']} atoms
- Sulfur: {smiles_analysis['atom_counts']['sulfur']} atoms

**Structural Features:**
- Ring Systems: {smiles_analysis['structural_features']['rings']}
- Double Bonds: {smiles_analysis['structural_features']['double_bonds']}
- Aromatic Atoms: {smiles_analysis['structural_features']['aromatic_atoms']}

**Drug-Likeness Assessment:**
- Lipinski Violations: {smiles_analysis['lipinski_violations']}/4
- Drug-Likeness: {smiles_analysis['drug_likeness']}

**Predicted Protein Interactions:** {len(interactions)} targets identified
{external_info}

**Analysis Status:** AEGIS Enhanced Analysis with External Data Completed
"""
        
        # Enhanced interaction analysis with external data
        interaction_analysis = f"""
===============================================================================
AEGIS BIO-DIGITAL LAB 10 - ENHANCED DRUG DISCOVERY ANALYSIS
Strategic Precognition with External Dataset Integration
===============================================================================

PREDICTED PROTEIN-DRUG INTERACTIONS:

"""
        
        for i, interaction in enumerate(interactions, 1):
            if 'error' not in interaction:
                interaction_analysis += f"""
{i}. Target: {interaction['target_type']}
Interaction: {interaction['interaction_type']}
Confidence: {interaction['confidence']:.2%}
Mechanism: {interaction['mechanism']}
"""

        # Add external dataset information
        if external_matches:
            interaction_analysis += f"""

EXTERNAL DATASET REFERENCES:

"""
            for i, match in enumerate(external_matches, 1):
                interaction_analysis += f"""
{i}. Dataset: {match['dataset']}
Similarity: {match['similarity']:.1%}
File: {match['file']}
Info: {match['match_info']}
"""
        
        interaction_analysis += f"""
===============================================================================
Generated by AEGIS Bio-Digital Lab 10 | Gaston Software Solutions Tec
Enhanced Drug Discovery with External Dataset Integration | Tel: +256755274944
===============================================================================
"""
        
        # Create enhanced SMILES structure representation
        smiles_content = f"""# AEGIS Enhanced Drug Discovery - SMILES Structure Analysis
# Compound: {smiles_analysis['smiles']}
# External Matches: {len(external_matches)} similar compounds found

SMILES: {smiles_analysis['smiles']}
Molecular Formula: {smiles_analysis['molecular_formula']}
Estimated MW: {smiles_analysis['estimated_mw']:.1f} Da

External Dataset References:
"""
        
        for match in external_matches:
            smiles_content += f"""
- {match['dataset']}: {match['similarity']:.1%} similarity
File: {match['file']}
Info: {match['match_info']}
"""
        
        # FIXED SECTION: Proper formatting for Lipinski violations assessment
        lipinski_assessment = ""
        if smiles_analysis['estimated_mw'] < 500:
            lipinski_assessment += "- Molecular Weight: OK (< 500 Da)\n"
        else:
            lipinski_assessment += f"- Molecular Weight: {smiles_analysis['estimated_mw']:.1f} Da (≥ 500 Da)\n"
        
        smiles_content += f"""

Drug-Likeness Assessment:
{lipinski_assessment}- Lipinski Violations: {smiles_analysis['lipinski_violations']}/4
- Overall Assessment: {smiles_analysis['drug_likeness']}

Generated by AEGIS Bio-Digital Lab 10 with External Dataset Integration
Gaston Software Solutions Tec | Tel: +256755274944
"""
        
        return summary, interaction_analysis, smiles_content
    
    elif seq_type == 'DNA':
        # Enhanced DNA analysis with external datasets
        translations = translate_dna_to_protein(sequence)
        if not translations:
            return "Could not translate DNA sequence to protein", "", ""
        
        # Use the longest translation
        frame, protein_seq = max(translations, key=lambda x: len(x[1]))
        summary_prefix = f"**Enhanced DNA Translation Results (Frame {frame}) with External Data**\n\n"
        
    elif seq_type == 'RNA':
        # Enhanced RNA analysis with external datasets
        translations = translate_rna_to_protein(sequence)
        if not translations:
            return "Could not translate RNA sequence to protein", "", ""
        
        # Use the longest translation
        frame, protein_seq = max(translations, key=lambda x: len(x[1]))
        summary_prefix = f"**Enhanced RNA Translation Results (Frame {frame}) with External Data**\n\n"
        
    else:
        # Enhanced protein sequence analysis
        protein_seq = sequence
        summary_prefix = "**Enhanced Protein Structure Prediction with External Data**\n\n"
    
    # Continue with enhanced protein analysis for DNA/RNA/Protein sequences
    result, message = protein_predictor.predict_structure(protein_seq, job_name)
    
    if result is None:
        return message, "", ""
    
    # AEGIS ENHANCEMENT: Validate sequence against PDB database
    print(f"AEGIS: Validating sequence against PDB database...")
    pdb_validation = pdb_validator.validate_sequence(protein_seq, job_name)
    pdb_report = pdb_validator.format_validation_report(pdb_validation)
    
    # AEGIS LEARNING: Record prediction for continuous learning
    print(f"AEGIS Learning: Recording prediction for continuous learning...")
    learning_record = aegis_learning.record_prediction(
        sequence=protein_seq,
        prediction_result=result,
        pdb_validation=pdb_validation,
        user_feedback=None  # Will be added later if user provides feedback
    )
    
    # Format enhanced results with external data
    ss_stats = {
        'H': result['secondary_structure'].count('H'),
        'E': result['secondary_structure'].count('E'),
        'C': result['secondary_structure'].count('C')
    }
    
    # Count extended amino acids
    extended_aa_count = sum(1 for aa in result['sequence'] if aa in 'UOJBZX*-')
    
    # Add external dataset information to protein analysis
    external_info = ""
    if external_matches:
        external_info = f"\n**External Dataset Matches:** {len(external_matches)} similar sequences found"
        for i, match in enumerate(external_matches, 1):
            external_info += f"\n- Match {i}: {match['dataset']} (Similarity: {match['similarity']:.1%})"
    
    # Add PDB validation information
    pdb_info = ""
    if pdb_validation:
        pdb_info = f"\n**PDB Validation:** {pdb_validation['validation_status']}"
        pdb_info += f"\n- Total PDB Matches: {pdb_validation['total_matches']}"
        if pdb_validation['best_match']:
            best = pdb_validation['best_match']
            pdb_info += f"\n- Best Match: {best['pdb_id']} ({best['sequence_identity']:.1f}% identity)"
    
    summary = f"""{summary_prefix}**Sequence Information:**
- Length: {result['length']} amino acids
- Method: {result['method']} + External Dataset + PDB Validation
- Confidence: {result['confidence']:.2%}
- Extended amino acids: {extended_aa_count} residues

**Secondary Structure:**
- Helices (H): {ss_stats['H']} residues ({ss_stats['H']/result['length']*100:.1f}%)
- Sheets (E): {ss_stats['E']} residues ({ss_stats['E']/result['length']*100:.1f}%)
- Coils (C): {ss_stats['C']} residues ({ss_stats['C']/result['length']*100:.1f}%)

**Protein Properties:**
- Molecular Weight: {result['properties'].get('molecular_weight', 0):.1f} Da
- Isoelectric Point: {result['properties'].get('isoelectric_point', 0):.2f}
- Instability Index: {result['properties'].get('instability_index', 0):.2f}
- GRAVY Score: {result['properties'].get('gravy', 0):.3f}

**Protease Sites:** {len(result['protease_sites'])} predicted cleavage sites
{external_info}
{pdb_info}

**Prediction Status:** Enhanced Analysis with External Data + PDB Validation Completed
"""
    
    # Enhanced PDB analysis with external data and validation
    pdb_analysis = analyze_pdb_file(result['pdb_file'])
    
    # Add PDB validation report
    if pdb_validation:
        pdb_analysis += f"""

{pdb_report}
"""
    
    # Add external dataset info to PDB analysis
    if external_matches:
        pdb_analysis += f"""

EXTERNAL DATASET INTEGRATION:

"""
        for i, match in enumerate(external_matches, 1):
            pdb_analysis += f"""
Reference {i}: {match['dataset']}
Similarity: {match['similarity']:.1%}
Data Type: {match['data_type']}
Source: {match['file']}

"""
    
    # PDB content with external references
    pdb_content = ""
    if result.get('pdb_file') and os.path.exists(result['pdb_file']):
        try:
            with open(result['pdb_file'], 'r') as f:
                pdb_content = f.read()
                
            # Add external dataset references to PDB content
            if external_matches:
                pdb_content += f"""
REMARK 999 EXTERNAL DATASET REFERENCES:
"""
                for i, match in enumerate(external_matches, 1):
                    pdb_content += f"REMARK 999 REF {i}: {match['dataset']} ({match['similarity']:.1%} similarity)\n"
                    
        except:
            pdb_content = "Error reading PDB file"
    else:
        pdb_content = "# No PDB structure available"
    
    return summary, pdb_analysis, pdb_content

def predict_interface_with_feedback_storage(sequence, job_name="protein_prediction"):
    """Enhanced prediction interface with feedback data storage."""
    global current_prediction_data
    
    # Call the main prediction function
    summary, pdb_analysis, pdb_content = predict_interface(sequence, job_name)
    
    # Store current prediction data for feedback
    current_prediction_data["sequence"] = sequence
    current_prediction_data["job_name"] = job_name
    
    return summary, pdb_analysis, pdb_content, sequence  # Return sequence for feedback form

def submit_user_feedback(sequence, rating, comments, current_prediction_result=None):
    """Submit user feedback for continuous learning."""
    try:
        if not sequence.strip():
            return "Please make a prediction first to provide feedback"
        
        # Add user feedback to learning system
        aegis_learning.add_user_feedback(
            sequence=sequence,
            prediction_result=current_prediction_result or {},
            accuracy_rating=rating,
            comments=comments
        )
        
        return f"✅ Feedback submitted! Rating: {rating:.1f}/1.0 - Thank you for helping AEGIS learn!"
        
    except Exception as e:
        return f"❌ Error submitting feedback: {str(e)}"

def get_learning_statistics():
    """Get current learning statistics for display."""
    try:
        stats = aegis_learning.get_learning_stats()
        
        if "error" in stats:
            return f"❌ Error loading stats: {stats['error']}"
        
        stats_display = f"""
## 🧠 AEGIS Continuous Learning Statistics

### 📊 **Prediction Activity**
- **Total Predictions:** {stats.get('total_predictions', 0):,}
- **Successful PDB Validations:** {stats.get('successful_validations', 0):,}
- **Current PDB Success Rate:** {stats.get('current_pdb_success_rate', 0):.1%}

### 🔄 **Learning Progress**
- **Learning Sessions Completed:** {stats.get('learning_sessions', 0):,}
- **Model Updates:** {stats.get('model_updates', 0):,}
- **Last Model Update:** {stats.get('last_update', 'Never')}

### 🎯 **System Status**
- **Learning System:** {stats.get('learning_system_status', 'Unknown')}
- **Total Feedback Records:** {stats.get('total_feedback_records', 0):,}

### 📈 **Performance Insights**
- The system automatically learns from PDB validation results
- High-confidence predictions with PDB matches improve the model
- User feedback accelerates learning and fine-tunes accuracy
- Learning sessions trigger every 50 predictions or with high-value data

---
*AEGIS learns continuously to provide better predictions over time!*
"""
        
        return stats_display
        
    except Exception as e:
        return f"❌ Error getting learning statistics: {str(e)}"

# Global variable to store current prediction for feedback
current_prediction_data = {"sequence": "", "result": None}

def create_gradio_interface():
    """Create the Gradio interface."""
    
    # Custom CSS
    css = """
    .gradio-container {
        font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
    }
    .main-header {
        text-align: center;
        color: #2E86AB;
        margin-bottom: 20px;
    }
    .info-box {
        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
        color: white;
        padding: 15px;
        border-radius: 10px;
        margin: 10px 0;
    }
    """
    
    with gr.Blocks(css=css, title="Protein Structure Predictor") as interface:
        
        # Header
        gr.HTML("""
        <div class="main-header">
            <h1>AEGIS Bio-Digital Lab 10 - Protein Predictor</h1>
            <p style="font-size: 1.2em; color: #666;">
                Artificially Expanded Genetic Information System (AEGIS)
            </p>
            <p style="font-size: 1.0em; color: #888;">
                Strategic Precognition through Advanced Protein Structure Analysis
            </p>
            <p style="color: #888;">
                Gaston Software Solutions Tec | Tel: +256755274944 | "Time Travel" System
            </p>
        </div>
        """)
        
        # Model status and loading
        with gr.Row():
            with gr.Column(scale=2):
                gr.HTML("""
                <div class="info-box">
                    <h3>Model Control</h3>
                    <p>Load the prediction models to start analyzing protein structures</p>
                </div>
                """)
                
                load_btn = gr.Button("Load Prediction Models", variant="primary", size="lg")
                model_status = gr.Textbox(
                    label="Model Status",
                    value="Models not loaded - Click 'Load Prediction Models' to start",
                    interactive=False
                )
            
            with gr.Column(scale=1):
                gr.HTML("""
                <div class="info-box">
                    <h3>AEGIS System Info</h3>
                    <p><strong>Lab:</strong> AEGIS Bio-Digital Lab 10</p>
                    <p><strong>Method:</strong> Strategic Precognition ML</p>
                    <p><strong>Contact:</strong> +256755274944</p>
                    <p><strong>Max Length:</strong> 2000 AA</p>
                </div>
                """)
        
        # Main prediction interface
        gr.HTML("<hr>")
        
        with gr.Row():
            with gr.Column(scale=2):
                gr.HTML("<h3>Sequence Input (Protein/DNA/RNA)</h3>")
                
                sequence_input = gr.Textbox(
                    label="Sequence Input (Protein, DNA, or RNA)",
                    placeholder="Protein: MKFLVNVALVFMVVYISYIYA... | DNA: ATGAAATTCCTG... | RNA: AUGAAAUUCCUG...",
                    lines=8,
                    max_lines=12
                )
                
                job_name_input = gr.Textbox(
                    label="Job Name (Optional)",
                    placeholder="my_protein_prediction",
                    value="protein_prediction"
                )
                
                with gr.Row():
                    predict_btn = gr.Button("Predict Structure", variant="primary", size="lg")
                    clear_btn = gr.Button("Clear", variant="secondary")
                
                # Example sequences
                gr.HTML("<h4>Example Sequences</h4>")
                
                examples = [
                    ["MKFLVNVALVFMVVYISYIYA", "short_peptide"],
                    ["ATGAAATTCCTGGTGAACGTGGCGCTGGTGTTCATGGTGGTGTACATCAGCTACATCTACGCGCTGAAACTGTTCAAGAAGCGCCAGGAAGAACTGAAG", "dna_sequence"],
                    ["AUGAAAUUCCUGGUUAACGUGGCGCUGGUGUUCAUGGUGGUGUACAUCAGCUACAUCUCUACGCGCUGAAACUGUUCAAGAAGCGCCAGGAAGAACUGAAG", "rna_sequence"],
                    ["MKTAYIAKQRQISFVKSHFSRQLEERLGLIEVQAPILSRVGDGTQDNLSGAEKAVQVKVKALPDAQFEVVHSLAKWKRQTLGQHDFSAGEGLYTHMKALRPDEDRLSPLHSVYVDQWDWERVMGDGERQFSTLKSTVEAIWAGIKATEAAVSEEFGLAPFLPDQIHFVHSQELLSRYPDLDAKGRERAIAKDLGAVFLVGIGGKLSDGHRHDVRAPDYDDWUQTPACVTYFTQSSLASRQGFVDWDDAASRPAINVGLYPTLNTVGGHQAAMQMLKETINEEAAEWDRVHPVHAGPIAPGQMREPRGTHGTWTIMHPSPSTEEGHAIPQRQTPSPGDGPVVPSASLYAVSPAILPKDGPVVVSQVKQWRQEFGWVLTPWVQTIIDGRGEEQTFLPGQHFLRELQJKHNLNHEFRLQTLLLTCDENGKGPLPQIVIRGQGDSREQAPGQWLEQPGWASPATCSPGPPRPPRPPPPPPPPPPPPPPP", "protease_domain"],
                    ["MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRNTNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRAALINMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRAKRVITTFRTGTWDAYKNL", "membrane_protein"],
                    ["MKFLVNVALVFMVVYISYIYAUOJBZX*", "extended_amino_acids"]
                ]
                
                gr.Examples(
                    examples=examples,
                    inputs=[sequence_input, job_name_input],
                    label="Click to load example sequences"
                )
            
            with gr.Column(scale=2):
                gr.HTML("<h3>Prediction Results</h3>")
                
                prediction_summary = gr.Markdown(
                    value="Results will appear here after prediction...",
                    label="Prediction Summary"
                )
                
                pdb_analysis = gr.Textbox(
                    label="PDB Structure Analysis",
                    lines=10,
                    max_lines=15,
                    interactive=False
                )
                
                pdb_content = gr.Code(
                    label="PDB File Content",
                    lines=10,
                    interactive=False
                )
        
        # User Feedback Section for Continuous Learning
        gr.HTML("<hr>")
        gr.HTML("""
        <div class="info-box">
            <h3>🧠 AEGIS Continuous Learning - User Feedback</h3>
            <p>Help AEGIS learn and improve by providing feedback on prediction accuracy!</p>
        </div>
        """)
        
        with gr.Row():
            with gr.Column(scale=1):
                gr.HTML("<h4>Prediction Feedback</h4>")
                
                feedback_sequence = gr.Textbox(
                    label="Sequence (auto-filled from last prediction)",
                    placeholder="Sequence will be auto-filled...",
                    interactive=False
                )
                
                accuracy_rating = gr.Slider(
                    minimum=0.0,
                    maximum=1.0,
                    value=0.5,
                    step=0.1,
                    label="Accuracy Rating (0.0 = Poor, 1.0 = Excellent)",
                    info="Rate how accurate you think the prediction was"
                )
                
                feedback_comments = gr.Textbox(
                    label="Comments (Optional)",
                    placeholder="Any specific observations about the prediction...",
                    lines=3
                )
                
                submit_feedback_btn = gr.Button("Submit Feedback", variant="secondary")
                feedback_status = gr.Textbox(
                    label="Feedback Status",
                    value="No feedback submitted yet",
                    interactive=False
                )
            
            with gr.Column(scale=1):
                gr.HTML("<h4>Learning Statistics</h4>")
                
                learning_stats_display = gr.Markdown(
                    value="Click 'Refresh Stats' to see current learning statistics",
                    label="AEGIS Learning Stats"
                )
                
                refresh_stats_btn = gr.Button("Refresh Learning Stats", variant="secondary")
        
        # Information section
        gr.HTML("<hr>")
        gr.HTML("""
        <div class="info-box">
            <h3>About AEGIS Enhanced System with Continuous Learning</h3>
            <ul>
                <li><strong>Input Types:</strong> Protein sequences, DNA, RNA, SMILES (auto-detection)</li>
                <li><strong>External Datasets:</strong> SandboxAQ/SAIR, ZINC-canonicalized, Essential genes</li>
                <li><strong>PDB Validation:</strong> Cross-references sequences against RCSB PDB database</li>
                <li><strong>Continuous Learning:</strong> Model improves from PDB validation and user feedback</li>
                <li><strong>Learning Triggers:</strong> Auto-learning every 50 predictions or high-value data</li>
                <li><strong>Performance Tracking:</strong> Monitors accuracy and success rates over time</li>
                <li><strong>Sequence Search:</strong> Identifies similar known protein structures</li>
                <li><strong>Validation Status:</strong> KNOWN, HIGHLY_SIMILAR, MODERATELY_SIMILAR, NOVEL</li>
                <li><strong>Enhanced Analysis:</strong> Searches external HF datasets for similar sequences</li>
                <li><strong>Comparison Engine:</strong> Compares predictions with reference data</li>
                <li><strong>Best Results:</strong> Provides consolidated analysis from multiple sources</li>
                <li><strong>Extended Amino Acids:</strong> Supports U (selenocysteine), O (pyrrolysine), ambiguous codes</li>
                <li><strong>Translation:</strong> Automatic DNA/RNA to protein translation (all reading frames)</li>
                <li><strong>Drug Discovery:</strong> SMILES analysis with protein-drug interaction prediction</li>
                <li><strong>Method:</strong> CPU-based ML + External Dataset + PDB + Continuous Learning</li>
                <li><strong>Performance:</strong> Enhanced accuracy through reference data integration + learning</li>
                <li><strong>Libraries:</strong> BioPython, scikit-learn, HuggingFace Hub, RCSB PDB API</li>
            </ul>
        </div>
        """)
        
        # Event handlers
        load_btn.click(
            fn=load_model_interface,
            outputs=model_status
        )
        
        predict_btn.click(
            fn=predict_interface_with_feedback_storage,
            inputs=[sequence_input, job_name_input],
            outputs=[prediction_summary, pdb_analysis, pdb_content, feedback_sequence]
        )
        
        submit_feedback_btn.click(
            fn=submit_user_feedback,
            inputs=[feedback_sequence, accuracy_rating, feedback_comments],
            outputs=feedback_status
        )
        
        refresh_stats_btn.click(
            fn=get_learning_statistics,
            outputs=learning_stats_display
        )
        
        clear_btn.click(
            fn=lambda: ("", "protein_prediction", "Results will appear here after prediction...", "", "", "", 0.5, "", "No feedback submitted yet"),
            outputs=[sequence_input, job_name_input, prediction_summary, pdb_analysis, pdb_content, feedback_sequence, accuracy_rating, feedback_comments, feedback_status]
        )
    
    return interface


def main():
    """Main function to launch the AEGIS Bio-Digital Lab 10 interface with PDB validation."""
    print("Starting AEGIS Bio-Digital Lab 10 - Protein Structure Predictor with PDB Validation")
    print("Artificially Expanded Genetic Information System (AEGIS)")
    print("Strategic Precognition through Advanced Protein Analysis + PDB Cross-Reference")
    print("Gaston Software Solutions Tec | Tel: +256755274944")
    print("'Time Travel' System - Calculating causal ripples of today's events")
    print("Method: CPU-based ML with Extended Genetic Code Support + PDB Validation")
    print("Libraries: BioPython, scikit-learn, NumPy, RCSB PDB API")
    
    interface = create_gradio_interface()
    
    # Launch interface
    # Use localhost for local development, 0.0.0.0 for Docker deployment
    server_name = "127.0.0.1" if not os.path.exists("/app") else "0.0.0.0"
    
    interface.launch(
        server_name=server_name,
        server_port=7860,
        share=False,
        show_error=True
    )
    
    if server_name == "127.0.0.1":
        print(f"AEGIS Lab 10 Local Access: http://localhost:7860")
        print(f"Network Access: http://127.0.0.1:7860")
        print(f"Support: +256755274944 | Gaston Software Solutions Tec")


if __name__ == "__main__":
    main()