Spaces:

AiCoderv2
/

app-cmkofd-64

Runtime error

File size: 8,681 Bytes

6a6c658

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from typing import List, Dict, Any, Optional
import logging
import asyncio
import threading

logger = logging.getLogger(__name__)

class CodeModel:
    """5B Parameter coding model wrapper with optimized inference."""
    
    def __init__(self):
        self.model_name = "bigcode/starcoder2-7b"  # 7B model (closest to 5B with excellent coding)
        self.parameter_count = "7B"
        self.max_length = 16384
        self.tokenizer = None
        self.model = None
        self.pipeline = None
        self.is_loaded = False
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self._lock = threading.Lock()
    
    @spaces.GPU(duration=1200)  # Extended duration for model loading
    def load_model(self):
        """Load the model (called via spaces decorator for optimization)."""
        try:
            logger.info(f"Loading {self.model_name} model...")
            
            # Load tokenizer and model
            self.tokenizer = AutoTokenizer.from_pretrained(
                self.model_name,
                trust_remote_code=True,
                padding_side="left"
            )
            
            # Set pad token if not present
            if self.tokenizer.pad_token is None:
                self.tokenizer.pad_token = self.tokenizer.eos_token
            
            # Load model with optimization
            self.model = AutoModelForCausalLM.from_pretrained(
                self.model_name,
                torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
                device_map="auto" if self.device == "cuda" else None,
                trust_remote_code=True,
                low_cpu_mem_usage=True,
                use_cache=True
            )
            
            # Set model to evaluation mode
            self.model.eval()
            
            # Create pipeline for easier inference
            self.pipeline = pipeline(
                "text-generation",
                model=self.model,
                tokenizer=self.tokenizer,
                device=0 if self.device == "cuda" else -1,
                do_sample=True,
                temperature=0.7,
                top_p=0.95,
                repetition_penalty=1.1,
                max_new_tokens=2048,
                pad_token_id=self.tokenizer.eos_token_id
            )
            
            self.is_loaded = True
            logger.info(f"✅ {self.model_name} loaded successfully on {self.device}")
            
        except Exception as e:
            logger.error(f"❌ Error loading model: {e}")
            self._fallback_model()
    
    def _fallback_model(self):
        """Fallback to a smaller model if the main model fails to load."""
        try:
            logger.info("Trying fallback model: microsoft/DialoGPT-medium")
            self.model_name = "microsoft/DialoGPT-medium"
            self.parameter_count = "345M"
            self.max_length = 1024
            
            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
            if self.tokenizer.pad_token is None:
                self.tokenizer.pad_token = self.tokenizer.eos_token
            
            self.model = AutoModelForCausalLM.from_pretrained(
                self.model_name,
                torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
                device_map="auto" if self.device == "cuda" else None
            )
            
            self.pipeline = pipeline(
                "text-generation",
                model=self.model,
                tokenizer=self.tokenizer,
                device=0 if self.device == "cuda" else -1,
                max_new_tokens=512,
                pad_token_id=self.tokenizer.eos_token_id
            )
            
            self.is_loaded = True
            logger.info(f"✅ Fallback model loaded successfully")
            
        except Exception as e:
            logger.error(f"❌ Fallback model also failed: {e}")
            self.is_loaded = False
    
    def generate(
        self, 
        messages: List[Dict[str, str]], 
        temperature: float = 0.7,
        max_new_tokens: int = 2048,
        language: str = "python"
    ) -> str:
        """Generate response from the model."""
        
        if not self.is_loaded:
            return "I'm sorry, the model is not loaded yet. Please try again in a moment."
        
        try:
            with self._lock:  # Ensure thread-safe access
                # Convert chat format to text
                if messages:
                    # Format as conversation
                    conversation = ""
                    for msg in messages:
                        role = msg["role"]
                        content = msg["content"]
                        if role == "system":
                            conversation += f"System: {content}\n\n"
                        elif role == "user":
                            conversation += f"Human: {content}\n"
                        elif role == "assistant":
                            conversation += f"Assistant: {content}\n"
                    
                    # Add specific coding instructions
                    if "write" in conversation.lower() or "code" in conversation.lower():
                        conversation += f"\n\nPlease provide clean, well-commented {language} code with proper syntax and best practices."
                    
                    conversation += "\nAssistant:"
                
                # Generate response
                with torch.no_grad():
                    if self.pipeline:
                        # Use pipeline for generation
                        outputs = self.pipeline(
                            conversation,
                            do_sample=True,
                            temperature=temperature,
                            top_p=0.95,
                            repetition_penalty=1.1,
                            max_new_tokens=max_new_tokens,
                            pad_token_id=self.tokenizer.eos_token_id,
                            eos_token_id=self.tokenizer.eos_token_id,
                            return_full_text=False,
                            clean_up_tokenization_spaces=True
                        )
                        
                        if outputs and len(outputs) > 0:
                            response = outputs[0]["generated_text"].strip()
                            return response
                    
                    # Fallback to direct model generation
                    inputs = self.tokenizer.encode(conversation, return_tensors="pt").to(self.device)
                    
                    with torch.no_grad():
                        outputs = self.model.generate(
                            inputs,
                            do_sample=True,
                            temperature=temperature,
                            top_p=0.95,
                            repetition_penalty=1.1,
                            max_new_tokens=max_new_tokens,
                            pad_token_id=self.tokenizer.eos_token_id,
                            eos_token_id=self.tokenizer.eos_token_id,
                            attention_mask=torch.ones_like(inputs)
                        )
                    
                    # Decode response
                    response = self.tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True)
                    return response.strip()
                    
        except Exception as e:
            logger.error(f"Generation error: {e}")
            return f"I apologize, but I encountered an error while generating the response: {str(e)}"
    
    def get_model_info(self) -> Dict[str, Any]:
        """Get information about the loaded model."""
        return {
            "model_name": self.model_name,
            "parameter_count": self.parameter_count,
            "max_length": self.max_length,
            "device": self.device,
            "is_loaded": self.is_loaded,
            "vocab_size": len(self.tokenizer) if self.tokenizer else 0
        }

# Global model instance for the server
_global_model = None

def get_model():
    """Get or create the global model instance."""
    global _global_model
    if _global_model is None:
        _global_model = CodeModel()
        # Load model asynchronously
        threading.Thread(target=_global_model.load_model, daemon=True).start()
    return _global_model

def CodeModel():
    """Factory function for creating CodeModel instances."""
    return CodeModel()