Spaces:
Runtime error
Runtime error
| import torch | |
| from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline | |
| from typing import List, Dict, Any, Optional | |
| import logging | |
| class CodeModel: | |
| """5B Parameter coding model wrapper with optimized inference.""" | |
| def __init__(self): | |
| self.model_name = "bigcode/starcoder2-7b" # 7B model (closest to 5B with excellent coding) | |
| self.parameter_count = "7B" | |
| self.max_length = 16384 | |
| self.tokenizer = None | |
| self.model = None | |
| self.pipeline = None | |
| self.is_loaded = False | |
| self.device = "cuda" if torch.cuda.is_available() else "cpu" | |
| self.setup_model() | |
| def setup_model(self): | |
| """Initialize and load the 5B+ parameter coding model.""" | |
| try: | |
| print(f"Loading {self.model_name} model...") | |
| # Load tokenizer and model | |
| self.tokenizer = AutoTokenizer.from_pretrained( | |
| self.model_name, | |
| trust_remote_code=True, | |
| padding_side="left" | |
| ) | |
| # Set pad token if not present | |
| if self.tokenizer.pad_token is None: | |
| self.tokenizer.pad_token = self.tokenizer.eos_token | |
| # Load model with optimization | |
| self.model = AutoModelForCausalLM.from_pretrained( | |
| self.model_name, | |
| torch_dtype=torch.float16 if self.device == "cuda" else torch.float32, | |
| device_map="auto" if self.device == "cuda" else None, | |
| trust_remote_code=True, | |
| low_cpu_mem_usage=True | |
| ) | |
| # Create pipeline for easier inference | |
| self.pipeline = pipeline( | |
| "text-generation", | |
| model=self.model, | |
| tokenizer=self.tokenizer, | |
| device=0 if self.device == "cuda" else -1, | |
| do_sample=True, | |
| temperature=0.7, | |
| top_p=0.95, | |
| repetition_penalty=1.1, | |
| max_new_tokens=2048, | |
| pad_token_id=self.tokenizer.eos_token_id | |
| ) | |
| self.is_loaded = True | |
| print(f"β {self.model_name} loaded successfully on {self.device}") | |
| except Exception as e: | |
| print(f"β Error loading model: {e}") | |
| self._fallback_model() | |
| def _fallback_model(self): | |
| """Fallback to a smaller model if the main model fails to load.""" | |
| try: | |
| print("Trying fallback model: microsoft/DialoGPT-medium") | |
| self.model_name = "microsoft/DialoGPT-medium" | |
| self.parameter_count = "345M" | |
| self.max_length = 1024 | |
| self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) | |
| if self.tokenizer.pad_token is None: | |
| self.tokenizer.pad_token = self.tokenizer.eos_token | |
| self.model = AutoModelForCausalLM.from_pretrained( | |
| self.model_name, | |
| torch_dtype=torch.float16 if self.device == "cuda" else torch.float32, | |
| device_map="auto" if self.device == "cuda" else None | |
| ) | |
| self.pipeline = pipeline( | |
| "text-generation", | |
| model=self.model, | |
| tokenizer=self.tokenizer, | |
| device=0 if self.device == "cuda" else -1, | |
| max_new_tokens=512, | |
| pad_token_id=self.tokenizer.eos_token_id | |
| ) | |
| self.is_loaded = True | |
| print(f"β Fallback model loaded successfully") | |
| except Exception as e: | |
| print(f"β Fallback model also failed: {e}") | |
| self.is_loaded = False | |
| def generate( | |
| self, | |
| messages: List[Dict[str, str]], | |
| temperature: float = 0.7, | |
| max_new_tokens: int = 2048, | |
| language: str = "python" | |
| ) -> str: | |
| """Generate response from the model.""" | |
| if not self.is_loaded: | |
| return "I'm sorry, the model is not loaded yet. Please try again in a moment." | |
| try: | |
| # Convert chat format to text | |
| if messages: | |
| # Format as conversation | |
| conversation = "" | |
| for msg in messages: | |
| role = msg["role"] | |
| content = msg["content"] | |
| if role == "system": | |
| conversation += f"System: {content}\n\n" | |
| elif role == "user": | |
| conversation += f"Human: {content}\n" | |
| elif role == "assistant": | |
| conversation += f"Assistant: {content}\n" | |
| # Add specific coding instructions | |
| if "write" in conversation.lower() or "code" in conversation.lower(): | |
| conversation += f"\n\nPlease provide clean, well-commented {language} code with proper syntax and best practices." | |
| conversation += "\nAssistant:" | |
| # Generate response | |
| with torch.no_grad(): | |
| if self.pipeline: | |
| # Use pipeline for generation | |
| outputs = self.pipeline( | |
| conversation, | |
| do_sample=True, | |
| temperature=temperature, | |
| top_p=0.95, | |
| repetition_penalty=1.1, | |
| max_new_tokens=max_new_tokens, | |
| pad_token_id=self.tokenizer.eos_token_id, | |
| eos_token_id=self.tokenizer.eos_token_id, | |
| return_full_text=False | |
| ) | |
| if outputs and len(outputs) > 0: | |
| return outputs[0]["generated_text"].strip() | |
| # Fallback to direct model generation | |
| inputs = self.tokenizer.encode(conversation, return_tensors="pt").to(self.device) | |
| with torch.no_grad(): | |
| outputs = self.model.generate( | |
| inputs, | |
| do_sample=True, | |
| temperature=temperature, | |
| top_p=0.95, | |
| repetition_penalty=1.1, | |
| max_new_tokens=max_new_tokens, | |
| pad_token_id=self.tokenizer.eos_token_id, | |
| eos_token_id=self.tokenizer.eos_token_id, | |
| attention_mask=torch.ones_like(inputs) | |
| ) | |
| # Decode response | |
| response = self.tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True) | |
| return response.strip() | |
| except Exception as e: | |
| logging.error(f"Generation error: {e}") | |
| return f"I apologize, but I encountered an error while generating the response: {str(e)}" | |
| def get_model_info(self) -> Dict[str, Any]: | |
| """Get information about the loaded model.""" | |
| return { | |
| "model_name": self.model_name, | |
| "parameter_count": self.parameter_count, | |
| "max_length": self.max_length, | |
| "device": self.device, | |
| "is_loaded": self.is_loaded, | |
| "vocab_size": len(self.tokenizer) if self.tokenizer else 0 | |
| } |