app-cmkofd-64 / models.py
AiCoderv2's picture
Deploy Gradio app with multiple files
6574073 verified
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from typing import List, Dict, Any, Optional
import logging
class CodeModel:
"""5B Parameter coding model wrapper with optimized inference."""
def __init__(self):
self.model_name = "bigcode/starcoder2-7b" # 7B model (closest to 5B with excellent coding)
self.parameter_count = "7B"
self.max_length = 16384
self.tokenizer = None
self.model = None
self.pipeline = None
self.is_loaded = False
self.device = "cuda" if torch.cuda.is_available() else "cpu"
self.setup_model()
def setup_model(self):
"""Initialize and load the 5B+ parameter coding model."""
try:
print(f"Loading {self.model_name} model...")
# Load tokenizer and model
self.tokenizer = AutoTokenizer.from_pretrained(
self.model_name,
trust_remote_code=True,
padding_side="left"
)
# Set pad token if not present
if self.tokenizer.pad_token is None:
self.tokenizer.pad_token = self.tokenizer.eos_token
# Load model with optimization
self.model = AutoModelForCausalLM.from_pretrained(
self.model_name,
torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
device_map="auto" if self.device == "cuda" else None,
trust_remote_code=True,
low_cpu_mem_usage=True
)
# Create pipeline for easier inference
self.pipeline = pipeline(
"text-generation",
model=self.model,
tokenizer=self.tokenizer,
device=0 if self.device == "cuda" else -1,
do_sample=True,
temperature=0.7,
top_p=0.95,
repetition_penalty=1.1,
max_new_tokens=2048,
pad_token_id=self.tokenizer.eos_token_id
)
self.is_loaded = True
print(f"βœ… {self.model_name} loaded successfully on {self.device}")
except Exception as e:
print(f"❌ Error loading model: {e}")
self._fallback_model()
def _fallback_model(self):
"""Fallback to a smaller model if the main model fails to load."""
try:
print("Trying fallback model: microsoft/DialoGPT-medium")
self.model_name = "microsoft/DialoGPT-medium"
self.parameter_count = "345M"
self.max_length = 1024
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
if self.tokenizer.pad_token is None:
self.tokenizer.pad_token = self.tokenizer.eos_token
self.model = AutoModelForCausalLM.from_pretrained(
self.model_name,
torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
device_map="auto" if self.device == "cuda" else None
)
self.pipeline = pipeline(
"text-generation",
model=self.model,
tokenizer=self.tokenizer,
device=0 if self.device == "cuda" else -1,
max_new_tokens=512,
pad_token_id=self.tokenizer.eos_token_id
)
self.is_loaded = True
print(f"βœ… Fallback model loaded successfully")
except Exception as e:
print(f"❌ Fallback model also failed: {e}")
self.is_loaded = False
def generate(
self,
messages: List[Dict[str, str]],
temperature: float = 0.7,
max_new_tokens: int = 2048,
language: str = "python"
) -> str:
"""Generate response from the model."""
if not self.is_loaded:
return "I'm sorry, the model is not loaded yet. Please try again in a moment."
try:
# Convert chat format to text
if messages:
# Format as conversation
conversation = ""
for msg in messages:
role = msg["role"]
content = msg["content"]
if role == "system":
conversation += f"System: {content}\n\n"
elif role == "user":
conversation += f"Human: {content}\n"
elif role == "assistant":
conversation += f"Assistant: {content}\n"
# Add specific coding instructions
if "write" in conversation.lower() or "code" in conversation.lower():
conversation += f"\n\nPlease provide clean, well-commented {language} code with proper syntax and best practices."
conversation += "\nAssistant:"
# Generate response
with torch.no_grad():
if self.pipeline:
# Use pipeline for generation
outputs = self.pipeline(
conversation,
do_sample=True,
temperature=temperature,
top_p=0.95,
repetition_penalty=1.1,
max_new_tokens=max_new_tokens,
pad_token_id=self.tokenizer.eos_token_id,
eos_token_id=self.tokenizer.eos_token_id,
return_full_text=False
)
if outputs and len(outputs) > 0:
return outputs[0]["generated_text"].strip()
# Fallback to direct model generation
inputs = self.tokenizer.encode(conversation, return_tensors="pt").to(self.device)
with torch.no_grad():
outputs = self.model.generate(
inputs,
do_sample=True,
temperature=temperature,
top_p=0.95,
repetition_penalty=1.1,
max_new_tokens=max_new_tokens,
pad_token_id=self.tokenizer.eos_token_id,
eos_token_id=self.tokenizer.eos_token_id,
attention_mask=torch.ones_like(inputs)
)
# Decode response
response = self.tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True)
return response.strip()
except Exception as e:
logging.error(f"Generation error: {e}")
return f"I apologize, but I encountered an error while generating the response: {str(e)}"
def get_model_info(self) -> Dict[str, Any]:
"""Get information about the loaded model."""
return {
"model_name": self.model_name,
"parameter_count": self.parameter_count,
"max_length": self.max_length,
"device": self.device,
"is_loaded": self.is_loaded,
"vocab_size": len(self.tokenizer) if self.tokenizer else 0
}