| | |
| | import time, logging |
| | from typing import Any, Dict, AsyncIterable |
| |
|
| | from transformers import AutoTokenizer, AutoModelForCausalLM |
| | from backends_base import ChatBackend, ImagesBackend |
| | from config import settings |
| |
|
| | logger = logging.getLogger(__name__) |
| |
|
| | try: |
| | import spaces |
| | except ImportError: |
| | spaces = None |
| |
|
| |
|
| | class TransformersChatBackend(ChatBackend): |
| | """ |
| | Lightweight backend for Hugging Face Spaces (ZeroGPU). |
| | Reloads model on every request using Transformers, not vLLM. |
| | """ |
| |
|
| | async def stream(self, request: Dict[str, Any]) -> AsyncIterable[Dict[str, Any]]: |
| | messages = request.get("messages", []) |
| | prompt = messages[-1]["content"] if messages else "(empty)" |
| |
|
| | |
| | model_id = request.get("model") or settings.LlmHFModelID |
| | temperature = float(request.get("temperature", settings.LlmTemp or 0.7)) |
| | max_tokens = int(request.get("max_tokens", settings.LlmOpenAICtxSize or 512)) |
| |
|
| | rid = f"chatcmpl-transformers-{int(time.time())}" |
| | now = int(time.time()) |
| |
|
| | |
| | if spaces: |
| | @spaces.GPU(duration=300) |
| | def run_once(prompt: str) -> str: |
| | tokenizer = AutoTokenizer.from_pretrained(model_id) |
| | model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto") |
| |
|
| | inputs = tokenizer(prompt, return_tensors="pt").to(model.device) |
| | outputs = model.generate( |
| | **inputs, |
| | max_new_tokens=max_tokens, |
| | temperature=temperature, |
| | do_sample=True, |
| | ) |
| | return tokenizer.decode(outputs[0], skip_special_tokens=True) |
| | else: |
| | def run_once(prompt: str) -> str: |
| | tokenizer = AutoTokenizer.from_pretrained(model_id) |
| | model = AutoModelForCausalLM.from_pretrained(model_id) |
| |
|
| | inputs = tokenizer(prompt, return_tensors="pt") |
| | outputs = model.generate( |
| | **inputs, |
| | max_new_tokens=max_tokens, |
| | temperature=temperature, |
| | do_sample=True, |
| | ) |
| | return tokenizer.decode(outputs[0], skip_special_tokens=True) |
| |
|
| | try: |
| | text = run_once(prompt) |
| | yield { |
| | "id": rid, |
| | "object": "chat.completion.chunk", |
| | "created": now, |
| | "model": model_id, |
| | "choices": [ |
| | {"index": 0, "delta": {"content": text}, "finish_reason": "stop"} |
| | ], |
| | } |
| | except Exception: |
| | logger.exception("Transformers inference failed") |
| | raise |
| |
|
| |
|
| | class StubImagesBackend(ImagesBackend): |
| | """ |
| | Image generation stub — returns a transparent PNG placeholder. |
| | """ |
| | async def generate_b64(self, request: Dict[str, Any]) -> str: |
| | logger.warning("Image generation not supported in Transformers backend.") |
| | return ( |
| | "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR4nGP4BwQACfsD/etCJH0AAAAASUVORK5CYII=" |
| | ) |
| |
|