#!/usr/bin/env python3 """ ZW Kitten - Production FastAPI interface for KittenTTS """ import os import tempfile from fastapi import FastAPI, Request from fastapi.responses import HTMLResponse, FileResponse, JSONResponse import uvicorn import numpy as np import wave # Import KittenTTS directly without demo fallback from get_model import KittenTTS from onnx_model import KittenTTS_1_Onnx # Fixed version without arbitrary audio trimming class FixedKittenTTS_1_Onnx(KittenTTS_1_Onnx): """Fixed version without arbitrary audio trimming""" def generate(self, text: str, voice: str = "expr-voice-5-m", speed: float = 1.0) -> np.ndarray: """Generate without the truncation bug""" onnx_inputs = self._prepare_inputs(text, voice, speed) outputs = self.session.run(None, onnx_inputs) # FIXED: No arbitrary trimming - return full audio audio = outputs[0] return audio class FixedKittenTTS: """KittenTTS with truncation bug fixed""" def __init__(self): self.kitten = KittenTTS() # Apply the fix self.kitten.model.generate = FixedKittenTTS_1_Onnx.generate.__get__( self.kitten.model, KittenTTS_1_Onnx ) def generate(self, text, voice="expr-voice-2-m", speed=1.0): return self.kitten.generate(text, voice=voice, speed=speed) app = FastAPI(title="ZW Kitten TTS") # Initialize TTS tts = FixedKittenTTS() print("✅ Fixed KittenTTS initialized (no truncation bug)") @app.post("/generate") async def generate_speech(request: Request): """Generate speech from text""" data = await request.json() text = data.get('text', '').strip() character = data.get('character', 'claude') emotion = data.get('emotion', 'neutral') if not text: return JSONResponse({"error": "No text provided"}, status_code=400) # Character to voice mapping voice_map = { 'claude': 'expr-voice-2-m', 'keen': 'expr-voice-2-f', 'tran': 'expr-voice-3-m', 'isla': 'expr-voice-4-f', 'system': 'expr-voice-5-m', 'narrator': 'expr-voice-5-f' } # Emotion to speed mapping speed_map = { 'neutral': 1.0, 'cosmic_awareness': 0.8, 'determined': 1.1, 'whisper': 0.9, 'urgent': 1.2, 'calm': 0.9 } voice = voice_map.get(character, 'expr-voice-2-m') speed = speed_map.get(emotion, 1.0) try: # Generate real audio with KittenTTS audio = tts.generate(text, voice=voice, speed=speed) # Save to temporary WAV (mono, 16-bit PCM) temp_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) a = np.asarray(audio, dtype=np.float32) # if model returns [N,1], flatten it: if a.ndim > 1: a = a[:, 0] a = np.clip(a, -1.0, 1.0) a_i16 = (a * 32767.0).astype(np.int16) with wave.open(temp_file.name, "wb") as w: w.setnchannels(1) # mono w.setsampwidth(2) # 16-bit w.setframerate(24000) # Hz (use your model's sr if different) w.writeframes(a_i16.tobytes()) return JSONResponse({ "success": True, "audio_url": f"/audio/{os.path.basename(temp_file.name)}", "zw_block": f"""!zw/dialogue.intent: character: {character} emotion: {emotion} line: {text}""", "info": f"""🎭 Character: {character.title()} 😊 Emotion: {emotion.replace('_', ' ').title()} (speed: {speed:.1f}x) 🎵 Voice: {voice} 📝 Text: "{text}" ✅ Audio generated successfully!""" }) except Exception as e: return JSONResponse({"error": str(e)}, status_code=500) @app.get("/audio/{filename}") async def serve_audio(filename: str): """Serve generated audio files""" file_path = os.path.join(tempfile.gettempdir(), filename) if os.path.exists(file_path): return FileResponse(file_path, media_type="audio/wav") return JSONResponse({"error": "File not found"}, status_code=404) @app.get("/", response_class=HTMLResponse) def serve_interface(): """Serve the web interface""" return """