""" Kokoro TTS Service - High-quality, local, commercially-free text-to-speech. Uses the Kokoro TTS model (82M params, Apache 2.0 license). """ import os import hashlib import logging from pathlib import Path from typing import Optional logger = logging.getLogger(__name__) # Get the base directory BASE_DIR = Path(__file__).parent.parent.parent # Available Kokoro voices with descriptions KOKORO_VOICES = { # American English voices "af_heart": {"name": "Heart (Female)", "lang": "a", "description": "Warm, friendly American female"}, "af_bella": {"name": "Bella (Female)", "lang": "a", "description": "Professional American female"}, "af_nicole": {"name": "Nicole (Female)", "lang": "a", "description": "Clear American female"}, "af_sarah": {"name": "Sarah (Female)", "lang": "a", "description": "Natural American female"}, "af_sky": {"name": "Sky (Female)", "lang": "a", "description": "Bright American female"}, "am_adam": {"name": "Adam (Male)", "lang": "a", "description": "Professional American male"}, "am_michael": {"name": "Michael (Male)", "lang": "a", "description": "Deep American male"}, # British English voices "bf_emma": {"name": "Emma (British Female)", "lang": "b", "description": "Professional British female"}, "bf_isabella": {"name": "Isabella (British Female)", "lang": "b", "description": "Warm British female"}, "bm_george": {"name": "George (British Male)", "lang": "b", "description": "Professional British male"}, "bm_lewis": {"name": "Lewis (British Male)", "lang": "b", "description": "Friendly British male"}, } # Default voice DEFAULT_VOICE = "af_heart" class KokoroTTSService: """ Text-to-speech service using Kokoro TTS. High-quality, local, commercially-free (Apache 2.0 license). """ def __init__(self, voice: str = DEFAULT_VOICE, cache_dir: Optional[Path] = None): """ Initialize Kokoro TTS service. Args: voice: Voice ID (e.g., 'af_heart', 'am_adam') cache_dir: Optional custom cache directory """ self.voice = voice if voice in KOKORO_VOICES else DEFAULT_VOICE self.voice_info = KOKORO_VOICES.get(self.voice, KOKORO_VOICES[DEFAULT_VOICE]) self.lang_code = self.voice_info["lang"] # Initialize cache directory if cache_dir: self.cache_dir = Path(cache_dir) else: self.cache_dir = BASE_DIR / "media" / "voiceover" / "kokoro" self.cache_dir.mkdir(parents=True, exist_ok=True) # Lazy-load pipeline to avoid loading model until needed self._pipeline = None logger.info(f"KokoroTTS initialized with voice: {self.voice} ({self.voice_info['name']})") def _get_pipeline(self): """Lazy-load the Kokoro pipeline.""" if self._pipeline is None: try: from kokoro import KPipeline self._pipeline = KPipeline(lang_code=self.lang_code) logger.info(f"Kokoro pipeline loaded for language: {self.lang_code}") except ImportError as e: logger.error("Kokoro not installed. Install with: pip install kokoro>=0.9.4 soundfile") raise RuntimeError("Kokoro TTS not installed") from e return self._pipeline def generate_from_text(self, text: str, **kwargs) -> Path: """ Generate audio from text and return the path to the audio file. Args: text: Text to convert to speech **kwargs: Additional arguments (ignored for compatibility) Returns: Path to the generated audio file (MP3) """ if not text or not text.strip(): raise ValueError("Text cannot be empty") # Create cache key based on text and voice content_hash = hashlib.md5(f"{text}-{self.voice}".encode("utf-8")).hexdigest() output_path = self.cache_dir / f"{content_hash}.mp3" # Return cached file if exists if output_path.exists() and output_path.stat().st_size > 0: logger.info(f"Using cached Kokoro voiceover: {output_path.name}") return output_path logger.info(f"Generating Kokoro TTS ({self.voice}) for: {text[:50]}...") try: import soundfile as sf import numpy as np pipeline = self._get_pipeline() # Generate audio using Kokoro generator = pipeline(text, voice=self.voice, speed=1.0) # Collect all audio chunks audio_chunks = [] for i, (gs, ps, audio) in enumerate(generator): audio_chunks.append(audio) if not audio_chunks: raise Exception("No audio generated") # Concatenate all chunks full_audio = np.concatenate(audio_chunks) # Save as WAV first (Kokoro outputs 24kHz) wav_path = self.cache_dir / f"{content_hash}.wav" sf.write(str(wav_path), full_audio, 24000) # Convert to MP3 using ffmpeg for smaller file size import subprocess result = subprocess.run([ "ffmpeg", "-y", "-i", str(wav_path), "-acodec", "libmp3lame", "-ab", "192k", str(output_path) ], capture_output=True, text=True) # Clean up WAV file if wav_path.exists(): wav_path.unlink() if result.returncode != 0: # Fallback: use WAV if MP3 conversion fails logger.warning("MP3 conversion failed, using WAV") wav_path = self.cache_dir / f"{content_hash}.wav" sf.write(str(wav_path), full_audio, 24000) return wav_path if output_path.exists() and output_path.stat().st_size > 0: logger.info(f"✅ Kokoro TTS saved: {output_path} ({output_path.stat().st_size} bytes)") return output_path else: raise Exception("Audio file was not created or is empty") except Exception as e: logger.error(f"Kokoro TTS failed: {str(e)}") raise RuntimeError(f"Kokoro TTS generation failed: {str(e)}") # Alias for backward compatibility SimpleElevenLabsService = KokoroTTSService def get_available_voices() -> dict: """Return dictionary of available voices with their info.""" return KOKORO_VOICES def get_voice_names() -> list: """Return list of voice display names for UI.""" return [(vid, info["name"]) for vid, info in KOKORO_VOICES.items()]