Adityahulk
integrating free voice
60e9dd0
"""
Kokoro TTS Service - High-quality, local, commercially-free text-to-speech.
Uses the Kokoro TTS model (82M params, Apache 2.0 license).
"""
import os
import hashlib
import logging
from pathlib import Path
from typing import Optional
logger = logging.getLogger(__name__)
# Get the base directory
BASE_DIR = Path(__file__).parent.parent.parent
# Available Kokoro voices with descriptions
KOKORO_VOICES = {
# American English voices
"af_heart": {"name": "Heart (Female)", "lang": "a", "description": "Warm, friendly American female"},
"af_bella": {"name": "Bella (Female)", "lang": "a", "description": "Professional American female"},
"af_nicole": {"name": "Nicole (Female)", "lang": "a", "description": "Clear American female"},
"af_sarah": {"name": "Sarah (Female)", "lang": "a", "description": "Natural American female"},
"af_sky": {"name": "Sky (Female)", "lang": "a", "description": "Bright American female"},
"am_adam": {"name": "Adam (Male)", "lang": "a", "description": "Professional American male"},
"am_michael": {"name": "Michael (Male)", "lang": "a", "description": "Deep American male"},
# British English voices
"bf_emma": {"name": "Emma (British Female)", "lang": "b", "description": "Professional British female"},
"bf_isabella": {"name": "Isabella (British Female)", "lang": "b", "description": "Warm British female"},
"bm_george": {"name": "George (British Male)", "lang": "b", "description": "Professional British male"},
"bm_lewis": {"name": "Lewis (British Male)", "lang": "b", "description": "Friendly British male"},
}
# Default voice
DEFAULT_VOICE = "af_heart"
class KokoroTTSService:
"""
Text-to-speech service using Kokoro TTS.
High-quality, local, commercially-free (Apache 2.0 license).
"""
def __init__(self, voice: str = DEFAULT_VOICE, cache_dir: Optional[Path] = None):
"""
Initialize Kokoro TTS service.
Args:
voice: Voice ID (e.g., 'af_heart', 'am_adam')
cache_dir: Optional custom cache directory
"""
self.voice = voice if voice in KOKORO_VOICES else DEFAULT_VOICE
self.voice_info = KOKORO_VOICES.get(self.voice, KOKORO_VOICES[DEFAULT_VOICE])
self.lang_code = self.voice_info["lang"]
# Initialize cache directory
if cache_dir:
self.cache_dir = Path(cache_dir)
else:
self.cache_dir = BASE_DIR / "media" / "voiceover" / "kokoro"
self.cache_dir.mkdir(parents=True, exist_ok=True)
# Lazy-load pipeline to avoid loading model until needed
self._pipeline = None
logger.info(f"KokoroTTS initialized with voice: {self.voice} ({self.voice_info['name']})")
def _get_pipeline(self):
"""Lazy-load the Kokoro pipeline."""
if self._pipeline is None:
try:
from kokoro import KPipeline
self._pipeline = KPipeline(lang_code=self.lang_code)
logger.info(f"Kokoro pipeline loaded for language: {self.lang_code}")
except ImportError as e:
logger.error("Kokoro not installed. Install with: pip install kokoro>=0.9.4 soundfile")
raise RuntimeError("Kokoro TTS not installed") from e
return self._pipeline
def generate_from_text(self, text: str, **kwargs) -> Path:
"""
Generate audio from text and return the path to the audio file.
Args:
text: Text to convert to speech
**kwargs: Additional arguments (ignored for compatibility)
Returns:
Path to the generated audio file (MP3)
"""
if not text or not text.strip():
raise ValueError("Text cannot be empty")
# Create cache key based on text and voice
content_hash = hashlib.md5(f"{text}-{self.voice}".encode("utf-8")).hexdigest()
output_path = self.cache_dir / f"{content_hash}.mp3"
# Return cached file if exists
if output_path.exists() and output_path.stat().st_size > 0:
logger.info(f"Using cached Kokoro voiceover: {output_path.name}")
return output_path
logger.info(f"Generating Kokoro TTS ({self.voice}) for: {text[:50]}...")
try:
import soundfile as sf
import numpy as np
pipeline = self._get_pipeline()
# Generate audio using Kokoro
generator = pipeline(text, voice=self.voice, speed=1.0)
# Collect all audio chunks
audio_chunks = []
for i, (gs, ps, audio) in enumerate(generator):
audio_chunks.append(audio)
if not audio_chunks:
raise Exception("No audio generated")
# Concatenate all chunks
full_audio = np.concatenate(audio_chunks)
# Save as WAV first (Kokoro outputs 24kHz)
wav_path = self.cache_dir / f"{content_hash}.wav"
sf.write(str(wav_path), full_audio, 24000)
# Convert to MP3 using ffmpeg for smaller file size
import subprocess
result = subprocess.run([
"ffmpeg", "-y", "-i", str(wav_path),
"-acodec", "libmp3lame", "-ab", "192k",
str(output_path)
], capture_output=True, text=True)
# Clean up WAV file
if wav_path.exists():
wav_path.unlink()
if result.returncode != 0:
# Fallback: use WAV if MP3 conversion fails
logger.warning("MP3 conversion failed, using WAV")
wav_path = self.cache_dir / f"{content_hash}.wav"
sf.write(str(wav_path), full_audio, 24000)
return wav_path
if output_path.exists() and output_path.stat().st_size > 0:
logger.info(f"✅ Kokoro TTS saved: {output_path} ({output_path.stat().st_size} bytes)")
return output_path
else:
raise Exception("Audio file was not created or is empty")
except Exception as e:
logger.error(f"Kokoro TTS failed: {str(e)}")
raise RuntimeError(f"Kokoro TTS generation failed: {str(e)}")
# Alias for backward compatibility
SimpleElevenLabsService = KokoroTTSService
def get_available_voices() -> dict:
"""Return dictionary of available voices with their info."""
return KOKORO_VOICES
def get_voice_names() -> list:
"""Return list of voice display names for UI."""
return [(vid, info["name"]) for vid, info in KOKORO_VOICES.items()]