Spaces:

aditya2001
/

VidSimplify

Sleeping

VidSimplify / manimator /services /voiceover.py

Adityahulk

integrating free voice

60e9dd0 12 days ago

6.82 kB

	"""
	Kokoro TTS Service - High-quality, local, commercially-free text-to-speech.
	Uses the Kokoro TTS model (82M params, Apache 2.0 license).
	"""

	import os
	import hashlib
	import logging
	from pathlib import Path
	from typing import Optional

	logger = logging.getLogger(__name__)

	# Get the base directory
	BASE_DIR = Path(__file__).parent.parent.parent

	# Available Kokoro voices with descriptions
	KOKORO_VOICES = {
	# American English voices
	"af_heart": {"name": "Heart (Female)", "lang": "a", "description": "Warm, friendly American female"},
	"af_bella": {"name": "Bella (Female)", "lang": "a", "description": "Professional American female"},
	"af_nicole": {"name": "Nicole (Female)", "lang": "a", "description": "Clear American female"},
	"af_sarah": {"name": "Sarah (Female)", "lang": "a", "description": "Natural American female"},
	"af_sky": {"name": "Sky (Female)", "lang": "a", "description": "Bright American female"},
	"am_adam": {"name": "Adam (Male)", "lang": "a", "description": "Professional American male"},
	"am_michael": {"name": "Michael (Male)", "lang": "a", "description": "Deep American male"},
	# British English voices
	"bf_emma": {"name": "Emma (British Female)", "lang": "b", "description": "Professional British female"},
	"bf_isabella": {"name": "Isabella (British Female)", "lang": "b", "description": "Warm British female"},
	"bm_george": {"name": "George (British Male)", "lang": "b", "description": "Professional British male"},
	"bm_lewis": {"name": "Lewis (British Male)", "lang": "b", "description": "Friendly British male"},
	}

	# Default voice
	DEFAULT_VOICE = "af_heart"


	class KokoroTTSService:
	"""
	Text-to-speech service using Kokoro TTS.
	High-quality, local, commercially-free (Apache 2.0 license).
	"""

	def __init__(self, voice: str = DEFAULT_VOICE, cache_dir: Optional[Path] = None):
	"""
	Initialize Kokoro TTS service.

	Args:
	voice: Voice ID (e.g., 'af_heart', 'am_adam')
	cache_dir: Optional custom cache directory
	"""
	self.voice = voice if voice in KOKORO_VOICES else DEFAULT_VOICE
	self.voice_info = KOKORO_VOICES.get(self.voice, KOKORO_VOICES[DEFAULT_VOICE])
	self.lang_code = self.voice_info["lang"]

	# Initialize cache directory
	if cache_dir:
	self.cache_dir = Path(cache_dir)
	else:
	self.cache_dir = BASE_DIR / "media" / "voiceover" / "kokoro"
	self.cache_dir.mkdir(parents=True, exist_ok=True)

	# Lazy-load pipeline to avoid loading model until needed
	self._pipeline = None

	logger.info(f"KokoroTTS initialized with voice: {self.voice} ({self.voice_info['name']})")

	def _get_pipeline(self):
	"""Lazy-load the Kokoro pipeline."""
	if self._pipeline is None:
	try:
	from kokoro import KPipeline
	self._pipeline = KPipeline(lang_code=self.lang_code)
	logger.info(f"Kokoro pipeline loaded for language: {self.lang_code}")
	except ImportError as e:
	logger.error("Kokoro not installed. Install with: pip install kokoro>=0.9.4 soundfile")
	raise RuntimeError("Kokoro TTS not installed") from e
	return self._pipeline

	def generate_from_text(self, text: str, **kwargs) -> Path:
	"""
	Generate audio from text and return the path to the audio file.

	Args:
	text: Text to convert to speech
	**kwargs: Additional arguments (ignored for compatibility)

	Returns:
	Path to the generated audio file (MP3)
	"""
	if not text or not text.strip():
	raise ValueError("Text cannot be empty")

	# Create cache key based on text and voice
	content_hash = hashlib.md5(f"{text}-{self.voice}".encode("utf-8")).hexdigest()
	output_path = self.cache_dir / f"{content_hash}.mp3"

	# Return cached file if exists
	if output_path.exists() and output_path.stat().st_size > 0:
	logger.info(f"Using cached Kokoro voiceover: {output_path.name}")
	return output_path

	logger.info(f"Generating Kokoro TTS ({self.voice}) for: {text[:50]}...")

	try:
	import soundfile as sf
	import numpy as np

	pipeline = self._get_pipeline()

	# Generate audio using Kokoro
	generator = pipeline(text, voice=self.voice, speed=1.0)

	# Collect all audio chunks
	audio_chunks = []
	for i, (gs, ps, audio) in enumerate(generator):
	audio_chunks.append(audio)

	if not audio_chunks:
	raise Exception("No audio generated")

	# Concatenate all chunks
	full_audio = np.concatenate(audio_chunks)

	# Save as WAV first (Kokoro outputs 24kHz)
	wav_path = self.cache_dir / f"{content_hash}.wav"
	sf.write(str(wav_path), full_audio, 24000)

	# Convert to MP3 using ffmpeg for smaller file size
	import subprocess
	result = subprocess.run([
	"ffmpeg", "-y", "-i", str(wav_path),
	"-acodec", "libmp3lame", "-ab", "192k",
	str(output_path)
	], capture_output=True, text=True)

	# Clean up WAV file
	if wav_path.exists():
	wav_path.unlink()

	if result.returncode != 0:
	# Fallback: use WAV if MP3 conversion fails
	logger.warning("MP3 conversion failed, using WAV")
	wav_path = self.cache_dir / f"{content_hash}.wav"
	sf.write(str(wav_path), full_audio, 24000)
	return wav_path

	if output_path.exists() and output_path.stat().st_size > 0:
	logger.info(f"✅ Kokoro TTS saved: {output_path} ({output_path.stat().st_size} bytes)")
	return output_path
	else:
	raise Exception("Audio file was not created or is empty")

	except Exception as e:
	logger.error(f"Kokoro TTS failed: {str(e)}")
	raise RuntimeError(f"Kokoro TTS generation failed: {str(e)}")


	# Alias for backward compatibility
	SimpleElevenLabsService = KokoroTTSService


	def get_available_voices() -> dict:
	"""Return dictionary of available voices with their info."""
	return KOKORO_VOICES


	def get_voice_names() -> list:
	"""Return list of voice display names for UI."""
	return [(vid, info["name"]) for vid, info in KOKORO_VOICES.items()]