Spaces:

aditya2001
/

VidSimplify

Sleeping

App Files Files Community

Adityahulk commited on 7 days ago

Commit

60e9dd0

1 Parent(s): 3ccc955

integrating free voice

Browse files

Files changed (5) hide show

Dockerfile +1 -0
manimator/scene/voiceover_scene.py +3 -11
manimator/services/voiceover.py +126 -141
requirements.txt +3 -2
streamlit_app.py +20 -0

Dockerfile CHANGED Viewed

@@ -13,6 +13,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     pkg-config \
     python3-dev \
     sudo \
     && rm -rf /var/lib/apt/lists/*
 # Create a non-root user (Hugging Face requirement)

     pkg-config \
     python3-dev \
     sudo \
+    espeak-ng \
     && rm -rf /var/lib/apt/lists/*
 # Create a non-root user (Hugging Face requirement)

manimator/scene/voiceover_scene.py CHANGED Viewed

@@ -42,7 +42,7 @@ class VoiceoverScene(Scene):
         # Generate audio
         audio_path = self.speech_service.generate_from_text(text)
-        # Convert to absolute path to ensure it works in containerized environments
         absolute_audio_path = Path(audio_path).resolve()
         # Verify the file exists
@@ -52,18 +52,10 @@ class VoiceoverScene(Scene):
         logger.info(f"Adding audio to scene: {absolute_audio_path} (size: {absolute_audio_path.stat().st_size} bytes)")
-        # Add audio to scene - use absolute path string
         self.add_sound(str(absolute_audio_path))
-        # Calculate duration (approximate or exact if we could read metadata)
-        # For now, we rely on Manim's add_sound to handle playback.
-        # But we need to know how long to wait.
-        # We need to get the duration of the audio file.
-        # Since we want to avoid heavy dependencies like pydub/sox if possible,
-        # we can try a lightweight approach or just use mutagen if available.
-        # Given the environment has 'manim', it likely has tools to read audio duration.
         duration = self._get_audio_duration(absolute_audio_path)
         return _VoiceoverContext(self, duration)

         # Generate audio
         audio_path = self.speech_service.generate_from_text(text)
+        # Convert to absolute path
         absolute_audio_path = Path(audio_path).resolve()
         # Verify the file exists
         logger.info(f"Adding audio to scene: {absolute_audio_path} (size: {absolute_audio_path.stat().st_size} bytes)")
+        # Add audio to scene
         self.add_sound(str(absolute_audio_path))
+        # Get audio duration
         duration = self._get_audio_duration(absolute_audio_path)
         return _VoiceoverContext(self, duration)

manimator/services/voiceover.py CHANGED Viewed

@@ -1,185 +1,170 @@
 import os
 import hashlib
-import json
 import logging
-import requests
-import asyncio
 from pathlib import Path
-from typing import Optional, Dict, Any
 logger = logging.getLogger(__name__)
-# Get the base directory (2 levels up from this file: manimator/services -> manimator -> texttoanimation)
 BASE_DIR = Path(__file__).parent.parent.parent
-# Edge-TTS Voice mapping - high quality neural voices
-EDGE_TTS_VOICES = {
-    "Rachel": "en-US-JennyNeural",       # Female, clear and professional
-    "Adam": "en-US-GuyNeural",           # Male, professional
-    "Bella": "en-US-AriaNeural",         # Female, warm and friendly
-    "Josh": "en-US-ChristopherNeural",   # Male, deep voice
-    "Indian": "en-IN-NeerjaNeural",      # Indian English female
-    "British": "en-GB-SoniaNeural",      # British female
-    "Australian": "en-AU-NatashaNeural", # Australian female
 }
-class SimpleElevenLabsService:
     """
-    A simple, robust service for generating voiceovers using ElevenLabs API.
-    Falls back to Edge TTS (Microsoft neural voices) if ElevenLabs fails.
     """
-    DEFAULT_VOICE_ID = "21m00Tcm4TlvDq8ikWAM"  # Rachel
-    VOICE_MAPPING = {
-        "Rachel": "21m00Tcm4TlvDq8ikWAM",
-        "Adam": "pNInz6obpgDQGcFmaJgB",
-        "Bella": "EXAVITQu4vr4xnSDxMaL",
-        "Josh": "TxGEqnHWrfWFTfGW9XjX"
-    }
-    BASE_URL = "https://api.elevenlabs.io/v1"
-    def __init__(self, voice_id: str = DEFAULT_VOICE_ID, cache_dir: Optional[Path] = None):
-        # Resolve voice ID if it's a name
-        self.voice_id = self.VOICE_MAPPING.get(voice_id, voice_id)
-        self.voice_name = voice_id  # Store the voice name for edge-tts fallback
-        self.api_key = os.getenv("ELEVENLABS_API_KEY")
-        if not self.api_key:
-            logger.warning("ELEVENLABS_API_KEY not set. Will use Edge TTS (free).")
-        # Use provided cache_dir or default (use absolute path for containerized environments)
         if cache_dir:
-            self.cache_dir = Path(cache_dir) if not isinstance(cache_dir, Path) else cache_dir
         else:
-            self.cache_dir = BASE_DIR / "media" / "voiceover" / "elevenlabs"
-        # Ensure cache directory exists
         self.cache_dir.mkdir(parents=True, exist_ok=True)
-        logger.info(f"Voiceover cache directory: {self.cache_dir.resolve()}")
     def generate_from_text(self, text: str, **kwargs) -> Path:
         """
         Generate audio from text and return the path to the audio file.
-        Uses a simple hash-based caching mechanism.
         """
-        if not text:
             raise ValueError("Text cannot be empty")
-        # Create a stable hash for the text and voice_id
-        content_hash = hashlib.md5(f"{text}-{self.voice_id}".encode("utf-8")).hexdigest()
         output_path = self.cache_dir / f"{content_hash}.mp3"
-        # Return cached file if it exists
         if output_path.exists() and output_path.stat().st_size > 0:
-            logger.info(f"Using cached voiceover for hash {content_hash}")
             return output_path
-        logger.info(f"Generating new voiceover for: {text[:50]}...")
         try:
-            if not self.api_key:
-                logger.warning("ELEVENLABS_API_KEY missing, using Edge TTS")
-                return self._generate_with_edge_tts(text)
-            # Call ElevenLabs API
-            url = f"{self.BASE_URL}/text-to-speech/{self.voice_id}"
-            headers = {
-                "Accept": "audio/mpeg",
-                "Content-Type": "application/json",
-                "xi-api-key": self.api_key
-            }
-            data = {
-                "text": text,
-                "model_id": "eleven_monolingual_v1",
-                "voice_settings": {
-                    "stability": 0.5,
-                    "similarity_boost": 0.75
-                }
-            }
-            response = requests.post(url, json=data, headers=headers)
-            response.raise_for_status()
-            with open(output_path, "wb") as f:
-                for chunk in response.iter_content(chunk_size=1024):
-                    if chunk:
-                        f.write(chunk)
-            # Verify file was created successfully
-            if output_path.exists() and output_path.stat().st_size > 0:
-                logger.info(f"✅ ElevenLabs voiceover saved: {output_path} ({output_path.stat().st_size} bytes)")
-            else:
-                raise Exception("Audio file was not created or is empty")
-            return output_path
-        except Exception as e:
-            logger.error(f"ElevenLabs generation failed: {str(e)}. Falling back to Edge TTS.")
-            return self._generate_with_edge_tts(text)
-    def _generate_with_edge_tts(self, text: str) -> Path:
-        """
-        Fallback generation using Microsoft Edge TTS (free, high quality).
-        Uses neural voices that sound natural and professional.
-        """
-        try:
-            import edge_tts
-            # Use absolute path for edge-tts cache (important for containerized environments)
-            edge_cache_dir = BASE_DIR / "media" / "voiceover" / "edge_tts"
-            edge_cache_dir.mkdir(parents=True, exist_ok=True)
-            # Map the voice name to edge-tts voice
-            edge_voice = EDGE_TTS_VOICES.get(self.voice_name, "en-US-JennyNeural")
-            content_hash = hashlib.md5(f"{text}-{edge_voice}".encode("utf-8")).hexdigest()
-            output_path = edge_cache_dir / f"{content_hash}.mp3"
-            if output_path.exists() and output_path.stat().st_size > 0:
-                logger.info(f"Using cached Edge TTS voiceover for hash {content_hash}")
-                return output_path
-            logger.info(f"Generating Edge TTS ({edge_voice}) for: {text[:30]}...")
-            # Edge-tts is async, handle event loop properly for Streamlit/Flask contexts
-            async def _generate():
-                communicate = edge_tts.Communicate(text, edge_voice)
-                await communicate.save(str(output_path))
-            # Try to use nest_asyncio for Streamlit/Jupyter compatibility
-            try:
-                import nest_asyncio
-                nest_asyncio.apply()
-            except ImportError:
-                pass  # nest_asyncio not available, continue anyway
-            # Run the async function with proper event loop handling
-            try:
-                # Try asyncio.run() first (Python 3.7+, creates new loop)
-                asyncio.run(_generate())
-            except RuntimeError as e:
-                # If there's already an event loop running (e.g., in Streamlit/Jupyter)
-                if "cannot be called from a running event loop" in str(e) or "There is no current event loop" in str(e):
-                    loop = asyncio.new_event_loop()
-                    asyncio.set_event_loop(loop)
-                    try:
-                        loop.run_until_complete(_generate())
-                    finally:
-                        loop.close()
-                else:
-                    raise
-            # Verify file was created successfully
             if output_path.exists() and output_path.stat().st_size > 0:
-                logger.info(f"✅ Edge TTS voiceover saved: {output_path} ({output_path.stat().st_size} bytes)")
             else:
-                raise Exception("Edge TTS audio file was not created or is empty")
-            return output_path
         except Exception as e:
-            logger.error(f"Edge TTS failed: {str(e)}")
-            raise RuntimeError(f"Edge TTS voiceover generation failed: {str(e)}")

+"""
+Kokoro TTS Service - High-quality, local, commercially-free text-to-speech.
+Uses the Kokoro TTS model (82M params, Apache 2.0 license).
+"""
 import os
 import hashlib
 import logging
 from pathlib import Path
+from typing import Optional
 logger = logging.getLogger(__name__)
+# Get the base directory
 BASE_DIR = Path(__file__).parent.parent.parent
+# Available Kokoro voices with descriptions
+KOKORO_VOICES = {
+    # American English voices
+    "af_heart": {"name": "Heart (Female)", "lang": "a", "description": "Warm, friendly American female"},
+    "af_bella": {"name": "Bella (Female)", "lang": "a", "description": "Professional American female"},
+    "af_nicole": {"name": "Nicole (Female)", "lang": "a", "description": "Clear American female"},
+    "af_sarah": {"name": "Sarah (Female)", "lang": "a", "description": "Natural American female"},
+    "af_sky": {"name": "Sky (Female)", "lang": "a", "description": "Bright American female"},
+    "am_adam": {"name": "Adam (Male)", "lang": "a", "description": "Professional American male"},
+    "am_michael": {"name": "Michael (Male)", "lang": "a", "description": "Deep American male"},
+    # British English voices
+    "bf_emma": {"name": "Emma (British Female)", "lang": "b", "description": "Professional British female"},
+    "bf_isabella": {"name": "Isabella (British Female)", "lang": "b", "description": "Warm British female"},
+    "bm_george": {"name": "George (British Male)", "lang": "b", "description": "Professional British male"},
+    "bm_lewis": {"name": "Lewis (British Male)", "lang": "b", "description": "Friendly British male"},
 }
+# Default voice
+DEFAULT_VOICE = "af_heart"
+class KokoroTTSService:
     """
+    Text-to-speech service using Kokoro TTS.
+    High-quality, local, commercially-free (Apache 2.0 license).
     """
+    def __init__(self, voice: str = DEFAULT_VOICE, cache_dir: Optional[Path] = None):
+        """
+        Initialize Kokoro TTS service.
+        Args:
+            voice: Voice ID (e.g., 'af_heart', 'am_adam')
+            cache_dir: Optional custom cache directory
+        """
+        self.voice = voice if voice in KOKORO_VOICES else DEFAULT_VOICE
+        self.voice_info = KOKORO_VOICES.get(self.voice, KOKORO_VOICES[DEFAULT_VOICE])
+        self.lang_code = self.voice_info["lang"]
+        # Initialize cache directory
         if cache_dir:
+            self.cache_dir = Path(cache_dir)
         else:
+            self.cache_dir = BASE_DIR / "media" / "voiceover" / "kokoro"
         self.cache_dir.mkdir(parents=True, exist_ok=True)
+        # Lazy-load pipeline to avoid loading model until needed
+        self._pipeline = None
+        logger.info(f"KokoroTTS initialized with voice: {self.voice} ({self.voice_info['name']})")
+    def _get_pipeline(self):
+        """Lazy-load the Kokoro pipeline."""
+        if self._pipeline is None:
+            try:
+                from kokoro import KPipeline
+                self._pipeline = KPipeline(lang_code=self.lang_code)
+                logger.info(f"Kokoro pipeline loaded for language: {self.lang_code}")
+            except ImportError as e:
+                logger.error("Kokoro not installed. Install with: pip install kokoro>=0.9.4 soundfile")
+                raise RuntimeError("Kokoro TTS not installed") from e
+        return self._pipeline
     def generate_from_text(self, text: str, **kwargs) -> Path:
         """
         Generate audio from text and return the path to the audio file.
+        Args:
+            text: Text to convert to speech
+            **kwargs: Additional arguments (ignored for compatibility)
+        Returns:
+            Path to the generated audio file (MP3)
         """
+        if not text or not text.strip():
             raise ValueError("Text cannot be empty")
+        # Create cache key based on text and voice
+        content_hash = hashlib.md5(f"{text}-{self.voice}".encode("utf-8")).hexdigest()
         output_path = self.cache_dir / f"{content_hash}.mp3"
+        # Return cached file if exists
         if output_path.exists() and output_path.stat().st_size > 0:
+            logger.info(f"Using cached Kokoro voiceover: {output_path.name}")
             return output_path
+        logger.info(f"Generating Kokoro TTS ({self.voice}) for: {text[:50]}...")
         try:
+            import soundfile as sf
+            import numpy as np
+            pipeline = self._get_pipeline()
+            # Generate audio using Kokoro
+            generator = pipeline(text, voice=self.voice, speed=1.0)
+            # Collect all audio chunks
+            audio_chunks = []
+            for i, (gs, ps, audio) in enumerate(generator):
+                audio_chunks.append(audio)
+            if not audio_chunks:
+                raise Exception("No audio generated")
+            # Concatenate all chunks
+            full_audio = np.concatenate(audio_chunks)
+            # Save as WAV first (Kokoro outputs 24kHz)
+            wav_path = self.cache_dir / f"{content_hash}.wav"
+            sf.write(str(wav_path), full_audio, 24000)
+            # Convert to MP3 using ffmpeg for smaller file size
+            import subprocess
+            result = subprocess.run([
+                "ffmpeg", "-y", "-i", str(wav_path),
+                "-acodec", "libmp3lame", "-ab", "192k",
+                str(output_path)
+            ], capture_output=True, text=True)
+            # Clean up WAV file
+            if wav_path.exists():
+                wav_path.unlink()
+            if result.returncode != 0:
+                # Fallback: use WAV if MP3 conversion fails
+                logger.warning("MP3 conversion failed, using WAV")
+                wav_path = self.cache_dir / f"{content_hash}.wav"
+                sf.write(str(wav_path), full_audio, 24000)
+                return wav_path
             if output_path.exists() and output_path.stat().st_size > 0:
+                logger.info(f"✅ Kokoro TTS saved: {output_path} ({output_path.stat().st_size} bytes)")
+                return output_path
             else:
+                raise Exception("Audio file was not created or is empty")
         except Exception as e:
+            logger.error(f"Kokoro TTS failed: {str(e)}")
+            raise RuntimeError(f"Kokoro TTS generation failed: {str(e)}")
+# Alias for backward compatibility
+SimpleElevenLabsService = KokoroTTSService
+def get_available_voices() -> dict:
+    """Return dictionary of available voices with their info."""
+    return KOKORO_VOICES
+def get_voice_names() -> list:
+    """Return list of voice display names for UI."""
+    return [(vid, info["name"]) for vid, info in KOKORO_VOICES.items()]

requirements.txt CHANGED Viewed

@@ -12,5 +12,6 @@ requests
 beautifulsoup4>=4.12.0
 lxml>=4.9.0
 readability-lxml>=0.8.1
-edge-tts>=6.1.0
-nest_asyncio>=1.5.0

 beautifulsoup4>=4.12.0
 lxml>=4.9.0
 readability-lxml>=0.8.1
+kokoro>=0.9.4
+soundfile>=0.12.0
+mutagen>=1.45.0

streamlit_app.py CHANGED Viewed

@@ -680,6 +680,26 @@ with st.sidebar:
         format_func=lambda x: {"tech_system": "🔧 Tech & Systems", "product_startup": "🚀 Product & Startup", "mathematical": "📐 Mathematical"}[x]
     )
     st.markdown("---")
     # About section - professional branding

         format_func=lambda x: {"tech_system": "🔧 Tech & Systems", "product_startup": "🚀 Product & Startup", "mathematical": "📐 Mathematical"}[x]
     )
+    # Voice selection for TTS
+    voice_options = {
+        "af_heart": "❤️ Heart (Female, Warm)",
+        "af_bella": "💼 Bella (Female, Professional)",
+        "af_nicole": "🎤 Nicole (Female, Clear)",
+        "af_sarah": "🌸 Sarah (Female, Natural)",
+        "am_adam": "👔 Adam (Male, Professional)",
+        "am_michael": "🎙️ Michael (Male, Deep)",
+        "bf_emma": "🇬🇧 Emma (British Female)",
+        "bf_isabella": "🇬🇧 Isabella (British Female)",
+        "bm_george": "🇬🇧 George (British Male)",
+        "bm_lewis": "🇬🇧 Lewis (British Male)",
+    }
+    voice = st.selectbox(
+        "🎙️ Voice",
+        list(voice_options.keys()),
+        index=0,
+        format_func=lambda x: voice_options[x]
+    )
     st.markdown("---")
     # About section - professional branding