Adityahulk commited on
Commit
60e9dd0
Β·
1 Parent(s): 3ccc955

integrating free voice

Browse files
Dockerfile CHANGED
@@ -13,6 +13,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
13
  pkg-config \
14
  python3-dev \
15
  sudo \
 
16
  && rm -rf /var/lib/apt/lists/*
17
 
18
  # Create a non-root user (Hugging Face requirement)
 
13
  pkg-config \
14
  python3-dev \
15
  sudo \
16
+ espeak-ng \
17
  && rm -rf /var/lib/apt/lists/*
18
 
19
  # Create a non-root user (Hugging Face requirement)
manimator/scene/voiceover_scene.py CHANGED
@@ -42,7 +42,7 @@ class VoiceoverScene(Scene):
42
  # Generate audio
43
  audio_path = self.speech_service.generate_from_text(text)
44
 
45
- # Convert to absolute path to ensure it works in containerized environments
46
  absolute_audio_path = Path(audio_path).resolve()
47
 
48
  # Verify the file exists
@@ -52,18 +52,10 @@ class VoiceoverScene(Scene):
52
 
53
  logger.info(f"Adding audio to scene: {absolute_audio_path} (size: {absolute_audio_path.stat().st_size} bytes)")
54
 
55
- # Add audio to scene - use absolute path string
56
  self.add_sound(str(absolute_audio_path))
57
 
58
- # Calculate duration (approximate or exact if we could read metadata)
59
- # For now, we rely on Manim's add_sound to handle playback.
60
- # But we need to know how long to wait.
61
-
62
- # We need to get the duration of the audio file.
63
- # Since we want to avoid heavy dependencies like pydub/sox if possible,
64
- # we can try a lightweight approach or just use mutagen if available.
65
- # Given the environment has 'manim', it likely has tools to read audio duration.
66
-
67
  duration = self._get_audio_duration(absolute_audio_path)
68
 
69
  return _VoiceoverContext(self, duration)
 
42
  # Generate audio
43
  audio_path = self.speech_service.generate_from_text(text)
44
 
45
+ # Convert to absolute path
46
  absolute_audio_path = Path(audio_path).resolve()
47
 
48
  # Verify the file exists
 
52
 
53
  logger.info(f"Adding audio to scene: {absolute_audio_path} (size: {absolute_audio_path.stat().st_size} bytes)")
54
 
55
+ # Add audio to scene
56
  self.add_sound(str(absolute_audio_path))
57
 
58
+ # Get audio duration
 
 
 
 
 
 
 
 
59
  duration = self._get_audio_duration(absolute_audio_path)
60
 
61
  return _VoiceoverContext(self, duration)
manimator/services/voiceover.py CHANGED
@@ -1,185 +1,170 @@
 
 
 
 
 
1
  import os
2
  import hashlib
3
- import json
4
  import logging
5
- import requests
6
- import asyncio
7
  from pathlib import Path
8
- from typing import Optional, Dict, Any
9
 
10
  logger = logging.getLogger(__name__)
11
 
12
- # Get the base directory (2 levels up from this file: manimator/services -> manimator -> texttoanimation)
13
  BASE_DIR = Path(__file__).parent.parent.parent
14
 
15
-
16
- # Edge-TTS Voice mapping - high quality neural voices
17
- EDGE_TTS_VOICES = {
18
- "Rachel": "en-US-JennyNeural", # Female, clear and professional
19
- "Adam": "en-US-GuyNeural", # Male, professional
20
- "Bella": "en-US-AriaNeural", # Female, warm and friendly
21
- "Josh": "en-US-ChristopherNeural", # Male, deep voice
22
- "Indian": "en-IN-NeerjaNeural", # Indian English female
23
- "British": "en-GB-SoniaNeural", # British female
24
- "Australian": "en-AU-NatashaNeural", # Australian female
 
 
 
 
 
25
  }
26
 
 
 
27
 
28
- class SimpleElevenLabsService:
 
29
  """
30
- A simple, robust service for generating voiceovers using ElevenLabs API.
31
- Falls back to Edge TTS (Microsoft neural voices) if ElevenLabs fails.
32
  """
33
 
34
- DEFAULT_VOICE_ID = "21m00Tcm4TlvDq8ikWAM" # Rachel
35
-
36
- VOICE_MAPPING = {
37
- "Rachel": "21m00Tcm4TlvDq8ikWAM",
38
- "Adam": "pNInz6obpgDQGcFmaJgB",
39
- "Bella": "EXAVITQu4vr4xnSDxMaL",
40
- "Josh": "TxGEqnHWrfWFTfGW9XjX"
41
- }
42
-
43
- BASE_URL = "https://api.elevenlabs.io/v1"
44
-
45
- def __init__(self, voice_id: str = DEFAULT_VOICE_ID, cache_dir: Optional[Path] = None):
46
- # Resolve voice ID if it's a name
47
- self.voice_id = self.VOICE_MAPPING.get(voice_id, voice_id)
48
- self.voice_name = voice_id # Store the voice name for edge-tts fallback
49
- self.api_key = os.getenv("ELEVENLABS_API_KEY")
50
- if not self.api_key:
51
- logger.warning("ELEVENLABS_API_KEY not set. Will use Edge TTS (free).")
52
-
53
- # Use provided cache_dir or default (use absolute path for containerized environments)
54
  if cache_dir:
55
- self.cache_dir = Path(cache_dir) if not isinstance(cache_dir, Path) else cache_dir
56
  else:
57
- self.cache_dir = BASE_DIR / "media" / "voiceover" / "elevenlabs"
58
-
59
- # Ensure cache directory exists
60
  self.cache_dir.mkdir(parents=True, exist_ok=True)
61
- logger.info(f"Voiceover cache directory: {self.cache_dir.resolve()}")
62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  def generate_from_text(self, text: str, **kwargs) -> Path:
64
  """
65
  Generate audio from text and return the path to the audio file.
66
- Uses a simple hash-based caching mechanism.
 
 
 
 
 
 
67
  """
68
- if not text:
69
  raise ValueError("Text cannot be empty")
70
-
71
- # Create a stable hash for the text and voice_id
72
- content_hash = hashlib.md5(f"{text}-{self.voice_id}".encode("utf-8")).hexdigest()
73
  output_path = self.cache_dir / f"{content_hash}.mp3"
74
 
75
- # Return cached file if it exists
76
  if output_path.exists() and output_path.stat().st_size > 0:
77
- logger.info(f"Using cached voiceover for hash {content_hash}")
78
  return output_path
79
-
80
- logger.info(f"Generating new voiceover for: {text[:50]}...")
81
 
82
  try:
83
- if not self.api_key:
84
- logger.warning("ELEVENLABS_API_KEY missing, using Edge TTS")
85
- return self._generate_with_edge_tts(text)
86
-
87
- # Call ElevenLabs API
88
- url = f"{self.BASE_URL}/text-to-speech/{self.voice_id}"
89
- headers = {
90
- "Accept": "audio/mpeg",
91
- "Content-Type": "application/json",
92
- "xi-api-key": self.api_key
93
- }
94
- data = {
95
- "text": text,
96
- "model_id": "eleven_monolingual_v1",
97
- "voice_settings": {
98
- "stability": 0.5,
99
- "similarity_boost": 0.75
100
- }
101
- }
102
-
103
- response = requests.post(url, json=data, headers=headers)
104
- response.raise_for_status()
105
-
106
- with open(output_path, "wb") as f:
107
- for chunk in response.iter_content(chunk_size=1024):
108
- if chunk:
109
- f.write(chunk)
110
 
111
- # Verify file was created successfully
112
- if output_path.exists() and output_path.stat().st_size > 0:
113
- logger.info(f"βœ… ElevenLabs voiceover saved: {output_path} ({output_path.stat().st_size} bytes)")
114
- else:
115
- raise Exception("Audio file was not created or is empty")
116
-
117
- return output_path
118
 
119
- except Exception as e:
120
- logger.error(f"ElevenLabs generation failed: {str(e)}. Falling back to Edge TTS.")
121
- return self._generate_with_edge_tts(text)
122
-
123
- def _generate_with_edge_tts(self, text: str) -> Path:
124
- """
125
- Fallback generation using Microsoft Edge TTS (free, high quality).
126
- Uses neural voices that sound natural and professional.
127
- """
128
- try:
129
- import edge_tts
130
 
131
- # Use absolute path for edge-tts cache (important for containerized environments)
132
- edge_cache_dir = BASE_DIR / "media" / "voiceover" / "edge_tts"
133
- edge_cache_dir.mkdir(parents=True, exist_ok=True)
 
134
 
135
- # Map the voice name to edge-tts voice
136
- edge_voice = EDGE_TTS_VOICES.get(self.voice_name, "en-US-JennyNeural")
137
 
138
- content_hash = hashlib.md5(f"{text}-{edge_voice}".encode("utf-8")).hexdigest()
139
- output_path = edge_cache_dir / f"{content_hash}.mp3"
140
 
141
- if output_path.exists() and output_path.stat().st_size > 0:
142
- logger.info(f"Using cached Edge TTS voiceover for hash {content_hash}")
143
- return output_path
144
-
145
- logger.info(f"Generating Edge TTS ({edge_voice}) for: {text[:30]}...")
146
 
147
- # Edge-tts is async, handle event loop properly for Streamlit/Flask contexts
148
- async def _generate():
149
- communicate = edge_tts.Communicate(text, edge_voice)
150
- await communicate.save(str(output_path))
 
 
 
151
 
152
- # Try to use nest_asyncio for Streamlit/Jupyter compatibility
153
- try:
154
- import nest_asyncio
155
- nest_asyncio.apply()
156
- except ImportError:
157
- pass # nest_asyncio not available, continue anyway
158
 
159
- # Run the async function with proper event loop handling
160
- try:
161
- # Try asyncio.run() first (Python 3.7+, creates new loop)
162
- asyncio.run(_generate())
163
- except RuntimeError as e:
164
- # If there's already an event loop running (e.g., in Streamlit/Jupyter)
165
- if "cannot be called from a running event loop" in str(e) or "There is no current event loop" in str(e):
166
- loop = asyncio.new_event_loop()
167
- asyncio.set_event_loop(loop)
168
- try:
169
- loop.run_until_complete(_generate())
170
- finally:
171
- loop.close()
172
- else:
173
- raise
174
 
175
- # Verify file was created successfully
176
  if output_path.exists() and output_path.stat().st_size > 0:
177
- logger.info(f"βœ… Edge TTS voiceover saved: {output_path} ({output_path.stat().st_size} bytes)")
 
178
  else:
179
- raise Exception("Edge TTS audio file was not created or is empty")
180
 
181
- return output_path
182
-
183
  except Exception as e:
184
- logger.error(f"Edge TTS failed: {str(e)}")
185
- raise RuntimeError(f"Edge TTS voiceover generation failed: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Kokoro TTS Service - High-quality, local, commercially-free text-to-speech.
3
+ Uses the Kokoro TTS model (82M params, Apache 2.0 license).
4
+ """
5
+
6
  import os
7
  import hashlib
 
8
  import logging
 
 
9
  from pathlib import Path
10
+ from typing import Optional
11
 
12
  logger = logging.getLogger(__name__)
13
 
14
+ # Get the base directory
15
  BASE_DIR = Path(__file__).parent.parent.parent
16
 
17
+ # Available Kokoro voices with descriptions
18
+ KOKORO_VOICES = {
19
+ # American English voices
20
+ "af_heart": {"name": "Heart (Female)", "lang": "a", "description": "Warm, friendly American female"},
21
+ "af_bella": {"name": "Bella (Female)", "lang": "a", "description": "Professional American female"},
22
+ "af_nicole": {"name": "Nicole (Female)", "lang": "a", "description": "Clear American female"},
23
+ "af_sarah": {"name": "Sarah (Female)", "lang": "a", "description": "Natural American female"},
24
+ "af_sky": {"name": "Sky (Female)", "lang": "a", "description": "Bright American female"},
25
+ "am_adam": {"name": "Adam (Male)", "lang": "a", "description": "Professional American male"},
26
+ "am_michael": {"name": "Michael (Male)", "lang": "a", "description": "Deep American male"},
27
+ # British English voices
28
+ "bf_emma": {"name": "Emma (British Female)", "lang": "b", "description": "Professional British female"},
29
+ "bf_isabella": {"name": "Isabella (British Female)", "lang": "b", "description": "Warm British female"},
30
+ "bm_george": {"name": "George (British Male)", "lang": "b", "description": "Professional British male"},
31
+ "bm_lewis": {"name": "Lewis (British Male)", "lang": "b", "description": "Friendly British male"},
32
  }
33
 
34
+ # Default voice
35
+ DEFAULT_VOICE = "af_heart"
36
 
37
+
38
+ class KokoroTTSService:
39
  """
40
+ Text-to-speech service using Kokoro TTS.
41
+ High-quality, local, commercially-free (Apache 2.0 license).
42
  """
43
 
44
+ def __init__(self, voice: str = DEFAULT_VOICE, cache_dir: Optional[Path] = None):
45
+ """
46
+ Initialize Kokoro TTS service.
47
+
48
+ Args:
49
+ voice: Voice ID (e.g., 'af_heart', 'am_adam')
50
+ cache_dir: Optional custom cache directory
51
+ """
52
+ self.voice = voice if voice in KOKORO_VOICES else DEFAULT_VOICE
53
+ self.voice_info = KOKORO_VOICES.get(self.voice, KOKORO_VOICES[DEFAULT_VOICE])
54
+ self.lang_code = self.voice_info["lang"]
55
+
56
+ # Initialize cache directory
 
 
 
 
 
 
 
57
  if cache_dir:
58
+ self.cache_dir = Path(cache_dir)
59
  else:
60
+ self.cache_dir = BASE_DIR / "media" / "voiceover" / "kokoro"
 
 
61
  self.cache_dir.mkdir(parents=True, exist_ok=True)
 
62
 
63
+ # Lazy-load pipeline to avoid loading model until needed
64
+ self._pipeline = None
65
+
66
+ logger.info(f"KokoroTTS initialized with voice: {self.voice} ({self.voice_info['name']})")
67
+
68
+ def _get_pipeline(self):
69
+ """Lazy-load the Kokoro pipeline."""
70
+ if self._pipeline is None:
71
+ try:
72
+ from kokoro import KPipeline
73
+ self._pipeline = KPipeline(lang_code=self.lang_code)
74
+ logger.info(f"Kokoro pipeline loaded for language: {self.lang_code}")
75
+ except ImportError as e:
76
+ logger.error("Kokoro not installed. Install with: pip install kokoro>=0.9.4 soundfile")
77
+ raise RuntimeError("Kokoro TTS not installed") from e
78
+ return self._pipeline
79
+
80
  def generate_from_text(self, text: str, **kwargs) -> Path:
81
  """
82
  Generate audio from text and return the path to the audio file.
83
+
84
+ Args:
85
+ text: Text to convert to speech
86
+ **kwargs: Additional arguments (ignored for compatibility)
87
+
88
+ Returns:
89
+ Path to the generated audio file (MP3)
90
  """
91
+ if not text or not text.strip():
92
  raise ValueError("Text cannot be empty")
93
+
94
+ # Create cache key based on text and voice
95
+ content_hash = hashlib.md5(f"{text}-{self.voice}".encode("utf-8")).hexdigest()
96
  output_path = self.cache_dir / f"{content_hash}.mp3"
97
 
98
+ # Return cached file if exists
99
  if output_path.exists() and output_path.stat().st_size > 0:
100
+ logger.info(f"Using cached Kokoro voiceover: {output_path.name}")
101
  return output_path
102
+
103
+ logger.info(f"Generating Kokoro TTS ({self.voice}) for: {text[:50]}...")
104
 
105
  try:
106
+ import soundfile as sf
107
+ import numpy as np
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
 
109
+ pipeline = self._get_pipeline()
 
 
 
 
 
 
110
 
111
+ # Generate audio using Kokoro
112
+ generator = pipeline(text, voice=self.voice, speed=1.0)
 
 
 
 
 
 
 
 
 
113
 
114
+ # Collect all audio chunks
115
+ audio_chunks = []
116
+ for i, (gs, ps, audio) in enumerate(generator):
117
+ audio_chunks.append(audio)
118
 
119
+ if not audio_chunks:
120
+ raise Exception("No audio generated")
121
 
122
+ # Concatenate all chunks
123
+ full_audio = np.concatenate(audio_chunks)
124
 
125
+ # Save as WAV first (Kokoro outputs 24kHz)
126
+ wav_path = self.cache_dir / f"{content_hash}.wav"
127
+ sf.write(str(wav_path), full_audio, 24000)
 
 
128
 
129
+ # Convert to MP3 using ffmpeg for smaller file size
130
+ import subprocess
131
+ result = subprocess.run([
132
+ "ffmpeg", "-y", "-i", str(wav_path),
133
+ "-acodec", "libmp3lame", "-ab", "192k",
134
+ str(output_path)
135
+ ], capture_output=True, text=True)
136
 
137
+ # Clean up WAV file
138
+ if wav_path.exists():
139
+ wav_path.unlink()
 
 
 
140
 
141
+ if result.returncode != 0:
142
+ # Fallback: use WAV if MP3 conversion fails
143
+ logger.warning("MP3 conversion failed, using WAV")
144
+ wav_path = self.cache_dir / f"{content_hash}.wav"
145
+ sf.write(str(wav_path), full_audio, 24000)
146
+ return wav_path
 
 
 
 
 
 
 
 
 
147
 
 
148
  if output_path.exists() and output_path.stat().st_size > 0:
149
+ logger.info(f"βœ… Kokoro TTS saved: {output_path} ({output_path.stat().st_size} bytes)")
150
+ return output_path
151
  else:
152
+ raise Exception("Audio file was not created or is empty")
153
 
 
 
154
  except Exception as e:
155
+ logger.error(f"Kokoro TTS failed: {str(e)}")
156
+ raise RuntimeError(f"Kokoro TTS generation failed: {str(e)}")
157
+
158
+
159
+ # Alias for backward compatibility
160
+ SimpleElevenLabsService = KokoroTTSService
161
+
162
+
163
+ def get_available_voices() -> dict:
164
+ """Return dictionary of available voices with their info."""
165
+ return KOKORO_VOICES
166
+
167
+
168
+ def get_voice_names() -> list:
169
+ """Return list of voice display names for UI."""
170
+ return [(vid, info["name"]) for vid, info in KOKORO_VOICES.items()]
requirements.txt CHANGED
@@ -12,5 +12,6 @@ requests
12
  beautifulsoup4>=4.12.0
13
  lxml>=4.9.0
14
  readability-lxml>=0.8.1
15
- edge-tts>=6.1.0
16
- nest_asyncio>=1.5.0
 
 
12
  beautifulsoup4>=4.12.0
13
  lxml>=4.9.0
14
  readability-lxml>=0.8.1
15
+ kokoro>=0.9.4
16
+ soundfile>=0.12.0
17
+ mutagen>=1.45.0
streamlit_app.py CHANGED
@@ -680,6 +680,26 @@ with st.sidebar:
680
  format_func=lambda x: {"tech_system": "πŸ”§ Tech & Systems", "product_startup": "πŸš€ Product & Startup", "mathematical": "πŸ“ Mathematical"}[x]
681
  )
682
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
683
  st.markdown("---")
684
 
685
  # About section - professional branding
 
680
  format_func=lambda x: {"tech_system": "πŸ”§ Tech & Systems", "product_startup": "πŸš€ Product & Startup", "mathematical": "πŸ“ Mathematical"}[x]
681
  )
682
 
683
+ # Voice selection for TTS
684
+ voice_options = {
685
+ "af_heart": "❀️ Heart (Female, Warm)",
686
+ "af_bella": "πŸ’Ό Bella (Female, Professional)",
687
+ "af_nicole": "🎀 Nicole (Female, Clear)",
688
+ "af_sarah": "🌸 Sarah (Female, Natural)",
689
+ "am_adam": "πŸ‘” Adam (Male, Professional)",
690
+ "am_michael": "πŸŽ™οΈ Michael (Male, Deep)",
691
+ "bf_emma": "πŸ‡¬πŸ‡§ Emma (British Female)",
692
+ "bf_isabella": "πŸ‡¬πŸ‡§ Isabella (British Female)",
693
+ "bm_george": "πŸ‡¬πŸ‡§ George (British Male)",
694
+ "bm_lewis": "πŸ‡¬πŸ‡§ Lewis (British Male)",
695
+ }
696
+ voice = st.selectbox(
697
+ "πŸŽ™οΈ Voice",
698
+ list(voice_options.keys()),
699
+ index=0,
700
+ format_func=lambda x: voice_options[x]
701
+ )
702
+
703
  st.markdown("---")
704
 
705
  # About section - professional branding