Chatterbox-Multilingual-TTS

Sleeping

App Files Files Community

yash184 commited on Dec 2, 2025

Commit

237e0e0

verified ·

1 Parent(s): 8574f55

Update app.py

Browse files

Files changed (1) hide show

app.py +64 -99

app.py CHANGED Viewed

@@ -1,22 +1,16 @@
 import random
 import numpy as np
 import torch
 import gradio as gr
 import spaces
-from pathlib import Path
-from typing import Optional
-# Import model class as before
-from src.chatterbox.mtl_tts import ChatterboxMultilingualTTS, SUPPORTED_LANGUAGES
-# --- Device detection ---
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"🚀 Running on device: {DEVICE}")
 # --- Global Model Initialization ---
-MODEL: Optional[ChatterboxMultilingualTTS] = None
-# --- Language config (kept exactly as you provided) ---
 LANGUAGE_CONFIG = {
     "ar": {
         "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/ar_f/ar_prompts2.flac",
@@ -113,7 +107,7 @@ LANGUAGE_CONFIG = {
 }
 # --- UI Helpers ---
-def default_audio_for_ui(lang: str) -> Optional[str]:
     return LANGUAGE_CONFIG.get(lang, {}).get("audio")
@@ -127,118 +121,101 @@ def get_supported_languages_display() -> str:
     for code, name in sorted(SUPPORTED_LANGUAGES.items()):
         language_items.append(f"**{name}** (`{code}`)")
-    # Split into 2 lines for readability
     mid = len(language_items) // 2
     line1 = " • ".join(language_items[:mid])
     line2 = " • ".join(language_items[mid:])
     return f"""
-### Supported Languages ({len(SUPPORTED_LANGUAGES)} total)
 {line1}
 {line2}
 """
-# --- Safe model loader with CPU fallback wrapper for torch.load ---
-def _load_model_with_cpu_fallback(device_str: str) -> ChatterboxMultilingualTTS:
-    """
-    Try to load the model normally. If a CUDA-deserialization RuntimeError occurs,
-    patch torch.load temporarily to force map_location=cpu and retry.
-    """
-    global MODEL
-    try:
-        # First attempt: let the model loader handle device as asked.
-        print(f"[model loader] Attempting to load model with device='{device_str}'")
-        return ChatterboxMultilingualTTS.from_pretrained(device_str)
-    except RuntimeError as e:
-        msg = str(e)
-        # Detect the common CUDA-deserialization error
-        if "Attempting to deserialize object on a CUDA device" in msg or "cuda" in msg and "is not available" in msg:
-            print("[model loader] Caught CUDA-deserialization error; retrying with forced CPU map_location.")
-            # Backup original torch.load
-            original_torch_load = torch.load
-            def _torch_load_cpu_fallback(*args, **kwargs):
-                # If user did not pass a map_location, force CPU
-                if "map_location" not in kwargs:
-                    kwargs["map_location"] = torch.device("cpu")
-                return original_torch_load(*args, **kwargs)
-            try:
-                torch.load = _torch_load_cpu_fallback  # monkeypatch
-                # Try again — pass "cpu" explicitly to the loader when retrying.
-                return ChatterboxMultilingualTTS.from_pretrained("cpu")
-            finally:
-                # Restore original
-                torch.load = original_torch_load
-        else:
-            # Not the CUDA-deserialization error we expected — re-raise
-            raise
-def get_or_load_model() -> ChatterboxMultilingualTTS:
     """Loads the ChatterboxMultilingualTTS model if it hasn't been loaded already,
-    using safe CPU fallback handling for CUDA-saved checkpoints."""
     global MODEL
     if MODEL is None:
         print("Model not loaded, initializing...")
         try:
-            MODEL = _load_model_with_cpu_fallback(DEVICE)
-            # Move model to desired device if required and supported.
-            if hasattr(MODEL, "to"):
-                try:
-                    MODEL.to(torch.device(DEVICE))
-                except Exception as e:
-                    # If moving to CUDA fails (e.g., CPU-only), log and continue using CPU.
-                    print(f"[model loader] Warning: failed to move model to {DEVICE}: {e}")
-            print(f"Model loaded successfully. Internal device attribute: {getattr(MODEL, 'device', 'N/A')}")
         except Exception as e:
-            print(f"CRITICAL: Failed to load model. Error: {e}")
             raise
     return MODEL
 def set_seed(seed: int):
     """Sets the random seed for reproducibility across torch, numpy, and random."""
     torch.manual_seed(seed)
-    if DEVICE == "cuda" and torch.cuda.is_available():
         torch.cuda.manual_seed(seed)
         torch.cuda.manual_seed_all(seed)
     random.seed(seed)
     np.random.seed(seed)
-def resolve_audio_prompt(language_id: str, provided_path: Optional[str]) -> Optional[str]:
-    """Choose provided prompt or default language prompt."""
     if provided_path and str(provided_path).strip():
         return provided_path
     return LANGUAGE_CONFIG.get(language_id, {}).get("audio")
-# --- The TTS generation function used by Gradio ---
 @spaces.GPU
 def generate_tts_audio(
     text_input: str,
     language_id: str,
-    audio_prompt_path_input: Optional[str] = None,
     exaggeration_input: float = 0.5,
     temperature_input: float = 0.8,
     seed_num_input: int = 0,
     cfgw_input: float = 0.5
 ) -> tuple[int, np.ndarray]:
     current_model = get_or_load_model()
     if current_model is None:
         raise RuntimeError("TTS model is not loaded.")
     if seed_num_input != 0:
         set_seed(int(seed_num_input))
-    print(f"Generating audio for text: '{(text_input or '')[:60]}...' language={language_id}")
-    # Resolve prompt (user-provided or default)
-    chosen_prompt = resolve_audio_prompt(language_id, audio_prompt_path_input)
     generate_kwargs = {
         "exaggeration": exaggeration_input,
         "temperature": temperature_input,
@@ -248,32 +225,28 @@ def generate_tts_audio(
         generate_kwargs["audio_prompt_path"] = chosen_prompt
         print(f"Using audio prompt: {chosen_prompt}")
     else:
-        print("No audio prompt provided; using default/model voice.")
-    # Call model.generate (keep same call signature as original)
     wav = current_model.generate(
-        (text_input or "")[:300],
         language_id=language_id,
         **generate_kwargs
     )
     print("Audio generation complete.")
-    # Ensure returned shape -> (sr, numpy array)
-    sr = getattr(current_model, "sr", 22050)
-    return (sr, wav.squeeze(0).numpy())
-# --- UI (keeps original layout, minimal changes) ---
 with gr.Blocks() as demo:
     gr.Markdown(
         """
         # Chatterbox Multilingual Demo
-        Generate high-quality multilingual speech from text with reference audio styling, supporting many languages.
         """
     )
     gr.Markdown(get_supported_languages_display())
     with gr.Row():
         with gr.Column():
             initial_lang = "fr"
@@ -282,26 +255,26 @@ with gr.Blocks() as demo:
                 label="Text to synthesize (max chars 300)",
                 max_lines=5
             )
             language_id = gr.Dropdown(
                 choices=list(ChatterboxMultilingualTTS.get_supported_languages().keys()),
                 value=initial_lang,
                 label="Language",
                 info="Select the language for text-to-speech synthesis"
             )
             ref_wav = gr.Audio(
                 sources=["upload", "microphone"],
                 type="filepath",
                 label="Reference Audio File (Optional)",
                 value=default_audio_for_ui(initial_lang)
             )
             gr.Markdown(
-                "Note: Ensure reference clip language matches the selected language. To avoid language transfer, set CFG weight to 0.",
                 elem_classes=["audio-note"]
             )
             exaggeration = gr.Slider(
                 0.25, 2, step=.05, label="Exaggeration (Neutral = 0.5, extreme values can be unstable)", value=.5
             )
@@ -342,12 +315,4 @@ with gr.Blocks() as demo:
         outputs=[audio_output],
     )
-# Attempt to warm-load model on startup (optional; will raise if fails)
-try:
-    get_or_load_model()
-except Exception as e:
-    print(f"Startup warning: Model failed to warm-load. You can still try Generate; error: {e}")
-# Launch Gradio
-if __name__ == "__main__":
-    demo.launch(mcp_server=True)

 import random
 import numpy as np
 import torch
+from src.chatterbox.mtl_tts import ChatterboxMultilingualTTS, SUPPORTED_LANGUAGES
 import gradio as gr
 import spaces
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"🚀 Running on device: {DEVICE}")
 # --- Global Model Initialization ---
+MODEL = None
 LANGUAGE_CONFIG = {
     "ar": {
         "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/ar_f/ar_prompts2.flac",
 }
 # --- UI Helpers ---
+def default_audio_for_ui(lang: str) -> str | None:
     return LANGUAGE_CONFIG.get(lang, {}).get("audio")
     for code, name in sorted(SUPPORTED_LANGUAGES.items()):
         language_items.append(f"**{name}** (`{code}`)")
+    # Split into 2 lines
     mid = len(language_items) // 2
     line1 = " • ".join(language_items[:mid])
     line2 = " • ".join(language_items[mid:])
     return f"""
+### 🌍 Supported Languages ({len(SUPPORTED_LANGUAGES)} total)
 {line1}
 {line2}
 """
+def get_or_load_model():
     """Loads the ChatterboxMultilingualTTS model if it hasn't been loaded already,
+    and ensures it's on the correct device."""
     global MODEL
     if MODEL is None:
         print("Model not loaded, initializing...")
         try:
+            MODEL = ChatterboxMultilingualTTS.from_pretrained(DEVICE)
+            if hasattr(MODEL, 'to') and str(MODEL.device) != DEVICE:
+                MODEL.to(DEVICE)
+            print(f"Model loaded successfully. Internal device: {getattr(MODEL, 'device', 'N/A')}")
         except Exception as e:
+            print(f"Error loading model: {e}")
             raise
     return MODEL
+# Attempt to load the model at startup.
+try:
+    get_or_load_model()
+except Exception as e:
+    print(f"CRITICAL: Failed to load model on startup. Application may not function. Error: {e}")
 def set_seed(seed: int):
     """Sets the random seed for reproducibility across torch, numpy, and random."""
     torch.manual_seed(seed)
+    if DEVICE == "cuda":
         torch.cuda.manual_seed(seed)
         torch.cuda.manual_seed_all(seed)
     random.seed(seed)
     np.random.seed(seed)
+def resolve_audio_prompt(language_id: str, provided_path: str | None) -> str | None:
+    """
+    Decide which audio prompt to use:
+    - If user provided a path (upload/mic/url), use it.
+    - Else, fall back to language-specific default (if any).
+    """
     if provided_path and str(provided_path).strip():
         return provided_path
     return LANGUAGE_CONFIG.get(language_id, {}).get("audio")
 @spaces.GPU
 def generate_tts_audio(
     text_input: str,
     language_id: str,
+    audio_prompt_path_input: str = None,
     exaggeration_input: float = 0.5,
     temperature_input: float = 0.8,
     seed_num_input: int = 0,
     cfgw_input: float = 0.5
 ) -> tuple[int, np.ndarray]:
+    """
+    Generate high-quality speech audio from text using Chatterbox Multilingual model with optional reference audio styling.
+    Supported languages: English, French, German, Spanish, Italian, Portuguese, and Hindi.
+    This tool synthesizes natural-sounding speech from input text. When a reference audio file
+    is provided, it captures the speaker's voice characteristics and speaking style. The generated audio
+    maintains the prosody, tone, and vocal qualities of the reference speaker, or uses default voice if no reference is provided.
+    Args:
+        text_input (str): The text to synthesize into speech (maximum 300 characters)
+        language_id (str): The language code for synthesis (eg. en, fr, de, es, it, pt, hi)
+        audio_prompt_path_input (str, optional): File path or URL to the reference audio file that defines the target voice style. Defaults to None.
+        exaggeration_input (float, optional): Controls speech expressiveness (0.25-2.0, neutral=0.5, extreme values may be unstable). Defaults to 0.5.
+        temperature_input (float, optional): Controls randomness in generation (0.05-5.0, higher=more varied). Defaults to 0.8.
+        seed_num_input (int, optional): Random seed for reproducible results (0 for random generation). Defaults to 0.
+        cfgw_input (float, optional): CFG/Pace weight controlling generation guidance (0.2-1.0). Defaults to 0.5, 0 for language transfer.
+    Returns:
+        tuple[int, np.ndarray]: A tuple containing the sample rate (int) and the generated audio waveform (numpy.ndarray)
+    """
     current_model = get_or_load_model()
     if current_model is None:
         raise RuntimeError("TTS model is not loaded.")
     if seed_num_input != 0:
         set_seed(int(seed_num_input))
+    print(f"Generating audio for text: '{text_input[:50]}...'")
+    # Handle optional audio prompt
+    chosen_prompt = audio_prompt_path_input or default_audio_for_ui(language_id)
     generate_kwargs = {
         "exaggeration": exaggeration_input,
         "temperature": temperature_input,
         generate_kwargs["audio_prompt_path"] = chosen_prompt
         print(f"Using audio prompt: {chosen_prompt}")
     else:
+        print("No audio prompt provided; using default voice.")
     wav = current_model.generate(
+        text_input[:300],  # Truncate text to max chars
         language_id=language_id,
         **generate_kwargs
     )
     print("Audio generation complete.")
+    return (current_model.sr, wav.squeeze(0).numpy())
 with gr.Blocks() as demo:
     gr.Markdown(
         """
         # Chatterbox Multilingual Demo
+        Generate high-quality multilingual speech from text with reference audio styling, supporting 23 languages.
+        For a hosted version of Chatterbox Multilingual and for finetuning, please visit [resemble.ai](https://app.resemble.ai)
         """
     )
+    # Display supported languages
     gr.Markdown(get_supported_languages_display())
     with gr.Row():
         with gr.Column():
             initial_lang = "fr"
                 label="Text to synthesize (max chars 300)",
                 max_lines=5
             )
             language_id = gr.Dropdown(
                 choices=list(ChatterboxMultilingualTTS.get_supported_languages().keys()),
                 value=initial_lang,
                 label="Language",
                 info="Select the language for text-to-speech synthesis"
             )
             ref_wav = gr.Audio(
                 sources=["upload", "microphone"],
                 type="filepath",
                 label="Reference Audio File (Optional)",
                 value=default_audio_for_ui(initial_lang)
             )
             gr.Markdown(
+                "💡 **Note**: Ensure that the reference clip matches the specified language tag. Otherwise, language transfer outputs may inherit the accent of the reference clip's language. To mitigate this, set the CFG weight to 0.",
                 elem_classes=["audio-note"]
             )
             exaggeration = gr.Slider(
                 0.25, 2, step=.05, label="Exaggeration (Neutral = 0.5, extreme values can be unstable)", value=.5
             )
         outputs=[audio_output],
     )
+demo.launch(mcp_server=True)