Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -10,12 +10,14 @@ import hashlib
|
|
| 10 |
import tempfile
|
| 11 |
import traceback
|
| 12 |
|
|
|
|
| 13 |
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 14 |
if not HF_TOKEN:
|
| 15 |
print("WARNING: HF_TOKEN missing. Add it in Space → Settings → Variables & Secrets if needed.")
|
| 16 |
|
| 17 |
MODEL_ID = "ai4bharat/indic-parler-tts"
|
| 18 |
|
|
|
|
| 19 |
try:
|
| 20 |
from parler_tts import ParlerTTSForConditionalGeneration
|
| 21 |
from transformers import AutoTokenizer
|
|
@@ -28,6 +30,7 @@ print("Loading model… (this may take a while)")
|
|
| 28 |
model = ParlerTTSForConditionalGeneration.from_pretrained(MODEL_ID).to(device)
|
| 29 |
text_tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
| 30 |
|
|
|
|
| 31 |
try:
|
| 32 |
desc_encoder_name = model.config.text_encoder._name_or_path
|
| 33 |
desc_tokenizer = AutoTokenizer.from_pretrained(desc_encoder_name)
|
|
@@ -36,6 +39,7 @@ except Exception:
|
|
| 36 |
|
| 37 |
sampling_rate = int(getattr(model.config, "sampling_rate", 22050))
|
| 38 |
|
|
|
|
| 39 |
sp_file = Path(__file__).parent / "speakers.json"
|
| 40 |
if sp_file.exists():
|
| 41 |
try:
|
|
@@ -45,7 +49,8 @@ if sp_file.exists():
|
|
| 45 |
else:
|
| 46 |
SPEAKERS = ["Default"]
|
| 47 |
|
| 48 |
-
def _make_outfile(text: str, speaker: str, emotion: str):
|
|
|
|
| 49 |
key = (text + "|" + str(speaker) + "|" + str(emotion)).encode("utf-8")
|
| 50 |
h = hashlib.sha256(key).hexdigest()[:20]
|
| 51 |
filename = f"out_{h}.wav"
|
|
@@ -53,10 +58,13 @@ def _make_outfile(text: str, speaker: str, emotion: str):
|
|
| 53 |
return str(Path(tmpdir) / filename)
|
| 54 |
|
| 55 |
def synthesize(text: str, speaker: str, emotion: str = "Neutral"):
|
|
|
|
| 56 |
if not text or not str(text).strip():
|
| 57 |
return None
|
| 58 |
|
| 59 |
desc = f"{speaker}'s voice. Tone: {emotion}. Natural, clear speech, close mic."
|
|
|
|
|
|
|
| 60 |
try:
|
| 61 |
desc_ids = desc_tokenizer(desc, return_tensors="pt").to(device)
|
| 62 |
text_ids = text_tokenizer(text, return_tensors="pt").to(device)
|
|
@@ -64,6 +72,7 @@ def synthesize(text: str, speaker: str, emotion: str = "Neutral"):
|
|
| 64 |
desc_ids = desc_tokenizer(desc, return_tensors="pt")
|
| 65 |
text_ids = text_tokenizer(text, return_tensors="pt")
|
| 66 |
|
|
|
|
| 67 |
try:
|
| 68 |
with torch.no_grad():
|
| 69 |
try:
|
|
@@ -81,6 +90,7 @@ def synthesize(text: str, speaker: str, emotion: str = "Neutral"):
|
|
| 81 |
traceback.print_exc()
|
| 82 |
return None
|
| 83 |
|
|
|
|
| 84 |
try:
|
| 85 |
arr = audio.cpu().numpy().squeeze()
|
| 86 |
except Exception as e:
|
|
@@ -88,6 +98,7 @@ def synthesize(text: str, speaker: str, emotion: str = "Neutral"):
|
|
| 88 |
traceback.print_exc()
|
| 89 |
return None
|
| 90 |
|
|
|
|
| 91 |
if np.issubdtype(arr.dtype, np.integer):
|
| 92 |
arr = arr.astype("float32") / np.iinfo(arr.dtype).max
|
| 93 |
else:
|
|
@@ -108,23 +119,19 @@ def synthesize(text: str, speaker: str, emotion: str = "Neutral"):
|
|
| 108 |
print(f"Generated audio saved to: {out_path}")
|
| 109 |
return out_path
|
| 110 |
|
| 111 |
-
# --- Gradio UI
|
| 112 |
with gr.Blocks() as demo:
|
| 113 |
gr.Markdown("# Indic Parler-TTS (69 Speakers)")
|
| 114 |
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
with gr.Column(scale=1):
|
| 123 |
-
gr.Markdown("**Status**\n\n- Server running\n- If UI is blank, open browser console (F12) to view errors.")
|
| 124 |
-
out = gr.Audio(label="Output (wav)", type="filepath")
|
| 125 |
-
# Expose API endpoint
|
| 126 |
btn.click(fn=synthesize, inputs=[txt, sp, emo], outputs=out, api_name="/synthesize")
|
| 127 |
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
demo.launch(
|
|
|
|
| 10 |
import tempfile
|
| 11 |
import traceback
|
| 12 |
|
| 13 |
+
# Token MUST be added via HuggingFace Space Secrets (only needed for private resources)
|
| 14 |
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 15 |
if not HF_TOKEN:
|
| 16 |
print("WARNING: HF_TOKEN missing. Add it in Space → Settings → Variables & Secrets if needed.")
|
| 17 |
|
| 18 |
MODEL_ID = "ai4bharat/indic-parler-tts"
|
| 19 |
|
| 20 |
+
# Required imports (raise clear error if missing)
|
| 21 |
try:
|
| 22 |
from parler_tts import ParlerTTSForConditionalGeneration
|
| 23 |
from transformers import AutoTokenizer
|
|
|
|
| 30 |
model = ParlerTTSForConditionalGeneration.from_pretrained(MODEL_ID).to(device)
|
| 31 |
text_tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
| 32 |
|
| 33 |
+
# Some models use a separate description encoder tokenizer
|
| 34 |
try:
|
| 35 |
desc_encoder_name = model.config.text_encoder._name_or_path
|
| 36 |
desc_tokenizer = AutoTokenizer.from_pretrained(desc_encoder_name)
|
|
|
|
| 39 |
|
| 40 |
sampling_rate = int(getattr(model.config, "sampling_rate", 22050))
|
| 41 |
|
| 42 |
+
# Load speakers list if present
|
| 43 |
sp_file = Path(__file__).parent / "speakers.json"
|
| 44 |
if sp_file.exists():
|
| 45 |
try:
|
|
|
|
| 49 |
else:
|
| 50 |
SPEAKERS = ["Default"]
|
| 51 |
|
| 52 |
+
def _make_outfile(text: str, speaker: str, emotion: str) -> str:
|
| 53 |
+
"""Deterministic temporary filename to reduce collisions and allow caching."""
|
| 54 |
key = (text + "|" + str(speaker) + "|" + str(emotion)).encode("utf-8")
|
| 55 |
h = hashlib.sha256(key).hexdigest()[:20]
|
| 56 |
filename = f"out_{h}.wav"
|
|
|
|
| 58 |
return str(Path(tmpdir) / filename)
|
| 59 |
|
| 60 |
def synthesize(text: str, speaker: str, emotion: str = "Neutral"):
|
| 61 |
+
"""Generate TTS audio and return filepath (Gradio Audio expects filepath when type='filepath')."""
|
| 62 |
if not text or not str(text).strip():
|
| 63 |
return None
|
| 64 |
|
| 65 |
desc = f"{speaker}'s voice. Tone: {emotion}. Natural, clear speech, close mic."
|
| 66 |
+
|
| 67 |
+
# Tokenize (attempt to keep tensors on device)
|
| 68 |
try:
|
| 69 |
desc_ids = desc_tokenizer(desc, return_tensors="pt").to(device)
|
| 70 |
text_ids = text_tokenizer(text, return_tensors="pt").to(device)
|
|
|
|
| 72 |
desc_ids = desc_tokenizer(desc, return_tensors="pt")
|
| 73 |
text_ids = text_tokenizer(text, return_tensors="pt")
|
| 74 |
|
| 75 |
+
# Model generation with fallbacks
|
| 76 |
try:
|
| 77 |
with torch.no_grad():
|
| 78 |
try:
|
|
|
|
| 90 |
traceback.print_exc()
|
| 91 |
return None
|
| 92 |
|
| 93 |
+
# Convert tensor -> numpy
|
| 94 |
try:
|
| 95 |
arr = audio.cpu().numpy().squeeze()
|
| 96 |
except Exception as e:
|
|
|
|
| 98 |
traceback.print_exc()
|
| 99 |
return None
|
| 100 |
|
| 101 |
+
# Normalize integer PCM to float32 in [-1,1] if needed
|
| 102 |
if np.issubdtype(arr.dtype, np.integer):
|
| 103 |
arr = arr.astype("float32") / np.iinfo(arr.dtype).max
|
| 104 |
else:
|
|
|
|
| 119 |
print(f"Generated audio saved to: {out_path}")
|
| 120 |
return out_path
|
| 121 |
|
| 122 |
+
# --- Gradio UI and API exposure ---
|
| 123 |
with gr.Blocks() as demo:
|
| 124 |
gr.Markdown("# Indic Parler-TTS (69 Speakers)")
|
| 125 |
|
| 126 |
+
txt = gr.Textbox(value="नमस्ते, यह एक परीक्षण वाक्य है।", label="Text")
|
| 127 |
+
sp = gr.Dropdown(SPEAKERS, value=SPEAKERS[0], label="Speaker")
|
| 128 |
+
emo = gr.Dropdown(["Neutral","Happy","Sad","Angry","Narration"], value="Neutral", label="Emotion")
|
| 129 |
+
btn = gr.Button("Generate")
|
| 130 |
+
out = gr.Audio(label="Output (wav)", type="filepath")
|
| 131 |
+
|
| 132 |
+
# Expose the endpoint so the Space API is available at /synthesize
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
btn.click(fn=synthesize, inputs=[txt, sp, emo], outputs=out, api_name="/synthesize")
|
| 134 |
|
| 135 |
+
if __name__ == '__main__':
|
| 136 |
+
# Disable SSR to avoid frontend blank issues in some Spaces environments
|
| 137 |
+
demo.launch(ssr_mode=False)
|