yash184 commited on
Commit
02202ef
·
verified ·
1 Parent(s): 2822aef

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -16
app.py CHANGED
@@ -10,12 +10,14 @@ import hashlib
10
  import tempfile
11
  import traceback
12
 
 
13
  HF_TOKEN = os.getenv("HF_TOKEN")
14
  if not HF_TOKEN:
15
  print("WARNING: HF_TOKEN missing. Add it in Space → Settings → Variables & Secrets if needed.")
16
 
17
  MODEL_ID = "ai4bharat/indic-parler-tts"
18
 
 
19
  try:
20
  from parler_tts import ParlerTTSForConditionalGeneration
21
  from transformers import AutoTokenizer
@@ -28,6 +30,7 @@ print("Loading model… (this may take a while)")
28
  model = ParlerTTSForConditionalGeneration.from_pretrained(MODEL_ID).to(device)
29
  text_tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
30
 
 
31
  try:
32
  desc_encoder_name = model.config.text_encoder._name_or_path
33
  desc_tokenizer = AutoTokenizer.from_pretrained(desc_encoder_name)
@@ -36,6 +39,7 @@ except Exception:
36
 
37
  sampling_rate = int(getattr(model.config, "sampling_rate", 22050))
38
 
 
39
  sp_file = Path(__file__).parent / "speakers.json"
40
  if sp_file.exists():
41
  try:
@@ -45,7 +49,8 @@ if sp_file.exists():
45
  else:
46
  SPEAKERS = ["Default"]
47
 
48
- def _make_outfile(text: str, speaker: str, emotion: str):
 
49
  key = (text + "|" + str(speaker) + "|" + str(emotion)).encode("utf-8")
50
  h = hashlib.sha256(key).hexdigest()[:20]
51
  filename = f"out_{h}.wav"
@@ -53,10 +58,13 @@ def _make_outfile(text: str, speaker: str, emotion: str):
53
  return str(Path(tmpdir) / filename)
54
 
55
  def synthesize(text: str, speaker: str, emotion: str = "Neutral"):
 
56
  if not text or not str(text).strip():
57
  return None
58
 
59
  desc = f"{speaker}'s voice. Tone: {emotion}. Natural, clear speech, close mic."
 
 
60
  try:
61
  desc_ids = desc_tokenizer(desc, return_tensors="pt").to(device)
62
  text_ids = text_tokenizer(text, return_tensors="pt").to(device)
@@ -64,6 +72,7 @@ def synthesize(text: str, speaker: str, emotion: str = "Neutral"):
64
  desc_ids = desc_tokenizer(desc, return_tensors="pt")
65
  text_ids = text_tokenizer(text, return_tensors="pt")
66
 
 
67
  try:
68
  with torch.no_grad():
69
  try:
@@ -81,6 +90,7 @@ def synthesize(text: str, speaker: str, emotion: str = "Neutral"):
81
  traceback.print_exc()
82
  return None
83
 
 
84
  try:
85
  arr = audio.cpu().numpy().squeeze()
86
  except Exception as e:
@@ -88,6 +98,7 @@ def synthesize(text: str, speaker: str, emotion: str = "Neutral"):
88
  traceback.print_exc()
89
  return None
90
 
 
91
  if np.issubdtype(arr.dtype, np.integer):
92
  arr = arr.astype("float32") / np.iinfo(arr.dtype).max
93
  else:
@@ -108,23 +119,19 @@ def synthesize(text: str, speaker: str, emotion: str = "Neutral"):
108
  print(f"Generated audio saved to: {out_path}")
109
  return out_path
110
 
111
- # --- Gradio UI / API ---
112
  with gr.Blocks() as demo:
113
  gr.Markdown("# Indic Parler-TTS (69 Speakers)")
114
 
115
- # Simple health fallback box (helps avoid totally blank page)
116
- with gr.Row():
117
- with gr.Column(scale=2):
118
- txt = gr.Textbox(value="नमस्ते, यह एक परीक्षण वाक्य है।", label="Text")
119
- sp = gr.Dropdown(SPEAKERS, value=SPEAKERS[0], label="Speaker")
120
- emo = gr.Dropdown(["Neutral","Happy","Sad","Angry","Narration"], value="Neutral", label="Emotion")
121
- btn = gr.Button("Generate")
122
- with gr.Column(scale=1):
123
- gr.Markdown("**Status**\n\n- Server running\n- If UI is blank, open browser console (F12) to view errors.")
124
- out = gr.Audio(label="Output (wav)", type="filepath")
125
- # Expose API endpoint
126
  btn.click(fn=synthesize, inputs=[txt, sp, emo], outputs=out, api_name="/synthesize")
127
 
128
- # Launch with SSR disabled (helps avoid blank UI in some HF environments)
129
- if __name__ == "__main__":
130
- demo.launch(server_name="0.0.0.0", server_port=7860, ssr_mode=False)
 
10
  import tempfile
11
  import traceback
12
 
13
+ # Token MUST be added via HuggingFace Space Secrets (only needed for private resources)
14
  HF_TOKEN = os.getenv("HF_TOKEN")
15
  if not HF_TOKEN:
16
  print("WARNING: HF_TOKEN missing. Add it in Space → Settings → Variables & Secrets if needed.")
17
 
18
  MODEL_ID = "ai4bharat/indic-parler-tts"
19
 
20
+ # Required imports (raise clear error if missing)
21
  try:
22
  from parler_tts import ParlerTTSForConditionalGeneration
23
  from transformers import AutoTokenizer
 
30
  model = ParlerTTSForConditionalGeneration.from_pretrained(MODEL_ID).to(device)
31
  text_tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
32
 
33
+ # Some models use a separate description encoder tokenizer
34
  try:
35
  desc_encoder_name = model.config.text_encoder._name_or_path
36
  desc_tokenizer = AutoTokenizer.from_pretrained(desc_encoder_name)
 
39
 
40
  sampling_rate = int(getattr(model.config, "sampling_rate", 22050))
41
 
42
+ # Load speakers list if present
43
  sp_file = Path(__file__).parent / "speakers.json"
44
  if sp_file.exists():
45
  try:
 
49
  else:
50
  SPEAKERS = ["Default"]
51
 
52
+ def _make_outfile(text: str, speaker: str, emotion: str) -> str:
53
+ """Deterministic temporary filename to reduce collisions and allow caching."""
54
  key = (text + "|" + str(speaker) + "|" + str(emotion)).encode("utf-8")
55
  h = hashlib.sha256(key).hexdigest()[:20]
56
  filename = f"out_{h}.wav"
 
58
  return str(Path(tmpdir) / filename)
59
 
60
  def synthesize(text: str, speaker: str, emotion: str = "Neutral"):
61
+ """Generate TTS audio and return filepath (Gradio Audio expects filepath when type='filepath')."""
62
  if not text or not str(text).strip():
63
  return None
64
 
65
  desc = f"{speaker}'s voice. Tone: {emotion}. Natural, clear speech, close mic."
66
+
67
+ # Tokenize (attempt to keep tensors on device)
68
  try:
69
  desc_ids = desc_tokenizer(desc, return_tensors="pt").to(device)
70
  text_ids = text_tokenizer(text, return_tensors="pt").to(device)
 
72
  desc_ids = desc_tokenizer(desc, return_tensors="pt")
73
  text_ids = text_tokenizer(text, return_tensors="pt")
74
 
75
+ # Model generation with fallbacks
76
  try:
77
  with torch.no_grad():
78
  try:
 
90
  traceback.print_exc()
91
  return None
92
 
93
+ # Convert tensor -> numpy
94
  try:
95
  arr = audio.cpu().numpy().squeeze()
96
  except Exception as e:
 
98
  traceback.print_exc()
99
  return None
100
 
101
+ # Normalize integer PCM to float32 in [-1,1] if needed
102
  if np.issubdtype(arr.dtype, np.integer):
103
  arr = arr.astype("float32") / np.iinfo(arr.dtype).max
104
  else:
 
119
  print(f"Generated audio saved to: {out_path}")
120
  return out_path
121
 
122
+ # --- Gradio UI and API exposure ---
123
  with gr.Blocks() as demo:
124
  gr.Markdown("# Indic Parler-TTS (69 Speakers)")
125
 
126
+ txt = gr.Textbox(value="नमस्ते, यह एक परीक्षण वाक्य है।", label="Text")
127
+ sp = gr.Dropdown(SPEAKERS, value=SPEAKERS[0], label="Speaker")
128
+ emo = gr.Dropdown(["Neutral","Happy","Sad","Angry","Narration"], value="Neutral", label="Emotion")
129
+ btn = gr.Button("Generate")
130
+ out = gr.Audio(label="Output (wav)", type="filepath")
131
+
132
+ # Expose the endpoint so the Space API is available at /synthesize
 
 
 
 
133
  btn.click(fn=synthesize, inputs=[txt, sp, emo], outputs=out, api_name="/synthesize")
134
 
135
+ if __name__ == '__main__':
136
+ # Disable SSR to avoid frontend blank issues in some Spaces environments
137
+ demo.launch(ssr_mode=False)