PhuongLT commited on
Commit
5632ded
·
1 Parent(s): 2bc7be3
app.py CHANGED
@@ -1,10 +1,9 @@
1
  # -*- coding: utf-8 -*-
2
  """
3
  Gradio app.py - StyleTTS2-vi with precomputed style embeddings (.pth)
4
- - UI alpha/beta/metrics
5
  - Style Mixer: 4 slot cố định (Kore, Puck, Algenib, Leda), chỉ chỉnh weight; auto-normalize
6
- - Luôn hiển thị 4 reference samples (accordion)
7
- - Không còn dropdown speaker & reference sample auto
8
  """
9
 
10
  import os, re, glob, time, yaml, torch, librosa, numpy as np, gradio as gr
@@ -308,13 +307,13 @@ def inference_one(text, ref_feat, alpha=ALPHA, beta=BETA,
308
  if wav.shape[-1] > 50:
309
  wav = wav[:-50]
310
 
311
- # Hậu xử lý: trim + denoise + bỏ silence nội bộ
312
  wav = postprocess_audio(
313
  wav, SR_OUT,
314
  trim_top_db=30,
315
  denoise=True,
316
  denoise_n_fft=1024, denoise_hop=256, denoise_strength=0.8,
317
- remove_internal_silence=True,
318
  split_top_db=30, min_keep_ms=40, crossfade_ms=8
319
  )
320
  return wav, ps, simi_timbre, simi_prosody
@@ -434,18 +433,18 @@ def _build_mix_spec_ui_fixed(normalize, w1, w2, w3, w4, order):
434
  return mix_spec, mix_view, sum_md
435
 
436
 
437
-
438
  # ==============================================================
439
- # Gradio UI
440
  # ==============================================================
441
- with gr.Blocks(title="StyleTTS2-vi Demo") as demo:
442
- gr.Markdown("# StyleTTS2-vi Demo")
443
 
444
  with gr.Row():
445
- with gr.Column():
446
  text_inp = gr.Textbox(
447
- label="Text",
448
- lines=4,
 
449
  value="Trăng treo lơ lửng trên đỉnh núi chơ vơ, ánh sáng bàng bạc phủ lên bãi đá ngổn ngang. Con dế thổn thức trong khe cỏ, tiếng gió hun hút lùa qua hốc núi trập trùng. Dưới thung lũng, đàn trâu gặm cỏ ung dung, hơi sương vẩn đục, lảng bảng giữa đồng khuya tĩnh mịch."
450
  )
451
 
@@ -461,65 +460,67 @@ with gr.Blocks(title="StyleTTS2-vi Demo") as demo:
461
  if len(fixed_order) == 4:
462
  break
463
 
464
- # === Luôn hiển thị 4 voice sample ===
465
- with gr.Accordion("Reference samples", open=True):
466
- with gr.Row():
467
- spk0 = fixed_order[0] if len(fixed_order) > 0 else "Kore"
468
- spk1 = fixed_order[1] if len(fixed_order) > 1 else "Puck"
469
- with gr.Column():
470
- gr.Markdown(f"**{spk0}**")
471
- gr.Audio(value=get_ref_path_for_speaker(spk0), label=f"{spk0} sample", type="filepath", interactive=False)
472
- with gr.Column():
473
- gr.Markdown(f"**{spk1}**")
474
- gr.Audio(value=get_ref_path_for_speaker(spk1), label=f"{spk1} sample", type="filepath", interactive=False)
 
 
 
 
 
 
 
 
 
 
 
 
475
  with gr.Row():
476
- spk2 = fixed_order[2] if len(fixed_order) > 2 else "Algenib"
477
- spk3 = fixed_order[3] if len(fixed_order) > 3 else "Leda"
478
- with gr.Column():
479
- gr.Markdown(f"**{spk2}**")
480
- gr.Audio(value=get_ref_path_for_speaker(spk2), label=f"{spk2} sample", type="filepath", interactive=False)
481
- with gr.Column():
482
- gr.Markdown(f"**{spk3}**")
483
- gr.Audio(value=get_ref_path_for_speaker(spk3), label=f"{spk3} sample", type="filepath", interactive=False)
484
-
485
- # ---- Style Mixer cố định 4 slot ----
486
- with gr.Accordion("Style Mixer", open=True):
487
- normalize_ck = gr.Checkbox(value=True, label="Normalize weights to 1")
488
-
489
- # Hàng 1: Kore & Puck
490
- with gr.Row(equal_height=True):
491
- with gr.Column():
492
  gr.Markdown(f"**{fixed_order[0]}**")
493
- w1 = gr.Slider(0.0, 1.0, value=0.0, step=0.05, label="Weight 1", container=False)
494
- with gr.Column():
495
  gr.Markdown(f"**{fixed_order[1]}**")
496
- w2 = gr.Slider(0.0, 1.0, value=0.0, step=0.05, label="Weight 2", container=False)
497
 
498
- # Hàng 2: Algenib & Leda
499
- with gr.Row(equal_height=True):
500
- with gr.Column():
501
  gr.Markdown(f"**{fixed_order[2]}**")
502
- w3 = gr.Slider(0.0, 1.0, value=0.0, step=0.05, label="Weight 3", container=False)
503
- with gr.Column():
504
  gr.Markdown(f"**{fixed_order[3]}**")
505
- w4 = gr.Slider(0.0, 1.0, value=0.0, step=0.05, label="Weight 4", container=False)
506
-
507
- mix_sum_md = gr.Markdown("**Sum:** 0.000")
508
- mix_view_json = gr.JSON(label="Mixer weights (view)")
509
- mix_spec_state = gr.State("") # string mix-spec cho backend
510
- order_state = gr.State(fixed_order) # giữ thứ tự cố định
511
 
512
- with gr.Row():
513
- alpha_n = gr.Number(value=ALPHA, label="alpha diffusion (0-1, timbre)", precision=3)
514
- beta_n = gr.Number(value=BETA, label="beta diffusion (0-1, prosody)", precision=3)
 
 
 
 
 
 
 
 
 
515
 
516
- btn = gr.Button("Đọc 🔊🔥", variant="primary")
517
 
518
- with gr.Column():
519
- out_audio = gr.Audio(label="Synthesised Audio", type="numpy")
520
- metrics = gr.JSON(label="Metrics")
 
 
521
 
522
- # Bất kỳ thay đổi weight/normalize -> build spec cố định + update tổng/json
523
  def _ui_build_wrapper_fixed(normalize, w1, w2, w3, w4, order):
524
  spec, view, summ = _build_mix_spec_ui_fixed(normalize, w1, w2, w3, w4, order)
525
  return spec, view, summ
@@ -531,8 +532,6 @@ with gr.Blocks(title="StyleTTS2-vi Demo") as demo:
531
  outputs=[mix_spec_state, mix_view_json, mix_sum_md]
532
  )
533
 
534
-
535
- # Nút đọc: dùng mix_spec_state; nếu rỗng => fallback DEFAULT_SPK
536
  btn.click(
537
  run_inference,
538
  inputs=[text_inp, alpha_n, beta_n, mix_spec_state],
@@ -540,4 +539,4 @@ with gr.Blocks(title="StyleTTS2-vi Demo") as demo:
540
  )
541
 
542
  if __name__ == "__main__":
543
- demo.launch()
 
1
  # -*- coding: utf-8 -*-
2
  """
3
  Gradio app.py - StyleTTS2-vi with precomputed style embeddings (.pth)
4
+ - UI gọn gàng với accordion thu gọn
5
  - Style Mixer: 4 slot cố định (Kore, Puck, Algenib, Leda), chỉ chỉnh weight; auto-normalize
6
+ - Reference samples trong accordion
 
7
  """
8
 
9
  import os, re, glob, time, yaml, torch, librosa, numpy as np, gradio as gr
 
307
  if wav.shape[-1] > 50:
308
  wav = wav[:-50]
309
 
310
+ # # Hậu xử lý: trim + denoise + bỏ silence nội bộ
311
  wav = postprocess_audio(
312
  wav, SR_OUT,
313
  trim_top_db=30,
314
  denoise=True,
315
  denoise_n_fft=1024, denoise_hop=256, denoise_strength=0.8,
316
+ remove_internal_silence=False,
317
  split_top_db=30, min_keep_ms=40, crossfade_ms=8
318
  )
319
  return wav, ps, simi_timbre, simi_prosody
 
433
  return mix_spec, mix_view, sum_md
434
 
435
 
 
436
  # ==============================================================
437
+ # Gradio UI - Compact & Clean Version
438
  # ==============================================================
439
+ with gr.Blocks(title="StyleTTS2-vi Demo", theme=gr.themes.Soft()) as demo:
440
+ gr.Markdown("# 🎙️ StyleTTS2-vi Demo")
441
 
442
  with gr.Row():
443
+ with gr.Column(scale=1):
444
  text_inp = gr.Textbox(
445
+ label="📝 Text Input",
446
+ lines=3,
447
+ placeholder="Nhập văn bản cần đọc...",
448
  value="Trăng treo lơ lửng trên đỉnh núi chơ vơ, ánh sáng bàng bạc phủ lên bãi đá ngổn ngang. Con dế thổn thức trong khe cỏ, tiếng gió hun hút lùa qua hốc núi trập trùng. Dưới thung lũng, đàn trâu gặm cỏ ung dung, hơi sương vẩn đục, lảng bảng giữa đồng khuya tĩnh mịch."
449
  )
450
 
 
460
  if len(fixed_order) == 4:
461
  break
462
 
463
+ # === Reference samples - Compact grid ===
464
+ with gr.Accordion("🎵 Reference Samples", open=True):
465
+ gr.Markdown("*Click to preview voice samples*")
466
+ for i in range(0, 4, 2):
467
+ with gr.Row():
468
+ for j in range(2):
469
+ idx = i + j
470
+ if idx < len(fixed_order):
471
+ spk = fixed_order[idx]
472
+ with gr.Column(min_width=200):
473
+ gr.Audio(
474
+ value=get_ref_path_for_speaker(spk),
475
+ label=spk,
476
+ type="filepath",
477
+ interactive=False,
478
+ show_download_button=False
479
+ )
480
+
481
+ # ---- Style Mixer - More compact ----
482
+ with gr.Accordion("🎨 Style Mixer", open=True):
483
+ normalize_ck = gr.Checkbox(value=True, label="Auto-normalize", container=False)
484
+
485
+ # Grid 2x2 cho 4 sliders
486
  with gr.Row():
487
+ with gr.Column(scale=1):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
488
  gr.Markdown(f"**{fixed_order[0]}**")
489
+ w1 = gr.Slider(0.0, 1.0, value=0.0, step=0.05, show_label=False, container=False)
490
+ with gr.Column(scale=1):
491
  gr.Markdown(f"**{fixed_order[1]}**")
492
+ w2 = gr.Slider(0.0, 1.0, value=0.0, step=0.05, show_label=False, container=False)
493
 
494
+ with gr.Row():
495
+ with gr.Column(scale=1):
 
496
  gr.Markdown(f"**{fixed_order[2]}**")
497
+ w3 = gr.Slider(0.0, 1.0, value=0.0, step=0.05, show_label=False, container=False)
498
+ with gr.Column(scale=1):
499
  gr.Markdown(f"**{fixed_order[3]}**")
500
+ w4 = gr.Slider(0.0, 1.0, value=0.0, step=0.05, show_label=False, container=False)
 
 
 
 
 
501
 
502
+ with gr.Row():
503
+ mix_sum_md = gr.Markdown("**Sum:** 0.000")
504
+
505
+ mix_view_json = gr.JSON(label="Current Mix", visible=False)
506
+ mix_spec_state = gr.State("")
507
+ order_state = gr.State(fixed_order)
508
+
509
+ # Advanced settings - Collapsed by default
510
+ with gr.Accordion("⚙️ Advanced Settings", open=False):
511
+ with gr.Row():
512
+ alpha_n = gr.Number(value=ALPHA, label="Alpha (timbre)", precision=3, minimum=0, maximum=1)
513
+ beta_n = gr.Number(value=BETA, label="Beta (prosody)", precision=3, minimum=0, maximum=1)
514
 
515
+ btn = gr.Button("🔊 Generate Speech", variant="primary", size="lg")
516
 
517
+ with gr.Column(scale=1):
518
+ out_audio = gr.Audio(label="🎧 Output Audio", type="numpy")
519
+
520
+ with gr.Accordion("📊 Generation Metrics", open=False):
521
+ metrics = gr.JSON(label="Details")
522
 
523
+ # Event handlers
524
  def _ui_build_wrapper_fixed(normalize, w1, w2, w3, w4, order):
525
  spec, view, summ = _build_mix_spec_ui_fixed(normalize, w1, w2, w3, w4, order)
526
  return spec, view, summ
 
532
  outputs=[mix_spec_state, mix_view_json, mix_sum_md]
533
  )
534
 
 
 
535
  btn.click(
536
  run_inference,
537
  inputs=[text_inp, alpha_n, beta_n, mix_spec_state],
 
539
  )
540
 
541
  if __name__ == "__main__":
542
+ demo.launch()
app_old.py ADDED
@@ -0,0 +1,543 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Gradio app.py - StyleTTS2-vi with precomputed style embeddings (.pth)
4
+ - UI có alpha/beta/metrics
5
+ - Style Mixer: 4 slot cố định (Kore, Puck, Algenib, Leda), chỉ chỉnh weight; auto-normalize
6
+ - Luôn hiển thị 4 reference samples (accordion)
7
+ - Không còn dropdown speaker & reference sample auto
8
+ """
9
+
10
+ import os, re, glob, time, yaml, torch, librosa, numpy as np, gradio as gr
11
+ from munch import Munch
12
+ from soe_vinorm import SoeNormalizer
13
+
14
+ # ==============================================================
15
+ # Cấu hình cơ bản
16
+ # ==============================================================
17
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
18
+ SR_OUT = 24000
19
+ ALPHA, BETA, DIFFUSION_STEPS, EMBEDDING_SCALE = 0.0, 0.0, 5, 1.0
20
+
21
+ REF_DIR = "ref_voice" # thư mục chứa audio mẫu (.wav)
22
+
23
+ # ==============================================================
24
+ # Import module StyleTTS2
25
+ # ==============================================================
26
+ from models import *
27
+ from utils import *
28
+ from models import build_model
29
+ from text_utils import TextCleaner
30
+ from Utils_extend_v1.PLBERT.util import load_plbert
31
+ from Modules.diffusion.sampler import DiffusionSampler, ADPM2Sampler, KarrasSchedule
32
+
33
+ textcleaner = TextCleaner()
34
+
35
+ # ==============================================================
36
+ # Load model và checkpoint
37
+ # ==============================================================
38
+ from huggingface_hub import hf_hub_download
39
+ hf_hub_download(
40
+ repo_id="ltphuongunited/styletts2_vi",
41
+ filename="gemini_2nd_00045.pth",
42
+ local_dir="Models/gemini_vi",
43
+ local_dir_use_symlinks=False,
44
+ )
45
+
46
+ CHECKPOINT_PTH = "Models/gemini_vi/gemini_2nd_00045.pth"
47
+ CONFIG_PATH = "Models/gemini_vi/config_gemini_vi_en.yml"
48
+ config = yaml.safe_load(open(CONFIG_PATH))
49
+
50
+ ASR_config = config.get("ASR_config", False)
51
+ ASR_path = config.get("ASR_path", False)
52
+ F0_path = config.get("F0_path", False)
53
+ PLBERT_dir = config.get("PLBERT_dir", False)
54
+
55
+ text_aligner = load_ASR_models(ASR_path, ASR_config)
56
+ pitch_extractor = load_F0_models(F0_path)
57
+ plbert = load_plbert(PLBERT_dir)
58
+ model_params = recursive_munch(config["model_params"])
59
+ model = build_model(model_params, text_aligner, pitch_extractor, plbert)
60
+
61
+ _ = [model[k].to(DEVICE) for k in model]
62
+ _ = [model[k].eval() for k in model]
63
+
64
+ ckpt = torch.load(CHECKPOINT_PTH, map_location="cpu")["net"]
65
+ for key in model:
66
+ if key in ckpt:
67
+ try:
68
+ model[key].load_state_dict(ckpt[key])
69
+ except Exception:
70
+ from collections import OrderedDict
71
+ new_state = OrderedDict()
72
+ for k, v in ckpt[key].items():
73
+ new_state[k[7:]] = v
74
+ model[key].load_state_dict(new_state, strict=False)
75
+
76
+ sampler = DiffusionSampler(
77
+ model.diffusion.diffusion,
78
+ sampler=ADPM2Sampler(),
79
+ sigma_schedule=KarrasSchedule(sigma_min=1e-4, sigma_max=3.0, rho=9.0),
80
+ clamp=False,
81
+ )
82
+
83
+ # ==============================================================
84
+ # Phonemizer
85
+ # ==============================================================
86
+ import phonemizer
87
+ vi_phonemizer = phonemizer.backend.EspeakBackend(
88
+ language="vi", preserve_punctuation=True, with_stress=True
89
+ )
90
+
91
+ def phonemize_text(text: str) -> str:
92
+ ps = vi_phonemizer.phonemize([text])[0]
93
+ return ps.replace("(en)", "").replace("(vi)", "").strip()
94
+
95
+ def length_to_mask(lengths: torch.LongTensor) -> torch.Tensor:
96
+ mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
97
+ mask = torch.gt(mask + 1, lengths.unsqueeze(1))
98
+ return mask
99
+
100
+ # ==============================================================
101
+ # Load style embeddings đã tính sẵn
102
+ # ==============================================================
103
+ STYLE_PTH = "Models/styles_speaker_parallel.pth"
104
+ print(f"Loading precomputed styles: {STYLE_PTH}")
105
+ styles_dict = torch.load(STYLE_PTH, map_location=DEVICE)
106
+
107
+ # fallback speaker nếu mixer rỗng
108
+ SPEAKER_ORDER_PREF = ["Kore", "Puck", "Algenib", "Leda"]
109
+ DEFAULT_SPK = next((s for s in SPEAKER_ORDER_PREF if s in styles_dict), list(styles_dict.keys())[0])
110
+
111
+ def get_style_by_length(speaker: str, phoneme_len: int):
112
+ spk_tensor = styles_dict[speaker] # [510, 1, 256] hoặc [510, 256]
113
+ idx = min(max(phoneme_len, 1), spk_tensor.shape[0]) - 1
114
+ feat = spk_tensor[idx]
115
+ # ép về [1,256]
116
+ if feat.ndim == 3: # [1,1,256]
117
+ feat = feat.squeeze(0)
118
+ if feat.ndim == 2: # [1,256]
119
+ feat = feat.squeeze(0)
120
+ return feat.unsqueeze(0).to(DEVICE) # [1,256]
121
+
122
+ # ==============================================================
123
+ # Style mixing utils
124
+ # ==============================================================
125
+ def parse_mix_spec(spec: str) -> dict:
126
+ """Parse 'Kore:0.75,Puck:0.25' -> {'Kore':0.75,'Puck':0.25} (lọc lỗi, gộp trùng)."""
127
+ mix = {}
128
+ if not spec or not isinstance(spec, str):
129
+ return mix
130
+ for part in spec.split(","):
131
+ if ":" not in part:
132
+ continue
133
+ k, v = part.split(":", 1)
134
+ k = (k or "").strip()
135
+ if not k:
136
+ continue
137
+ try:
138
+ w = float((v or "").strip())
139
+ except Exception:
140
+ continue
141
+ if not np.isfinite(w) or w <= 0:
142
+ continue
143
+ mix[k] = mix.get(k, 0.0) + w
144
+ return mix
145
+
146
+ def get_style_mixed_by_length(mix_dict: dict, phoneme_len: int):
147
+ """Trộn style của nhiều speaker theo trọng số. Trả về [1,256] trên DEVICE."""
148
+ if not mix_dict:
149
+ return get_style_by_length(DEFAULT_SPK, phoneme_len)
150
+
151
+ total = sum(max(0.0, float(w)) for w in mix_dict.values())
152
+ if total <= 0:
153
+ return get_style_by_length(DEFAULT_SPK, phoneme_len)
154
+
155
+ mix_feat = None
156
+ for spk, w in mix_dict.items():
157
+ if spk not in styles_dict:
158
+ print(f"[WARN] Speaker '{spk}' không có trong styles_dict, bỏ qua.")
159
+ continue
160
+ feat_i = get_style_by_length(spk, phoneme_len) # [1,256]
161
+ wi = float(w) / total
162
+ mix_feat = feat_i * wi if mix_feat is None else mix_feat + feat_i * wi
163
+
164
+ if mix_feat is None:
165
+ return get_style_by_length(DEFAULT_SPK, phoneme_len)
166
+ return mix_feat # [1,256]
167
+
168
+ # ==============================================================
169
+ # Audio postprocess (librosa): trim + denoise + remove internal silence
170
+ # ==============================================================
171
+ def _simple_spectral_denoise(y, sr, n_fft=1024, hop=256, prop_decrease=0.8):
172
+ if y.size == 0:
173
+ return y
174
+ D = librosa.stft(y, n_fft=n_fft, hop_length=hop, win_length=n_fft)
175
+ S = np.abs(D)
176
+ noise = np.median(S, axis=1, keepdims=True)
177
+ S_clean = S - prop_decrease * noise
178
+ S_clean = np.maximum(S_clean, 0.0)
179
+ gain = S_clean / (S + 1e-8)
180
+ D_denoised = D * gain
181
+ y_out = librosa.istft(D_denoised, hop_length=hop, win_length=n_fft, length=len(y))
182
+ return y_out
183
+
184
+ def _concat_with_crossfade(segments, crossfade_samples=0):
185
+ if not segments:
186
+ return np.array([], dtype=np.float32)
187
+ out = segments[0].astype(np.float32, copy=True)
188
+ for seg in segments[1:]:
189
+ seg = seg.astype(np.float32, copy=False)
190
+ if crossfade_samples > 0 and out.size > 0 and seg.size > 0:
191
+ cf = min(crossfade_samples, out.size, seg.size)
192
+ fade_out = np.linspace(1.0, 0.0, cf, dtype=np.float32)
193
+ fade_in = 1.0 - fade_out
194
+ tail = out[-cf:] * fade_out + seg[:cf] * fade_in
195
+ out = np.concatenate([out[:-cf], tail, seg[cf:]], axis=0)
196
+ else:
197
+ out = np.concatenate([out, seg], axis=0)
198
+ return out
199
+
200
+ def _reduce_internal_silence(y, sr, top_db=30, min_keep_ms=40, crossfade_ms=8):
201
+ if y.size == 0:
202
+ return y
203
+ intervals = librosa.effects.split(y, top_db=top_db)
204
+ if intervals.size == 0:
205
+ return y
206
+ min_keep = int(sr * (min_keep_ms / 1000.0))
207
+ segs = []
208
+ for s, e in intervals:
209
+ if e - s >= min_keep:
210
+ segs.append(y[s:e])
211
+ if not segs:
212
+ return y
213
+ crossfade = int(sr * (crossfade_ms / 1000.0))
214
+ y_out = _concat_with_crossfade(segs, crossfade_samples=crossfade)
215
+ return y_out
216
+
217
+ def postprocess_audio(y, sr,
218
+ trim_top_db=30,
219
+ denoise=True,
220
+ denoise_n_fft=1024,
221
+ denoise_hop=256,
222
+ denoise_strength=0.8,
223
+ remove_internal_silence=True,
224
+ split_top_db=30,
225
+ min_keep_ms=40,
226
+ crossfade_ms=8):
227
+ if y.size == 0:
228
+ return y.astype(np.float32)
229
+ y_trim, _ = librosa.effects.trim(y, top_db=trim_top_db)
230
+ if denoise:
231
+ y_trim = _simple_spectral_denoise(
232
+ y_trim, sr, n_fft=denoise_n_fft, hop=denoise_hop, prop_decrease=denoise_strength
233
+ )
234
+ if remove_internal_silence:
235
+ y_trim = _reduce_internal_silence(
236
+ y_trim, sr, top_db=split_top_db, min_keep_ms=min_keep_ms, crossfade_ms=crossfade_ms
237
+ )
238
+ y_trim = np.nan_to_num(y_trim, nan=0.0, posinf=0.0, neginf=0.0).astype(np.float32)
239
+ m = np.max(np.abs(y_trim)) + 1e-8
240
+ if m > 1.0:
241
+ y_trim = y_trim / m
242
+ return y_trim
243
+
244
+ # ==============================================================
245
+ # Inference core
246
+ # ==============================================================
247
+ def inference_one(text, ref_feat, alpha=ALPHA, beta=BETA,
248
+ diffusion_steps=DIFFUSION_STEPS, embedding_scale=EMBEDDING_SCALE):
249
+ ps = phonemize_text(text)
250
+ tokens = textcleaner(ps)
251
+ tokens.insert(0, 0)
252
+ tokens = torch.LongTensor(tokens).unsqueeze(0).to(DEVICE)
253
+ input_lengths = torch.LongTensor([tokens.shape[-1]]).to(DEVICE)
254
+ text_mask = length_to_mask(input_lengths).to(DEVICE)
255
+
256
+ with torch.no_grad():
257
+ t_en = model.text_encoder(tokens, input_lengths, text_mask)
258
+ bert_d = model.bert(tokens, attention_mask=(~text_mask).int())
259
+ d_en = model.bert_encoder(bert_d).transpose(-1, -2)
260
+
261
+ if alpha == 0 and beta == 0:
262
+ s_pred = ref_feat.clone() # [1,256]
263
+ else:
264
+ s_pred = sampler(
265
+ noise=torch.randn((1, 256)).unsqueeze(1).to(DEVICE),
266
+ embedding=bert_d,
267
+ embedding_scale=embedding_scale,
268
+ features=ref_feat, # [1,256]
269
+ num_steps=diffusion_steps,
270
+ ).squeeze(1) # [1,256]
271
+
272
+ s, ref = s_pred[:, 128:], s_pred[:, :128]
273
+ ref = alpha * ref + (1 - alpha) * ref_feat[:, :128]
274
+ s = beta * s + (1 - beta) * ref_feat[:, 128:]
275
+
276
+ # --- Metrics (cosine) ---
277
+ def cosine_sim(a, b):
278
+ return torch.nn.functional.cosine_similarity(a, b, dim=1).mean().item()
279
+ simi_timbre = cosine_sim(s_pred[:, :128], ref_feat[:, :128])
280
+ simi_prosody = cosine_sim(s_pred[:, 128:], ref_feat[:, 128:])
281
+
282
+ # --- Duration / Alignment ---
283
+ d = model.predictor.text_encoder(d_en, s, input_lengths, text_mask)
284
+ x, _ = model.predictor.lstm(d)
285
+ duration = torch.sigmoid(model.predictor.duration_proj(x)).sum(axis=-1)
286
+ pred_dur = torch.round(duration.squeeze()).clamp(min=1)
287
+
288
+ T = int(pred_dur.sum().item())
289
+ pred_aln = torch.zeros(input_lengths.item(), T, device=DEVICE)
290
+ c = 0
291
+ for i in range(input_lengths.item()):
292
+ span = int(pred_dur[i].item())
293
+ pred_aln[i, c:c+span] = 1.0
294
+ c += span
295
+
296
+ en = (d.transpose(-1, -2) @ pred_aln.unsqueeze(0))
297
+ if model_params.decoder.type == "hifigan":
298
+ en = torch.cat([en[:, :, :1], en[:, :, :-1]], dim=2)
299
+
300
+ F0_pred, N_pred = model.predictor.F0Ntrain(en, s)
301
+ asr = (t_en @ pred_aln.unsqueeze(0))
302
+ if model_params.decoder.type == "hifigan":
303
+ asr = torch.cat([asr[:, :, :1], asr[:, :, :-1]], dim=2)
304
+
305
+ out = model.decoder(asr, F0_pred, N_pred, ref.squeeze().unsqueeze(0))
306
+
307
+ wav = out.squeeze().detach().cpu().numpy()
308
+ if wav.shape[-1] > 50:
309
+ wav = wav[:-50]
310
+
311
+ # Hậu xử lý: trim + denoise + bỏ silence nội bộ
312
+ # wav = postprocess_audio(
313
+ # wav, SR_OUT,
314
+ # trim_top_db=30,
315
+ # denoise=True,
316
+ # denoise_n_fft=1024, denoise_hop=256, denoise_strength=0.8,
317
+ # remove_internal_silence=True,
318
+ # split_top_db=30, min_keep_ms=40, crossfade_ms=8
319
+ # )
320
+ return wav, ps, simi_timbre, simi_prosody
321
+
322
+ # ==============================================================
323
+ # Ref-audio mapping (quét ./ref_voice để tìm file mẫu theo speaker)
324
+ # ==============================================================
325
+ def _norm(s: str) -> str:
326
+ import unicodedata
327
+ s = unicodedata.normalize("NFKD", s)
328
+ s = "".join([c for c in s if not unicodedata.combining(c)])
329
+ s = s.lower()
330
+ s = re.sub(r"[^a-z0-9_\-\.]+", "", s)
331
+ return s
332
+
333
+ def build_ref_map(ref_dir: str) -> dict:
334
+ paths = glob.glob(os.path.join(ref_dir, "**", "*.wav"), recursive=True)
335
+ by_name = {}
336
+ for p in paths:
337
+ fname = os.path.basename(p)
338
+ by_name[_norm(fname)] = p
339
+
340
+ spk_map = {}
341
+ speakers = list(styles_dict.keys()) if isinstance(styles_dict, dict) else ["Kore","Algenib","Puck","Leda"]
342
+
343
+ for spk in speakers:
344
+ spk_n = _norm(spk)
345
+ hit = None
346
+ for k, p in by_name.items():
347
+ if f"_{spk_n}_" in k:
348
+ hit = p
349
+ break
350
+ if not hit:
351
+ for k, p in by_name.items():
352
+ if spk_n in k:
353
+ hit = p
354
+ break
355
+ if hit:
356
+ spk_map[spk] = hit
357
+ return spk_map
358
+
359
+ REF_MAP = build_ref_map(REF_DIR)
360
+
361
+ def get_ref_path_for_speaker(spk: str):
362
+ return REF_MAP.get(spk)
363
+
364
+ # ==============================================================
365
+ # Wrapper cho Gradio (nhận speaker_mix_spec là string ẩn)
366
+ # ==============================================================
367
+ def run_inference(text, alpha, beta, speaker_mix_spec):
368
+ normalizer = SoeNormalizer()
369
+ text = normalizer.normalize(text).replace(" ,", ",").replace(" .", ".")
370
+
371
+ ps = phonemize_text(text)
372
+ phoneme_len = len(ps.replace(" ", ""))
373
+
374
+ mix_dict = parse_mix_spec(speaker_mix_spec)
375
+ if len(mix_dict) > 0:
376
+ ref_feat = get_style_mixed_by_length(mix_dict, phoneme_len)
377
+ ref_idx = min(phoneme_len, 510)
378
+ total = sum(mix_dict.values())
379
+ mix_info = {k: round(float(v / total), 3) for k, v in mix_dict.items()}
380
+ chosen_speakers = list(mix_dict.keys())
381
+ else:
382
+ ref_feat = get_style_by_length(DEFAULT_SPK, phoneme_len)
383
+ ref_idx = min(phoneme_len, 510)
384
+ mix_info = {DEFAULT_SPK: 1.0}
385
+ chosen_speakers = [DEFAULT_SPK]
386
+
387
+ t0 = time.time()
388
+ wav, ps_out, simi_timbre, simi_prosody = inference_one(
389
+ text, ref_feat, alpha=float(alpha), beta=float(beta)
390
+ )
391
+ gen_time = time.time() - t0
392
+ rtf = gen_time / max(1e-6, len(wav) / SR_OUT)
393
+
394
+ info = {
395
+ "Text after soe_vinorms:": text,
396
+ "Speakers": chosen_speakers,
397
+ "Mix weights (normalized)": mix_info,
398
+ "Phonemes": ps_out,
399
+ "Phoneme length": phoneme_len,
400
+ "Ref index": ref_idx,
401
+ "simi_timbre": round(float(simi_timbre), 4),
402
+ "simi_prosody": round(float(simi_prosody), 4),
403
+ "alpha": float(alpha),
404
+ "beta": float(beta),
405
+ "RTF": round(float(rtf), 3),
406
+ "Device": DEVICE,
407
+ }
408
+ return (SR_OUT, wav.astype(np.float32)), info
409
+
410
+ # ==============================================================
411
+ # UI helper: build mix-spec CỐ ĐỊNH theo 4 speaker
412
+ # ==============================================================
413
+ def _build_mix_spec_ui_fixed(normalize, w1, w2, w3, w4, order):
414
+ pairs = [(order[0], float(w1 or 0.0)),
415
+ (order[1], float(w2 or 0.0)),
416
+ (order[2], float(w3 or 0.0)),
417
+ (order[3], float(w4 or 0.0))]
418
+ pairs = [(s, w) for s, w in pairs if w > 0]
419
+
420
+ if not pairs:
421
+ return "", {}, "**Sum:** 0.000"
422
+
423
+ total = sum(w for _, w in pairs)
424
+ if normalize and total > 0:
425
+ pairs = [(s, w/total) for s, w in pairs]
426
+
427
+ acc = {}
428
+ for s, w in pairs:
429
+ acc[s] = acc.get(s, 0.0) + w
430
+
431
+ mix_spec = ",".join([f"{s}:{w:.4f}" for s, w in acc.items()])
432
+ mix_view = {"weights": {s: round(w, 3) for s, w in acc.items()}, "normalized": bool(normalize)}
433
+ sum_md = f"**Sum:** {round(sum(acc.values()), 3)}"
434
+ return mix_spec, mix_view, sum_md
435
+
436
+
437
+
438
+ # ==============================================================
439
+ # Gradio UI
440
+ # ==============================================================
441
+ with gr.Blocks(title="StyleTTS2-vi Demo") as demo:
442
+ gr.Markdown("# StyleTTS2-vi Demo")
443
+
444
+ with gr.Row():
445
+ with gr.Column():
446
+ text_inp = gr.Textbox(
447
+ label="Text",
448
+ lines=4,
449
+ value="Trăng treo lơ lửng trên đỉnh núi chơ vơ, ánh sáng bàng bạc phủ lên bãi đá ngổn ngang. Con dế thổn thức trong khe cỏ, tiếng gió hun hút lùa qua hốc núi trập trùng. Dưới thung lũng, đàn trâu gặm cỏ ung dung, hơi sương vẩn đục, lảng bảng giữa đồng khuya tĩnh mịch."
450
+ )
451
+
452
+ # Danh sách speaker có trong styles_dict
453
+ spk_choices = list(styles_dict.keys()) if isinstance(styles_dict, dict) else ["Kore","Algenib","Puck","Leda"]
454
+
455
+ # Thứ tự CỐ ĐỊNH cho mixer
456
+ fixed_order = [s for s in ["Kore", "Puck", "Algenib", "Leda"] if s in spk_choices]
457
+ if len(fixed_order) < 4:
458
+ for s in spk_choices:
459
+ if s not in fixed_order:
460
+ fixed_order.append(s)
461
+ if len(fixed_order) == 4:
462
+ break
463
+
464
+ # === Luôn hiển thị 4 voice sample ===
465
+ with gr.Accordion("Reference samples", open=True):
466
+ with gr.Row():
467
+ spk0 = fixed_order[0] if len(fixed_order) > 0 else "Kore"
468
+ spk1 = fixed_order[1] if len(fixed_order) > 1 else "Puck"
469
+ with gr.Column():
470
+ gr.Markdown(f"**{spk0}**")
471
+ gr.Audio(value=get_ref_path_for_speaker(spk0), label=f"{spk0} sample", type="filepath", interactive=False)
472
+ with gr.Column():
473
+ gr.Markdown(f"**{spk1}**")
474
+ gr.Audio(value=get_ref_path_for_speaker(spk1), label=f"{spk1} sample", type="filepath", interactive=False)
475
+ with gr.Row():
476
+ spk2 = fixed_order[2] if len(fixed_order) > 2 else "Algenib"
477
+ spk3 = fixed_order[3] if len(fixed_order) > 3 else "Leda"
478
+ with gr.Column():
479
+ gr.Markdown(f"**{spk2}**")
480
+ gr.Audio(value=get_ref_path_for_speaker(spk2), label=f"{spk2} sample", type="filepath", interactive=False)
481
+ with gr.Column():
482
+ gr.Markdown(f"**{spk3}**")
483
+ gr.Audio(value=get_ref_path_for_speaker(spk3), label=f"{spk3} sample", type="filepath", interactive=False)
484
+
485
+ # ---- Style Mixer cố định 4 slot ----
486
+ with gr.Accordion("Style Mixer", open=True):
487
+ normalize_ck = gr.Checkbox(value=True, label="Normalize weights to 1")
488
+
489
+ # Hàng 1: Kore & Puck
490
+ with gr.Row(equal_height=True):
491
+ with gr.Column():
492
+ gr.Markdown(f"**{fixed_order[0]}**")
493
+ w1 = gr.Slider(0.0, 1.0, value=0.0, step=0.05, label="Weight 1", container=False)
494
+ with gr.Column():
495
+ gr.Markdown(f"**{fixed_order[1]}**")
496
+ w2 = gr.Slider(0.0, 1.0, value=0.0, step=0.05, label="Weight 2", container=False)
497
+
498
+ # Hàng 2: Algenib & Leda
499
+ with gr.Row(equal_height=True):
500
+ with gr.Column():
501
+ gr.Markdown(f"**{fixed_order[2]}**")
502
+ w3 = gr.Slider(0.0, 1.0, value=0.0, step=0.05, label="Weight 3", container=False)
503
+ with gr.Column():
504
+ gr.Markdown(f"**{fixed_order[3]}**")
505
+ w4 = gr.Slider(0.0, 1.0, value=0.0, step=0.05, label="Weight 4", container=False)
506
+
507
+ mix_sum_md = gr.Markdown("**Sum:** 0.000")
508
+ mix_view_json = gr.JSON(label="Mixer weights (view)")
509
+ mix_spec_state = gr.State("") # string mix-spec cho backend
510
+ order_state = gr.State(fixed_order) # giữ thứ tự cố định
511
+
512
+ with gr.Row():
513
+ alpha_n = gr.Number(value=ALPHA, label="alpha diffusion (0-1, timbre)", precision=3)
514
+ beta_n = gr.Number(value=BETA, label="beta diffusion (0-1, prosody)", precision=3)
515
+
516
+ btn = gr.Button("Đọc 🔊🔥", variant="primary")
517
+
518
+ with gr.Column():
519
+ out_audio = gr.Audio(label="Synthesised Audio", type="numpy")
520
+ metrics = gr.JSON(label="Metrics")
521
+
522
+ # Bất kỳ thay đổi weight/normalize -> build spec cố định + update tổng/json
523
+ def _ui_build_wrapper_fixed(normalize, w1, w2, w3, w4, order):
524
+ spec, view, summ = _build_mix_spec_ui_fixed(normalize, w1, w2, w3, w4, order)
525
+ return spec, view, summ
526
+
527
+ for comp in [normalize_ck, w1, w2, w3, w4]:
528
+ comp.change(
529
+ _ui_build_wrapper_fixed,
530
+ inputs=[normalize_ck, w1, w2, w3, w4, order_state],
531
+ outputs=[mix_spec_state, mix_view_json, mix_sum_md]
532
+ )
533
+
534
+
535
+ # Nút đọc: dùng mix_spec_state; nếu rỗng => fallback DEFAULT_SPK
536
+ btn.click(
537
+ run_inference,
538
+ inputs=[text_inp, alpha_n, beta_n, mix_spec_state],
539
+ outputs=[out_audio, metrics]
540
+ )
541
+
542
+ if __name__ == "__main__":
543
+ demo.launch()
ref_voice/{0000000_Kore_Quân_sự.wav → 0000012_Kore_Thể_thao.wav} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:abfd3f8771395bbcb0789f2f5e61fab93ac186672d0ff756fbe5c44854bb4cc3
3
- size 284730
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c797fbd6d6ff2aff8f0fa618a8767325ab680a6f21dc49ffde3a6295b457904
3
+ size 680250