Spaces:
Sleeping
Sleeping
PhuongLT
commited on
Commit
·
5632ded
1
Parent(s):
2bc7be3
update UI
Browse files- app.py +63 -64
- app_old.py +543 -0
- ref_voice/{0000000_Kore_Quân_sự.wav → 0000012_Kore_Thể_thao.wav} +2 -2
app.py
CHANGED
|
@@ -1,10 +1,9 @@
|
|
| 1 |
# -*- coding: utf-8 -*-
|
| 2 |
"""
|
| 3 |
Gradio app.py - StyleTTS2-vi with precomputed style embeddings (.pth)
|
| 4 |
-
- UI
|
| 5 |
- Style Mixer: 4 slot cố định (Kore, Puck, Algenib, Leda), chỉ chỉnh weight; auto-normalize
|
| 6 |
-
-
|
| 7 |
-
- Không còn dropdown speaker & reference sample auto
|
| 8 |
"""
|
| 9 |
|
| 10 |
import os, re, glob, time, yaml, torch, librosa, numpy as np, gradio as gr
|
|
@@ -308,13 +307,13 @@ def inference_one(text, ref_feat, alpha=ALPHA, beta=BETA,
|
|
| 308 |
if wav.shape[-1] > 50:
|
| 309 |
wav = wav[:-50]
|
| 310 |
|
| 311 |
-
# Hậu xử lý: trim + denoise + bỏ silence nội bộ
|
| 312 |
wav = postprocess_audio(
|
| 313 |
wav, SR_OUT,
|
| 314 |
trim_top_db=30,
|
| 315 |
denoise=True,
|
| 316 |
denoise_n_fft=1024, denoise_hop=256, denoise_strength=0.8,
|
| 317 |
-
remove_internal_silence=
|
| 318 |
split_top_db=30, min_keep_ms=40, crossfade_ms=8
|
| 319 |
)
|
| 320 |
return wav, ps, simi_timbre, simi_prosody
|
|
@@ -434,18 +433,18 @@ def _build_mix_spec_ui_fixed(normalize, w1, w2, w3, w4, order):
|
|
| 434 |
return mix_spec, mix_view, sum_md
|
| 435 |
|
| 436 |
|
| 437 |
-
|
| 438 |
# ==============================================================
|
| 439 |
-
# Gradio UI
|
| 440 |
# ==============================================================
|
| 441 |
-
with gr.Blocks(title="StyleTTS2-vi Demo") as demo:
|
| 442 |
-
gr.Markdown("# StyleTTS2-vi Demo")
|
| 443 |
|
| 444 |
with gr.Row():
|
| 445 |
-
with gr.Column():
|
| 446 |
text_inp = gr.Textbox(
|
| 447 |
-
label="Text",
|
| 448 |
-
lines=
|
|
|
|
| 449 |
value="Trăng treo lơ lửng trên đỉnh núi chơ vơ, ánh sáng bàng bạc phủ lên bãi đá ngổn ngang. Con dế thổn thức trong khe cỏ, tiếng gió hun hút lùa qua hốc núi trập trùng. Dưới thung lũng, đàn trâu gặm cỏ ung dung, hơi sương vẩn đục, lảng bảng giữa đồng khuya tĩnh mịch."
|
| 450 |
)
|
| 451 |
|
|
@@ -461,65 +460,67 @@ with gr.Blocks(title="StyleTTS2-vi Demo") as demo:
|
|
| 461 |
if len(fixed_order) == 4:
|
| 462 |
break
|
| 463 |
|
| 464 |
-
# ===
|
| 465 |
-
with gr.Accordion("Reference
|
| 466 |
-
|
| 467 |
-
|
| 468 |
-
|
| 469 |
-
|
| 470 |
-
|
| 471 |
-
|
| 472 |
-
|
| 473 |
-
|
| 474 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 475 |
with gr.Row():
|
| 476 |
-
|
| 477 |
-
spk3 = fixed_order[3] if len(fixed_order) > 3 else "Leda"
|
| 478 |
-
with gr.Column():
|
| 479 |
-
gr.Markdown(f"**{spk2}**")
|
| 480 |
-
gr.Audio(value=get_ref_path_for_speaker(spk2), label=f"{spk2} sample", type="filepath", interactive=False)
|
| 481 |
-
with gr.Column():
|
| 482 |
-
gr.Markdown(f"**{spk3}**")
|
| 483 |
-
gr.Audio(value=get_ref_path_for_speaker(spk3), label=f"{spk3} sample", type="filepath", interactive=False)
|
| 484 |
-
|
| 485 |
-
# ---- Style Mixer cố định 4 slot ----
|
| 486 |
-
with gr.Accordion("Style Mixer", open=True):
|
| 487 |
-
normalize_ck = gr.Checkbox(value=True, label="Normalize weights to 1")
|
| 488 |
-
|
| 489 |
-
# Hàng 1: Kore & Puck
|
| 490 |
-
with gr.Row(equal_height=True):
|
| 491 |
-
with gr.Column():
|
| 492 |
gr.Markdown(f"**{fixed_order[0]}**")
|
| 493 |
-
w1 = gr.Slider(0.0, 1.0, value=0.0, step=0.05,
|
| 494 |
-
with gr.Column():
|
| 495 |
gr.Markdown(f"**{fixed_order[1]}**")
|
| 496 |
-
w2 = gr.Slider(0.0, 1.0, value=0.0, step=0.05,
|
| 497 |
|
| 498 |
-
|
| 499 |
-
|
| 500 |
-
with gr.Column():
|
| 501 |
gr.Markdown(f"**{fixed_order[2]}**")
|
| 502 |
-
w3 = gr.Slider(0.0, 1.0, value=0.0, step=0.05,
|
| 503 |
-
with gr.Column():
|
| 504 |
gr.Markdown(f"**{fixed_order[3]}**")
|
| 505 |
-
w4 = gr.Slider(0.0, 1.0, value=0.0, step=0.05,
|
| 506 |
-
|
| 507 |
-
mix_sum_md = gr.Markdown("**Sum:** 0.000")
|
| 508 |
-
mix_view_json = gr.JSON(label="Mixer weights (view)")
|
| 509 |
-
mix_spec_state = gr.State("") # string mix-spec cho backend
|
| 510 |
-
order_state = gr.State(fixed_order) # giữ thứ tự cố định
|
| 511 |
|
| 512 |
-
|
| 513 |
-
|
| 514 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 515 |
|
| 516 |
-
btn = gr.Button("
|
| 517 |
|
| 518 |
-
with gr.Column():
|
| 519 |
-
out_audio = gr.Audio(label="
|
| 520 |
-
|
|
|
|
|
|
|
| 521 |
|
| 522 |
-
#
|
| 523 |
def _ui_build_wrapper_fixed(normalize, w1, w2, w3, w4, order):
|
| 524 |
spec, view, summ = _build_mix_spec_ui_fixed(normalize, w1, w2, w3, w4, order)
|
| 525 |
return spec, view, summ
|
|
@@ -531,8 +532,6 @@ with gr.Blocks(title="StyleTTS2-vi Demo") as demo:
|
|
| 531 |
outputs=[mix_spec_state, mix_view_json, mix_sum_md]
|
| 532 |
)
|
| 533 |
|
| 534 |
-
|
| 535 |
-
# Nút đọc: dùng mix_spec_state; nếu rỗng => fallback DEFAULT_SPK
|
| 536 |
btn.click(
|
| 537 |
run_inference,
|
| 538 |
inputs=[text_inp, alpha_n, beta_n, mix_spec_state],
|
|
@@ -540,4 +539,4 @@ with gr.Blocks(title="StyleTTS2-vi Demo") as demo:
|
|
| 540 |
)
|
| 541 |
|
| 542 |
if __name__ == "__main__":
|
| 543 |
-
demo.launch()
|
|
|
|
| 1 |
# -*- coding: utf-8 -*-
|
| 2 |
"""
|
| 3 |
Gradio app.py - StyleTTS2-vi with precomputed style embeddings (.pth)
|
| 4 |
+
- UI gọn gàng với accordion thu gọn
|
| 5 |
- Style Mixer: 4 slot cố định (Kore, Puck, Algenib, Leda), chỉ chỉnh weight; auto-normalize
|
| 6 |
+
- Reference samples trong accordion
|
|
|
|
| 7 |
"""
|
| 8 |
|
| 9 |
import os, re, glob, time, yaml, torch, librosa, numpy as np, gradio as gr
|
|
|
|
| 307 |
if wav.shape[-1] > 50:
|
| 308 |
wav = wav[:-50]
|
| 309 |
|
| 310 |
+
# # Hậu xử lý: trim + denoise + bỏ silence nội bộ
|
| 311 |
wav = postprocess_audio(
|
| 312 |
wav, SR_OUT,
|
| 313 |
trim_top_db=30,
|
| 314 |
denoise=True,
|
| 315 |
denoise_n_fft=1024, denoise_hop=256, denoise_strength=0.8,
|
| 316 |
+
remove_internal_silence=False,
|
| 317 |
split_top_db=30, min_keep_ms=40, crossfade_ms=8
|
| 318 |
)
|
| 319 |
return wav, ps, simi_timbre, simi_prosody
|
|
|
|
| 433 |
return mix_spec, mix_view, sum_md
|
| 434 |
|
| 435 |
|
|
|
|
| 436 |
# ==============================================================
|
| 437 |
+
# Gradio UI - Compact & Clean Version
|
| 438 |
# ==============================================================
|
| 439 |
+
with gr.Blocks(title="StyleTTS2-vi Demo", theme=gr.themes.Soft()) as demo:
|
| 440 |
+
gr.Markdown("# 🎙️ StyleTTS2-vi Demo")
|
| 441 |
|
| 442 |
with gr.Row():
|
| 443 |
+
with gr.Column(scale=1):
|
| 444 |
text_inp = gr.Textbox(
|
| 445 |
+
label="📝 Text Input",
|
| 446 |
+
lines=3,
|
| 447 |
+
placeholder="Nhập văn bản cần đọc...",
|
| 448 |
value="Trăng treo lơ lửng trên đỉnh núi chơ vơ, ánh sáng bàng bạc phủ lên bãi đá ngổn ngang. Con dế thổn thức trong khe cỏ, tiếng gió hun hút lùa qua hốc núi trập trùng. Dưới thung lũng, đàn trâu gặm cỏ ung dung, hơi sương vẩn đục, lảng bảng giữa đồng khuya tĩnh mịch."
|
| 449 |
)
|
| 450 |
|
|
|
|
| 460 |
if len(fixed_order) == 4:
|
| 461 |
break
|
| 462 |
|
| 463 |
+
# === Reference samples - Compact grid ===
|
| 464 |
+
with gr.Accordion("🎵 Reference Samples", open=True):
|
| 465 |
+
gr.Markdown("*Click to preview voice samples*")
|
| 466 |
+
for i in range(0, 4, 2):
|
| 467 |
+
with gr.Row():
|
| 468 |
+
for j in range(2):
|
| 469 |
+
idx = i + j
|
| 470 |
+
if idx < len(fixed_order):
|
| 471 |
+
spk = fixed_order[idx]
|
| 472 |
+
with gr.Column(min_width=200):
|
| 473 |
+
gr.Audio(
|
| 474 |
+
value=get_ref_path_for_speaker(spk),
|
| 475 |
+
label=spk,
|
| 476 |
+
type="filepath",
|
| 477 |
+
interactive=False,
|
| 478 |
+
show_download_button=False
|
| 479 |
+
)
|
| 480 |
+
|
| 481 |
+
# ---- Style Mixer - More compact ----
|
| 482 |
+
with gr.Accordion("🎨 Style Mixer", open=True):
|
| 483 |
+
normalize_ck = gr.Checkbox(value=True, label="Auto-normalize", container=False)
|
| 484 |
+
|
| 485 |
+
# Grid 2x2 cho 4 sliders
|
| 486 |
with gr.Row():
|
| 487 |
+
with gr.Column(scale=1):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 488 |
gr.Markdown(f"**{fixed_order[0]}**")
|
| 489 |
+
w1 = gr.Slider(0.0, 1.0, value=0.0, step=0.05, show_label=False, container=False)
|
| 490 |
+
with gr.Column(scale=1):
|
| 491 |
gr.Markdown(f"**{fixed_order[1]}**")
|
| 492 |
+
w2 = gr.Slider(0.0, 1.0, value=0.0, step=0.05, show_label=False, container=False)
|
| 493 |
|
| 494 |
+
with gr.Row():
|
| 495 |
+
with gr.Column(scale=1):
|
|
|
|
| 496 |
gr.Markdown(f"**{fixed_order[2]}**")
|
| 497 |
+
w3 = gr.Slider(0.0, 1.0, value=0.0, step=0.05, show_label=False, container=False)
|
| 498 |
+
with gr.Column(scale=1):
|
| 499 |
gr.Markdown(f"**{fixed_order[3]}**")
|
| 500 |
+
w4 = gr.Slider(0.0, 1.0, value=0.0, step=0.05, show_label=False, container=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 501 |
|
| 502 |
+
with gr.Row():
|
| 503 |
+
mix_sum_md = gr.Markdown("**Sum:** 0.000")
|
| 504 |
+
|
| 505 |
+
mix_view_json = gr.JSON(label="Current Mix", visible=False)
|
| 506 |
+
mix_spec_state = gr.State("")
|
| 507 |
+
order_state = gr.State(fixed_order)
|
| 508 |
+
|
| 509 |
+
# Advanced settings - Collapsed by default
|
| 510 |
+
with gr.Accordion("⚙️ Advanced Settings", open=False):
|
| 511 |
+
with gr.Row():
|
| 512 |
+
alpha_n = gr.Number(value=ALPHA, label="Alpha (timbre)", precision=3, minimum=0, maximum=1)
|
| 513 |
+
beta_n = gr.Number(value=BETA, label="Beta (prosody)", precision=3, minimum=0, maximum=1)
|
| 514 |
|
| 515 |
+
btn = gr.Button("🔊 Generate Speech", variant="primary", size="lg")
|
| 516 |
|
| 517 |
+
with gr.Column(scale=1):
|
| 518 |
+
out_audio = gr.Audio(label="🎧 Output Audio", type="numpy")
|
| 519 |
+
|
| 520 |
+
with gr.Accordion("📊 Generation Metrics", open=False):
|
| 521 |
+
metrics = gr.JSON(label="Details")
|
| 522 |
|
| 523 |
+
# Event handlers
|
| 524 |
def _ui_build_wrapper_fixed(normalize, w1, w2, w3, w4, order):
|
| 525 |
spec, view, summ = _build_mix_spec_ui_fixed(normalize, w1, w2, w3, w4, order)
|
| 526 |
return spec, view, summ
|
|
|
|
| 532 |
outputs=[mix_spec_state, mix_view_json, mix_sum_md]
|
| 533 |
)
|
| 534 |
|
|
|
|
|
|
|
| 535 |
btn.click(
|
| 536 |
run_inference,
|
| 537 |
inputs=[text_inp, alpha_n, beta_n, mix_spec_state],
|
|
|
|
| 539 |
)
|
| 540 |
|
| 541 |
if __name__ == "__main__":
|
| 542 |
+
demo.launch()
|
app_old.py
ADDED
|
@@ -0,0 +1,543 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""
|
| 3 |
+
Gradio app.py - StyleTTS2-vi with precomputed style embeddings (.pth)
|
| 4 |
+
- UI có alpha/beta/metrics
|
| 5 |
+
- Style Mixer: 4 slot cố định (Kore, Puck, Algenib, Leda), chỉ chỉnh weight; auto-normalize
|
| 6 |
+
- Luôn hiển thị 4 reference samples (accordion)
|
| 7 |
+
- Không còn dropdown speaker & reference sample auto
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
import os, re, glob, time, yaml, torch, librosa, numpy as np, gradio as gr
|
| 11 |
+
from munch import Munch
|
| 12 |
+
from soe_vinorm import SoeNormalizer
|
| 13 |
+
|
| 14 |
+
# ==============================================================
|
| 15 |
+
# Cấu hình cơ bản
|
| 16 |
+
# ==============================================================
|
| 17 |
+
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
| 18 |
+
SR_OUT = 24000
|
| 19 |
+
ALPHA, BETA, DIFFUSION_STEPS, EMBEDDING_SCALE = 0.0, 0.0, 5, 1.0
|
| 20 |
+
|
| 21 |
+
REF_DIR = "ref_voice" # thư mục chứa audio mẫu (.wav)
|
| 22 |
+
|
| 23 |
+
# ==============================================================
|
| 24 |
+
# Import module StyleTTS2
|
| 25 |
+
# ==============================================================
|
| 26 |
+
from models import *
|
| 27 |
+
from utils import *
|
| 28 |
+
from models import build_model
|
| 29 |
+
from text_utils import TextCleaner
|
| 30 |
+
from Utils_extend_v1.PLBERT.util import load_plbert
|
| 31 |
+
from Modules.diffusion.sampler import DiffusionSampler, ADPM2Sampler, KarrasSchedule
|
| 32 |
+
|
| 33 |
+
textcleaner = TextCleaner()
|
| 34 |
+
|
| 35 |
+
# ==============================================================
|
| 36 |
+
# Load model và checkpoint
|
| 37 |
+
# ==============================================================
|
| 38 |
+
from huggingface_hub import hf_hub_download
|
| 39 |
+
hf_hub_download(
|
| 40 |
+
repo_id="ltphuongunited/styletts2_vi",
|
| 41 |
+
filename="gemini_2nd_00045.pth",
|
| 42 |
+
local_dir="Models/gemini_vi",
|
| 43 |
+
local_dir_use_symlinks=False,
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
CHECKPOINT_PTH = "Models/gemini_vi/gemini_2nd_00045.pth"
|
| 47 |
+
CONFIG_PATH = "Models/gemini_vi/config_gemini_vi_en.yml"
|
| 48 |
+
config = yaml.safe_load(open(CONFIG_PATH))
|
| 49 |
+
|
| 50 |
+
ASR_config = config.get("ASR_config", False)
|
| 51 |
+
ASR_path = config.get("ASR_path", False)
|
| 52 |
+
F0_path = config.get("F0_path", False)
|
| 53 |
+
PLBERT_dir = config.get("PLBERT_dir", False)
|
| 54 |
+
|
| 55 |
+
text_aligner = load_ASR_models(ASR_path, ASR_config)
|
| 56 |
+
pitch_extractor = load_F0_models(F0_path)
|
| 57 |
+
plbert = load_plbert(PLBERT_dir)
|
| 58 |
+
model_params = recursive_munch(config["model_params"])
|
| 59 |
+
model = build_model(model_params, text_aligner, pitch_extractor, plbert)
|
| 60 |
+
|
| 61 |
+
_ = [model[k].to(DEVICE) for k in model]
|
| 62 |
+
_ = [model[k].eval() for k in model]
|
| 63 |
+
|
| 64 |
+
ckpt = torch.load(CHECKPOINT_PTH, map_location="cpu")["net"]
|
| 65 |
+
for key in model:
|
| 66 |
+
if key in ckpt:
|
| 67 |
+
try:
|
| 68 |
+
model[key].load_state_dict(ckpt[key])
|
| 69 |
+
except Exception:
|
| 70 |
+
from collections import OrderedDict
|
| 71 |
+
new_state = OrderedDict()
|
| 72 |
+
for k, v in ckpt[key].items():
|
| 73 |
+
new_state[k[7:]] = v
|
| 74 |
+
model[key].load_state_dict(new_state, strict=False)
|
| 75 |
+
|
| 76 |
+
sampler = DiffusionSampler(
|
| 77 |
+
model.diffusion.diffusion,
|
| 78 |
+
sampler=ADPM2Sampler(),
|
| 79 |
+
sigma_schedule=KarrasSchedule(sigma_min=1e-4, sigma_max=3.0, rho=9.0),
|
| 80 |
+
clamp=False,
|
| 81 |
+
)
|
| 82 |
+
|
| 83 |
+
# ==============================================================
|
| 84 |
+
# Phonemizer
|
| 85 |
+
# ==============================================================
|
| 86 |
+
import phonemizer
|
| 87 |
+
vi_phonemizer = phonemizer.backend.EspeakBackend(
|
| 88 |
+
language="vi", preserve_punctuation=True, with_stress=True
|
| 89 |
+
)
|
| 90 |
+
|
| 91 |
+
def phonemize_text(text: str) -> str:
|
| 92 |
+
ps = vi_phonemizer.phonemize([text])[0]
|
| 93 |
+
return ps.replace("(en)", "").replace("(vi)", "").strip()
|
| 94 |
+
|
| 95 |
+
def length_to_mask(lengths: torch.LongTensor) -> torch.Tensor:
|
| 96 |
+
mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
|
| 97 |
+
mask = torch.gt(mask + 1, lengths.unsqueeze(1))
|
| 98 |
+
return mask
|
| 99 |
+
|
| 100 |
+
# ==============================================================
|
| 101 |
+
# Load style embeddings đã tính sẵn
|
| 102 |
+
# ==============================================================
|
| 103 |
+
STYLE_PTH = "Models/styles_speaker_parallel.pth"
|
| 104 |
+
print(f"Loading precomputed styles: {STYLE_PTH}")
|
| 105 |
+
styles_dict = torch.load(STYLE_PTH, map_location=DEVICE)
|
| 106 |
+
|
| 107 |
+
# fallback speaker nếu mixer rỗng
|
| 108 |
+
SPEAKER_ORDER_PREF = ["Kore", "Puck", "Algenib", "Leda"]
|
| 109 |
+
DEFAULT_SPK = next((s for s in SPEAKER_ORDER_PREF if s in styles_dict), list(styles_dict.keys())[0])
|
| 110 |
+
|
| 111 |
+
def get_style_by_length(speaker: str, phoneme_len: int):
|
| 112 |
+
spk_tensor = styles_dict[speaker] # [510, 1, 256] hoặc [510, 256]
|
| 113 |
+
idx = min(max(phoneme_len, 1), spk_tensor.shape[0]) - 1
|
| 114 |
+
feat = spk_tensor[idx]
|
| 115 |
+
# ép về [1,256]
|
| 116 |
+
if feat.ndim == 3: # [1,1,256]
|
| 117 |
+
feat = feat.squeeze(0)
|
| 118 |
+
if feat.ndim == 2: # [1,256]
|
| 119 |
+
feat = feat.squeeze(0)
|
| 120 |
+
return feat.unsqueeze(0).to(DEVICE) # [1,256]
|
| 121 |
+
|
| 122 |
+
# ==============================================================
|
| 123 |
+
# Style mixing utils
|
| 124 |
+
# ==============================================================
|
| 125 |
+
def parse_mix_spec(spec: str) -> dict:
|
| 126 |
+
"""Parse 'Kore:0.75,Puck:0.25' -> {'Kore':0.75,'Puck':0.25} (lọc lỗi, gộp trùng)."""
|
| 127 |
+
mix = {}
|
| 128 |
+
if not spec or not isinstance(spec, str):
|
| 129 |
+
return mix
|
| 130 |
+
for part in spec.split(","):
|
| 131 |
+
if ":" not in part:
|
| 132 |
+
continue
|
| 133 |
+
k, v = part.split(":", 1)
|
| 134 |
+
k = (k or "").strip()
|
| 135 |
+
if not k:
|
| 136 |
+
continue
|
| 137 |
+
try:
|
| 138 |
+
w = float((v or "").strip())
|
| 139 |
+
except Exception:
|
| 140 |
+
continue
|
| 141 |
+
if not np.isfinite(w) or w <= 0:
|
| 142 |
+
continue
|
| 143 |
+
mix[k] = mix.get(k, 0.0) + w
|
| 144 |
+
return mix
|
| 145 |
+
|
| 146 |
+
def get_style_mixed_by_length(mix_dict: dict, phoneme_len: int):
|
| 147 |
+
"""Trộn style của nhiều speaker theo trọng số. Trả về [1,256] trên DEVICE."""
|
| 148 |
+
if not mix_dict:
|
| 149 |
+
return get_style_by_length(DEFAULT_SPK, phoneme_len)
|
| 150 |
+
|
| 151 |
+
total = sum(max(0.0, float(w)) for w in mix_dict.values())
|
| 152 |
+
if total <= 0:
|
| 153 |
+
return get_style_by_length(DEFAULT_SPK, phoneme_len)
|
| 154 |
+
|
| 155 |
+
mix_feat = None
|
| 156 |
+
for spk, w in mix_dict.items():
|
| 157 |
+
if spk not in styles_dict:
|
| 158 |
+
print(f"[WARN] Speaker '{spk}' không có trong styles_dict, bỏ qua.")
|
| 159 |
+
continue
|
| 160 |
+
feat_i = get_style_by_length(spk, phoneme_len) # [1,256]
|
| 161 |
+
wi = float(w) / total
|
| 162 |
+
mix_feat = feat_i * wi if mix_feat is None else mix_feat + feat_i * wi
|
| 163 |
+
|
| 164 |
+
if mix_feat is None:
|
| 165 |
+
return get_style_by_length(DEFAULT_SPK, phoneme_len)
|
| 166 |
+
return mix_feat # [1,256]
|
| 167 |
+
|
| 168 |
+
# ==============================================================
|
| 169 |
+
# Audio postprocess (librosa): trim + denoise + remove internal silence
|
| 170 |
+
# ==============================================================
|
| 171 |
+
def _simple_spectral_denoise(y, sr, n_fft=1024, hop=256, prop_decrease=0.8):
|
| 172 |
+
if y.size == 0:
|
| 173 |
+
return y
|
| 174 |
+
D = librosa.stft(y, n_fft=n_fft, hop_length=hop, win_length=n_fft)
|
| 175 |
+
S = np.abs(D)
|
| 176 |
+
noise = np.median(S, axis=1, keepdims=True)
|
| 177 |
+
S_clean = S - prop_decrease * noise
|
| 178 |
+
S_clean = np.maximum(S_clean, 0.0)
|
| 179 |
+
gain = S_clean / (S + 1e-8)
|
| 180 |
+
D_denoised = D * gain
|
| 181 |
+
y_out = librosa.istft(D_denoised, hop_length=hop, win_length=n_fft, length=len(y))
|
| 182 |
+
return y_out
|
| 183 |
+
|
| 184 |
+
def _concat_with_crossfade(segments, crossfade_samples=0):
|
| 185 |
+
if not segments:
|
| 186 |
+
return np.array([], dtype=np.float32)
|
| 187 |
+
out = segments[0].astype(np.float32, copy=True)
|
| 188 |
+
for seg in segments[1:]:
|
| 189 |
+
seg = seg.astype(np.float32, copy=False)
|
| 190 |
+
if crossfade_samples > 0 and out.size > 0 and seg.size > 0:
|
| 191 |
+
cf = min(crossfade_samples, out.size, seg.size)
|
| 192 |
+
fade_out = np.linspace(1.0, 0.0, cf, dtype=np.float32)
|
| 193 |
+
fade_in = 1.0 - fade_out
|
| 194 |
+
tail = out[-cf:] * fade_out + seg[:cf] * fade_in
|
| 195 |
+
out = np.concatenate([out[:-cf], tail, seg[cf:]], axis=0)
|
| 196 |
+
else:
|
| 197 |
+
out = np.concatenate([out, seg], axis=0)
|
| 198 |
+
return out
|
| 199 |
+
|
| 200 |
+
def _reduce_internal_silence(y, sr, top_db=30, min_keep_ms=40, crossfade_ms=8):
|
| 201 |
+
if y.size == 0:
|
| 202 |
+
return y
|
| 203 |
+
intervals = librosa.effects.split(y, top_db=top_db)
|
| 204 |
+
if intervals.size == 0:
|
| 205 |
+
return y
|
| 206 |
+
min_keep = int(sr * (min_keep_ms / 1000.0))
|
| 207 |
+
segs = []
|
| 208 |
+
for s, e in intervals:
|
| 209 |
+
if e - s >= min_keep:
|
| 210 |
+
segs.append(y[s:e])
|
| 211 |
+
if not segs:
|
| 212 |
+
return y
|
| 213 |
+
crossfade = int(sr * (crossfade_ms / 1000.0))
|
| 214 |
+
y_out = _concat_with_crossfade(segs, crossfade_samples=crossfade)
|
| 215 |
+
return y_out
|
| 216 |
+
|
| 217 |
+
def postprocess_audio(y, sr,
|
| 218 |
+
trim_top_db=30,
|
| 219 |
+
denoise=True,
|
| 220 |
+
denoise_n_fft=1024,
|
| 221 |
+
denoise_hop=256,
|
| 222 |
+
denoise_strength=0.8,
|
| 223 |
+
remove_internal_silence=True,
|
| 224 |
+
split_top_db=30,
|
| 225 |
+
min_keep_ms=40,
|
| 226 |
+
crossfade_ms=8):
|
| 227 |
+
if y.size == 0:
|
| 228 |
+
return y.astype(np.float32)
|
| 229 |
+
y_trim, _ = librosa.effects.trim(y, top_db=trim_top_db)
|
| 230 |
+
if denoise:
|
| 231 |
+
y_trim = _simple_spectral_denoise(
|
| 232 |
+
y_trim, sr, n_fft=denoise_n_fft, hop=denoise_hop, prop_decrease=denoise_strength
|
| 233 |
+
)
|
| 234 |
+
if remove_internal_silence:
|
| 235 |
+
y_trim = _reduce_internal_silence(
|
| 236 |
+
y_trim, sr, top_db=split_top_db, min_keep_ms=min_keep_ms, crossfade_ms=crossfade_ms
|
| 237 |
+
)
|
| 238 |
+
y_trim = np.nan_to_num(y_trim, nan=0.0, posinf=0.0, neginf=0.0).astype(np.float32)
|
| 239 |
+
m = np.max(np.abs(y_trim)) + 1e-8
|
| 240 |
+
if m > 1.0:
|
| 241 |
+
y_trim = y_trim / m
|
| 242 |
+
return y_trim
|
| 243 |
+
|
| 244 |
+
# ==============================================================
|
| 245 |
+
# Inference core
|
| 246 |
+
# ==============================================================
|
| 247 |
+
def inference_one(text, ref_feat, alpha=ALPHA, beta=BETA,
|
| 248 |
+
diffusion_steps=DIFFUSION_STEPS, embedding_scale=EMBEDDING_SCALE):
|
| 249 |
+
ps = phonemize_text(text)
|
| 250 |
+
tokens = textcleaner(ps)
|
| 251 |
+
tokens.insert(0, 0)
|
| 252 |
+
tokens = torch.LongTensor(tokens).unsqueeze(0).to(DEVICE)
|
| 253 |
+
input_lengths = torch.LongTensor([tokens.shape[-1]]).to(DEVICE)
|
| 254 |
+
text_mask = length_to_mask(input_lengths).to(DEVICE)
|
| 255 |
+
|
| 256 |
+
with torch.no_grad():
|
| 257 |
+
t_en = model.text_encoder(tokens, input_lengths, text_mask)
|
| 258 |
+
bert_d = model.bert(tokens, attention_mask=(~text_mask).int())
|
| 259 |
+
d_en = model.bert_encoder(bert_d).transpose(-1, -2)
|
| 260 |
+
|
| 261 |
+
if alpha == 0 and beta == 0:
|
| 262 |
+
s_pred = ref_feat.clone() # [1,256]
|
| 263 |
+
else:
|
| 264 |
+
s_pred = sampler(
|
| 265 |
+
noise=torch.randn((1, 256)).unsqueeze(1).to(DEVICE),
|
| 266 |
+
embedding=bert_d,
|
| 267 |
+
embedding_scale=embedding_scale,
|
| 268 |
+
features=ref_feat, # [1,256]
|
| 269 |
+
num_steps=diffusion_steps,
|
| 270 |
+
).squeeze(1) # [1,256]
|
| 271 |
+
|
| 272 |
+
s, ref = s_pred[:, 128:], s_pred[:, :128]
|
| 273 |
+
ref = alpha * ref + (1 - alpha) * ref_feat[:, :128]
|
| 274 |
+
s = beta * s + (1 - beta) * ref_feat[:, 128:]
|
| 275 |
+
|
| 276 |
+
# --- Metrics (cosine) ---
|
| 277 |
+
def cosine_sim(a, b):
|
| 278 |
+
return torch.nn.functional.cosine_similarity(a, b, dim=1).mean().item()
|
| 279 |
+
simi_timbre = cosine_sim(s_pred[:, :128], ref_feat[:, :128])
|
| 280 |
+
simi_prosody = cosine_sim(s_pred[:, 128:], ref_feat[:, 128:])
|
| 281 |
+
|
| 282 |
+
# --- Duration / Alignment ---
|
| 283 |
+
d = model.predictor.text_encoder(d_en, s, input_lengths, text_mask)
|
| 284 |
+
x, _ = model.predictor.lstm(d)
|
| 285 |
+
duration = torch.sigmoid(model.predictor.duration_proj(x)).sum(axis=-1)
|
| 286 |
+
pred_dur = torch.round(duration.squeeze()).clamp(min=1)
|
| 287 |
+
|
| 288 |
+
T = int(pred_dur.sum().item())
|
| 289 |
+
pred_aln = torch.zeros(input_lengths.item(), T, device=DEVICE)
|
| 290 |
+
c = 0
|
| 291 |
+
for i in range(input_lengths.item()):
|
| 292 |
+
span = int(pred_dur[i].item())
|
| 293 |
+
pred_aln[i, c:c+span] = 1.0
|
| 294 |
+
c += span
|
| 295 |
+
|
| 296 |
+
en = (d.transpose(-1, -2) @ pred_aln.unsqueeze(0))
|
| 297 |
+
if model_params.decoder.type == "hifigan":
|
| 298 |
+
en = torch.cat([en[:, :, :1], en[:, :, :-1]], dim=2)
|
| 299 |
+
|
| 300 |
+
F0_pred, N_pred = model.predictor.F0Ntrain(en, s)
|
| 301 |
+
asr = (t_en @ pred_aln.unsqueeze(0))
|
| 302 |
+
if model_params.decoder.type == "hifigan":
|
| 303 |
+
asr = torch.cat([asr[:, :, :1], asr[:, :, :-1]], dim=2)
|
| 304 |
+
|
| 305 |
+
out = model.decoder(asr, F0_pred, N_pred, ref.squeeze().unsqueeze(0))
|
| 306 |
+
|
| 307 |
+
wav = out.squeeze().detach().cpu().numpy()
|
| 308 |
+
if wav.shape[-1] > 50:
|
| 309 |
+
wav = wav[:-50]
|
| 310 |
+
|
| 311 |
+
# Hậu xử lý: trim + denoise + bỏ silence nội bộ
|
| 312 |
+
# wav = postprocess_audio(
|
| 313 |
+
# wav, SR_OUT,
|
| 314 |
+
# trim_top_db=30,
|
| 315 |
+
# denoise=True,
|
| 316 |
+
# denoise_n_fft=1024, denoise_hop=256, denoise_strength=0.8,
|
| 317 |
+
# remove_internal_silence=True,
|
| 318 |
+
# split_top_db=30, min_keep_ms=40, crossfade_ms=8
|
| 319 |
+
# )
|
| 320 |
+
return wav, ps, simi_timbre, simi_prosody
|
| 321 |
+
|
| 322 |
+
# ==============================================================
|
| 323 |
+
# Ref-audio mapping (quét ./ref_voice để tìm file mẫu theo speaker)
|
| 324 |
+
# ==============================================================
|
| 325 |
+
def _norm(s: str) -> str:
|
| 326 |
+
import unicodedata
|
| 327 |
+
s = unicodedata.normalize("NFKD", s)
|
| 328 |
+
s = "".join([c for c in s if not unicodedata.combining(c)])
|
| 329 |
+
s = s.lower()
|
| 330 |
+
s = re.sub(r"[^a-z0-9_\-\.]+", "", s)
|
| 331 |
+
return s
|
| 332 |
+
|
| 333 |
+
def build_ref_map(ref_dir: str) -> dict:
|
| 334 |
+
paths = glob.glob(os.path.join(ref_dir, "**", "*.wav"), recursive=True)
|
| 335 |
+
by_name = {}
|
| 336 |
+
for p in paths:
|
| 337 |
+
fname = os.path.basename(p)
|
| 338 |
+
by_name[_norm(fname)] = p
|
| 339 |
+
|
| 340 |
+
spk_map = {}
|
| 341 |
+
speakers = list(styles_dict.keys()) if isinstance(styles_dict, dict) else ["Kore","Algenib","Puck","Leda"]
|
| 342 |
+
|
| 343 |
+
for spk in speakers:
|
| 344 |
+
spk_n = _norm(spk)
|
| 345 |
+
hit = None
|
| 346 |
+
for k, p in by_name.items():
|
| 347 |
+
if f"_{spk_n}_" in k:
|
| 348 |
+
hit = p
|
| 349 |
+
break
|
| 350 |
+
if not hit:
|
| 351 |
+
for k, p in by_name.items():
|
| 352 |
+
if spk_n in k:
|
| 353 |
+
hit = p
|
| 354 |
+
break
|
| 355 |
+
if hit:
|
| 356 |
+
spk_map[spk] = hit
|
| 357 |
+
return spk_map
|
| 358 |
+
|
| 359 |
+
REF_MAP = build_ref_map(REF_DIR)
|
| 360 |
+
|
| 361 |
+
def get_ref_path_for_speaker(spk: str):
|
| 362 |
+
return REF_MAP.get(spk)
|
| 363 |
+
|
| 364 |
+
# ==============================================================
|
| 365 |
+
# Wrapper cho Gradio (nhận speaker_mix_spec là string ẩn)
|
| 366 |
+
# ==============================================================
|
| 367 |
+
def run_inference(text, alpha, beta, speaker_mix_spec):
|
| 368 |
+
normalizer = SoeNormalizer()
|
| 369 |
+
text = normalizer.normalize(text).replace(" ,", ",").replace(" .", ".")
|
| 370 |
+
|
| 371 |
+
ps = phonemize_text(text)
|
| 372 |
+
phoneme_len = len(ps.replace(" ", ""))
|
| 373 |
+
|
| 374 |
+
mix_dict = parse_mix_spec(speaker_mix_spec)
|
| 375 |
+
if len(mix_dict) > 0:
|
| 376 |
+
ref_feat = get_style_mixed_by_length(mix_dict, phoneme_len)
|
| 377 |
+
ref_idx = min(phoneme_len, 510)
|
| 378 |
+
total = sum(mix_dict.values())
|
| 379 |
+
mix_info = {k: round(float(v / total), 3) for k, v in mix_dict.items()}
|
| 380 |
+
chosen_speakers = list(mix_dict.keys())
|
| 381 |
+
else:
|
| 382 |
+
ref_feat = get_style_by_length(DEFAULT_SPK, phoneme_len)
|
| 383 |
+
ref_idx = min(phoneme_len, 510)
|
| 384 |
+
mix_info = {DEFAULT_SPK: 1.0}
|
| 385 |
+
chosen_speakers = [DEFAULT_SPK]
|
| 386 |
+
|
| 387 |
+
t0 = time.time()
|
| 388 |
+
wav, ps_out, simi_timbre, simi_prosody = inference_one(
|
| 389 |
+
text, ref_feat, alpha=float(alpha), beta=float(beta)
|
| 390 |
+
)
|
| 391 |
+
gen_time = time.time() - t0
|
| 392 |
+
rtf = gen_time / max(1e-6, len(wav) / SR_OUT)
|
| 393 |
+
|
| 394 |
+
info = {
|
| 395 |
+
"Text after soe_vinorms:": text,
|
| 396 |
+
"Speakers": chosen_speakers,
|
| 397 |
+
"Mix weights (normalized)": mix_info,
|
| 398 |
+
"Phonemes": ps_out,
|
| 399 |
+
"Phoneme length": phoneme_len,
|
| 400 |
+
"Ref index": ref_idx,
|
| 401 |
+
"simi_timbre": round(float(simi_timbre), 4),
|
| 402 |
+
"simi_prosody": round(float(simi_prosody), 4),
|
| 403 |
+
"alpha": float(alpha),
|
| 404 |
+
"beta": float(beta),
|
| 405 |
+
"RTF": round(float(rtf), 3),
|
| 406 |
+
"Device": DEVICE,
|
| 407 |
+
}
|
| 408 |
+
return (SR_OUT, wav.astype(np.float32)), info
|
| 409 |
+
|
| 410 |
+
# ==============================================================
|
| 411 |
+
# UI helper: build mix-spec CỐ ĐỊNH theo 4 speaker
|
| 412 |
+
# ==============================================================
|
| 413 |
+
def _build_mix_spec_ui_fixed(normalize, w1, w2, w3, w4, order):
|
| 414 |
+
pairs = [(order[0], float(w1 or 0.0)),
|
| 415 |
+
(order[1], float(w2 or 0.0)),
|
| 416 |
+
(order[2], float(w3 or 0.0)),
|
| 417 |
+
(order[3], float(w4 or 0.0))]
|
| 418 |
+
pairs = [(s, w) for s, w in pairs if w > 0]
|
| 419 |
+
|
| 420 |
+
if not pairs:
|
| 421 |
+
return "", {}, "**Sum:** 0.000"
|
| 422 |
+
|
| 423 |
+
total = sum(w for _, w in pairs)
|
| 424 |
+
if normalize and total > 0:
|
| 425 |
+
pairs = [(s, w/total) for s, w in pairs]
|
| 426 |
+
|
| 427 |
+
acc = {}
|
| 428 |
+
for s, w in pairs:
|
| 429 |
+
acc[s] = acc.get(s, 0.0) + w
|
| 430 |
+
|
| 431 |
+
mix_spec = ",".join([f"{s}:{w:.4f}" for s, w in acc.items()])
|
| 432 |
+
mix_view = {"weights": {s: round(w, 3) for s, w in acc.items()}, "normalized": bool(normalize)}
|
| 433 |
+
sum_md = f"**Sum:** {round(sum(acc.values()), 3)}"
|
| 434 |
+
return mix_spec, mix_view, sum_md
|
| 435 |
+
|
| 436 |
+
|
| 437 |
+
|
| 438 |
+
# ==============================================================
|
| 439 |
+
# Gradio UI
|
| 440 |
+
# ==============================================================
|
| 441 |
+
with gr.Blocks(title="StyleTTS2-vi Demo") as demo:
|
| 442 |
+
gr.Markdown("# StyleTTS2-vi Demo")
|
| 443 |
+
|
| 444 |
+
with gr.Row():
|
| 445 |
+
with gr.Column():
|
| 446 |
+
text_inp = gr.Textbox(
|
| 447 |
+
label="Text",
|
| 448 |
+
lines=4,
|
| 449 |
+
value="Trăng treo lơ lửng trên đỉnh núi chơ vơ, ánh sáng bàng bạc phủ lên bãi đá ngổn ngang. Con dế thổn thức trong khe cỏ, tiếng gió hun hút lùa qua hốc núi trập trùng. Dưới thung lũng, đàn trâu gặm cỏ ung dung, hơi sương vẩn đục, lảng bảng giữa đồng khuya tĩnh mịch."
|
| 450 |
+
)
|
| 451 |
+
|
| 452 |
+
# Danh sách speaker có trong styles_dict
|
| 453 |
+
spk_choices = list(styles_dict.keys()) if isinstance(styles_dict, dict) else ["Kore","Algenib","Puck","Leda"]
|
| 454 |
+
|
| 455 |
+
# Thứ tự CỐ ĐỊNH cho mixer
|
| 456 |
+
fixed_order = [s for s in ["Kore", "Puck", "Algenib", "Leda"] if s in spk_choices]
|
| 457 |
+
if len(fixed_order) < 4:
|
| 458 |
+
for s in spk_choices:
|
| 459 |
+
if s not in fixed_order:
|
| 460 |
+
fixed_order.append(s)
|
| 461 |
+
if len(fixed_order) == 4:
|
| 462 |
+
break
|
| 463 |
+
|
| 464 |
+
# === Luôn hiển thị 4 voice sample ===
|
| 465 |
+
with gr.Accordion("Reference samples", open=True):
|
| 466 |
+
with gr.Row():
|
| 467 |
+
spk0 = fixed_order[0] if len(fixed_order) > 0 else "Kore"
|
| 468 |
+
spk1 = fixed_order[1] if len(fixed_order) > 1 else "Puck"
|
| 469 |
+
with gr.Column():
|
| 470 |
+
gr.Markdown(f"**{spk0}**")
|
| 471 |
+
gr.Audio(value=get_ref_path_for_speaker(spk0), label=f"{spk0} sample", type="filepath", interactive=False)
|
| 472 |
+
with gr.Column():
|
| 473 |
+
gr.Markdown(f"**{spk1}**")
|
| 474 |
+
gr.Audio(value=get_ref_path_for_speaker(spk1), label=f"{spk1} sample", type="filepath", interactive=False)
|
| 475 |
+
with gr.Row():
|
| 476 |
+
spk2 = fixed_order[2] if len(fixed_order) > 2 else "Algenib"
|
| 477 |
+
spk3 = fixed_order[3] if len(fixed_order) > 3 else "Leda"
|
| 478 |
+
with gr.Column():
|
| 479 |
+
gr.Markdown(f"**{spk2}**")
|
| 480 |
+
gr.Audio(value=get_ref_path_for_speaker(spk2), label=f"{spk2} sample", type="filepath", interactive=False)
|
| 481 |
+
with gr.Column():
|
| 482 |
+
gr.Markdown(f"**{spk3}**")
|
| 483 |
+
gr.Audio(value=get_ref_path_for_speaker(spk3), label=f"{spk3} sample", type="filepath", interactive=False)
|
| 484 |
+
|
| 485 |
+
# ---- Style Mixer cố định 4 slot ----
|
| 486 |
+
with gr.Accordion("Style Mixer", open=True):
|
| 487 |
+
normalize_ck = gr.Checkbox(value=True, label="Normalize weights to 1")
|
| 488 |
+
|
| 489 |
+
# Hàng 1: Kore & Puck
|
| 490 |
+
with gr.Row(equal_height=True):
|
| 491 |
+
with gr.Column():
|
| 492 |
+
gr.Markdown(f"**{fixed_order[0]}**")
|
| 493 |
+
w1 = gr.Slider(0.0, 1.0, value=0.0, step=0.05, label="Weight 1", container=False)
|
| 494 |
+
with gr.Column():
|
| 495 |
+
gr.Markdown(f"**{fixed_order[1]}**")
|
| 496 |
+
w2 = gr.Slider(0.0, 1.0, value=0.0, step=0.05, label="Weight 2", container=False)
|
| 497 |
+
|
| 498 |
+
# Hàng 2: Algenib & Leda
|
| 499 |
+
with gr.Row(equal_height=True):
|
| 500 |
+
with gr.Column():
|
| 501 |
+
gr.Markdown(f"**{fixed_order[2]}**")
|
| 502 |
+
w3 = gr.Slider(0.0, 1.0, value=0.0, step=0.05, label="Weight 3", container=False)
|
| 503 |
+
with gr.Column():
|
| 504 |
+
gr.Markdown(f"**{fixed_order[3]}**")
|
| 505 |
+
w4 = gr.Slider(0.0, 1.0, value=0.0, step=0.05, label="Weight 4", container=False)
|
| 506 |
+
|
| 507 |
+
mix_sum_md = gr.Markdown("**Sum:** 0.000")
|
| 508 |
+
mix_view_json = gr.JSON(label="Mixer weights (view)")
|
| 509 |
+
mix_spec_state = gr.State("") # string mix-spec cho backend
|
| 510 |
+
order_state = gr.State(fixed_order) # giữ thứ tự cố định
|
| 511 |
+
|
| 512 |
+
with gr.Row():
|
| 513 |
+
alpha_n = gr.Number(value=ALPHA, label="alpha diffusion (0-1, timbre)", precision=3)
|
| 514 |
+
beta_n = gr.Number(value=BETA, label="beta diffusion (0-1, prosody)", precision=3)
|
| 515 |
+
|
| 516 |
+
btn = gr.Button("Đọc 🔊🔥", variant="primary")
|
| 517 |
+
|
| 518 |
+
with gr.Column():
|
| 519 |
+
out_audio = gr.Audio(label="Synthesised Audio", type="numpy")
|
| 520 |
+
metrics = gr.JSON(label="Metrics")
|
| 521 |
+
|
| 522 |
+
# Bất kỳ thay đổi weight/normalize -> build spec cố định + update tổng/json
|
| 523 |
+
def _ui_build_wrapper_fixed(normalize, w1, w2, w3, w4, order):
|
| 524 |
+
spec, view, summ = _build_mix_spec_ui_fixed(normalize, w1, w2, w3, w4, order)
|
| 525 |
+
return spec, view, summ
|
| 526 |
+
|
| 527 |
+
for comp in [normalize_ck, w1, w2, w3, w4]:
|
| 528 |
+
comp.change(
|
| 529 |
+
_ui_build_wrapper_fixed,
|
| 530 |
+
inputs=[normalize_ck, w1, w2, w3, w4, order_state],
|
| 531 |
+
outputs=[mix_spec_state, mix_view_json, mix_sum_md]
|
| 532 |
+
)
|
| 533 |
+
|
| 534 |
+
|
| 535 |
+
# Nút đọc: dùng mix_spec_state; nếu rỗng => fallback DEFAULT_SPK
|
| 536 |
+
btn.click(
|
| 537 |
+
run_inference,
|
| 538 |
+
inputs=[text_inp, alpha_n, beta_n, mix_spec_state],
|
| 539 |
+
outputs=[out_audio, metrics]
|
| 540 |
+
)
|
| 541 |
+
|
| 542 |
+
if __name__ == "__main__":
|
| 543 |
+
demo.launch()
|
ref_voice/{0000000_Kore_Quân_sự.wav → 0000012_Kore_Thể_thao.wav}
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2c797fbd6d6ff2aff8f0fa618a8767325ab680a6f21dc49ffde3a6295b457904
|
| 3 |
+
size 680250
|