Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Hugging Face Space 首页 - MOSS-TTSD | |
| 参考 fnlp/MOSS-TTSD Space 的实现,并结合本仓 UI 与文档做了增强: | |
| - 默认中文界面,保留简洁工作流 | |
| - 提供场景选择与一键加载 | |
| - 支持文本规范化选项 | |
| - 右侧提供简明的使用说明与文档链接 | |
| 如需在本地运行本 Space 脚本: | |
| python hf_space/app.py | |
| """ | |
| import os | |
| import json | |
| import time | |
| import shutil | |
| import tempfile | |
| from typing import Optional, Tuple | |
| import gradio as gr | |
| import torch | |
| import torchaudio | |
| # HF Spaces GPU 调度 | |
| try: | |
| import spaces # 在HF空间中可用,本地不存在也不影响 | |
| except Exception: # noqa: BLE001 | |
| class _DummySpaces: # 兜底占位,以便本地运行不报错 | |
| def GPU(self, *args, **kwargs): # type: ignore[override] | |
| def deco(fn): | |
| return fn | |
| return deco | |
| spaces = _DummySpaces() # type: ignore | |
| from huggingface_hub import hf_hub_download | |
| # 复用本仓通用推理工具 | |
| from generation_utils import load_model, process_batch | |
| # ========================= | |
| # 配置 | |
| # ========================= | |
| SYSTEM_PROMPT = ( | |
| "You are a speech synthesizer that generates natural, realistic, and human-like conversational audio from dialogue text." | |
| ) | |
| # 场景配置映射 | |
| SCENARIO_CONFIG = { | |
| "科技播客_AI发展": { | |
| "title": "🤖 科技播客 - AI发展趋势", | |
| "description": "探讨人工智能的最新发展与未来趋势", | |
| "file": "scenarios/科技播客_AI发展.jsonl" | |
| }, | |
| "教育播客_学习方法": { | |
| "title": "📚 教育播客 - 高效学习方法", | |
| "description": "分享科学的学习方法与技巧", | |
| "file": "scenarios/教育播客_学习方法.jsonl" | |
| }, | |
| "生活播客_美食文化": { | |
| "title": "🍜 生活播客 - 美食文化探索", | |
| "description": "品味各地美食文化的魅力", | |
| "file": "scenarios/生活播客_美食文化.jsonl" | |
| }, | |
| "商业播客_创业经验": { | |
| "title": "💼 商业播客 - 创业经验分享", | |
| "description": "创业路上的经验教训与心得", | |
| "file": "scenarios/商业播客_创业经验.jsonl" | |
| }, | |
| "健康播客_运动健身": { | |
| "title": "🏃 健康播客 - 运动健身指南", | |
| "description": "科学健身与健康生活方式", | |
| "file": "scenarios/健康播客_运动健身.jsonl" | |
| }, | |
| "心理播客_情绪管理": { | |
| "title": "🧠 心理播客 - 情绪管理技巧", | |
| "description": "探索情绪管理与心理健康", | |
| "file": "scenarios/心理播客_情绪管理.jsonl" | |
| } | |
| } | |
| # 默认音频配置 | |
| DEFAULT_AUDIO_CONFIG = { | |
| "speaker1": { | |
| "audio": "examples/zh_spk1_moon.wav", | |
| "text": "周一到周五,每天早晨七点半到九点半的直播片段。言下之意呢,就是废话有点多,大家也别嫌弃,因为这都是直播间最真实的状态了。" | |
| }, | |
| "speaker2": { | |
| "audio": "examples/zh_spk2_moon.wav", | |
| "text": "如果大家想听到更丰富更及时的直播内容,记得在周一到周五准时进入直播间,和大家一起畅聊新消费新科技新趋势。" | |
| } | |
| } | |
| MODEL_PATH = "fnlp/MOSS-TTSD-v0.5" | |
| SPT_CONFIG_PATH = "XY_Tokenizer/config/xy_tokenizer_config.yaml" | |
| # 自动下载 XY_Tokenizer 权重到本地缓存(HF Space 会复用缓存) | |
| os.makedirs("XY_Tokenizer/weights", exist_ok=True) | |
| try: | |
| SPT_CHECKPOINT_PATH = hf_hub_download( | |
| repo_id="fnlp/XY_Tokenizer_TTSD_V0", | |
| filename="xy_tokenizer.ckpt", | |
| cache_dir="XY_Tokenizer/weights", | |
| ) | |
| except Exception as e: # noqa: BLE001 | |
| # 失败时保留占位路径,稍后初始化时再提示 | |
| print(f"⚠️ XY_Tokenizer 权重下载失败: {e}") | |
| SPT_CHECKPOINT_PATH = "XY_Tokenizer/weights/xy_tokenizer.ckpt" | |
| # 全局缓存 | |
| tokenizer = None | |
| model = None | |
| spt = None | |
| device = None | |
| # ========================= | |
| # 工具函数 | |
| # ========================= | |
| def get_scenario_examples(): | |
| """获取所有可用的场景示例,整合 JSON 文件和默认配置""" | |
| scenarios = {} | |
| # 加载 JSON 文件场景 | |
| for key, config in SCENARIO_CONFIG.items(): | |
| try: | |
| file_path = config["file"] | |
| print(f"🔍 检查场景文件: {file_path}") | |
| if os.path.exists(file_path): | |
| with open(file_path, "r", encoding="utf-8") as f: | |
| data = json.load(f) | |
| scenarios[config["title"]] = { | |
| "text": data.get("text", ""), | |
| "description": config["description"], | |
| "audio1": data.get("prompt_audio_speaker1", ""), | |
| "text1": data.get("prompt_text_speaker1", ""), | |
| "audio2": data.get("prompt_audio_speaker2", ""), | |
| "text2": data.get("prompt_text_speaker2", ""), | |
| "base_path": data.get("base_path", ""), | |
| } | |
| print(f"✅ 成功加载场景: {config['title']}") | |
| else: | |
| print(f"❌ 场景文件不存在: {file_path}") | |
| except Exception as e: | |
| print(f"⚠️ 加载场景 {key} 失败: {e}") | |
| # 添加默认示例(确保总有可用场景) | |
| scenarios["🎧 默认示例"] = { | |
| "text": ( | |
| "[S1]大家好,欢迎收听今天的节目,我是主播小雨。" | |
| "[S2]大家好,我是嘉宾阿明,很高兴和大家见面。" | |
| "[S1]今天我们要聊的话题非常有趣,相信大家会喜欢的。" | |
| "[S2]是的,让我们开始今天的精彩内容吧!" | |
| ), | |
| "description": "默认的示例对话,适合快速体验", | |
| "audio1": DEFAULT_AUDIO_CONFIG["speaker1"]["audio"], | |
| "text1": DEFAULT_AUDIO_CONFIG["speaker1"]["text"], | |
| "audio2": DEFAULT_AUDIO_CONFIG["speaker2"]["audio"], | |
| "text2": DEFAULT_AUDIO_CONFIG["speaker2"]["text"], | |
| "base_path": "", | |
| } | |
| print(f"📊 总共加载了 {len(scenarios)} 个场景") | |
| return scenarios | |
| def load_scenario_data(scenario_key: str): | |
| """加载场景数据,确保音频和文本一一对应""" | |
| if scenario_key not in SCENARIO_CONFIG: | |
| return None, None, None, None, None | |
| try: | |
| scenario_file = SCENARIO_CONFIG[scenario_key]["file"] | |
| if not os.path.exists(scenario_file): | |
| return None, None, None, None, None | |
| with open(scenario_file, "r", encoding="utf-8") as f: | |
| data = json.load(f) | |
| # 确保音频文件路径正确 | |
| audio1_path = data.get("prompt_audio_speaker1", "") | |
| audio2_path = data.get("prompt_audio_speaker2", "") | |
| if audio1_path and not audio1_path.startswith("/"): | |
| audio1_path = os.path.join(data.get("base_path", ""), audio1_path) | |
| if audio2_path and not audio2_path.startswith("/"): | |
| audio2_path = os.path.join(data.get("base_path", ""), audio2_path) | |
| return ( | |
| data.get("text", ""), | |
| audio1_path if os.path.exists(audio1_path) else None, | |
| data.get("prompt_text_speaker1", ""), | |
| audio2_path if os.path.exists(audio2_path) else None, | |
| data.get("prompt_text_speaker2", "") | |
| ) | |
| except Exception as e: | |
| print(f"❌ 加载场景失败: {e}") | |
| return None, None, None, None, None | |
| def load_default_audio(): | |
| """加载默认音频和文本,确保音频文件存在""" | |
| audio1 = DEFAULT_AUDIO_CONFIG["speaker1"]["audio"] | |
| text1 = DEFAULT_AUDIO_CONFIG["speaker1"]["text"] | |
| audio2 = DEFAULT_AUDIO_CONFIG["speaker2"]["audio"] | |
| text2 = DEFAULT_AUDIO_CONFIG["speaker2"]["text"] | |
| # 默认对话文本 | |
| default_text = ( | |
| "[S1]大家好,欢迎收听今天的节目,我是主播小雨。" | |
| "[S2]大家好,我是嘉宾阿明,很高兴和大家见面。" | |
| "[S1]今天我们要聊的话题非常有趣,相信大家会喜欢的。" | |
| "[S2]是的,让我们开始今天的精彩内容吧!" | |
| ) | |
| # 检查音频文件是否存在 | |
| audio1_exists = os.path.exists(audio1) | |
| audio2_exists = os.path.exists(audio2) | |
| print(f"🔍 默认音频检查: {audio1}={audio1_exists}, {audio2}={audio2_exists}") | |
| # 如果文件存在,返回绝对路径;否则返回None | |
| audio1_path = os.path.abspath(audio1) if audio1_exists else None | |
| audio2_path = os.path.abspath(audio2) if audio2_exists else None | |
| print(f"🎵 返回音频路径: audio1={audio1_path}, audio2={audio2_path}") | |
| return ( | |
| default_text, | |
| audio1_path, | |
| text1, | |
| audio2_path, | |
| text2 | |
| ) | |
| def initialize_model(): | |
| global tokenizer, model, spt, device | |
| if tokenizer is not None: | |
| return tokenizer, model, spt, device | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| print(f"🔧 初始化模型,设备: {device}") | |
| if not os.path.exists(SPT_CHECKPOINT_PATH): | |
| raise FileNotFoundError( | |
| "未找到 XY_Tokenizer 权重,请检查网络或手动放置到 XY_Tokenizer/weights/xy_tokenizer.ckpt" | |
| ) | |
| tokenizer, model, spt = load_model( | |
| MODEL_PATH, | |
| SPT_CONFIG_PATH, | |
| SPT_CHECKPOINT_PATH, | |
| ) | |
| model = model.to(device) | |
| spt = spt.to(device) | |
| # 设置稳定的生成参数,避免数值不稳定 | |
| try: | |
| # 优化生成长度,平衡质量与速度 | |
| model.generation_config.max_new_tokens = min( | |
| getattr(model.generation_config, "max_new_tokens", 1024), 1024 # 减少默认长度,提升速度 | |
| ) | |
| # 使用文档推荐的"轻松对话风格"参数组合,确保数值稳定 | |
| model.generation_config.do_sample = True | |
| model.generation_config.temperature = 1.0 # 恢复默认值,避免数值不稳定 | |
| model.generation_config.top_k = 50 # 添加top_k限制 | |
| model.generation_config.top_p = 0.9 # 保持合理的nucleus采样 | |
| model.generation_config.repetition_penalty = 1.1 # 避免重复 | |
| model.generation_config.num_beams = 1 # 使用贪心搜索 | |
| # 添加数值稳定性保护 | |
| model.generation_config.epsilon = 1e-8 # 防止除零错误 | |
| model.generation_config.pad_token_id = model.config.eos_token_id | |
| print(f"🚀 应用稳定生成参数: temp={model.generation_config.temperature}, top_k={model.generation_config.top_k}, top_p={model.generation_config.top_p}") | |
| except Exception as e: # noqa: BLE001 | |
| print(f"⚠️ 生成参数设置失败: {e}") | |
| pass | |
| print("✅ 模型初始化完成!") | |
| return tokenizer, model, spt, device | |
| # ========================= | |
| # 推理函数(供 UI 调用) | |
| # ========================= | |
| # 减少GPU持续时间,提升响应速度 | |
| def generate_dialogue_audio( | |
| dialogue_text: str, | |
| speaker1_audio: Optional[str], | |
| speaker1_text: str, | |
| speaker2_audio: Optional[str], | |
| speaker2_text: str, | |
| use_normalize: bool, | |
| temperature: float = 1.0, | |
| top_k: int = 50, | |
| top_p: float = 0.9, | |
| repetition_penalty: float = 1.1, | |
| max_new_tokens: int = 2048, | |
| do_sample: bool = True, | |
| ) -> Tuple[Optional[str], str]: | |
| try: | |
| if not dialogue_text or not dialogue_text.strip(): | |
| return None, "❌ 请输入对话文本" | |
| # 检查音频输入 | |
| if not speaker1_audio and not speaker2_audio: | |
| return None, "💡 页面应该已自动加载默认音频,如未加载请点击 '🎧 默认音频' 按钮,或上传您自己的参考音频文件!" | |
| # 初始化模型,显示进度 | |
| tokenizer, model, spt, device = initialize_model() | |
| # 应用用户选择的生成参数,覆盖默认设置 | |
| print(f"🎛️ 应用用户参数: temp={temperature}, top_k={top_k}, top_p={top_p}, penalty={repetition_penalty}") | |
| model.generation_config.temperature = temperature | |
| model.generation_config.top_k = top_k | |
| model.generation_config.top_p = top_p | |
| model.generation_config.repetition_penalty = repetition_penalty | |
| model.generation_config.max_new_tokens = min(max_new_tokens, 4096) # 安全限制 | |
| model.generation_config.do_sample = do_sample | |
| # 根据输入拼装 item(process_batch 兼容单/双说话者) | |
| item = {"text": dialogue_text} | |
| if speaker1_audio and speaker2_audio: | |
| item.update( | |
| { | |
| "prompt_audio_speaker1": speaker1_audio, | |
| "prompt_text_speaker1": speaker1_text or "", | |
| "prompt_audio_speaker2": speaker2_audio, | |
| "prompt_text_speaker2": speaker2_text or "", | |
| } | |
| ) | |
| else: | |
| # 单音频模式 | |
| single_audio = speaker1_audio or speaker2_audio | |
| single_text = speaker1_text or speaker2_text or "" | |
| item.update({"prompt_audio": single_audio, "prompt_text": single_text}) | |
| # 执行合成,添加重试机制 | |
| try: | |
| actual_texts_data, audio_results = process_batch( | |
| batch_items=[item], | |
| tokenizer=tokenizer, | |
| model=model, | |
| spt=spt, | |
| device=device, | |
| system_prompt=SYSTEM_PROMPT, | |
| start_idx=0, | |
| use_normalize=use_normalize, | |
| ) | |
| except RuntimeError as e: | |
| if "probability tensor contains" in str(e): | |
| print("⚠️ 检测到数值不稳定,尝试使用确定性生成...") | |
| # 临时切换到确定性生成 | |
| original_do_sample = model.generation_config.do_sample | |
| model.generation_config.do_sample = False | |
| try: | |
| actual_texts_data, audio_results = process_batch( | |
| batch_items=[item], | |
| tokenizer=tokenizer, | |
| model=model, | |
| spt=spt, | |
| device=device, | |
| system_prompt=SYSTEM_PROMPT, | |
| start_idx=0, | |
| use_normalize=use_normalize, | |
| ) | |
| finally: | |
| # 恢复原设置 | |
| model.generation_config.do_sample = original_do_sample | |
| else: | |
| raise e | |
| if not audio_results or audio_results[0] is None: | |
| return None, "❌ 音频生成失败" | |
| audio_result = audio_results[0] | |
| out_path = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name | |
| torchaudio.save(out_path, audio_result["audio_data"], audio_result["sample_rate"]) # type: ignore[index] | |
| status = ( | |
| f"✅ 生成成功!\n\n" | |
| f"📊 音频信息:\n" | |
| f"- 采样率: {audio_result['sample_rate']} Hz\n" | |
| f"- 时长: {audio_result['audio_data'].shape[-1] / audio_result['sample_rate']:.2f} 秒\n" | |
| f"- 通道数: {audio_result['audio_data'].shape[0]}\n\n" | |
| f"📝 文本处理:\n" | |
| f"- 是否规范化: {use_normalize}\n" | |
| ) | |
| return out_path, status | |
| except Exception as e: # noqa: BLE001 | |
| import traceback | |
| return None, f"❌ 生成出错: {e}\n\n{traceback.format_exc()}" | |
| # ========================= | |
| # UI 构建 | |
| # ========================= | |
| def create_space_ui() -> gr.Blocks: | |
| # Apple-inspired 暗色主题设计 | |
| custom_css = """ | |
| /* 全局样式 - Apple暗色风格 */ | |
| .gradio-container { | |
| max-width: 1400px !important; | |
| margin: 0 auto !important; | |
| font-family: -apple-system, BlinkMacSystemFont, 'SF Pro Display', system-ui, sans-serif !important; | |
| background: #0d1117 !important; | |
| --primary-color: #0969da; | |
| --primary-hover: #1f6feb; | |
| --surface-color: #161b22; | |
| --surface-secondary: #21262d; | |
| --border-color: #30363d; | |
| --border-secondary: #21262d; | |
| --text-primary: #f0f6fc; | |
| --text-secondary: #8b949e; | |
| --text-muted: #656d76; | |
| --success-color: #238636; | |
| --shadow: 0 4px 16px rgba(0,0,0,0.4); | |
| --shadow-elevated: 0 8px 32px rgba(0,0,0,0.6); | |
| --radius: 12px; | |
| } | |
| /* 主标题区域 */ | |
| .header { | |
| text-align: center; | |
| margin-bottom: 2rem; | |
| background: var(--surface-color); | |
| padding: 3rem 2rem; | |
| border-radius: var(--radius); | |
| color: var(--text-primary); | |
| box-shadow: var(--shadow); | |
| border: 1px solid var(--border-color); | |
| } | |
| .header h1 { | |
| font-size: 2.75rem; | |
| margin: 0 0 0.5rem 0; | |
| font-weight: 700; | |
| letter-spacing: -0.02em; | |
| color: var(--text-primary); | |
| } | |
| .header p { | |
| font-size: 1.1rem; | |
| margin: 0; | |
| color: var(--text-secondary); | |
| font-weight: 400; | |
| } | |
| /* 卡片组件 - 暗色主题 */ | |
| .section { | |
| background: var(--surface-color); | |
| border-radius: var(--radius); | |
| padding: 2rem; | |
| border: 1px solid var(--border-color); | |
| margin: 1rem 0; | |
| box-shadow: var(--shadow); | |
| transition: all 0.2s ease; | |
| } | |
| .section:hover { | |
| box-shadow: var(--shadow-elevated); | |
| transform: translateY(-2px); | |
| border-color: var(--primary-color); | |
| } | |
| /* 按钮样式 - 暗色主题 */ | |
| .quick-btn { | |
| background: var(--primary-color) !important; | |
| border: none !important; | |
| color: var(--text-primary) !important; | |
| font-weight: 600 !important; | |
| border-radius: var(--radius) !important; | |
| padding: 0.875rem 2rem !important; | |
| transition: all 0.2s ease !important; | |
| } | |
| .quick-btn:hover { | |
| background: var(--primary-hover) !important; | |
| transform: translateY(-1px) !important; | |
| box-shadow: 0 8px 24px rgba(9,105,218,0.4) !important; | |
| } | |
| .generate-btn { | |
| background: var(--primary-color) !important; | |
| border: none !important; | |
| color: var(--text-primary) !important; | |
| font-weight: 700 !important; | |
| font-size: 1.1rem !important; | |
| border-radius: var(--radius) !important; | |
| padding: 1rem 2rem !important; | |
| width: 100% !important; | |
| transition: all 0.2s ease !important; | |
| box-shadow: var(--shadow) !important; | |
| } | |
| .generate-btn:hover { | |
| background: var(--primary-hover) !important; | |
| transform: translateY(-2px) !important; | |
| box-shadow: var(--shadow-elevated) !important; | |
| } | |
| .speaker-section { | |
| background: var(--surface-secondary); | |
| padding: 1.5rem; | |
| border-radius: var(--radius); | |
| border: 1px solid var(--border-color); | |
| } | |
| /* Gradio 组件暗色主题覆盖 */ | |
| .gradio-container .gr-textbox, | |
| .gradio-container .gr-textarea, | |
| .gradio-container .gr-dropdown, | |
| .gradio-container .gr-audio, | |
| .gradio-container .gr-slider, | |
| .gradio-container .gr-checkbox, | |
| .gradio-container .gr-accordion { | |
| background: var(--surface-color) !important; | |
| border: 1px solid var(--border-color) !important; | |
| color: var(--text-primary) !important; | |
| border-radius: var(--radius) !important; | |
| } | |
| .gradio-container .gr-textbox:focus, | |
| .gradio-container .gr-textarea:focus, | |
| .gradio-container .gr-dropdown:focus { | |
| border-color: var(--primary-color) !important; | |
| box-shadow: 0 0 0 3px rgba(9,105,218,0.2) !important; | |
| } | |
| /* 文本和标签暗色主题 */ | |
| .gradio-container .gr-markdown, | |
| .gradio-container .gr-markdown *, | |
| .gradio-container label, | |
| .gradio-container p, | |
| .gradio-container span { | |
| color: var(--text-primary) !important; | |
| } | |
| .gradio-container .gr-markdown code { | |
| background: var(--surface-secondary) !important; | |
| color: var(--text-primary) !important; | |
| border-radius: 4px !important; | |
| padding: 2px 6px !important; | |
| } | |
| /* 按钮统一暗色主题 */ | |
| .gradio-container .gr-button { | |
| background: var(--surface-color) !important; | |
| border: 1px solid var(--border-color) !important; | |
| color: var(--text-primary) !important; | |
| border-radius: var(--radius) !important; | |
| } | |
| .gradio-container .gr-button:hover { | |
| background: var(--surface-secondary) !important; | |
| border-color: var(--primary-color) !important; | |
| } | |
| .gradio-container .gr-button.primary { | |
| background: var(--primary-color) !important; | |
| border: none !important; | |
| color: var(--text-primary) !important; | |
| } | |
| .gradio-container .gr-button.primary:hover { | |
| background: var(--primary-hover) !important; | |
| } | |
| """ | |
| with gr.Blocks(css=custom_css, title="🎙️ MOSS-TTSD | Hugging Face Space", theme="dark") as demo: | |
| gr.HTML( | |
| """ | |
| <div class="header"> | |
| <h1>🎙️ MOSS-TTSD 对话语音合成</h1> | |
| <p>零样本双说话者对话合成 · 默认中文界面 · 一键加载场景</p> | |
| </div> | |
| """ | |
| ) | |
| with gr.Row(): | |
| # 左侧:输入 | |
| with gr.Column(scale=3): | |
| with gr.Group(): | |
| gr.Markdown("### 📝 对话文本") | |
| # 预填充默认对话文本,实现开箱即用 | |
| default_text = ( | |
| "[S1]大家好,欢迎收听今天的节目,我是主播小雨。" | |
| "[S2]大家好,我是嘉宾阿明,很高兴和大家见面。" | |
| "[S1]今天我们要聊的话题非常有趣,相信大家会喜欢的。" | |
| "[S2]是的,让我们开始今天的精彩内容吧!" | |
| ) | |
| dialogue_text = gr.TextArea( | |
| label="", | |
| lines=6, | |
| placeholder="请输入对话内容,使用[S1]/[S2]标记不同说话者...", | |
| value=default_text, | |
| ) | |
| with gr.Group(): | |
| gr.Markdown("### 🚀 快速操作") | |
| # 预定义场景选项,确保界面稳定 | |
| predefined_scenarios = [ | |
| "🎧 默认示例", | |
| "🤖 科技播客 - AI发展趋势", | |
| "📚 教育播客 - 高效学习方法", | |
| "🍜 生活播客 - 美食文化探索", | |
| "💼 商业播客 - 创业经验分享", | |
| "🏃 健康播客 - 运动健身指南", | |
| "🧠 心理播客 - 情绪管理技巧" | |
| ] | |
| scenario_dropdown = gr.Dropdown( | |
| choices=predefined_scenarios, | |
| value=predefined_scenarios[0], | |
| label="🎭 选择场景", | |
| info="选择一个预设场景,获取不同主题的对话文本" | |
| ) | |
| with gr.Row(): | |
| btn_load_scenario = gr.Button("📝 加载场景文本", variant="secondary") | |
| btn_load_default = gr.Button("🎧 加载默认音频", variant="secondary") | |
| with gr.Row(): | |
| with gr.Group(): | |
| gr.Markdown("### 🎵 说话者1 (女声)") | |
| # 尝试预设默认音频 | |
| try: | |
| default_audio1 = DEFAULT_AUDIO_CONFIG["speaker1"]["audio"] | |
| default_text1 = DEFAULT_AUDIO_CONFIG["speaker1"]["text"] | |
| if os.path.exists(default_audio1): | |
| speaker1_audio = gr.Audio( | |
| label="参考音频", | |
| type="filepath", | |
| value=default_audio1 | |
| ) | |
| else: | |
| speaker1_audio = gr.Audio( | |
| label="参考音频", | |
| type="filepath" | |
| ) | |
| speaker1_text = gr.TextArea( | |
| label="参考文本", | |
| lines=2, | |
| placeholder="请输入与参考音频内容完全匹配的文本...", | |
| value=default_text1 | |
| ) | |
| except Exception as e: | |
| print(f"⚠️ 无法预设说话者1默认内容: {e}") | |
| speaker1_audio = gr.Audio( | |
| label="参考音频", | |
| type="filepath" | |
| ) | |
| speaker1_text = gr.TextArea( | |
| label="参考文本", | |
| lines=2, | |
| placeholder="请输入与参考音频内容完全匹配的文本..." | |
| ) | |
| with gr.Group(): | |
| gr.Markdown("### 🎵 说话者2 (男声)") | |
| # 尝试预设默认音频 | |
| try: | |
| default_audio2 = DEFAULT_AUDIO_CONFIG["speaker2"]["audio"] | |
| default_text2 = DEFAULT_AUDIO_CONFIG["speaker2"]["text"] | |
| if os.path.exists(default_audio2): | |
| speaker2_audio = gr.Audio( | |
| label="参考音频", | |
| type="filepath", | |
| value=default_audio2 | |
| ) | |
| else: | |
| speaker2_audio = gr.Audio( | |
| label="参考音频", | |
| type="filepath" | |
| ) | |
| speaker2_text = gr.TextArea( | |
| label="参考文本", | |
| lines=2, | |
| placeholder="请输入与参考音频内容完全匹配的文本...", | |
| value=default_text2 | |
| ) | |
| except Exception as e: | |
| print(f"⚠️ 无法预设说话者2默认内容: {e}") | |
| speaker2_audio = gr.Audio( | |
| label="参考音频", | |
| type="filepath" | |
| ) | |
| speaker2_text = gr.TextArea( | |
| label="参考文本", | |
| lines=2, | |
| placeholder="请输入与参考音频内容完全匹配的文本..." | |
| ) | |
| with gr.Group(): | |
| gr.Markdown("### ⚙️ 基础设置") | |
| with gr.Row(): | |
| use_normalize = gr.Checkbox(label="✅ 文本标准化(推荐)", value=True) | |
| # 高级参数设置 - 可折叠 | |
| with gr.Accordion("🎛️ 高级参数设置", open=False): | |
| gr.Markdown("**🎯 生成风格控制** - 根据需要调整参数以获得不同的语音风格") | |
| # 预设风格选择 | |
| with gr.Row(): | |
| style_preset = gr.Dropdown( | |
| label="🎨 预设风格", | |
| choices=["轻松对话", "新闻播报", "娱乐节目", "教育讲解", "自定义"], | |
| value="轻松对话", | |
| interactive=True | |
| ) | |
| gr.Markdown("**⚙️ 自定义参数** - 微调生成效果") | |
| with gr.Row(): | |
| with gr.Column(): | |
| temperature = gr.Slider( | |
| minimum=0.5, | |
| maximum=1.5, | |
| value=1.0, | |
| step=0.1, | |
| label="🌡️ 语气温度", | |
| info="控制语气自然度 (0.5=稳定, 1.0=自然, 1.5=活泼)" | |
| ) | |
| top_k = gr.Slider( | |
| minimum=20, | |
| maximum=100, | |
| value=50, | |
| step=10, | |
| label="🔝 词汇多样性", | |
| info="控制词汇选择范围" | |
| ) | |
| with gr.Column(): | |
| top_p = gr.Slider( | |
| minimum=0.7, | |
| maximum=1.0, | |
| value=0.9, | |
| step=0.05, | |
| label="🎯 表达流畅度", | |
| info="控制表达的连贯性" | |
| ) | |
| repetition_penalty = gr.Slider( | |
| minimum=1.0, | |
| maximum=1.3, | |
| value=1.1, | |
| step=0.05, | |
| label="🔄 重复避免", | |
| info="避免重复表达的强度" | |
| ) | |
| with gr.Row(): | |
| max_new_tokens = gr.Slider( | |
| minimum=512, | |
| maximum=2048, | |
| value=1024, | |
| step=128, | |
| label="📏 最大生成长度", | |
| info="控制生成音频的长度 (512=快速, 1024=平衡, 2048=完整)" | |
| ) | |
| do_sample = gr.Checkbox( | |
| label="🎲 启用采样", | |
| value=True, | |
| info="关闭后使用确定性生成,更稳定但缺乏变化" | |
| ) | |
| btn_generate = gr.Button("🎬 开始合成", variant="primary", size="lg") | |
| gr.Markdown("💡 **开箱即用**: 页面已预填充默认内容,可直接合成 | **生成优化**: 预计20-40秒完成") | |
| # 右侧:输出与说明 | |
| with gr.Column(scale=2): | |
| with gr.Group(): | |
| gr.Markdown("### 🎧 生成结果") | |
| output_audio = gr.Audio(label="生成的音频", type="filepath") | |
| status_info = gr.TextArea(label="状态信息", lines=12, interactive=False) | |
| with gr.Group(): | |
| gr.Markdown("### 📚 使用说明") | |
| gr.Markdown( | |
| """ | |
| **🎯 快速开始:** | |
| 1. 【文本】选择场景并点击"📝 加载场景文本",或自己输入对话文本 | |
| 2. 【音频】点击"🎧 加载默认音频"使用示例音频,或上传自己的参考音频 | |
| 3. 【参考文本】确保参考文本与音频内容完全匹配 | |
| 4. 【设置】勾选"文本标准化",可选调整高级参数 | |
| 5. 【生成】点击"🎬 开始合成" | |
| **📝 格式要求:** | |
| - 使用 `[S1]`/`[S2]` 标记不同说话者 | |
| - 参考文本需与参考音频内容完全匹配 | |
| - 支持上传两个参考音频(双说话者)或一个(单说话者) | |
| **🎵 音频建议:** | |
| - 格式: WAV, MP3, FLAC | |
| - 时长: 10-30秒最佳 | |
| - 质量: 清晰无背景噪音 | |
| - 语速: 自然正常语速 | |
| **💡 提示:** | |
| - 文本标准化开启可提升质量(数字、标点等处理更稳定) | |
| - 文本尽量短句、自然口语化 | |
| - 生成时间根据文本长度而定,请耐心等待 | |
| """ | |
| ) | |
| # ===== 交互逻辑 ===== | |
| def on_load_scenario(name: str): | |
| """加载选中的场景 - 只更换对话文本,不影响音频""" | |
| if not name or name.strip() == "": | |
| gr.Warning("⚠️ 请先选择一个场景") | |
| return gr.update() | |
| scenarios = get_scenario_examples() | |
| if name not in scenarios: | |
| gr.Error(f"❌ 场景不存在: {name}") | |
| return gr.update() | |
| try: | |
| scenario = scenarios[name] | |
| dialogue_text = scenario.get("text", "") | |
| gr.Info(f"✅ 成功加载场景: {name} (仅更换对话文本)") | |
| return dialogue_text | |
| except Exception as e: | |
| gr.Error(f"❌ 加载场景时出错: {str(e)}") | |
| return gr.update() | |
| def on_load_default(): | |
| """加载默认音频和文本""" | |
| try: | |
| result = load_default_audio() | |
| gr.Info("✅ 成功加载默认音频和文本") | |
| return result | |
| except Exception as e: | |
| gr.Error(f"❌ 加载默认音频时出错: {str(e)}") | |
| return gr.update(), gr.update(), gr.update(), gr.update(), gr.update() | |
| # 预设风格参数映射 | |
| STYLE_PRESETS = { | |
| "轻松对话": {"temperature": 1.0, "top_k": 50, "top_p": 0.9, "repetition_penalty": 1.1}, | |
| "新闻播报": {"temperature": 0.8, "top_k": 30, "top_p": 0.85, "repetition_penalty": 1.05}, | |
| "娱乐节目": {"temperature": 1.2, "top_k": 80, "top_p": 0.95, "repetition_penalty": 1.15}, | |
| "教育讲解": {"temperature": 0.9, "top_k": 40, "top_p": 0.88, "repetition_penalty": 1.08}, | |
| "自定义": {"temperature": 1.0, "top_k": 50, "top_p": 0.9, "repetition_penalty": 1.1} | |
| } | |
| def on_style_preset_change(preset_name): | |
| """当预设风格改变时,自动更新参数滑块""" | |
| if preset_name in STYLE_PRESETS: | |
| params = STYLE_PRESETS[preset_name] | |
| return ( | |
| gr.update(value=params["temperature"]), | |
| gr.update(value=params["top_k"]), | |
| gr.update(value=params["top_p"]), | |
| gr.update(value=params["repetition_penalty"]) | |
| ) | |
| return gr.update(), gr.update(), gr.update(), gr.update() | |
| # 绑定预设风格变更事件 | |
| style_preset.change( | |
| fn=on_style_preset_change, | |
| inputs=[style_preset], | |
| outputs=[temperature, top_k, top_p, repetition_penalty] | |
| ) | |
| btn_load_scenario.click( | |
| fn=on_load_scenario, | |
| inputs=[scenario_dropdown], | |
| outputs=[dialogue_text], # 只更新对话文本 | |
| ) | |
| btn_load_default.click( | |
| fn=on_load_default, | |
| outputs=[dialogue_text, speaker1_audio, speaker1_text, speaker2_audio, speaker2_text], | |
| ) | |
| btn_generate.click( | |
| fn=generate_dialogue_audio, | |
| inputs=[ | |
| dialogue_text, speaker1_audio, speaker1_text, speaker2_audio, speaker2_text, | |
| use_normalize, temperature, top_k, top_p, repetition_penalty, max_new_tokens, do_sample | |
| ], | |
| outputs=[output_audio, status_info], | |
| show_progress=True, | |
| ) | |
| return demo | |
| # 供 HF Spaces 直接加载 | |
| demo = create_space_ui() | |
| def main(): | |
| demo.queue(max_size=16).launch() | |
| if __name__ == "__main__": | |
| main() |