Spaces:
Runtime error
Runtime error
| import os, sys | |
| import tempfile | |
| import gradio as gr | |
| from app_tts import infer_tts | |
| from src.gradio_demo import SadTalker | |
| # from src.utils.text2speech import TTSTalker | |
| from huggingface_hub import snapshot_download | |
| def get_source_image(image): | |
| return image | |
| try: | |
| import webui # in webui | |
| in_webui = True | |
| except: | |
| in_webui = False | |
| def toggle_audio_file(choice): | |
| if choice == False: | |
| return gr.update(visible=True), gr.update(visible=False) | |
| else: | |
| return gr.update(visible=False), gr.update(visible=True) | |
| def ref_video_fn(path_of_ref_video): | |
| if path_of_ref_video is not None: | |
| return gr.update(value=True) | |
| else: | |
| return gr.update(value=False) | |
| def download_model(): | |
| REPO_ID = 'vinthony/SadTalker-V002rc' | |
| snapshot_download(repo_id=REPO_ID, local_dir='./checkpoints', local_dir_use_symlinks=True) | |
| # New: Gộp 2 nút thành 1, output audio là input cho video | |
| import soundfile as sf | |
| def generate_voice_and_video(ref_audio, ref_text, gen_text, speed, source_image, preprocess_type, is_still_mode, enhancer, batch_size, size_of_image, pose_style, facerender, exp_weight, use_ref_video, ref_video, ref_info, use_idle_mode, length_of_audio, blink_every): | |
| # 1. Sinh audio từ TTS | |
| (final_sample_rate, final_wave), _ = infer_tts(ref_audio, ref_text, gen_text, speed) | |
| # Lưu ra file tạm | |
| tmp_audio = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) | |
| sf.write(tmp_audio.name, final_wave, final_sample_rate) | |
| # 2. Gọi SadTalker với audio vừa sinh ra | |
| sad_talker = SadTalker(lazy_load=True) | |
| video_path = sad_talker.test( | |
| source_image, | |
| tmp_audio.name, | |
| preprocess_type, | |
| is_still_mode, | |
| enhancer, | |
| batch_size, | |
| size_of_image, | |
| pose_style, | |
| facerender, | |
| exp_weight, | |
| use_ref_video, | |
| ref_video, | |
| ref_info, | |
| use_idle_mode, | |
| length_of_audio, | |
| blink_every | |
| ) | |
| return tmp_audio.name, video_path | |
| def sadtalker_demo(): | |
| download_model() | |
| with gr.Blocks(analytics_enabled=False) as sadtalker_interface: | |
| gr.Markdown(""" | |
| # 🎤 F5-TTS: Vietnamese Text-to-Speech Synthesis & SadTalker Video | |
| # Nhập text, upload sample voice và ảnh để tạo video nói chuyện. | |
| """) | |
| with gr.Row(): | |
| ref_audio = gr.Audio(label="🔊 Sample Voice", type="filepath") | |
| ref_text = gr.Textbox(label="📝 Reference Transcript (optional)", placeholder="Nhập transcript tiếng Việt cho sample voice nếu có...", lines=2) | |
| gen_text = gr.Textbox(label="📝 Text", placeholder="Enter the text to generate voice...", lines=3) | |
| speed = gr.Slider(0.3, 2.0, value=1.0, step=0.1, label="⚡ Speed") | |
| with gr.Row(): | |
| source_image = gr.Image(label="Source image", type="filepath", elem_id="img2img_image") | |
| with gr.Row(): | |
| # Các setting cho SadTalker | |
| with gr.Column(): | |
| preprocess_type = gr.Radio(['crop', 'resize','full', 'extcrop', 'extfull'], value='crop', label='preprocess', info="How to handle input image?") | |
| is_still_mode = gr.Checkbox(label="Still Mode (fewer head motion, works with preprocess `full`)") | |
| enhancer = gr.Checkbox(label="GFPGAN as Face enhancer") | |
| batch_size = gr.Slider(label="batch size in generation", step=1, maximum=10, value=1) | |
| size_of_image = gr.Radio([256, 512], value=256, label='face model resolution', info="use 256/512 model?") | |
| pose_style = gr.Slider(minimum=0, maximum=45, step=1, label="Pose style", value=0) | |
| facerender = gr.Radio(['facevid2vid','pirender'], value='facevid2vid', label='facerender', info="which face render?") | |
| exp_weight = gr.Slider(minimum=0, maximum=3, step=0.1, label="expression scale", value=1) | |
| use_ref_video = gr.Checkbox(label="Use Reference Video") | |
| ref_video = gr.Video(label="Reference Video", elem_id="vidref") | |
| ref_info = gr.Radio(['pose', 'blink','pose+blink', 'all'], value='pose', label='Reference Video',info="How to borrow from reference Video?((fully transfer, aka, video driving mode))") | |
| use_idle_mode = gr.Checkbox(label="Use Idle Animation") | |
| length_of_audio = gr.Number(value=5, label="The length(seconds) of the generated video.") | |
| blink_every = gr.Checkbox(label="use eye blink", value=True) | |
| btn_generate = gr.Button("🔥 Generate Voice & Video") | |
| with gr.Row(): | |
| output_audio = gr.Audio(label="🎧 Generated Audio", type="filepath") | |
| gen_video = gr.Video(label="Generated video", format="mp4", scale=1) | |
| btn_generate.click( | |
| generate_voice_and_video, | |
| inputs=[ref_audio, ref_text, gen_text, speed, source_image, preprocess_type, is_still_mode, enhancer, batch_size, size_of_image, pose_style, facerender, exp_weight, use_ref_video, ref_video, ref_info, use_idle_mode, length_of_audio, blink_every], | |
| outputs=[output_audio, gen_video] | |
| ) | |
| return sadtalker_interface | |
| if __name__ == "__main__": | |
| demo = sadtalker_demo() | |
| demo.queue(max_size=10, api_open=True) | |
| demo.launch(debug=True, server_name="0.0.0.0") | |