File size: 5,317 Bytes
cff9535
 
 
b8a79bd
416263d
95ba447
0ce42bd
cff9535
 
 
 
9ab094a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0ce42bd
9ab094a
0ce42bd
416263d
cff9535
2bf87e7
 
0ce42bd
2bf87e7
 
 
 
 
 
 
0ce42bd
2bf87e7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
416263d
2bf87e7
 
cff9535
b8a79bd
2bf87e7
 
b8a79bd
 
 
2bf87e7
b8a79bd
 
 
2bf87e7
cff9535
2bf87e7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cff9535
a22eb82
cff9535
416263d
a663a58
2bf87e7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import os, sys
import tempfile
import gradio as gr
from app_tts import infer_tts
from src.gradio_demo import SadTalker  
# from src.utils.text2speech import TTSTalker
from huggingface_hub import snapshot_download

def get_source_image(image):   
        return image

try:
    import webui  # in webui
    in_webui = True
except:
    in_webui = False


def toggle_audio_file(choice):
    if choice == False:
        return gr.update(visible=True), gr.update(visible=False)
    else:
        return gr.update(visible=False), gr.update(visible=True)
    
def ref_video_fn(path_of_ref_video):
    if path_of_ref_video is not None:
        return gr.update(value=True)
    else:
        return gr.update(value=False)
    
def download_model():
    REPO_ID = 'vinthony/SadTalker-V002rc'
    snapshot_download(repo_id=REPO_ID, local_dir='./checkpoints', local_dir_use_symlinks=True)


# New: Gộp 2 nút thành 1, output audio là input cho video
import soundfile as sf

def generate_voice_and_video(ref_audio, ref_text, gen_text, speed, source_image, preprocess_type, is_still_mode, enhancer, batch_size, size_of_image, pose_style, facerender, exp_weight, use_ref_video, ref_video, ref_info, use_idle_mode, length_of_audio, blink_every):
    # 1. Sinh audio từ TTS
    (final_sample_rate, final_wave), _ = infer_tts(ref_audio, ref_text, gen_text, speed)
    # Lưu ra file tạm
    tmp_audio = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
    sf.write(tmp_audio.name, final_wave, final_sample_rate)
    # 2. Gọi SadTalker với audio vừa sinh ra
    sad_talker = SadTalker(lazy_load=True)
    video_path = sad_talker.test(
        source_image,
        tmp_audio.name,
        preprocess_type,
        is_still_mode,
        enhancer,
        batch_size,
        size_of_image,
        pose_style,
        facerender,
        exp_weight,
        use_ref_video,
        ref_video,
        ref_info,
        use_idle_mode,
        length_of_audio,
        blink_every
    )
    return tmp_audio.name, video_path

def sadtalker_demo():
    download_model()
    with gr.Blocks(analytics_enabled=False) as sadtalker_interface:
        gr.Markdown("""
# 🎤 F5-TTS: Vietnamese Text-to-Speech Synthesis & SadTalker Video
# Nhập text, upload sample voice và ảnh để tạo video nói chuyện.
""")
        with gr.Row():
            ref_audio = gr.Audio(label="🔊 Sample Voice", type="filepath")
            ref_text = gr.Textbox(label="📝 Reference Transcript (optional)", placeholder="Nhập transcript tiếng Việt cho sample voice nếu có...", lines=2)
            gen_text = gr.Textbox(label="📝 Text", placeholder="Enter the text to generate voice...", lines=3)
        speed = gr.Slider(0.3, 2.0, value=1.0, step=0.1, label="⚡ Speed")
        with gr.Row():
            source_image = gr.Image(label="Source image", type="filepath", elem_id="img2img_image")
        with gr.Row():
            # Các setting cho SadTalker
            with gr.Column():
                preprocess_type = gr.Radio(['crop', 'resize','full', 'extcrop', 'extfull'], value='crop', label='preprocess', info="How to handle input image?")
                is_still_mode = gr.Checkbox(label="Still Mode (fewer head motion, works with preprocess `full`)")
                enhancer = gr.Checkbox(label="GFPGAN as Face enhancer")
                batch_size = gr.Slider(label="batch size in generation", step=1, maximum=10, value=1)
                size_of_image = gr.Radio([256, 512], value=256, label='face model resolution', info="use 256/512 model?")
                pose_style = gr.Slider(minimum=0, maximum=45, step=1, label="Pose style", value=0)
                facerender = gr.Radio(['facevid2vid','pirender'], value='facevid2vid', label='facerender', info="which face render?")
                exp_weight = gr.Slider(minimum=0, maximum=3, step=0.1, label="expression scale", value=1)
                use_ref_video = gr.Checkbox(label="Use Reference Video")
                ref_video = gr.Video(label="Reference Video", elem_id="vidref")
                ref_info = gr.Radio(['pose', 'blink','pose+blink', 'all'], value='pose', label='Reference Video',info="How to borrow from reference Video?((fully transfer, aka, video driving mode))")
                use_idle_mode = gr.Checkbox(label="Use Idle Animation")
                length_of_audio = gr.Number(value=5, label="The length(seconds) of the generated video.")
                blink_every = gr.Checkbox(label="use eye blink", value=True)
        btn_generate = gr.Button("🔥 Generate Voice & Video")
        with gr.Row():
            output_audio = gr.Audio(label="🎧 Generated Audio", type="filepath")
            gen_video = gr.Video(label="Generated video", format="mp4", scale=1)
        btn_generate.click(
            generate_voice_and_video,
            inputs=[ref_audio, ref_text, gen_text, speed, source_image, preprocess_type, is_still_mode, enhancer, batch_size, size_of_image, pose_style, facerender, exp_weight, use_ref_video, ref_video, ref_info, use_idle_mode, length_of_audio, blink_every],
            outputs=[output_audio, gen_video]
        )
    return sadtalker_interface

if __name__ == "__main__":
    demo = sadtalker_demo()
    demo.queue(max_size=10, api_open=True)
    demo.launch(debug=True, server_name="0.0.0.0")