File size: 10,859 Bytes
cff9535
 
 
b8a79bd
dc78718
 
95ba447
0ce42bd
dc78718
 
 
 
 
cff9535
 
9ab094a
 
dc78718
9ab094a
 
 
 
 
 
 
 
 
 
dc78718
 
9ab094a
 
 
 
 
dc78718
 
0ce42bd
dc78718
 
 
 
 
 
 
 
 
 
1dce2dd
 
dc78718
 
416263d
cff9535
2bf87e7
 
0ce42bd
dc78718
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a3fd3c7
1dce2dd
a3fd3c7
 
 
 
1dce2dd
 
a3fd3c7
 
2bf87e7
 
 
a3fd3c7
1dce2dd
2bf87e7
a3fd3c7
 
 
 
1dce2dd
 
a3fd3c7
 
2bf87e7
0ce42bd
2bf87e7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dc78718
2bf87e7
a3fd3c7
 
 
 
1dce2dd
 
a3fd3c7
e066130
 
 
 
 
 
416263d
dc78718
2bf87e7
 
dc78718
 
 
 
 
 
 
e578b02
dc78718
e578b02
dc78718
2bf87e7
dc78718
 
 
 
e578b02
dc78718
 
 
 
 
e578b02
dc78718
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e578b02
 
 
dc78718
 
 
a3fd3c7
dc78718
1dce2dd
 
 
 
 
 
e578b02
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fd5587a
 
e578b02
1dce2dd
e578b02
 
 
 
 
 
 
 
fd5587a
 
e578b02
e066130
 
 
 
 
e578b02
1dce2dd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cff9535
a22eb82
dc78718
cff9535
416263d
a663a58
2bf87e7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
import os, sys
import tempfile
import gradio as gr
from app_tts import infer_tts
from src.gradio_demo import SadTalker

# from src.utils.text2speech import TTSTalker
from huggingface_hub import snapshot_download
import glob


def get_source_image(image):
    return image


try:
    import webui  # in webui

    in_webui = True
except:
    in_webui = False


def toggle_audio_file(choice):
    if choice == False:
        return gr.update(visible=True), gr.update(visible=False)
    else:
        return gr.update(visible=False), gr.update(visible=True)


def ref_video_fn(path_of_ref_video):
    if path_of_ref_video is not None:
        return gr.update(value=True)
    else:
        return gr.update(value=False)


def download_model():
    REPO_ID = "vinthony/SadTalker-V002rc"
    snapshot_download(
        repo_id=REPO_ID,
        local_dir="./checkpoints",
        local_dir_use_symlinks=True,
    )


def list_videos():
    # Lấy danh sách tất cả file mp4 trong results
    PATH_RESULTS = "results"
    video_files = glob.glob(f"{PATH_RESULTS}/**/*.mp4", recursive=True)
    # Trả về danh sách file (có thể sort theo thời gian)
    return sorted(video_files, reverse=True)


# New: Gộp 2 nút thành 1, output audio là input cho video
import soundfile as sf


def generate_voice_and_video(
    ref_audio,
    ref_text,
    gen_text,
    speed,
    source_image,
    preprocess_type,
    is_still_mode,
    enhancer,
    batch_size,
    size_of_image,
    pose_style,
    facerender,
    exp_weight,
    use_ref_video,
    ref_video,
    ref_info,
    use_idle_mode,
    length_of_audio,
    blink_every,
):
    import gradio as gr

    # Bắt đầu: Hiển thị trạng thái đang tạo audio
    yield (
        gr.update(value=None, visible=True, interactive=False),
        gr.update(value=None, visible=True, interactive=False),
        gr.update(value="⏳ Đang tạo âm thanh...", visible=True),
        gr.update(choices=list_videos()),
    )

    # 1. Sinh audio từ TTS
    (final_sample_rate, final_wave), _ = infer_tts(ref_audio, ref_text, gen_text, speed)
    tmp_audio = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
    import soundfile as sf

    sf.write(tmp_audio.name, final_wave, final_sample_rate)
    # Audio xong, chuyển sang tạo video
    yield (
        gr.update(value=tmp_audio.name, visible=True, interactive=True),
        gr.update(value=None, visible=True, interactive=False),
        gr.update(value="⏳ Đang tạo video...", visible=True),
        gr.update(choices=list_videos()),
    )

    # 2. Gọi SadTalker với audio vừa sinh ra
    sad_talker = SadTalker(lazy_load=True)
    video_path = sad_talker.test(
        source_image,
        tmp_audio.name,
        preprocess_type,
        is_still_mode,
        enhancer,
        batch_size,
        size_of_image,
        pose_style,
        facerender,
        exp_weight,
        use_ref_video,
        ref_video,
        ref_info,
        use_idle_mode,
        length_of_audio,
        blink_every,
    )
    # Cả audio và video đã xong
    yield (
        gr.update(value=tmp_audio.name, visible=True, interactive=True),
        gr.update(value=video_path, visible=True, interactive=True),
        gr.update(value="✅ Hoàn thành!", visible=True),
        gr.update(choices=list_videos(), value=video_path),
    )
def list_files(directory):
    try:
        files = os.listdir(directory)
        return "\n".join(files)
    except Exception as e:
        return str(e)


def sadtalker_demo():
    download_model()
    with gr.Blocks(
        analytics_enabled=False,
    ) as sadtalker_interface:
        gr.Markdown(
            f"""
    ![logo](https://vietnam.atalink.com/favicon.ico)

    ## Atalink TTS_Talker

    Nhập text, upload sample voice và ảnh để tạo video nói chuyện.
        """
        )
        with gr.Tab("Tạo video mới"):
            with gr.Row(elem_classes="gr-row"):
                ref_audio = gr.Audio(label="🔊 Sample Voice", type="filepath")
                ref_text = gr.Textbox(
                    label="📝 Nội dung tham khảo (tùy chọn)",
                    placeholder="Nhập transcript tiếng Việt cho sample voice nếu có...",
                    lines=2,
                )
            with gr.Row(elem_classes="gr-row"):
                gen_text = gr.Textbox(
                    label="📝 Nội dung cần tạo",
                    placeholder="Nhập nội dung để tạo giọng nói...",
                    lines=3,
                )
                speed = gr.Slider(
                    0.3,
                    2.0,
                    value=1.0,
                    step=0.1,
                    label="⚡ Tốc độ nói",
                    info="Chỉnh tốc độ phát âm",
                )
            with gr.Row(elem_classes="gr-row"):
                source_image = gr.Image(
                    label="Ảnh nguồn", type="filepath", elem_id="img2img_image"
                )
            with gr.Accordion(
                "Cài đặt nâng cao SadTalker", open=False, elem_classes="gr-button"
            ):
                with gr.Row(elem_classes="gr-row"):
                    preprocess_type = gr.Radio(
                        ["crop", "resize", "full", "extcrop", "extfull"],
                        value="crop",
                        label="Tiền xử lý ảnh",
                        info="Cách xử lý ảnh đầu vào?",
                    )
                    is_still_mode = gr.Checkbox(
                        label="Chế độ tĩnh (ít chuyển động đầu)"
                    )
                    enhancer = gr.Checkbox(label="Dùng GFPGAN làm đẹp mặt")
                    batch_size = gr.Slider(
                        label="Batch size", step=1, maximum=10, value=1
                    )
                    size_of_image = gr.Radio(
                        [256, 512],
                        value=256,
                        label="Độ phân giải khuôn mặt",
                        info="Dùng model 256/512?",
                    )
                with gr.Row(elem_classes="gr-row"):
                    pose_style = gr.Slider(
                        minimum=0, maximum=45, step=1, label="Kiểu pose", value=0
                    )
                    facerender = gr.Radio(
                        ["facevid2vid", "pirender"],
                        value="facevid2vid",
                        label="Face render",
                        info="Chọn kiểu render mặt",
                    )
                    exp_weight = gr.Slider(
                        minimum=0,
                        maximum=3,
                        step=0.1,
                        label="Biên độ biểu cảm",
                        value=1,
                    )
                    use_ref_video = gr.Checkbox(label="Dùng video tham chiếu")
                    ref_video = gr.Video(
                        label="Video tham chiếu",
                        elem_id="vidref",
                        height=120,
                        width=120,
                    )
                    ref_info = gr.Radio(
                        ["pose", "blink", "pose+blink", "all"],
                        value="pose",
                        label="Tham chiếu",
                        info="Cách lấy thông tin từ video tham chiếu?",
                    )
                    use_idle_mode = gr.Checkbox(label="Idle Animation")
                    length_of_audio = gr.Number(value=5, label="Độ dài video (giây)")
                    blink_every = gr.Checkbox(label="Chớp mắt", value=True)
            btn_generate = gr.Button(
                "🔥 Tạo giọng nói & video", elem_id="btn-generate", interactive=False
            )
            with gr.Row(elem_classes="gr-row"):
                output_audio = gr.Audio(label="🎧 Audio đã tạo", type="filepath")
                gen_video = gr.Video(
                    label="Video đã tạo", format="mp4", scale=1, width=180
                )
                status_box = gr.Textbox(
                    label="Trạng thái tiến trình",
                    interactive=False,
                    value="",
                    visible=True,
                )

            def enable_generate(audio, text, image):
                return gr.update(interactive=bool(audio and text and image))

            ref_audio.change(
                enable_generate, [ref_audio, gen_text, source_image], btn_generate
            )
            gen_text.change(
                enable_generate, [ref_audio, gen_text, source_image], btn_generate
            )
            source_image.change(
                enable_generate, [ref_audio, gen_text, source_image], btn_generate
            )

        with gr.Tab("Lịch sử video"):
            with gr.Row(elem_classes="gr-row"):
                refresh_btn = gr.Button("🔄 Refresh File List")

                video_list = gr.Dropdown(
                    value=list_videos()[0] if len(list_videos()) > 0 else None,
                    choices=list_videos(),
                    label="Chọn video để xem",
                    interactive=True,
                    scale=1,
                )
                video_player = gr.Video(
                    height=180, width=180, label="Video lịch sử", scale=1
                )
                
            refresh_btn.click(fn=lambda: gr.update(choices=list_videos()), outputs=video_list)
            video_list.change(lambda x: x, inputs=video_list, outputs=video_player)
        with gr.Tab("Debug"):
            directory_input = gr.Textbox(label="Enter Directory Path", value=".")
            file_list_output = gr.Textbox(label="Files", lines=10)

            directory_input.change(fn=list_files, inputs=directory_input, outputs=file_list_output)

        btn_generate.click(
            generate_voice_and_video,
            inputs=[
                ref_audio,
                ref_text,
                gen_text,
                speed,
                source_image,
                preprocess_type,
                is_still_mode,
                enhancer,
                batch_size,
                size_of_image,
                pose_style,
                facerender,
                exp_weight,
                use_ref_video,
                ref_video,
                ref_info,
                use_idle_mode,
                length_of_audio,
                blink_every,
            ],
            outputs=[output_audio, gen_video, status_box, video_list],
        )
    return sadtalker_interface


if __name__ == "__main__":
    demo = sadtalker_demo()
    demo.queue(max_size=10, api_open=True)
    demo.launch(debug=True, server_name="0.0.0.0")