Spaces:

atalink
/

TTS-Talker

Runtime error

File size: 10,859 Bytes

cff9535
 
 
b8a79bd
dc78718
 
95ba447
0ce42bd
dc78718
 
 
 
 
cff9535
 
9ab094a
 
dc78718
9ab094a
 
 
 
 
 
 
 
 
 
dc78718
 
9ab094a
 
 
 
 
dc78718
 
0ce42bd
dc78718
 
 
 
 
 
 
 
 
 
1dce2dd
 
dc78718
 
416263d
cff9535
2bf87e7
 
0ce42bd
dc78718
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a3fd3c7
1dce2dd
a3fd3c7
 
 
 
1dce2dd
 
a3fd3c7
 
2bf87e7
 
 
a3fd3c7
1dce2dd
2bf87e7
a3fd3c7
 
 
 
1dce2dd
 
a3fd3c7
 
2bf87e7
0ce42bd
2bf87e7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dc78718
2bf87e7
a3fd3c7
 
 
 
1dce2dd
 
a3fd3c7
e066130
 
 
 
 
 
416263d
dc78718
2bf87e7
 
dc78718
 
 
 
 
 
 
e578b02
dc78718
e578b02
dc78718
2bf87e7
dc78718
 
 
 
e578b02
dc78718
 
 
 
 
e578b02
dc78718
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e578b02
 
 
dc78718
 
 
a3fd3c7
dc78718
1dce2dd
 
 
 
 
 
e578b02
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fd5587a
 
e578b02
1dce2dd
e578b02
 
 
 
 
 
 
 
fd5587a
 
e578b02
e066130
 
 
 
 
e578b02
1dce2dd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cff9535
a22eb82
dc78718
cff9535
416263d
a663a58
2bf87e7

import os, sys
import tempfile
import gradio as gr
from app_tts import infer_tts
from src.gradio_demo import SadTalker

# from src.utils.text2speech import TTSTalker
from huggingface_hub import snapshot_download
import glob


def get_source_image(image):
    return image


try:
    import webui  # in webui

    in_webui = True
except:
    in_webui = False


def toggle_audio_file(choice):
    if choice == False:
        return gr.update(visible=True), gr.update(visible=False)
    else:
        return gr.update(visible=False), gr.update(visible=True)


def ref_video_fn(path_of_ref_video):
    if path_of_ref_video is not None:
        return gr.update(value=True)
    else:
        return gr.update(value=False)


def download_model():
    REPO_ID = "vinthony/SadTalker-V002rc"
    snapshot_download(
        repo_id=REPO_ID,
        local_dir="./checkpoints",
        local_dir_use_symlinks=True,
    )


def list_videos():
    # Lấy danh sách tất cả file mp4 trong results
    PATH_RESULTS = "results"
    video_files = glob.glob(f"{PATH_RESULTS}/**/*.mp4", recursive=True)
    # Trả về danh sách file (có thể sort theo thời gian)
    return sorted(video_files, reverse=True)


# New: Gộp 2 nút thành 1, output audio là input cho video
import soundfile as sf


def generate_voice_and_video(
    ref_audio,
    ref_text,
    gen_text,
    speed,
    source_image,
    preprocess_type,
    is_still_mode,
    enhancer,
    batch_size,
    size_of_image,
    pose_style,
    facerender,
    exp_weight,
    use_ref_video,
    ref_video,
    ref_info,
    use_idle_mode,
    length_of_audio,
    blink_every,
):
    import gradio as gr

    # Bắt đầu: Hiển thị trạng thái đang tạo audio
    yield (
        gr.update(value=None, visible=True, interactive=False),
        gr.update(value=None, visible=True, interactive=False),
        gr.update(value="⏳ Đang tạo âm thanh...", visible=True),
        gr.update(choices=list_videos()),
    )

    # 1. Sinh audio từ TTS
    (final_sample_rate, final_wave), _ = infer_tts(ref_audio, ref_text, gen_text, speed)
    tmp_audio = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
    import soundfile as sf

    sf.write(tmp_audio.name, final_wave, final_sample_rate)
    # Audio xong, chuyển sang tạo video
    yield (
        gr.update(value=tmp_audio.name, visible=True, interactive=True),
        gr.update(value=None, visible=True, interactive=False),
        gr.update(value="⏳ Đang tạo video...", visible=True),
        gr.update(choices=list_videos()),
    )

    # 2. Gọi SadTalker với audio vừa sinh ra
    sad_talker = SadTalker(lazy_load=True)
    video_path = sad_talker.test(
        source_image,
        tmp_audio.name,
        preprocess_type,
        is_still_mode,
        enhancer,
        batch_size,
        size_of_image,
        pose_style,
        facerender,
        exp_weight,
        use_ref_video,
        ref_video,
        ref_info,
        use_idle_mode,
        length_of_audio,
        blink_every,
    )
    # Cả audio và video đã xong
    yield (
        gr.update(value=tmp_audio.name, visible=True, interactive=True),
        gr.update(value=video_path, visible=True, interactive=True),
        gr.update(value="✅ Hoàn thành!", visible=True),
        gr.update(choices=list_videos(), value=video_path),
    )
def list_files(directory):
    try:
        files = os.listdir(directory)
        return "\n".join(files)
    except Exception as e:
        return str(e)


def sadtalker_demo():
    download_model()
    with gr.Blocks(
        analytics_enabled=False,
    ) as sadtalker_interface:
        gr.Markdown(
            f"""
    ![logo](https://vietnam.atalink.com/favicon.ico)

    ## Atalink TTS_Talker

    Nhập text, upload sample voice và ảnh để tạo video nói chuyện.
        """
        )
        with gr.Tab("Tạo video mới"):
            with gr.Row(elem_classes="gr-row"):
                ref_audio = gr.Audio(label="🔊 Sample Voice", type="filepath")
                ref_text = gr.Textbox(
                    label="📝 Nội dung tham khảo (tùy chọn)",
                    placeholder="Nhập transcript tiếng Việt cho sample voice nếu có...",
                    lines=2,
                )
            with gr.Row(elem_classes="gr-row"):
                gen_text = gr.Textbox(
                    label="📝 Nội dung cần tạo",
                    placeholder="Nhập nội dung để tạo giọng nói...",
                    lines=3,
                )
                speed = gr.Slider(
                    0.3,
                    2.0,
                    value=1.0,
                    step=0.1,
                    label="⚡ Tốc độ nói",
                    info="Chỉnh tốc độ phát âm",
                )
            with gr.Row(elem_classes="gr-row"):
                source_image = gr.Image(
                    label="Ảnh nguồn", type="filepath", elem_id="img2img_image"
                )
            with gr.Accordion(
                "Cài đặt nâng cao SadTalker", open=False, elem_classes="gr-button"
            ):
                with gr.Row(elem_classes="gr-row"):
                    preprocess_type = gr.Radio(
                        ["crop", "resize", "full", "extcrop", "extfull"],
                        value="crop",
                        label="Tiền xử lý ảnh",
                        info="Cách xử lý ảnh đầu vào?",
                    )
                    is_still_mode = gr.Checkbox(
                        label="Chế độ tĩnh (ít chuyển động đầu)"
                    )
                    enhancer = gr.Checkbox(label="Dùng GFPGAN làm đẹp mặt")
                    batch_size = gr.Slider(
                        label="Batch size", step=1, maximum=10, value=1
                    )
                    size_of_image = gr.Radio(
                        [256, 512],
                        value=256,
                        label="Độ phân giải khuôn mặt",
                        info="Dùng model 256/512?",
                    )
                with gr.Row(elem_classes="gr-row"):
                    pose_style = gr.Slider(
                        minimum=0, maximum=45, step=1, label="Kiểu pose", value=0
                    )
                    facerender = gr.Radio(
                        ["facevid2vid", "pirender"],
                        value="facevid2vid",
                        label="Face render",
                        info="Chọn kiểu render mặt",
                    )
                    exp_weight = gr.Slider(
                        minimum=0,
                        maximum=3,
                        step=0.1,
                        label="Biên độ biểu cảm",
                        value=1,
                    )
                    use_ref_video = gr.Checkbox(label="Dùng video tham chiếu")
                    ref_video = gr.Video(
                        label="Video tham chiếu",
                        elem_id="vidref",
                        height=120,
                        width=120,
                    )
                    ref_info = gr.Radio(
                        ["pose", "blink", "pose+blink", "all"],
                        value="pose",
                        label="Tham chiếu",
                        info="Cách lấy thông tin từ video tham chiếu?",
                    )
                    use_idle_mode = gr.Checkbox(label="Idle Animation")
                    length_of_audio = gr.Number(value=5, label="Độ dài video (giây)")
                    blink_every = gr.Checkbox(label="Chớp mắt", value=True)
            btn_generate = gr.Button(
                "🔥 Tạo giọng nói & video", elem_id="btn-generate", interactive=False
            )
            with gr.Row(elem_classes="gr-row"):
                output_audio = gr.Audio(label="🎧 Audio đã tạo", type="filepath")
                gen_video = gr.Video(
                    label="Video đã tạo", format="mp4", scale=1, width=180
                )
                status_box = gr.Textbox(
                    label="Trạng thái tiến trình",
                    interactive=False,
                    value="",
                    visible=True,
                )

            def enable_generate(audio, text, image):
                return gr.update(interactive=bool(audio and text and image))

            ref_audio.change(
                enable_generate, [ref_audio, gen_text, source_image], btn_generate
            )
            gen_text.change(
                enable_generate, [ref_audio, gen_text, source_image], btn_generate
            )
            source_image.change(
                enable_generate, [ref_audio, gen_text, source_image], btn_generate
            )

        with gr.Tab("Lịch sử video"):
            with gr.Row(elem_classes="gr-row"):
                refresh_btn = gr.Button("🔄 Refresh File List")

                video_list = gr.Dropdown(
                    value=list_videos()[0] if len(list_videos()) > 0 else None,
                    choices=list_videos(),
                    label="Chọn video để xem",
                    interactive=True,
                    scale=1,
                )
                video_player = gr.Video(
                    height=180, width=180, label="Video lịch sử", scale=1
                )
                
            refresh_btn.click(fn=lambda: gr.update(choices=list_videos()), outputs=video_list)
            video_list.change(lambda x: x, inputs=video_list, outputs=video_player)
        with gr.Tab("Debug"):
            directory_input = gr.Textbox(label="Enter Directory Path", value=".")
            file_list_output = gr.Textbox(label="Files", lines=10)

            directory_input.change(fn=list_files, inputs=directory_input, outputs=file_list_output)

        btn_generate.click(
            generate_voice_and_video,
            inputs=[
                ref_audio,
                ref_text,
                gen_text,
                speed,
                source_image,
                preprocess_type,
                is_still_mode,
                enhancer,
                batch_size,
                size_of_image,
                pose_style,
                facerender,
                exp_weight,
                use_ref_video,
                ref_video,
                ref_info,
                use_idle_mode,
                length_of_audio,
                blink_every,
            ],
            outputs=[output_audio, gen_video, status_box, video_list],
        )
    return sadtalker_interface


if __name__ == "__main__":
    demo = sadtalker_demo()
    demo.queue(max_size=10, api_open=True)
    demo.launch(debug=True, server_name="0.0.0.0")