Spaces:
Runtime error
Runtime error
File size: 5,317 Bytes
cff9535 b8a79bd 416263d 95ba447 0ce42bd cff9535 9ab094a 0ce42bd 9ab094a 0ce42bd 416263d cff9535 2bf87e7 0ce42bd 2bf87e7 0ce42bd 2bf87e7 416263d 2bf87e7 cff9535 b8a79bd 2bf87e7 b8a79bd 2bf87e7 b8a79bd 2bf87e7 cff9535 2bf87e7 cff9535 a22eb82 cff9535 416263d a663a58 2bf87e7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 |
import os, sys
import tempfile
import gradio as gr
from app_tts import infer_tts
from src.gradio_demo import SadTalker
# from src.utils.text2speech import TTSTalker
from huggingface_hub import snapshot_download
def get_source_image(image):
return image
try:
import webui # in webui
in_webui = True
except:
in_webui = False
def toggle_audio_file(choice):
if choice == False:
return gr.update(visible=True), gr.update(visible=False)
else:
return gr.update(visible=False), gr.update(visible=True)
def ref_video_fn(path_of_ref_video):
if path_of_ref_video is not None:
return gr.update(value=True)
else:
return gr.update(value=False)
def download_model():
REPO_ID = 'vinthony/SadTalker-V002rc'
snapshot_download(repo_id=REPO_ID, local_dir='./checkpoints', local_dir_use_symlinks=True)
# New: Gộp 2 nút thành 1, output audio là input cho video
import soundfile as sf
def generate_voice_and_video(ref_audio, ref_text, gen_text, speed, source_image, preprocess_type, is_still_mode, enhancer, batch_size, size_of_image, pose_style, facerender, exp_weight, use_ref_video, ref_video, ref_info, use_idle_mode, length_of_audio, blink_every):
# 1. Sinh audio từ TTS
(final_sample_rate, final_wave), _ = infer_tts(ref_audio, ref_text, gen_text, speed)
# Lưu ra file tạm
tmp_audio = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
sf.write(tmp_audio.name, final_wave, final_sample_rate)
# 2. Gọi SadTalker với audio vừa sinh ra
sad_talker = SadTalker(lazy_load=True)
video_path = sad_talker.test(
source_image,
tmp_audio.name,
preprocess_type,
is_still_mode,
enhancer,
batch_size,
size_of_image,
pose_style,
facerender,
exp_weight,
use_ref_video,
ref_video,
ref_info,
use_idle_mode,
length_of_audio,
blink_every
)
return tmp_audio.name, video_path
def sadtalker_demo():
download_model()
with gr.Blocks(analytics_enabled=False) as sadtalker_interface:
gr.Markdown("""
# 🎤 F5-TTS: Vietnamese Text-to-Speech Synthesis & SadTalker Video
# Nhập text, upload sample voice và ảnh để tạo video nói chuyện.
""")
with gr.Row():
ref_audio = gr.Audio(label="🔊 Sample Voice", type="filepath")
ref_text = gr.Textbox(label="📝 Reference Transcript (optional)", placeholder="Nhập transcript tiếng Việt cho sample voice nếu có...", lines=2)
gen_text = gr.Textbox(label="📝 Text", placeholder="Enter the text to generate voice...", lines=3)
speed = gr.Slider(0.3, 2.0, value=1.0, step=0.1, label="⚡ Speed")
with gr.Row():
source_image = gr.Image(label="Source image", type="filepath", elem_id="img2img_image")
with gr.Row():
# Các setting cho SadTalker
with gr.Column():
preprocess_type = gr.Radio(['crop', 'resize','full', 'extcrop', 'extfull'], value='crop', label='preprocess', info="How to handle input image?")
is_still_mode = gr.Checkbox(label="Still Mode (fewer head motion, works with preprocess `full`)")
enhancer = gr.Checkbox(label="GFPGAN as Face enhancer")
batch_size = gr.Slider(label="batch size in generation", step=1, maximum=10, value=1)
size_of_image = gr.Radio([256, 512], value=256, label='face model resolution', info="use 256/512 model?")
pose_style = gr.Slider(minimum=0, maximum=45, step=1, label="Pose style", value=0)
facerender = gr.Radio(['facevid2vid','pirender'], value='facevid2vid', label='facerender', info="which face render?")
exp_weight = gr.Slider(minimum=0, maximum=3, step=0.1, label="expression scale", value=1)
use_ref_video = gr.Checkbox(label="Use Reference Video")
ref_video = gr.Video(label="Reference Video", elem_id="vidref")
ref_info = gr.Radio(['pose', 'blink','pose+blink', 'all'], value='pose', label='Reference Video',info="How to borrow from reference Video?((fully transfer, aka, video driving mode))")
use_idle_mode = gr.Checkbox(label="Use Idle Animation")
length_of_audio = gr.Number(value=5, label="The length(seconds) of the generated video.")
blink_every = gr.Checkbox(label="use eye blink", value=True)
btn_generate = gr.Button("🔥 Generate Voice & Video")
with gr.Row():
output_audio = gr.Audio(label="🎧 Generated Audio", type="filepath")
gen_video = gr.Video(label="Generated video", format="mp4", scale=1)
btn_generate.click(
generate_voice_and_video,
inputs=[ref_audio, ref_text, gen_text, speed, source_image, preprocess_type, is_still_mode, enhancer, batch_size, size_of_image, pose_style, facerender, exp_weight, use_ref_video, ref_video, ref_info, use_idle_mode, length_of_audio, blink_every],
outputs=[output_audio, gen_video]
)
return sadtalker_interface
if __name__ == "__main__":
demo = sadtalker_demo()
demo.queue(max_size=10, api_open=True)
demo.launch(debug=True, server_name="0.0.0.0")
|