Spaces:

atalink
/

TTS-Talker

Runtime error

App Files Files Community

TTS-Talker / app.py

congcuong-cse

Merge branch 'main' of hf.co:spaces/longtrinhquang/TTS-Talker

34f063d 3 months ago

raw

history blame contribute delete

10.9 kB

	import os, sys
	import tempfile
	import gradio as gr
	from app_tts import infer_tts
	from src.gradio_demo import SadTalker

	# from src.utils.text2speech import TTSTalker
	from huggingface_hub import snapshot_download
	import glob


	def get_source_image(image):
	return image


	try:
	import webui # in webui

	in_webui = True
	except:
	in_webui = False


	def toggle_audio_file(choice):
	if choice == False:
	return gr.update(visible=True), gr.update(visible=False)
	else:
	return gr.update(visible=False), gr.update(visible=True)


	def ref_video_fn(path_of_ref_video):
	if path_of_ref_video is not None:
	return gr.update(value=True)
	else:
	return gr.update(value=False)


	def download_model():
	REPO_ID = "vinthony/SadTalker-V002rc"
	snapshot_download(
	repo_id=REPO_ID,
	local_dir="./checkpoints",
	local_dir_use_symlinks=True,
	)


	def list_videos():
	# Lấy danh sách tất cả file mp4 trong results
	PATH_RESULTS = "results"
	video_files = glob.glob(f"{PATH_RESULTS}/*/.mp4", recursive=True)
	# Trả về danh sách file (có thể sort theo thời gian)
	return sorted(video_files, reverse=True)


	# New: Gộp 2 nút thành 1, output audio là input cho video
	import soundfile as sf


	def generate_voice_and_video(
	ref_audio,
	ref_text,
	gen_text,
	speed,
	source_image,
	preprocess_type,
	is_still_mode,
	enhancer,
	batch_size,
	size_of_image,
	pose_style,
	facerender,
	exp_weight,
	use_ref_video,
	ref_video,
	ref_info,
	use_idle_mode,
	length_of_audio,
	blink_every,
	):
	import gradio as gr

	# Bắt đầu: Hiển thị trạng thái đang tạo audio
	yield (
	gr.update(value=None, visible=True, interactive=False),
	gr.update(value=None, visible=True, interactive=False),
	gr.update(value="⏳ Đang tạo âm thanh...", visible=True),
	gr.update(choices=list_videos()),
	)

	# 1. Sinh audio từ TTS
	(final_sample_rate, final_wave), _ = infer_tts(ref_audio, ref_text, gen_text, speed)
	tmp_audio = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
	import soundfile as sf

	sf.write(tmp_audio.name, final_wave, final_sample_rate)
	# Audio xong, chuyển sang tạo video
	yield (
	gr.update(value=tmp_audio.name, visible=True, interactive=True),
	gr.update(value=None, visible=True, interactive=False),
	gr.update(value="⏳ Đang tạo video...", visible=True),
	gr.update(choices=list_videos()),
	)

	# 2. Gọi SadTalker với audio vừa sinh ra
	sad_talker = SadTalker(lazy_load=True)
	video_path = sad_talker.test(
	source_image,
	tmp_audio.name,
	preprocess_type,
	is_still_mode,
	enhancer,
	batch_size,
	size_of_image,
	pose_style,
	facerender,
	exp_weight,
	use_ref_video,
	ref_video,
	ref_info,
	use_idle_mode,
	length_of_audio,
	blink_every,
	)
	# Cả audio và video đã xong
	yield (
	gr.update(value=tmp_audio.name, visible=True, interactive=True),
	gr.update(value=video_path, visible=True, interactive=True),
	gr.update(value="✅ Hoàn thành!", visible=True),
	gr.update(choices=list_videos(), value=video_path),
	)
	def list_files(directory):
	try:
	files = os.listdir(directory)
	return "\n".join(files)
	except Exception as e:
	return str(e)


	def sadtalker_demo():
	download_model()
	with gr.Blocks(
	analytics_enabled=False,
	) as sadtalker_interface:
	gr.Markdown(
	f"""
	![logo](https://vietnam.atalink.com/favicon.ico)

	## Atalink TTS_Talker

	Nhập text, upload sample voice và ảnh để tạo video nói chuyện.
	"""
	)
	with gr.Tab("Tạo video mới"):
	with gr.Row(elem_classes="gr-row"):
	ref_audio = gr.Audio(label="🔊 Sample Voice", type="filepath")
	ref_text = gr.Textbox(
	label="📝 Nội dung tham khảo (tùy chọn)",
	placeholder="Nhập transcript tiếng Việt cho sample voice nếu có...",
	lines=2,
	)
	with gr.Row(elem_classes="gr-row"):
	gen_text = gr.Textbox(
	label="📝 Nội dung cần tạo",
	placeholder="Nhập nội dung để tạo giọng nói...",
	lines=3,
	)
	speed = gr.Slider(
	0.3,
	2.0,
	value=1.0,
	step=0.1,
	label="⚡ Tốc độ nói",
	info="Chỉnh tốc độ phát âm",
	)
	with gr.Row(elem_classes="gr-row"):
	source_image = gr.Image(
	label="Ảnh nguồn", type="filepath", elem_id="img2img_image"
	)
	with gr.Accordion(
	"Cài đặt nâng cao SadTalker", open=False, elem_classes="gr-button"
	):
	with gr.Row(elem_classes="gr-row"):
	preprocess_type = gr.Radio(
	["crop", "resize", "full", "extcrop", "extfull"],
	value="crop",
	label="Tiền xử lý ảnh",
	info="Cách xử lý ảnh đầu vào?",
	)
	is_still_mode = gr.Checkbox(
	label="Chế độ tĩnh (ít chuyển động đầu)"
	)
	enhancer = gr.Checkbox(label="Dùng GFPGAN làm đẹp mặt")
	batch_size = gr.Slider(
	label="Batch size", step=1, maximum=10, value=1
	)
	size_of_image = gr.Radio(
	[256, 512],
	value=256,
	label="Độ phân giải khuôn mặt",
	info="Dùng model 256/512?",
	)
	with gr.Row(elem_classes="gr-row"):
	pose_style = gr.Slider(
	minimum=0, maximum=45, step=1, label="Kiểu pose", value=0
	)
	facerender = gr.Radio(
	["facevid2vid", "pirender"],
	value="facevid2vid",
	label="Face render",
	info="Chọn kiểu render mặt",
	)
	exp_weight = gr.Slider(
	minimum=0,
	maximum=3,
	step=0.1,
	label="Biên độ biểu cảm",
	value=1,
	)
	use_ref_video = gr.Checkbox(label="Dùng video tham chiếu")
	ref_video = gr.Video(
	label="Video tham chiếu",
	elem_id="vidref",
	height=120,
	width=120,
	)
	ref_info = gr.Radio(
	["pose", "blink", "pose+blink", "all"],
	value="pose",
	label="Tham chiếu",
	info="Cách lấy thông tin từ video tham chiếu?",
	)
	use_idle_mode = gr.Checkbox(label="Idle Animation")
	length_of_audio = gr.Number(value=5, label="Độ dài video (giây)")
	blink_every = gr.Checkbox(label="Chớp mắt", value=True)
	btn_generate = gr.Button(
	"🔥 Tạo giọng nói & video", elem_id="btn-generate", interactive=False
	)
	with gr.Row(elem_classes="gr-row"):
	output_audio = gr.Audio(label="🎧 Audio đã tạo", type="filepath")
	gen_video = gr.Video(
	label="Video đã tạo", format="mp4", scale=1, width=180
	)
	status_box = gr.Textbox(
	label="Trạng thái tiến trình",
	interactive=False,
	value="",
	visible=True,
	)

	def enable_generate(audio, text, image):
	return gr.update(interactive=bool(audio and text and image))

	ref_audio.change(
	enable_generate, [ref_audio, gen_text, source_image], btn_generate
	)
	gen_text.change(
	enable_generate, [ref_audio, gen_text, source_image], btn_generate
	)
	source_image.change(
	enable_generate, [ref_audio, gen_text, source_image], btn_generate
	)

	with gr.Tab("Lịch sử video"):
	with gr.Row(elem_classes="gr-row"):
	refresh_btn = gr.Button("🔄 Refresh File List")

	video_list = gr.Dropdown(
	value=list_videos()[0] if len(list_videos()) > 0 else None,
	choices=list_videos(),
	label="Chọn video để xem",
	interactive=True,
	scale=1,
	)
	video_player = gr.Video(
	height=180, width=180, label="Video lịch sử", scale=1
	)

	refresh_btn.click(fn=lambda: gr.update(choices=list_videos()), outputs=video_list)
	video_list.change(lambda x: x, inputs=video_list, outputs=video_player)
	with gr.Tab("Debug"):
	directory_input = gr.Textbox(label="Enter Directory Path", value=".")
	file_list_output = gr.Textbox(label="Files", lines=10)

	directory_input.change(fn=list_files, inputs=directory_input, outputs=file_list_output)

	btn_generate.click(
	generate_voice_and_video,
	inputs=[
	ref_audio,
	ref_text,
	gen_text,
	speed,
	source_image,
	preprocess_type,
	is_still_mode,
	enhancer,
	batch_size,
	size_of_image,
	pose_style,
	facerender,
	exp_weight,
	use_ref_video,
	ref_video,
	ref_info,
	use_idle_mode,
	length_of_audio,
	blink_every,
	],
	outputs=[output_audio, gen_video, status_box, video_list],
	)
	return sadtalker_interface


	if __name__ == "__main__":
	demo = sadtalker_demo()
	demo.queue(max_size=10, api_open=True)
	demo.launch(debug=True, server_name="0.0.0.0")