Long Trinh-Quang commited on
Commit
dc78718
·
1 Parent(s): 59ae2b9

[DEV] update ui app.py

Browse files
Files changed (1) hide show
  1. app.py +188 -46
app.py CHANGED
@@ -2,15 +2,20 @@ import os, sys
2
  import tempfile
3
  import gradio as gr
4
  from app_tts import infer_tts
5
- from src.gradio_demo import SadTalker
 
6
  # from src.utils.text2speech import TTSTalker
7
  from huggingface_hub import snapshot_download
 
 
 
 
 
8
 
9
- def get_source_image(image):
10
- return image
11
 
12
  try:
13
  import webui # in webui
 
14
  in_webui = True
15
  except:
16
  in_webui = False
@@ -21,22 +26,56 @@ def toggle_audio_file(choice):
21
  return gr.update(visible=True), gr.update(visible=False)
22
  else:
23
  return gr.update(visible=False), gr.update(visible=True)
24
-
 
25
  def ref_video_fn(path_of_ref_video):
26
  if path_of_ref_video is not None:
27
  return gr.update(value=True)
28
  else:
29
  return gr.update(value=False)
30
-
 
31
  def download_model():
32
- REPO_ID = 'vinthony/SadTalker-V002rc'
33
- snapshot_download(repo_id=REPO_ID, local_dir='./checkpoints', local_dir_use_symlinks=True)
 
 
 
 
 
 
 
 
 
 
 
34
 
35
 
36
  # New: Gộp 2 nút thành 1, output audio là input cho video
37
  import soundfile as sf
38
 
39
- def generate_voice_and_video(ref_audio, ref_text, gen_text, speed, source_image, preprocess_type, is_still_mode, enhancer, batch_size, size_of_image, pose_style, facerender, exp_weight, use_ref_video, ref_video, ref_info, use_idle_mode, length_of_audio, blink_every):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  # 1. Sinh audio từ TTS
41
  (final_sample_rate, final_wave), _ = infer_tts(ref_audio, ref_text, gen_text, speed)
42
  # Lưu ra file tạm
@@ -60,52 +99,155 @@ def generate_voice_and_video(ref_audio, ref_text, gen_text, speed, source_image,
60
  ref_info,
61
  use_idle_mode,
62
  length_of_audio,
63
- blink_every
64
  )
65
  return tmp_audio.name, video_path
66
 
 
67
  def sadtalker_demo():
68
  download_model()
69
- with gr.Blocks(analytics_enabled=False) as sadtalker_interface:
70
- gr.Markdown("""
71
- # 🎤 F5-TTS: Vietnamese Text-to-Speech Synthesis & SadTalker Video
72
- # Nhập text, upload sample voice và ảnh để tạo video nói chuyện.
73
- """)
74
- with gr.Row():
75
- ref_audio = gr.Audio(label="🔊 Sample Voice", type="filepath")
76
- ref_text = gr.Textbox(label="📝 Reference Transcript (optional)", placeholder="Nhập transcript tiếng Việt cho sample voice nếu có...", lines=2)
77
- gen_text = gr.Textbox(label="📝 Text", placeholder="Enter the text to generate voice...", lines=3)
78
- speed = gr.Slider(0.3, 2.0, value=1.0, step=0.1, label="⚡ Speed")
79
- with gr.Row():
80
- source_image = gr.Image(label="Source image", type="filepath", elem_id="img2img_image")
81
- with gr.Row():
82
- # Các setting cho SadTalker
83
- with gr.Column():
84
- preprocess_type = gr.Radio(['crop', 'resize','full', 'extcrop', 'extfull'], value='crop', label='preprocess', info="How to handle input image?")
85
- is_still_mode = gr.Checkbox(label="Still Mode (fewer head motion, works with preprocess `full`)")
86
- enhancer = gr.Checkbox(label="GFPGAN as Face enhancer")
87
- batch_size = gr.Slider(label="batch size in generation", step=1, maximum=10, value=1)
88
- size_of_image = gr.Radio([256, 512], value=256, label='face model resolution', info="use 256/512 model?")
89
- pose_style = gr.Slider(minimum=0, maximum=45, step=1, label="Pose style", value=0)
90
- facerender = gr.Radio(['facevid2vid','pirender'], value='facevid2vid', label='facerender', info="which face render?")
91
- exp_weight = gr.Slider(minimum=0, maximum=3, step=0.1, label="expression scale", value=1)
92
- use_ref_video = gr.Checkbox(label="Use Reference Video")
93
- ref_video = gr.Video(label="Reference Video", elem_id="vidref")
94
- ref_info = gr.Radio(['pose', 'blink','pose+blink', 'all'], value='pose', label='Reference Video',info="How to borrow from reference Video?((fully transfer, aka, video driving mode))")
95
- use_idle_mode = gr.Checkbox(label="Use Idle Animation")
96
- length_of_audio = gr.Number(value=5, label="The length(seconds) of the generated video.")
97
- blink_every = gr.Checkbox(label="use eye blink", value=True)
98
- btn_generate = gr.Button("🔥 Generate Voice & Video")
99
- with gr.Row():
100
- output_audio = gr.Audio(label="🎧 Generated Audio", type="filepath")
101
- gen_video = gr.Video(label="Generated video", format="mp4", scale=1)
102
- btn_generate.click(
103
- generate_voice_and_video,
104
- inputs=[ref_audio, ref_text, gen_text, speed, source_image, preprocess_type, is_still_mode, enhancer, batch_size, size_of_image, pose_style, facerender, exp_weight, use_ref_video, ref_video, ref_info, use_idle_mode, length_of_audio, blink_every],
105
- outputs=[output_audio, gen_video]
106
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
  return sadtalker_interface
108
 
 
109
  if __name__ == "__main__":
110
  demo = sadtalker_demo()
111
  demo.queue(max_size=10, api_open=True)
 
2
  import tempfile
3
  import gradio as gr
4
  from app_tts import infer_tts
5
+ from src.gradio_demo import SadTalker
6
+
7
  # from src.utils.text2speech import TTSTalker
8
  from huggingface_hub import snapshot_download
9
+ import glob
10
+
11
+
12
+ def get_source_image(image):
13
+ return image
14
 
 
 
15
 
16
  try:
17
  import webui # in webui
18
+
19
  in_webui = True
20
  except:
21
  in_webui = False
 
26
  return gr.update(visible=True), gr.update(visible=False)
27
  else:
28
  return gr.update(visible=False), gr.update(visible=True)
29
+
30
+
31
  def ref_video_fn(path_of_ref_video):
32
  if path_of_ref_video is not None:
33
  return gr.update(value=True)
34
  else:
35
  return gr.update(value=False)
36
+
37
+
38
  def download_model():
39
+ REPO_ID = "vinthony/SadTalker-V002rc"
40
+ snapshot_download(
41
+ repo_id=REPO_ID,
42
+ local_dir="./checkpoints",
43
+ local_dir_use_symlinks=True,
44
+ )
45
+
46
+
47
+ def list_videos():
48
+ # Lấy danh sách tất cả file mp4 trong results
49
+ video_files = glob.glob("results/**/*.mp4", recursive=True)
50
+ # Trả về danh sách file (có thể sort theo thời gian)
51
+ return sorted(video_files, reverse=True)
52
 
53
 
54
  # New: Gộp 2 nút thành 1, output audio là input cho video
55
  import soundfile as sf
56
 
57
+
58
+ def generate_voice_and_video(
59
+ ref_audio,
60
+ ref_text,
61
+ gen_text,
62
+ speed,
63
+ source_image,
64
+ preprocess_type,
65
+ is_still_mode,
66
+ enhancer,
67
+ batch_size,
68
+ size_of_image,
69
+ pose_style,
70
+ facerender,
71
+ exp_weight,
72
+ use_ref_video,
73
+ ref_video,
74
+ ref_info,
75
+ use_idle_mode,
76
+ length_of_audio,
77
+ blink_every,
78
+ ):
79
  # 1. Sinh audio từ TTS
80
  (final_sample_rate, final_wave), _ = infer_tts(ref_audio, ref_text, gen_text, speed)
81
  # Lưu ra file tạm
 
99
  ref_info,
100
  use_idle_mode,
101
  length_of_audio,
102
+ blink_every,
103
  )
104
  return tmp_audio.name, video_path
105
 
106
+
107
  def sadtalker_demo():
108
  download_model()
109
+ with gr.Blocks(
110
+ analytics_enabled=False,
111
+ css="src/assets/css/atalink_theme.css",
112
+ ) as sadtalker_interface:
113
+ gr.Markdown(
114
+ f"""
115
+ ![logo](https://vietnam.atalink.com/favicon.ico)
116
+
117
+ ## F5-TTS & SadTalker
118
+
119
+ Nhập text, upload sample voice và ảnh để tạo video nói chuyện. Chủ đạo Atalink Blue.
120
+ """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
  )
122
+ with gr.Tab("Lịch sử video"):
123
+ with gr.Row(elem_classes="gr-row"):
124
+ video_list = gr.Dropdown(
125
+ choices=list_videos(),
126
+ label="Chọn video để xem",
127
+ interactive=True,
128
+ scale=1,
129
+ )
130
+ video_player = gr.Video(
131
+ height=180, width=180, label="Video lịch sử", scale=1
132
+ )
133
+ video_list.change(lambda x: x, inputs=video_list, outputs=video_player)
134
+
135
+ with gr.Tab("Tạo video mới"):
136
+ with gr.Row(elem_classes="gr-row"):
137
+ ref_audio = gr.Audio(label="🔊 Sample Voice", type="filepath")
138
+ ref_text = gr.Textbox(
139
+ label="📝 Reference Transcript (optional)",
140
+ placeholder="Nhập transcript tiếng Việt cho sample voice nếu có...",
141
+ lines=2,
142
+ )
143
+ with gr.Row(elem_classes="gr-row"):
144
+ gen_text = gr.Textbox(
145
+ label="📝 Text",
146
+ placeholder="Nhập nội dung để tạo giọng nói...",
147
+ lines=3,
148
+ )
149
+ speed = gr.Slider(
150
+ 0.3,
151
+ 2.0,
152
+ value=1.0,
153
+ step=0.1,
154
+ label="⚡ Tốc độ nói",
155
+ info="Chỉnh tốc độ phát âm",
156
+ )
157
+ with gr.Row(elem_classes="gr-row"):
158
+ source_image = gr.Image(
159
+ label="Ảnh nguồn", type="filepath", elem_id="img2img_image"
160
+ )
161
+ with gr.Accordion(
162
+ "Cài đặt nâng cao SadTalker", open=False, elem_classes="gr-button"
163
+ ):
164
+ with gr.Row(elem_classes="gr-row"):
165
+ preprocess_type = gr.Radio(
166
+ ["crop", "resize", "full", "extcrop", "extfull"],
167
+ value="crop",
168
+ label="Tiền xử lý ảnh",
169
+ info="Cách xử lý ảnh đầu vào?",
170
+ )
171
+ is_still_mode = gr.Checkbox(
172
+ label="Chế độ tĩnh (ít chuyển động đầu)"
173
+ )
174
+ enhancer = gr.Checkbox(label="Dùng GFPGAN làm đẹp mặt")
175
+ batch_size = gr.Slider(
176
+ label="Batch size", step=1, maximum=10, value=1
177
+ )
178
+ size_of_image = gr.Radio(
179
+ [256, 512],
180
+ value=256,
181
+ label="Độ phân giải khuôn mặt",
182
+ info="Dùng model 256/512?",
183
+ )
184
+ with gr.Row(elem_classes="gr-row"):
185
+ pose_style = gr.Slider(
186
+ minimum=0, maximum=45, step=1, label="Kiểu pose", value=0
187
+ )
188
+ facerender = gr.Radio(
189
+ ["facevid2vid", "pirender"],
190
+ value="facevid2vid",
191
+ label="Face render",
192
+ info="Chọn kiểu render mặt",
193
+ )
194
+ exp_weight = gr.Slider(
195
+ minimum=0,
196
+ maximum=3,
197
+ step=0.1,
198
+ label="Biên độ biểu cảm",
199
+ value=1,
200
+ )
201
+ use_ref_video = gr.Checkbox(label="Dùng video tham chiếu")
202
+ ref_video = gr.Video(
203
+ label="Video tham chiếu",
204
+ elem_id="vidref",
205
+ height=120,
206
+ width=120,
207
+ )
208
+ ref_info = gr.Radio(
209
+ ["pose", "blink", "pose+blink", "all"],
210
+ value="pose",
211
+ label="Tham chiếu",
212
+ info="Cách lấy thông tin từ video tham chiếu?",
213
+ )
214
+ use_idle_mode = gr.Checkbox(label="Idle Animation")
215
+ length_of_audio = gr.Number(value=5, label="Độ dài video (giây)")
216
+ blink_every = gr.Checkbox(label="Chớp mắt", value=True)
217
+ btn_generate = gr.Button("🔥 Tạo giọng nói & video", elem_id="btn-generate")
218
+ with gr.Row(elem_classes="gr-row"):
219
+ output_audio = gr.Audio(label="🎧 Audio đã tạo", type="filepath")
220
+ gen_video = gr.Video(
221
+ label="Video đã tạo", format="mp4", scale=1, height=180, width=180
222
+ )
223
+ btn_generate.click(
224
+ generate_voice_and_video,
225
+ inputs=[
226
+ ref_audio,
227
+ ref_text,
228
+ gen_text,
229
+ speed,
230
+ source_image,
231
+ preprocess_type,
232
+ is_still_mode,
233
+ enhancer,
234
+ batch_size,
235
+ size_of_image,
236
+ pose_style,
237
+ facerender,
238
+ exp_weight,
239
+ use_ref_video,
240
+ ref_video,
241
+ ref_info,
242
+ use_idle_mode,
243
+ length_of_audio,
244
+ blink_every,
245
+ ],
246
+ outputs=[output_audio, gen_video],
247
+ )
248
  return sadtalker_interface
249
 
250
+
251
  if __name__ == "__main__":
252
  demo = sadtalker_demo()
253
  demo.queue(max_size=10, api_open=True)