manhteky123 commited on
Commit
9d836aa
·
verified ·
1 Parent(s): 27303a3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +103 -103
app.py CHANGED
@@ -1,103 +1,103 @@
1
- import spaces
2
- import os
3
- from huggingface_hub import login
4
- import gradio as gr
5
- from cached_path import cached_path
6
- import tempfile
7
- from vinorm import TTSnorm
8
-
9
- from f5_tts.model import DiT
10
- from f5_tts.infer.utils_infer import (
11
- preprocess_ref_audio_text,
12
- load_vocoder,
13
- load_model,
14
- infer_process,
15
- save_spectrogram,
16
- )
17
-
18
- # Retrieve token from secrets
19
- hf_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
20
-
21
-
22
- # Log in to Hugging Face
23
- if hf_token:
24
- login(token=hf_token)
25
-
26
- def post_process(text):
27
- text = " " + text + " "
28
- text = text.replace(" . . ", " . ")
29
- text = " " + text + " "
30
- text = text.replace(" .. ", " . ")
31
- text = " " + text + " "
32
- text = text.replace(" , , ", " , ")
33
- text = " " + text + " "
34
- text = text.replace(" ,, ", " , ")
35
- text = " " + text + " "
36
- text = text.replace('"', "")
37
- return " ".join(text.split())
38
-
39
- # Load models
40
- vocoder = load_vocoder()
41
- model = load_model(
42
- DiT,
43
- dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4),
44
- ckpt_path=str(cached_path("hf://hynt/F5-TTS-Vietnamese-ViVoice/model_last.pt")),
45
- vocab_file=str(cached_path("hf://hynt/F5-TTS-Vietnamese-ViVoice/config.json")),
46
- )
47
-
48
- @spaces.GPU
49
- def infer_tts(ref_audio_orig: str, gen_text: str, speed: float = 1.0, request: gr.Request = None):
50
-
51
- if not ref_audio_orig:
52
- raise gr.Error("Please upload a sample audio file.")
53
- if not gen_text.strip():
54
- raise gr.Error("Please enter the text content to generate voice.")
55
- if len(gen_text.split()) > 1000:
56
- raise gr.Error("Please enter text content with less than 1000 words.")
57
-
58
- try:
59
- ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, "")
60
- final_wave, final_sample_rate, spectrogram = infer_process(
61
- ref_audio, ref_text.lower(), post_process(TTSnorm(gen_text)).lower(), model, vocoder, speed=speed
62
- )
63
- with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_spectrogram:
64
- spectrogram_path = tmp_spectrogram.name
65
- save_spectrogram(spectrogram, spectrogram_path)
66
-
67
- return (final_sample_rate, final_wave), spectrogram_path
68
- except Exception as e:
69
- raise gr.Error(f"Error generating voice: {e}")
70
-
71
- # Gradio UI
72
- with gr.Blocks(theme=gr.themes.Soft()) as demo:
73
- gr.Markdown("""
74
- # 🎤 F5-TTS: Vietnamese Text-to-Speech Synthesis.
75
- # The model was trained with approximately 1000 hours of data on a RTX 3090 GPU.
76
- Enter text and upload a sample voice to generate natural speech.
77
- """)
78
-
79
- with gr.Row():
80
- ref_audio = gr.Audio(label="🔊 Sample Voice", type="filepath")
81
- gen_text = gr.Textbox(label="📝 Text", placeholder="Enter the text to generate voice...", lines=3)
82
-
83
- speed = gr.Slider(0.3, 2.0, value=1.0, step=0.1, label="⚡ Speed")
84
- btn_synthesize = gr.Button("🔥 Generate Voice")
85
-
86
- with gr.Row():
87
- output_audio = gr.Audio(label="🎧 Generated Audio", type="numpy")
88
- output_spectrogram = gr.Image(label="📊 Spectrogram")
89
-
90
- model_limitations = gr.Textbox(
91
- value="""1. This model may not perform well with numerical characters, dates, special characters, etc. => A text normalization module is needed.
92
- 2. The rhythm of some generated audios may be inconsistent or choppy => It is recommended to select clearly pronounced sample audios with minimal pauses for better synthesis quality.
93
- 3. Default, reference audio text uses the pho-whisper-medium model, which may not always accurately recognize Vietnamese, resulting in poor voice synthesis quality.
94
- 4. Inference with overly long paragraphs may produce poor results.""",
95
- label="❗ Model Limitations",
96
- lines=4,
97
- interactive=False
98
- )
99
-
100
- btn_synthesize.click(infer_tts, inputs=[ref_audio, gen_text, speed], outputs=[output_audio, output_spectrogram])
101
-
102
- # Run Gradio with share=True to get a gradio.live link
103
- demo.queue().launch()
 
1
+ import spaces
2
+ import os
3
+ from huggingface_hub import login
4
+ import gradio as gr
5
+ from cached_path import cached_path
6
+ import tempfile
7
+ from vinorm import TTSnorm
8
+
9
+ from f5_tts.model import DiT
10
+ from f5_tts.infer.utils_infer import (
11
+ preprocess_ref_audio_text,
12
+ load_vocoder,
13
+ load_model,
14
+ infer_process,
15
+ save_spectrogram,
16
+ )
17
+
18
+ # Retrieve token from secrets
19
+ hf_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
20
+
21
+
22
+ # Log in to Hugging Face
23
+ if hf_token:
24
+ login(token=hf_token)
25
+
26
+ def post_process(text):
27
+ text = " " + text + " "
28
+ text = text.replace(" . . ", " . ")
29
+ text = " " + text + " "
30
+ text = text.replace(" .. ", " . ")
31
+ text = " " + text + " "
32
+ text = text.replace(" , , ", " , ")
33
+ text = " " + text + " "
34
+ text = text.replace(" ,, ", " , ")
35
+ text = " " + text + " "
36
+ text = text.replace('"', "")
37
+ return " ".join(text.split())
38
+
39
+ # Load models
40
+ vocoder = load_vocoder()
41
+ model = load_model(
42
+ DiT,
43
+ dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4),
44
+ ckpt_path=str(cached_path("hf://hynt/F5-TTS-Vietnamese-ViVoice/model_last.pt")),
45
+ vocab_file=str(cached_path("hf://hynt/F5-TTS-Vietnamese-ViVoice/config.json")),
46
+ )
47
+
48
+ @spaces.GPU
49
+ def infer_tts(ref_audio_orig: str, gen_text: str, speed: float = 1.0, request: gr.Request = None):
50
+
51
+ if not ref_audio_orig:
52
+ raise gr.Error("Please upload a sample audio file.")
53
+ if not gen_text.strip():
54
+ raise gr.Error("Please enter the text content to generate voice.")
55
+ if len(gen_text.split()) > 1000:
56
+ raise gr.Error("Please enter text content with less than 1000 words.")
57
+
58
+ try:
59
+ ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, "")
60
+ final_wave, final_sample_rate, spectrogram = infer_process(
61
+ ref_audio, ref_text.lower(), post_process(TTSnorm(gen_text)).lower(), model, vocoder, speed=speed
62
+ )
63
+ with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_spectrogram:
64
+ spectrogram_path = tmp_spectrogram.name
65
+ save_spectrogram(spectrogram, spectrogram_path)
66
+
67
+ return (final_sample_rate, final_wave), spectrogram_path
68
+ except Exception as e:
69
+ raise gr.Error(f"Error generating voice: {e}")
70
+
71
+ # Gradio UI
72
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
73
+ gr.Markdown("""
74
+ # 🎤 F5-TTS: Vietnamese Text-to-Speech Synthesis.
75
+ # The model was trained with approximately 1000 hours of data on a RTX 3090 GPU.
76
+ Enter text and upload a sample voice to generate natural speech.
77
+ """)
78
+
79
+ with gr.Row():
80
+ ref_audio = gr.Audio(label="🔊 Sample Voice", type="filepath")
81
+ gen_text = gr.Textbox(label="📝 Text", placeholder="Enter the text to generate voice...", lines=3)
82
+
83
+ speed = gr.Slider(0.3, 2.0, value=1.0, step=0.1, label="⚡ Speed")
84
+ btn_synthesize = gr.Button("🔥 Generate Voice")
85
+
86
+ with gr.Row():
87
+ output_audio = gr.Audio(label="🎧 Generated Audio", type="numpy")
88
+ output_spectrogram = gr.Image(label="📊 Spectrogram")
89
+
90
+ model_limitations = gr.Textbox(
91
+ value="""1. This model may not perform well with numerical characters, dates, special characters, etc. => A text normalization module is needed.
92
+ 2. The rhythm of some generated audios may be inconsistent or choppy => It is recommended to select clearly pronounced sample audios with minimal pauses for better synthesis quality.
93
+ 3. Default, reference audio text uses the pho-whisper-medium model, which may not always accurately recognize Vietnamese, resulting in poor voice synthesis quality.
94
+ 4. Inference with overly long paragraphs may produce poor results.""",
95
+ label="❗ Model Limitations",
96
+ lines=4,
97
+ interactive=False
98
+ )
99
+
100
+ btn_synthesize.click(infer_tts, inputs=[ref_audio, gen_text, speed], outputs=[output_audio, output_spectrogram])
101
+
102
+ # Run Gradio with share=True to get a gradio.live link
103
+ demo.queue().launch(share=True)