manhteky123 commited on
Commit
258b448
·
verified ·
1 Parent(s): 9c3db2a

Upload 5 files

Browse files
Files changed (5) hide show
  1. Dockerfile +3 -2
  2. app.py +102 -102
  3. flask_app.py +184 -0
  4. requirements.txt +1 -0
  5. templates/index.html +451 -0
Dockerfile CHANGED
@@ -65,6 +65,7 @@ EXPOSE 7860
65
 
66
  # Set environment variables
67
  ENV PYTHONUNBUFFERED=1
 
68
 
69
- # Run the application
70
- CMD ["python", "app.py"]
 
65
 
66
  # Set environment variables
67
  ENV PYTHONUNBUFFERED=1
68
+ ENV FLASK_APP=flask_app.py
69
 
70
+ # Run the Flask application
71
+ CMD ["python", "flask_app.py"]
app.py CHANGED
@@ -1,103 +1,103 @@
1
- import spaces
2
- import os
3
- from huggingface_hub import login
4
- import gradio as gr
5
- from cached_path import cached_path
6
- import tempfile
7
- from vinorm import TTSnorm
8
-
9
- from f5_tts.model import DiT
10
- from f5_tts.infer.utils_infer import (
11
- preprocess_ref_audio_text,
12
- load_vocoder,
13
- load_model,
14
- infer_process,
15
- save_spectrogram,
16
- )
17
-
18
- # Retrieve token from secrets
19
- hf_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
20
-
21
-
22
- # Log in to Hugging Face
23
- if hf_token:
24
- login(token=hf_token)
25
-
26
- def post_process(text):
27
- text = " " + text + " "
28
- text = text.replace(" . . ", " . ")
29
- text = " " + text + " "
30
- text = text.replace(" .. ", " . ")
31
- text = " " + text + " "
32
- text = text.replace(" , , ", " , ")
33
- text = " " + text + " "
34
- text = text.replace(" ,, ", " , ")
35
- text = " " + text + " "
36
- text = text.replace('"', "")
37
- return " ".join(text.split())
38
-
39
- # Load models
40
- vocoder = load_vocoder()
41
- model = load_model(
42
- DiT,
43
- dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4),
44
- ckpt_path=str(cached_path("hf://hynt/F5-TTS-Vietnamese-ViVoice/model_last.pt")),
45
- vocab_file=str(cached_path("hf://hynt/F5-TTS-Vietnamese-ViVoice/config.json")),
46
- )
47
-
48
- @spaces.GPU
49
- def infer_tts(ref_audio_orig: str, gen_text: str, speed: float = 1.0, request: gr.Request = None):
50
-
51
- if not ref_audio_orig:
52
- raise gr.Error("Please upload a sample audio file.")
53
- if not gen_text.strip():
54
- raise gr.Error("Please enter the text content to generate voice.")
55
- if len(gen_text.split()) > 1000:
56
- raise gr.Error("Please enter text content with less than 1000 words.")
57
-
58
- try:
59
- ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, "")
60
- final_wave, final_sample_rate, spectrogram = infer_process(
61
- ref_audio, ref_text.lower(), post_process(TTSnorm(gen_text)).lower(), model, vocoder, speed=speed
62
- )
63
- with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_spectrogram:
64
- spectrogram_path = tmp_spectrogram.name
65
- save_spectrogram(spectrogram, spectrogram_path)
66
-
67
- return (final_sample_rate, final_wave), spectrogram_path
68
- except Exception as e:
69
- raise gr.Error(f"Error generating voice: {e}")
70
-
71
- # Gradio UI
72
- with gr.Blocks(theme=gr.themes.Soft()) as demo:
73
- gr.Markdown("""
74
- # 🎤 F5-TTS: Vietnamese Text-to-Speech Synthesis.
75
- # The model was trained with approximately 1000 hours of data on a RTX 3090 GPU.
76
- Enter text and upload a sample voice to generate natural speech.
77
- """)
78
-
79
- with gr.Row():
80
- ref_audio = gr.Audio(label="🔊 Sample Voice", type="filepath")
81
- gen_text = gr.Textbox(label="📝 Text", placeholder="Enter the text to generate voice...", lines=3)
82
-
83
- speed = gr.Slider(0.3, 2.0, value=1.0, step=0.1, label="⚡ Speed")
84
- btn_synthesize = gr.Button("🔥 Generate Voice")
85
-
86
- with gr.Row():
87
- output_audio = gr.Audio(label="🎧 Generated Audio", type="numpy")
88
- output_spectrogram = gr.Image(label="📊 Spectrogram")
89
-
90
- model_limitations = gr.Textbox(
91
- value="""1. This model may not perform well with numerical characters, dates, special characters, etc. => A text normalization module is needed.
92
- 2. The rhythm of some generated audios may be inconsistent or choppy => It is recommended to select clearly pronounced sample audios with minimal pauses for better synthesis quality.
93
- 3. Default, reference audio text uses the pho-whisper-medium model, which may not always accurately recognize Vietnamese, resulting in poor voice synthesis quality.
94
- 4. Inference with overly long paragraphs may produce poor results.""",
95
- label="❗ Model Limitations",
96
- lines=4,
97
- interactive=False
98
- )
99
-
100
- btn_synthesize.click(infer_tts, inputs=[ref_audio, gen_text, speed], outputs=[output_audio, output_spectrogram])
101
-
102
- # Run Gradio with share=True to get a gradio.live link
103
  demo.queue().launch()
 
1
+ import spaces
2
+ import os
3
+ from huggingface_hub import login
4
+ import gradio as gr
5
+ from cached_path import cached_path
6
+ import tempfile
7
+ from vinorm import TTSnorm
8
+
9
+ from f5_tts.model import DiT
10
+ from f5_tts.infer.utils_infer import (
11
+ preprocess_ref_audio_text,
12
+ load_vocoder,
13
+ load_model,
14
+ infer_process,
15
+ save_spectrogram,
16
+ )
17
+
18
+ # Retrieve token from secrets
19
+ hf_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
20
+
21
+
22
+ # Log in to Hugging Face
23
+ if hf_token:
24
+ login(token=hf_token)
25
+
26
+ def post_process(text):
27
+ text = " " + text + " "
28
+ text = text.replace(" . . ", " . ")
29
+ text = " " + text + " "
30
+ text = text.replace(" .. ", " . ")
31
+ text = " " + text + " "
32
+ text = text.replace(" , , ", " , ")
33
+ text = " " + text + " "
34
+ text = text.replace(" ,, ", " , ")
35
+ text = " " + text + " "
36
+ text = text.replace('"', "")
37
+ return " ".join(text.split())
38
+
39
+ # Load models
40
+ vocoder = load_vocoder()
41
+ model = load_model(
42
+ DiT,
43
+ dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4),
44
+ ckpt_path=str(cached_path("hf://hynt/F5-TTS-Vietnamese-ViVoice/model_last.pt")),
45
+ vocab_file=str(cached_path("hf://hynt/F5-TTS-Vietnamese-ViVoice/config.json")),
46
+ )
47
+
48
+ @spaces.GPU
49
+ def infer_tts(ref_audio_orig: str, gen_text: str, speed: float = 1.0, request: gr.Request = None):
50
+
51
+ if not ref_audio_orig:
52
+ raise gr.Error("Please upload a sample audio file.")
53
+ if not gen_text.strip():
54
+ raise gr.Error("Please enter the text content to generate voice.")
55
+ if len(gen_text.split()) > 1000:
56
+ raise gr.Error("Please enter text content with less than 1000 words.")
57
+
58
+ try:
59
+ ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, "")
60
+ final_wave, final_sample_rate, spectrogram = infer_process(
61
+ ref_audio, ref_text.lower(), post_process(TTSnorm(gen_text)).lower(), model, vocoder, speed=speed
62
+ )
63
+ with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_spectrogram:
64
+ spectrogram_path = tmp_spectrogram.name
65
+ save_spectrogram(spectrogram, spectrogram_path)
66
+
67
+ return (final_sample_rate, final_wave), spectrogram_path
68
+ except Exception as e:
69
+ raise gr.Error(f"Error generating voice: {e}")
70
+
71
+ # Gradio UI
72
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
73
+ gr.Markdown("""
74
+ # 🎤 F5-TTS: Vietnamese Text-to-Speech Synthesis.
75
+ # The model was trained with approximately 1000 hours of data on a RTX 3090 GPU.
76
+ Enter text and upload a sample voice to generate natural speech.
77
+ """)
78
+
79
+ with gr.Row():
80
+ ref_audio = gr.Audio(label="🔊 Sample Voice", type="filepath")
81
+ gen_text = gr.Textbox(label="📝 Text", placeholder="Enter the text to generate voice...", lines=3)
82
+
83
+ speed = gr.Slider(0.3, 2.0, value=1.0, step=0.1, label="⚡ Speed")
84
+ btn_synthesize = gr.Button("🔥 Generate Voice")
85
+
86
+ with gr.Row():
87
+ output_audio = gr.Audio(label="🎧 Generated Audio", type="numpy")
88
+ output_spectrogram = gr.Image(label="📊 Spectrogram")
89
+
90
+ model_limitations = gr.Textbox(
91
+ value="""1. This model may not perform well with numerical characters, dates, special characters, etc. => A text normalization module is needed.
92
+ 2. The rhythm of some generated audios may be inconsistent or choppy => It is recommended to select clearly pronounced sample audios with minimal pauses for better synthesis quality.
93
+ 3. Default, reference audio text uses the pho-whisper-medium model, which may not always accurately recognize Vietnamese, resulting in poor voice synthesis quality.
94
+ 4. Inference with overly long paragraphs may produce poor results.""",
95
+ label="❗ Model Limitations",
96
+ lines=4,
97
+ interactive=False
98
+ )
99
+
100
+ btn_synthesize.click(infer_tts, inputs=[ref_audio, gen_text, speed], outputs=[output_audio, output_spectrogram])
101
+
102
+ # Run Gradio with share=True to get a gradio.live link
103
  demo.queue().launch()
flask_app.py ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import base64
3
+ import tempfile
4
+ from flask import Flask, render_template, request, jsonify, send_file
5
+ from werkzeug.utils import secure_filename
6
+ from cached_path import cached_path
7
+ from vinorm import TTSnorm
8
+ from huggingface_hub import login
9
+ import numpy as np
10
+ import soundfile as sf
11
+
12
+ from f5_tts.model import DiT
13
+ from f5_tts.infer.utils_infer import (
14
+ preprocess_ref_audio_text,
15
+ load_vocoder,
16
+ load_model,
17
+ infer_process,
18
+ save_spectrogram,
19
+ )
20
+
21
+ app = Flask(__name__)
22
+ app.config['MAX_CONTENT_LENGTH'] = 50 * 1024 * 1024 # 50MB max file size
23
+ app.config['UPLOAD_FOLDER'] = tempfile.gettempdir()
24
+ app.config['ALLOWED_EXTENSIONS'] = {'wav', 'mp3', 'ogg', 'flac', 'm4a'}
25
+
26
+ # Retrieve token from secrets
27
+ hf_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
28
+
29
+ # Log in to Hugging Face
30
+ if hf_token:
31
+ login(token=hf_token)
32
+
33
+ def post_process(text):
34
+ """Post process text by cleaning up punctuation and spacing"""
35
+ text = " " + text + " "
36
+ text = text.replace(" . . ", " . ")
37
+ text = " " + text + " "
38
+ text = text.replace(" .. ", " . ")
39
+ text = " " + text + " "
40
+ text = text.replace(" , , ", " , ")
41
+ text = " " + text + " "
42
+ text = text.replace(" ,, ", " , ")
43
+ text = " " + text + " "
44
+ text = text.replace('"', "")
45
+ return " ".join(text.split())
46
+
47
+ def allowed_file(filename):
48
+ """Check if file extension is allowed"""
49
+ return '.' in filename and filename.rsplit('.', 1)[1].lower() in app.config['ALLOWED_EXTENSIONS']
50
+
51
+ # Load models once at startup
52
+ print("Loading models...")
53
+ vocoder = load_vocoder()
54
+ model = load_model(
55
+ DiT,
56
+ dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4),
57
+ ckpt_path=str(cached_path("hf://hynt/F5-TTS-Vietnamese-ViVoice/model_last.pt")),
58
+ vocab_file=str(cached_path("hf://hynt/F5-TTS-Vietnamese-ViVoice/config.json")),
59
+ )
60
+ print("Models loaded successfully!")
61
+
62
+ @app.route('/')
63
+ def index():
64
+ """Render the main page"""
65
+ return render_template('index.html')
66
+
67
+ @app.route('/api/synthesize', methods=['POST'])
68
+ def synthesize():
69
+ """
70
+ API endpoint for text-to-speech synthesis
71
+
72
+ Parameters:
73
+ - ref_audio: audio file (multipart/form-data)
74
+ - gen_text: text to synthesize (string)
75
+ - speed: synthesis speed (float, default: 1.0)
76
+
77
+ Returns:
78
+ - JSON with audio data (base64) and spectrogram
79
+ """
80
+ try:
81
+ # Validate request
82
+ if 'ref_audio' not in request.files:
83
+ return jsonify({'error': 'No audio file provided'}), 400
84
+
85
+ file = request.files['ref_audio']
86
+ if file.filename == '':
87
+ return jsonify({'error': 'No file selected'}), 400
88
+
89
+ if not allowed_file(file.filename):
90
+ return jsonify({'error': 'Invalid file format. Allowed: wav, mp3, ogg, flac, m4a'}), 400
91
+
92
+ gen_text = request.form.get('gen_text', '').strip()
93
+ if not gen_text:
94
+ return jsonify({'error': 'No text provided'}), 400
95
+
96
+ if len(gen_text.split()) > 1000:
97
+ return jsonify({'error': 'Text too long. Maximum 1000 words'}), 400
98
+
99
+ speed = float(request.form.get('speed', 1.0))
100
+ if speed < 0.3 or speed > 2.0:
101
+ return jsonify({'error': 'Speed must be between 0.3 and 2.0'}), 400
102
+
103
+ # Save uploaded file
104
+ filename = secure_filename(file.filename)
105
+ filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename)
106
+ file.save(filepath)
107
+
108
+ # Process audio
109
+ ref_audio, ref_text = preprocess_ref_audio_text(filepath, "")
110
+
111
+ # Generate speech
112
+ final_wave, final_sample_rate, spectrogram = infer_process(
113
+ ref_audio,
114
+ ref_text.lower(),
115
+ post_process(TTSnorm(gen_text)).lower(),
116
+ model,
117
+ vocoder,
118
+ speed=speed
119
+ )
120
+
121
+ # Save audio to temporary file
122
+ audio_path = os.path.join(app.config['UPLOAD_FOLDER'], 'output.wav')
123
+ sf.write(audio_path, final_wave, final_sample_rate)
124
+
125
+ # Convert audio to base64
126
+ with open(audio_path, 'rb') as f:
127
+ audio_base64 = base64.b64encode(f.read()).decode('utf-8')
128
+
129
+ # Save spectrogram
130
+ spec_path = os.path.join(app.config['UPLOAD_FOLDER'], 'spectrogram.png')
131
+ save_spectrogram(spectrogram, spec_path)
132
+
133
+ # Convert spectrogram to base64
134
+ with open(spec_path, 'rb') as f:
135
+ spec_base64 = base64.b64encode(f.read()).decode('utf-8')
136
+
137
+ # Cleanup
138
+ os.remove(filepath)
139
+ os.remove(audio_path)
140
+ os.remove(spec_path)
141
+ if os.path.exists(ref_audio):
142
+ os.remove(ref_audio)
143
+
144
+ return jsonify({
145
+ 'success': True,
146
+ 'audio': audio_base64,
147
+ 'spectrogram': spec_base64,
148
+ 'sample_rate': final_sample_rate,
149
+ 'message': 'Speech synthesized successfully'
150
+ })
151
+
152
+ except Exception as e:
153
+ return jsonify({'error': f'Error generating speech: {str(e)}'}), 500
154
+
155
+ @app.route('/api/health', methods=['GET'])
156
+ def health():
157
+ """Health check endpoint"""
158
+ return jsonify({
159
+ 'status': 'healthy',
160
+ 'model': 'F5-TTS Vietnamese',
161
+ 'version': '1.0.0'
162
+ })
163
+
164
+ @app.route('/api/info', methods=['GET'])
165
+ def info():
166
+ """Get model information and limitations"""
167
+ return jsonify({
168
+ 'model_name': 'F5-TTS Vietnamese',
169
+ 'description': 'Vietnamese Text-to-Speech synthesis model trained on ~1000 hours of data',
170
+ 'limitations': [
171
+ 'May not perform well with numerical characters, dates, special characters',
172
+ 'Rhythm of some generated audios may be inconsistent or choppy',
173
+ 'Reference audio text uses pho-whisper-medium which may not always accurately recognize Vietnamese',
174
+ 'Inference with overly long paragraphs may produce poor results'
175
+ ],
176
+ 'max_words': 1000,
177
+ 'speed_range': [0.3, 2.0],
178
+ 'supported_audio_formats': ['wav', 'mp3', 'ogg', 'flac', 'm4a']
179
+ })
180
+
181
+ if __name__ == '__main__':
182
+ # Run Flask app
183
+ port = int(os.environ.get('PORT', 5000))
184
+ app.run(host='0.0.0.0', port=port, debug=False)
requirements.txt CHANGED
@@ -7,6 +7,7 @@ vinorm
7
  cached_path
8
  huggingface_hub
9
  gradio
 
10
  accelerate>=0.33.0
11
  click
12
  datasets
 
7
  cached_path
8
  huggingface_hub
9
  gradio
10
+ flask
11
  accelerate>=0.33.0
12
  click
13
  datasets
templates/index.html ADDED
@@ -0,0 +1,451 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="vi">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>F5-TTS Vietnamese - Text-to-Speech</title>
7
+ <style>
8
+ * {
9
+ margin: 0;
10
+ padding: 0;
11
+ box-sizing: border-box;
12
+ }
13
+
14
+ body {
15
+ font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
16
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
17
+ min-height: 100vh;
18
+ padding: 20px;
19
+ }
20
+
21
+ .container {
22
+ max-width: 900px;
23
+ margin: 0 auto;
24
+ background: white;
25
+ border-radius: 20px;
26
+ box-shadow: 0 20px 60px rgba(0, 0, 0, 0.3);
27
+ overflow: hidden;
28
+ }
29
+
30
+ .header {
31
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
32
+ color: white;
33
+ padding: 30px;
34
+ text-align: center;
35
+ }
36
+
37
+ .header h1 {
38
+ font-size: 2.5em;
39
+ margin-bottom: 10px;
40
+ }
41
+
42
+ .header p {
43
+ font-size: 1.1em;
44
+ opacity: 0.9;
45
+ }
46
+
47
+ .content {
48
+ padding: 40px;
49
+ }
50
+
51
+ .form-group {
52
+ margin-bottom: 25px;
53
+ }
54
+
55
+ label {
56
+ display: block;
57
+ font-weight: 600;
58
+ margin-bottom: 10px;
59
+ color: #333;
60
+ font-size: 1.1em;
61
+ }
62
+
63
+ .file-input-wrapper {
64
+ position: relative;
65
+ overflow: hidden;
66
+ display: inline-block;
67
+ width: 100%;
68
+ }
69
+
70
+ .file-input-wrapper input[type=file] {
71
+ position: absolute;
72
+ left: -9999px;
73
+ }
74
+
75
+ .file-input-label {
76
+ display: block;
77
+ padding: 15px 20px;
78
+ background: #f8f9fa;
79
+ border: 2px dashed #667eea;
80
+ border-radius: 10px;
81
+ cursor: pointer;
82
+ text-align: center;
83
+ transition: all 0.3s;
84
+ }
85
+
86
+ .file-input-label:hover {
87
+ background: #e7e9fc;
88
+ border-color: #764ba2;
89
+ }
90
+
91
+ .file-name {
92
+ margin-top: 10px;
93
+ font-size: 0.9em;
94
+ color: #666;
95
+ }
96
+
97
+ textarea {
98
+ width: 100%;
99
+ padding: 15px;
100
+ border: 2px solid #e0e0e0;
101
+ border-radius: 10px;
102
+ font-size: 1em;
103
+ resize: vertical;
104
+ min-height: 120px;
105
+ font-family: inherit;
106
+ transition: border-color 0.3s;
107
+ }
108
+
109
+ textarea:focus {
110
+ outline: none;
111
+ border-color: #667eea;
112
+ }
113
+
114
+ .slider-group {
115
+ margin-bottom: 25px;
116
+ }
117
+
118
+ .slider-label {
119
+ display: flex;
120
+ justify-content: space-between;
121
+ margin-bottom: 10px;
122
+ }
123
+
124
+ input[type="range"] {
125
+ width: 100%;
126
+ height: 8px;
127
+ border-radius: 5px;
128
+ background: #e0e0e0;
129
+ outline: none;
130
+ -webkit-appearance: none;
131
+ }
132
+
133
+ input[type="range"]::-webkit-slider-thumb {
134
+ -webkit-appearance: none;
135
+ appearance: none;
136
+ width: 20px;
137
+ height: 20px;
138
+ border-radius: 50%;
139
+ background: #667eea;
140
+ cursor: pointer;
141
+ }
142
+
143
+ input[type="range"]::-moz-range-thumb {
144
+ width: 20px;
145
+ height: 20px;
146
+ border-radius: 50%;
147
+ background: #667eea;
148
+ cursor: pointer;
149
+ }
150
+
151
+ .btn {
152
+ width: 100%;
153
+ padding: 15px;
154
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
155
+ color: white;
156
+ border: none;
157
+ border-radius: 10px;
158
+ font-size: 1.2em;
159
+ font-weight: 600;
160
+ cursor: pointer;
161
+ transition: transform 0.2s, box-shadow 0.2s;
162
+ }
163
+
164
+ .btn:hover {
165
+ transform: translateY(-2px);
166
+ box-shadow: 0 10px 20px rgba(102, 126, 234, 0.4);
167
+ }
168
+
169
+ .btn:disabled {
170
+ background: #ccc;
171
+ cursor: not-allowed;
172
+ transform: none;
173
+ }
174
+
175
+ .loading {
176
+ display: none;
177
+ text-align: center;
178
+ margin: 20px 0;
179
+ }
180
+
181
+ .spinner {
182
+ border: 4px solid #f3f3f3;
183
+ border-top: 4px solid #667eea;
184
+ border-radius: 50%;
185
+ width: 40px;
186
+ height: 40px;
187
+ animation: spin 1s linear infinite;
188
+ margin: 0 auto;
189
+ }
190
+
191
+ @keyframes spin {
192
+ 0% { transform: rotate(0deg); }
193
+ 100% { transform: rotate(360deg); }
194
+ }
195
+
196
+ .result {
197
+ display: none;
198
+ margin-top: 30px;
199
+ padding: 20px;
200
+ background: #f8f9fa;
201
+ border-radius: 10px;
202
+ }
203
+
204
+ .result h3 {
205
+ margin-bottom: 15px;
206
+ color: #333;
207
+ }
208
+
209
+ audio {
210
+ width: 100%;
211
+ margin-bottom: 15px;
212
+ }
213
+
214
+ .spectrogram {
215
+ width: 100%;
216
+ border-radius: 10px;
217
+ margin-top: 15px;
218
+ }
219
+
220
+ .error {
221
+ display: none;
222
+ padding: 15px;
223
+ background: #fee;
224
+ border-left: 4px solid #f44;
225
+ border-radius: 5px;
226
+ color: #c00;
227
+ margin-top: 20px;
228
+ }
229
+
230
+ .info-box {
231
+ background: #fff3cd;
232
+ border-left: 4px solid #ffc107;
233
+ padding: 15px;
234
+ border-radius: 5px;
235
+ margin-top: 30px;
236
+ }
237
+
238
+ .info-box h4 {
239
+ margin-bottom: 10px;
240
+ color: #856404;
241
+ }
242
+
243
+ .info-box ul {
244
+ margin-left: 20px;
245
+ }
246
+
247
+ .info-box li {
248
+ margin-bottom: 5px;
249
+ color: #856404;
250
+ }
251
+
252
+ .api-docs {
253
+ margin-top: 30px;
254
+ padding: 20px;
255
+ background: #f8f9fa;
256
+ border-radius: 10px;
257
+ }
258
+
259
+ .api-docs h3 {
260
+ margin-bottom: 15px;
261
+ color: #333;
262
+ }
263
+
264
+ .api-docs pre {
265
+ background: #2d2d2d;
266
+ color: #f8f8f2;
267
+ padding: 15px;
268
+ border-radius: 5px;
269
+ overflow-x: auto;
270
+ font-size: 0.9em;
271
+ }
272
+
273
+ .api-docs code {
274
+ font-family: 'Courier New', monospace;
275
+ }
276
+ </style>
277
+ </head>
278
+ <body>
279
+ <div class="container">
280
+ <div class="header">
281
+ <h1>🎤 F5-TTS Vietnamese</h1>
282
+ <p>Text-to-Speech Synthesis • Trained on ~1000 hours of data</p>
283
+ </div>
284
+
285
+ <div class="content">
286
+ <form id="ttsForm">
287
+ <div class="form-group">
288
+ <label>🔊 Sample Voice (Audio Reference)</label>
289
+ <div class="file-input-wrapper">
290
+ <input type="file" id="refAudio" name="ref_audio" accept="audio/*" required>
291
+ <label for="refAudio" class="file-input-label">
292
+ 📁 Click to upload audio file
293
+ </label>
294
+ </div>
295
+ <div class="file-name" id="fileName"></div>
296
+ </div>
297
+
298
+ <div class="form-group">
299
+ <label for="genText">📝 Text to Synthesize</label>
300
+ <textarea id="genText" name="gen_text" placeholder="Nhập văn bản tiếng Việt để tạo giọng nói..." required></textarea>
301
+ </div>
302
+
303
+ <div class="slider-group">
304
+ <div class="slider-label">
305
+ <label>⚡ Speed</label>
306
+ <span id="speedValue">1.0x</span>
307
+ </div>
308
+ <input type="range" id="speed" name="speed" min="0.3" max="2.0" step="0.1" value="1.0">
309
+ </div>
310
+
311
+ <button type="submit" class="btn" id="submitBtn">
312
+ 🔥 Generate Speech
313
+ </button>
314
+ </form>
315
+
316
+ <div class="loading" id="loading">
317
+ <div class="spinner"></div>
318
+ <p style="margin-top: 15px; color: #666;">Generating speech... Please wait...</p>
319
+ </div>
320
+
321
+ <div class="error" id="error"></div>
322
+
323
+ <div class="result" id="result">
324
+ <h3>🎧 Generated Audio</h3>
325
+ <audio id="audioPlayer" controls></audio>
326
+ <h3>📊 Spectrogram</h3>
327
+ <img id="spectrogram" class="spectrogram" alt="Spectrogram">
328
+ </div>
329
+
330
+ <div class="info-box">
331
+ <h4>❗ Model Limitations</h4>
332
+ <ul>
333
+ <li>May not perform well with numbers, dates, and special characters</li>
334
+ <li>Rhythm may be inconsistent with some texts</li>
335
+ <li>Works best with clear, well-pronounced reference audio</li>
336
+ <li>Maximum 1000 words per request</li>
337
+ </ul>
338
+ </div>
339
+
340
+ <div class="api-docs">
341
+ <h3>📡 API Documentation</h3>
342
+ <p style="margin-bottom: 15px;">Use the following endpoint to integrate with your application:</p>
343
+
344
+ <h4>POST /api/synthesize</h4>
345
+ <pre><code>curl -X POST http://localhost:5000/api/synthesize \
346
347
+ -F "gen_text=Xin chào, đây là giọng nói tổng hợp" \
348
+ -F "speed=1.0"</code></pre>
349
+
350
+ <h4 style="margin-top: 20px;">Response:</h4>
351
+ <pre><code>{
352
+ "success": true,
353
+ "audio": "base64_encoded_audio_data",
354
+ "spectrogram": "base64_encoded_image_data",
355
+ "sample_rate": 24000,
356
+ "message": "Speech synthesized successfully"
357
+ }</code></pre>
358
+
359
+ <h4 style="margin-top: 20px;">GET /api/health</h4>
360
+ <p style="margin-bottom: 10px;">Check if the service is running:</p>
361
+ <pre><code>curl http://localhost:5000/api/health</code></pre>
362
+
363
+ <h4 style="margin-top: 20px;">GET /api/info</h4>
364
+ <p style="margin-bottom: 10px;">Get model information:</p>
365
+ <pre><code>curl http://localhost:5000/api/info</code></pre>
366
+ </div>
367
+ </div>
368
+ </div>
369
+
370
+ <script>
371
+ const form = document.getElementById('ttsForm');
372
+ const refAudioInput = document.getElementById('refAudio');
373
+ const fileNameDiv = document.getElementById('fileName');
374
+ const speedSlider = document.getElementById('speed');
375
+ const speedValue = document.getElementById('speedValue');
376
+ const submitBtn = document.getElementById('submitBtn');
377
+ const loading = document.getElementById('loading');
378
+ const error = document.getElementById('error');
379
+ const result = document.getElementById('result');
380
+ const audioPlayer = document.getElementById('audioPlayer');
381
+ const spectrogram = document.getElementById('spectrogram');
382
+
383
+ // Update file name display
384
+ refAudioInput.addEventListener('change', function(e) {
385
+ if (e.target.files.length > 0) {
386
+ fileNameDiv.textContent = '✅ ' + e.target.files[0].name;
387
+ }
388
+ });
389
+
390
+ // Update speed value display
391
+ speedSlider.addEventListener('input', function(e) {
392
+ speedValue.textContent = e.target.value + 'x';
393
+ });
394
+
395
+ // Handle form submission
396
+ form.addEventListener('submit', async function(e) {
397
+ e.preventDefault();
398
+
399
+ // Hide previous results and errors
400
+ result.style.display = 'none';
401
+ error.style.display = 'none';
402
+
403
+ // Show loading
404
+ loading.style.display = 'block';
405
+ submitBtn.disabled = true;
406
+
407
+ try {
408
+ const formData = new FormData(form);
409
+
410
+ const response = await fetch('/api/synthesize', {
411
+ method: 'POST',
412
+ body: formData
413
+ });
414
+
415
+ const data = await response.json();
416
+
417
+ if (response.ok && data.success) {
418
+ // Display audio
419
+ const audioBlob = base64ToBlob(data.audio, 'audio/wav');
420
+ const audioUrl = URL.createObjectURL(audioBlob);
421
+ audioPlayer.src = audioUrl;
422
+
423
+ // Display spectrogram
424
+ spectrogram.src = 'data:image/png;base64,' + data.spectrogram;
425
+
426
+ result.style.display = 'block';
427
+ } else {
428
+ throw new Error(data.error || 'Unknown error occurred');
429
+ }
430
+ } catch (err) {
431
+ error.textContent = '❌ ' + err.message;
432
+ error.style.display = 'block';
433
+ } finally {
434
+ loading.style.display = 'none';
435
+ submitBtn.disabled = false;
436
+ }
437
+ });
438
+
439
+ // Helper function to convert base64 to Blob
440
+ function base64ToBlob(base64, mimeType) {
441
+ const byteCharacters = atob(base64);
442
+ const byteNumbers = new Array(byteCharacters.length);
443
+ for (let i = 0; i < byteCharacters.length; i++) {
444
+ byteNumbers[i] = byteCharacters.charCodeAt(i);
445
+ }
446
+ const byteArray = new Uint8Array(byteNumbers);
447
+ return new Blob([byteArray], { type: mimeType });
448
+ }
449
+ </script>
450
+ </body>
451
+ </html>