Spaces:

manhteky123
/

text2speech_docker

Sleeping

App Files Files Community

manhteky123 commited on Oct 10

Commit

258b448

verified ·

1 Parent(s): 9c3db2a

Upload 5 files

Browse files

Files changed (5) hide show

Dockerfile +3 -2
app.py +102 -102
flask_app.py +184 -0
requirements.txt +1 -0
templates/index.html +451 -0

Dockerfile CHANGED Viewed

@@ -65,6 +65,7 @@ EXPOSE 7860
 # Set environment variables
 ENV PYTHONUNBUFFERED=1
-# Run the application
-CMD ["python", "app.py"]

 # Set environment variables
 ENV PYTHONUNBUFFERED=1
+ENV FLASK_APP=flask_app.py
+# Run the Flask application
+CMD ["python", "flask_app.py"]

app.py CHANGED Viewed

@@ -1,103 +1,103 @@
-import spaces
-import os
-from huggingface_hub import login
-import gradio as gr
-from cached_path import cached_path
-import tempfile
-from vinorm import TTSnorm
-from f5_tts.model import DiT
-from f5_tts.infer.utils_infer import (
-    preprocess_ref_audio_text,
-    load_vocoder,
-    load_model,
-    infer_process,
-    save_spectrogram,
-)
-# Retrieve token from secrets
-hf_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
-# Log in to Hugging Face
-if hf_token:
-    login(token=hf_token)
-def post_process(text):
-    text = " " + text + " "
-    text = text.replace(" . . ", " . ")
-    text = " " + text + " "
-    text = text.replace(" .. ", " . ")
-    text = " " + text + " "
-    text = text.replace(" , , ", " , ")
-    text = " " + text + " "
-    text = text.replace(" ,, ", " , ")
-    text = " " + text + " "
-    text = text.replace('"', "")
-    return " ".join(text.split())
-# Load models
-vocoder = load_vocoder()
-model = load_model(
-    DiT,
-    dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4),
-    ckpt_path=str(cached_path("hf://hynt/F5-TTS-Vietnamese-ViVoice/model_last.pt")),
-    vocab_file=str(cached_path("hf://hynt/F5-TTS-Vietnamese-ViVoice/config.json")),
-)
-@spaces.GPU
-def infer_tts(ref_audio_orig: str, gen_text: str, speed: float = 1.0, request: gr.Request = None):
-    if not ref_audio_orig:
-        raise gr.Error("Please upload a sample audio file.")
-    if not gen_text.strip():
-        raise gr.Error("Please enter the text content to generate voice.")
-    if len(gen_text.split()) > 1000:
-        raise gr.Error("Please enter text content with less than 1000 words.")
-    try:
-        ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, "")
-        final_wave, final_sample_rate, spectrogram = infer_process(
-            ref_audio, ref_text.lower(), post_process(TTSnorm(gen_text)).lower(), model, vocoder, speed=speed
-        )
-        with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_spectrogram:
-            spectrogram_path = tmp_spectrogram.name
-            save_spectrogram(spectrogram, spectrogram_path)
-        return (final_sample_rate, final_wave), spectrogram_path
-    except Exception as e:
-        raise gr.Error(f"Error generating voice: {e}")
-# Gradio UI
-with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    gr.Markdown("""
-    # 🎤 F5-TTS: Vietnamese Text-to-Speech Synthesis.
-    # The model was trained with approximately 1000 hours of data on a RTX 3090 GPU.
-    Enter text and upload a sample voice to generate natural speech.
-    """)
-    with gr.Row():
-        ref_audio = gr.Audio(label="🔊 Sample Voice", type="filepath")
-        gen_text = gr.Textbox(label="📝 Text", placeholder="Enter the text to generate voice...", lines=3)
-    speed = gr.Slider(0.3, 2.0, value=1.0, step=0.1, label="⚡ Speed")
-    btn_synthesize = gr.Button("🔥 Generate Voice")
-    with gr.Row():
-        output_audio = gr.Audio(label="🎧 Generated Audio", type="numpy")
-        output_spectrogram = gr.Image(label="📊 Spectrogram")
-    model_limitations = gr.Textbox(
-        value="""1. This model may not perform well with numerical characters, dates, special characters, etc. => A text normalization module is needed.
-2. The rhythm of some generated audios may be inconsistent or choppy => It is recommended to select clearly pronounced sample audios with minimal pauses for better synthesis quality.
-3. Default, reference audio text uses the pho-whisper-medium model, which may not always accurately recognize Vietnamese, resulting in poor voice synthesis quality.
-4. Inference with overly long paragraphs may produce poor results.""",
-        label="❗ Model Limitations",
-        lines=4,
-        interactive=False
-    )
-    btn_synthesize.click(infer_tts, inputs=[ref_audio, gen_text, speed], outputs=[output_audio, output_spectrogram])
-# Run Gradio with share=True to get a gradio.live link
 demo.queue().launch()

+import spaces
+import os
+from huggingface_hub import login
+import gradio as gr
+from cached_path import cached_path
+import tempfile
+from vinorm import TTSnorm
+from f5_tts.model import DiT
+from f5_tts.infer.utils_infer import (
+    preprocess_ref_audio_text,
+    load_vocoder,
+    load_model,
+    infer_process,
+    save_spectrogram,
+)
+# Retrieve token from secrets
+hf_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
+# Log in to Hugging Face
+if hf_token:
+    login(token=hf_token)
+def post_process(text):
+    text = " " + text + " "
+    text = text.replace(" . . ", " . ")
+    text = " " + text + " "
+    text = text.replace(" .. ", " . ")
+    text = " " + text + " "
+    text = text.replace(" , , ", " , ")
+    text = " " + text + " "
+    text = text.replace(" ,, ", " , ")
+    text = " " + text + " "
+    text = text.replace('"', "")
+    return " ".join(text.split())
+# Load models
+vocoder = load_vocoder()
+model = load_model(
+    DiT,
+    dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4),
+    ckpt_path=str(cached_path("hf://hynt/F5-TTS-Vietnamese-ViVoice/model_last.pt")),
+    vocab_file=str(cached_path("hf://hynt/F5-TTS-Vietnamese-ViVoice/config.json")),
+)
+@spaces.GPU
+def infer_tts(ref_audio_orig: str, gen_text: str, speed: float = 1.0, request: gr.Request = None):
+    if not ref_audio_orig:
+        raise gr.Error("Please upload a sample audio file.")
+    if not gen_text.strip():
+        raise gr.Error("Please enter the text content to generate voice.")
+    if len(gen_text.split()) > 1000:
+        raise gr.Error("Please enter text content with less than 1000 words.")
+    try:
+        ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, "")
+        final_wave, final_sample_rate, spectrogram = infer_process(
+            ref_audio, ref_text.lower(), post_process(TTSnorm(gen_text)).lower(), model, vocoder, speed=speed
+        )
+        with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_spectrogram:
+            spectrogram_path = tmp_spectrogram.name
+            save_spectrogram(spectrogram, spectrogram_path)
+        return (final_sample_rate, final_wave), spectrogram_path
+    except Exception as e:
+        raise gr.Error(f"Error generating voice: {e}")
+# Gradio UI
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("""
+    # 🎤 F5-TTS: Vietnamese Text-to-Speech Synthesis.
+    # The model was trained with approximately 1000 hours of data on a RTX 3090 GPU.
+    Enter text and upload a sample voice to generate natural speech.
+    """)
+    with gr.Row():
+        ref_audio = gr.Audio(label="🔊 Sample Voice", type="filepath")
+        gen_text = gr.Textbox(label="📝 Text", placeholder="Enter the text to generate voice...", lines=3)
+    speed = gr.Slider(0.3, 2.0, value=1.0, step=0.1, label="⚡ Speed")
+    btn_synthesize = gr.Button("🔥 Generate Voice")
+    with gr.Row():
+        output_audio = gr.Audio(label="🎧 Generated Audio", type="numpy")
+        output_spectrogram = gr.Image(label="📊 Spectrogram")
+    model_limitations = gr.Textbox(
+        value="""1. This model may not perform well with numerical characters, dates, special characters, etc. => A text normalization module is needed.
+2. The rhythm of some generated audios may be inconsistent or choppy => It is recommended to select clearly pronounced sample audios with minimal pauses for better synthesis quality.
+3. Default, reference audio text uses the pho-whisper-medium model, which may not always accurately recognize Vietnamese, resulting in poor voice synthesis quality.
+4. Inference with overly long paragraphs may produce poor results.""",
+        label="❗ Model Limitations",
+        lines=4,
+        interactive=False
+    )
+    btn_synthesize.click(infer_tts, inputs=[ref_audio, gen_text, speed], outputs=[output_audio, output_spectrogram])
+# Run Gradio with share=True to get a gradio.live link
 demo.queue().launch()

flask_app.py ADDED Viewed

	@@ -0,0 +1,184 @@

+import os
+import base64
+import tempfile
+from flask import Flask, render_template, request, jsonify, send_file
+from werkzeug.utils import secure_filename
+from cached_path import cached_path
+from vinorm import TTSnorm
+from huggingface_hub import login
+import numpy as np
+import soundfile as sf
+from f5_tts.model import DiT
+from f5_tts.infer.utils_infer import (
+    preprocess_ref_audio_text,
+    load_vocoder,
+    load_model,
+    infer_process,
+    save_spectrogram,
+)
+app = Flask(__name__)
+app.config['MAX_CONTENT_LENGTH'] = 50 * 1024 * 1024  # 50MB max file size
+app.config['UPLOAD_FOLDER'] = tempfile.gettempdir()
+app.config['ALLOWED_EXTENSIONS'] = {'wav', 'mp3', 'ogg', 'flac', 'm4a'}
+# Retrieve token from secrets
+hf_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
+# Log in to Hugging Face
+if hf_token:
+    login(token=hf_token)
+def post_process(text):
+    """Post process text by cleaning up punctuation and spacing"""
+    text = " " + text + " "
+    text = text.replace(" . . ", " . ")
+    text = " " + text + " "
+    text = text.replace(" .. ", " . ")
+    text = " " + text + " "
+    text = text.replace(" , , ", " , ")
+    text = " " + text + " "
+    text = text.replace(" ,, ", " , ")
+    text = " " + text + " "
+    text = text.replace('"', "")
+    return " ".join(text.split())
+def allowed_file(filename):
+    """Check if file extension is allowed"""
+    return '.' in filename and filename.rsplit('.', 1)[1].lower() in app.config['ALLOWED_EXTENSIONS']
+# Load models once at startup
+print("Loading models...")
+vocoder = load_vocoder()
+model = load_model(
+    DiT,
+    dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4),
+    ckpt_path=str(cached_path("hf://hynt/F5-TTS-Vietnamese-ViVoice/model_last.pt")),
+    vocab_file=str(cached_path("hf://hynt/F5-TTS-Vietnamese-ViVoice/config.json")),
+)
+print("Models loaded successfully!")
+@app.route('/')
+def index():
+    """Render the main page"""
+    return render_template('index.html')
+@app.route('/api/synthesize', methods=['POST'])
+def synthesize():
+    """
+    API endpoint for text-to-speech synthesis
+    Parameters:
+    - ref_audio: audio file (multipart/form-data)
+    - gen_text: text to synthesize (string)
+    - speed: synthesis speed (float, default: 1.0)
+    Returns:
+    - JSON with audio data (base64) and spectrogram
+    """
+    try:
+        # Validate request
+        if 'ref_audio' not in request.files:
+            return jsonify({'error': 'No audio file provided'}), 400
+        file = request.files['ref_audio']
+        if file.filename == '':
+            return jsonify({'error': 'No file selected'}), 400
+        if not allowed_file(file.filename):
+            return jsonify({'error': 'Invalid file format. Allowed: wav, mp3, ogg, flac, m4a'}), 400
+        gen_text = request.form.get('gen_text', '').strip()
+        if not gen_text:
+            return jsonify({'error': 'No text provided'}), 400
+        if len(gen_text.split()) > 1000:
+            return jsonify({'error': 'Text too long. Maximum 1000 words'}), 400
+        speed = float(request.form.get('speed', 1.0))
+        if speed < 0.3 or speed > 2.0:
+            return jsonify({'error': 'Speed must be between 0.3 and 2.0'}), 400
+        # Save uploaded file
+        filename = secure_filename(file.filename)
+        filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename)
+        file.save(filepath)
+        # Process audio
+        ref_audio, ref_text = preprocess_ref_audio_text(filepath, "")
+        # Generate speech
+        final_wave, final_sample_rate, spectrogram = infer_process(
+            ref_audio,
+            ref_text.lower(),
+            post_process(TTSnorm(gen_text)).lower(),
+            model,
+            vocoder,
+            speed=speed
+        )
+        # Save audio to temporary file
+        audio_path = os.path.join(app.config['UPLOAD_FOLDER'], 'output.wav')
+        sf.write(audio_path, final_wave, final_sample_rate)
+        # Convert audio to base64
+        with open(audio_path, 'rb') as f:
+            audio_base64 = base64.b64encode(f.read()).decode('utf-8')
+        # Save spectrogram
+        spec_path = os.path.join(app.config['UPLOAD_FOLDER'], 'spectrogram.png')
+        save_spectrogram(spectrogram, spec_path)
+        # Convert spectrogram to base64
+        with open(spec_path, 'rb') as f:
+            spec_base64 = base64.b64encode(f.read()).decode('utf-8')
+        # Cleanup
+        os.remove(filepath)
+        os.remove(audio_path)
+        os.remove(spec_path)
+        if os.path.exists(ref_audio):
+            os.remove(ref_audio)
+        return jsonify({
+            'success': True,
+            'audio': audio_base64,
+            'spectrogram': spec_base64,
+            'sample_rate': final_sample_rate,
+            'message': 'Speech synthesized successfully'
+        })
+    except Exception as e:
+        return jsonify({'error': f'Error generating speech: {str(e)}'}), 500
+@app.route('/api/health', methods=['GET'])
+def health():
+    """Health check endpoint"""
+    return jsonify({
+        'status': 'healthy',
+        'model': 'F5-TTS Vietnamese',
+        'version': '1.0.0'
+    })
+@app.route('/api/info', methods=['GET'])
+def info():
+    """Get model information and limitations"""
+    return jsonify({
+        'model_name': 'F5-TTS Vietnamese',
+        'description': 'Vietnamese Text-to-Speech synthesis model trained on ~1000 hours of data',
+        'limitations': [
+            'May not perform well with numerical characters, dates, special characters',
+            'Rhythm of some generated audios may be inconsistent or choppy',
+            'Reference audio text uses pho-whisper-medium which may not always accurately recognize Vietnamese',
+            'Inference with overly long paragraphs may produce poor results'
+        ],
+        'max_words': 1000,
+        'speed_range': [0.3, 2.0],
+        'supported_audio_formats': ['wav', 'mp3', 'ogg', 'flac', 'm4a']
+    })
+if __name__ == '__main__':
+    # Run Flask app
+    port = int(os.environ.get('PORT', 5000))
+    app.run(host='0.0.0.0', port=port, debug=False)

requirements.txt CHANGED Viewed

@@ -7,6 +7,7 @@ vinorm
 cached_path
 huggingface_hub
 gradio
 accelerate>=0.33.0
 click
 datasets

 cached_path
 huggingface_hub
 gradio
+flask
 accelerate>=0.33.0
 click
 datasets

templates/index.html ADDED Viewed

	@@ -0,0 +1,451 @@

+<!DOCTYPE html>
+<html lang="vi">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>F5-TTS Vietnamese - Text-to-Speech</title>
+    <style>
+        * {
+            margin: 0;
+            padding: 0;
+            box-sizing: border-box;
+        }
+        body {
+            font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
+            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+            min-height: 100vh;
+            padding: 20px;
+        }
+        .container {
+            max-width: 900px;
+            margin: 0 auto;
+            background: white;
+            border-radius: 20px;
+            box-shadow: 0 20px 60px rgba(0, 0, 0, 0.3);
+            overflow: hidden;
+        }
+        .header {
+            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+            color: white;
+            padding: 30px;
+            text-align: center;
+        }
+        .header h1 {
+            font-size: 2.5em;
+            margin-bottom: 10px;
+        }
+        .header p {
+            font-size: 1.1em;
+            opacity: 0.9;
+        }
+        .content {
+            padding: 40px;
+        }
+        .form-group {
+            margin-bottom: 25px;
+        }
+        label {
+            display: block;
+            font-weight: 600;
+            margin-bottom: 10px;
+            color: #333;
+            font-size: 1.1em;
+        }
+        .file-input-wrapper {
+            position: relative;
+            overflow: hidden;
+            display: inline-block;
+            width: 100%;
+        }
+        .file-input-wrapper input[type=file] {
+            position: absolute;
+            left: -9999px;
+        }
+        .file-input-label {
+            display: block;
+            padding: 15px 20px;
+            background: #f8f9fa;
+            border: 2px dashed #667eea;
+            border-radius: 10px;
+            cursor: pointer;
+            text-align: center;
+            transition: all 0.3s;
+        }
+        .file-input-label:hover {
+            background: #e7e9fc;
+            border-color: #764ba2;
+        }
+        .file-name {
+            margin-top: 10px;
+            font-size: 0.9em;
+            color: #666;
+        }
+        textarea {
+            width: 100%;
+            padding: 15px;
+            border: 2px solid #e0e0e0;
+            border-radius: 10px;
+            font-size: 1em;
+            resize: vertical;
+            min-height: 120px;
+            font-family: inherit;
+            transition: border-color 0.3s;
+        }
+        textarea:focus {
+            outline: none;
+            border-color: #667eea;
+        }
+        .slider-group {
+            margin-bottom: 25px;
+        }
+        .slider-label {
+            display: flex;
+            justify-content: space-between;
+            margin-bottom: 10px;
+        }
+        input[type="range"] {
+            width: 100%;
+            height: 8px;
+            border-radius: 5px;
+            background: #e0e0e0;
+            outline: none;
+            -webkit-appearance: none;
+        }
+        input[type="range"]::-webkit-slider-thumb {
+            -webkit-appearance: none;
+            appearance: none;
+            width: 20px;
+            height: 20px;
+            border-radius: 50%;
+            background: #667eea;
+            cursor: pointer;
+        }
+        input[type="range"]::-moz-range-thumb {
+            width: 20px;
+            height: 20px;
+            border-radius: 50%;
+            background: #667eea;
+            cursor: pointer;
+        }
+        .btn {
+            width: 100%;
+            padding: 15px;
+            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+            color: white;
+            border: none;
+            border-radius: 10px;
+            font-size: 1.2em;
+            font-weight: 600;
+            cursor: pointer;
+            transition: transform 0.2s, box-shadow 0.2s;
+        }
+        .btn:hover {
+            transform: translateY(-2px);
+            box-shadow: 0 10px 20px rgba(102, 126, 234, 0.4);
+        }
+        .btn:disabled {
+            background: #ccc;
+            cursor: not-allowed;
+            transform: none;
+        }
+        .loading {
+            display: none;
+            text-align: center;
+            margin: 20px 0;
+        }
+        .spinner {
+            border: 4px solid #f3f3f3;
+            border-top: 4px solid #667eea;
+            border-radius: 50%;
+            width: 40px;
+            height: 40px;
+            animation: spin 1s linear infinite;
+            margin: 0 auto;
+        }
+        @keyframes spin {
+            0% { transform: rotate(0deg); }
+            100% { transform: rotate(360deg); }
+        }
+        .result {
+            display: none;
+            margin-top: 30px;
+            padding: 20px;
+            background: #f8f9fa;
+            border-radius: 10px;
+        }
+        .result h3 {
+            margin-bottom: 15px;
+            color: #333;
+        }
+        audio {
+            width: 100%;
+            margin-bottom: 15px;
+        }
+        .spectrogram {
+            width: 100%;
+            border-radius: 10px;
+            margin-top: 15px;
+        }
+        .error {
+            display: none;
+            padding: 15px;
+            background: #fee;
+            border-left: 4px solid #f44;
+            border-radius: 5px;
+            color: #c00;
+            margin-top: 20px;
+        }
+        .info-box {
+            background: #fff3cd;
+            border-left: 4px solid #ffc107;
+            padding: 15px;
+            border-radius: 5px;
+            margin-top: 30px;
+        }
+        .info-box h4 {
+            margin-bottom: 10px;
+            color: #856404;
+        }
+        .info-box ul {
+            margin-left: 20px;
+        }
+        .info-box li {
+            margin-bottom: 5px;
+            color: #856404;
+        }
+        .api-docs {
+            margin-top: 30px;
+            padding: 20px;
+            background: #f8f9fa;
+            border-radius: 10px;
+        }
+        .api-docs h3 {
+            margin-bottom: 15px;
+            color: #333;
+        }
+        .api-docs pre {
+            background: #2d2d2d;
+            color: #f8f8f2;
+            padding: 15px;
+            border-radius: 5px;
+            overflow-x: auto;
+            font-size: 0.9em;
+        }
+        .api-docs code {
+            font-family: 'Courier New', monospace;
+        }
+    </style>
+</head>
+<body>
+    <div class="container">
+        <div class="header">
+            <h1>🎤 F5-TTS Vietnamese</h1>
+            <p>Text-to-Speech Synthesis • Trained on ~1000 hours of data</p>
+        </div>
+        <div class="content">
+            <form id="ttsForm">
+                <div class="form-group">
+                    <label>🔊 Sample Voice (Audio Reference)</label>
+                    <div class="file-input-wrapper">
+                        <input type="file" id="refAudio" name="ref_audio" accept="audio/*" required>
+                        <label for="refAudio" class="file-input-label">
+                            📁 Click to upload audio file
+                        </label>
+                    </div>
+                    <div class="file-name" id="fileName"></div>
+                </div>
+                <div class="form-group">
+                    <label for="genText">📝 Text to Synthesize</label>
+                    <textarea id="genText" name="gen_text" placeholder="Nhập văn bản tiếng Việt để tạo giọng nói..." required></textarea>
+                </div>
+                <div class="slider-group">
+                    <div class="slider-label">
+                        <label>⚡ Speed</label>
+                        <span id="speedValue">1.0x</span>
+                    </div>
+                    <input type="range" id="speed" name="speed" min="0.3" max="2.0" step="0.1" value="1.0">
+                </div>
+                <button type="submit" class="btn" id="submitBtn">
+                    🔥 Generate Speech
+                </button>
+            </form>
+            <div class="loading" id="loading">
+                <div class="spinner"></div>
+                <p style="margin-top: 15px; color: #666;">Generating speech... Please wait...</p>
+            </div>
+            <div class="error" id="error"></div>
+            <div class="result" id="result">
+                <h3>🎧 Generated Audio</h3>
+                <audio id="audioPlayer" controls></audio>
+                <h3>📊 Spectrogram</h3>
+                <img id="spectrogram" class="spectrogram" alt="Spectrogram">
+            </div>
+            <div class="info-box">
+                <h4>❗ Model Limitations</h4>
+                <ul>
+                    <li>May not perform well with numbers, dates, and special characters</li>
+                    <li>Rhythm may be inconsistent with some texts</li>
+                    <li>Works best with clear, well-pronounced reference audio</li>
+                    <li>Maximum 1000 words per request</li>
+                </ul>
+            </div>
+            <div class="api-docs">
+                <h3>📡 API Documentation</h3>
+                <p style="margin-bottom: 15px;">Use the following endpoint to integrate with your application:</p>
+                <h4>POST /api/synthesize</h4>
+                <pre><code>curl -X POST http://localhost:5000/api/synthesize \
+  -F "[email protected]" \
+  -F "gen_text=Xin chào, đây là giọng nói tổng hợp" \
+  -F "speed=1.0"</code></pre>
+                <h4 style="margin-top: 20px;">Response:</h4>
+                <pre><code>{
+  "success": true,
+  "audio": "base64_encoded_audio_data",
+  "spectrogram": "base64_encoded_image_data",
+  "sample_rate": 24000,
+  "message": "Speech synthesized successfully"
+}</code></pre>
+                <h4 style="margin-top: 20px;">GET /api/health</h4>
+                <p style="margin-bottom: 10px;">Check if the service is running:</p>
+                <pre><code>curl http://localhost:5000/api/health</code></pre>
+                <h4 style="margin-top: 20px;">GET /api/info</h4>
+                <p style="margin-bottom: 10px;">Get model information:</p>
+                <pre><code>curl http://localhost:5000/api/info</code></pre>
+            </div>
+        </div>
+    </div>
+    <script>
+        const form = document.getElementById('ttsForm');
+        const refAudioInput = document.getElementById('refAudio');
+        const fileNameDiv = document.getElementById('fileName');
+        const speedSlider = document.getElementById('speed');
+        const speedValue = document.getElementById('speedValue');
+        const submitBtn = document.getElementById('submitBtn');
+        const loading = document.getElementById('loading');
+        const error = document.getElementById('error');
+        const result = document.getElementById('result');
+        const audioPlayer = document.getElementById('audioPlayer');
+        const spectrogram = document.getElementById('spectrogram');
+        // Update file name display
+        refAudioInput.addEventListener('change', function(e) {
+            if (e.target.files.length > 0) {
+                fileNameDiv.textContent = '✅ ' + e.target.files[0].name;
+            }
+        });
+        // Update speed value display
+        speedSlider.addEventListener('input', function(e) {
+            speedValue.textContent = e.target.value + 'x';
+        });
+        // Handle form submission
+        form.addEventListener('submit', async function(e) {
+            e.preventDefault();
+            // Hide previous results and errors
+            result.style.display = 'none';
+            error.style.display = 'none';
+            // Show loading
+            loading.style.display = 'block';
+            submitBtn.disabled = true;
+            try {
+                const formData = new FormData(form);
+                const response = await fetch('/api/synthesize', {
+                    method: 'POST',
+                    body: formData
+                });
+                const data = await response.json();
+                if (response.ok && data.success) {
+                    // Display audio
+                    const audioBlob = base64ToBlob(data.audio, 'audio/wav');
+                    const audioUrl = URL.createObjectURL(audioBlob);
+                    audioPlayer.src = audioUrl;
+                    // Display spectrogram
+                    spectrogram.src = 'data:image/png;base64,' + data.spectrogram;
+                    result.style.display = 'block';
+                } else {
+                    throw new Error(data.error || 'Unknown error occurred');
+                }
+            } catch (err) {
+                error.textContent = '❌ ' + err.message;
+                error.style.display = 'block';
+            } finally {
+                loading.style.display = 'none';
+                submitBtn.disabled = false;
+            }
+        });
+        // Helper function to convert base64 to Blob
+        function base64ToBlob(base64, mimeType) {
+            const byteCharacters = atob(base64);
+            const byteNumbers = new Array(byteCharacters.length);
+            for (let i = 0; i < byteCharacters.length; i++) {
+                byteNumbers[i] = byteCharacters.charCodeAt(i);
+            }
+            const byteArray = new Uint8Array(byteNumbers);
+            return new Blob([byteArray], { type: mimeType });
+        }
+    </script>
+</body>
+</html>