Spaces:

sheep52031
/

mediatek-asr-test

Runtime error

File size: 5,533 Bytes

1a073e6
 
 
c989afe
1a073e6
 
 
 
 
 
 
 
 
c989afe
 
 
 
 
 
1a073e6
c989afe
1a073e6
 
c989afe
 
 
 
1a073e6
 
 
 
 
 
 
 
c989afe
 
1a073e6
 
 
 
 
 
c989afe
1a073e6
c989afe
1a073e6
c989afe
 
 
1a073e6
 
 
 
c989afe
 
 
 
 
 
1a073e6
c989afe
 
 
1a073e6
 
c989afe
 
1a073e6
 
 
 
c989afe
 
 
 
 
 
 
 
 
 
 
1a073e6
 
 
 
c989afe
 
 
 
 
 
 
 
 
 
1a073e6
 
 
c989afe
1a073e6
 
c989afe
 
1a073e6
c989afe
 
 
 
 
 
 
 
 
 
1a073e6
 
c989afe
1a073e6
c989afe
1a073e6
 
 
c989afe
1a073e6
 
c989afe
 
1a073e6
c989afe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1a073e6
c989afe
1a073e6

"""
MediaTek Breeze-ASR-25 台灣國語識別測試 Space
適用於 HuggingFace Zero GPU Spaces 部署
修復版：解決 ZeroGPU 會話間模型載入問題
"""

import gradio as gr
import spaces
from transformers import pipeline
import torch
import time
import torchaudio

@spaces.GPU(duration=60) 
def transcribe_audio(audio_file):
    """ASR 推論與效能測試 - 每次調用時載入模型"""
    
    if audio_file is None:
        return "❌ 請上傳音訊檔案", "", ""
    
    start_total = time.time()
    
    try:
        # 每次推論時載入模型（ZeroGPU 限制）
        print("🔄 載入 MediaTek Breeze-ASR-25 模型...")
        model_load_start = time.time()
        
        asr_model = pipeline(
            "automatic-speech-recognition",
            model="MediaTek-Research/Breeze-ASR-25",
            torch_dtype=torch.float16,
            device="cuda",
            return_timestamps=True
        )
        
        model_load_time = time.time() - model_load_start
        print(f"✅ 模型載入完成 ({model_load_time:.2f}s)")
        
        # 載入音訊檔案獲取長度
        waveform, sample_rate = torchaudio.load(audio_file)
        audio_duration = waveform.shape[1] / sample_rate
        
        # 執行 ASR 推論
        inference_start = time.time()
        result = asr_model(audio_file)
        inference_time = time.time() - inference_start
        
        # 計算總處理時間
        total_time = time.time() - start_total
        rtf = total_time / audio_duration
        
        # 提取識別結果
        transcript = result["text"] if isinstance(result, dict) else str(result)
        
        # 檢查 GPU 記憶體使用
        gpu_info = ""
        if torch.cuda.is_available():
            gpu_memory = torch.cuda.memory_allocated() / 1024**3
            gpu_info = f"💾 GPU 記憶體: {gpu_memory:.2f}GB"
        
        # 格式化性能指標
        performance = f"""⏱️ 總處理時間: {total_time:.2f}s
🔄 模型載入時間: {model_load_time:.2f}s  
🎯 推論時間: {inference_time:.2f}s
🎵 音訊長度: {audio_duration:.2f}s
📈 RTF: {rtf:.3f} ({'實時' if rtf < 1.0 else '非實時'})
💾 模型: MediaTek Breeze-ASR-25
{gpu_info}"""
        
        return transcript, performance, "✅ 識別成功"
        
    except Exception as e:
        error_msg = f"❌ 處理失敗: {str(e)}"
        print(error_msg)
        return error_msg, "", "❌ 處理失敗"

def get_model_info():
    """獲取模型資訊 (CPU 函數)"""
    return """🤖 MediaTek Breeze-ASR-25 模型資訊:
- 基於 Whisper 架構，專為台灣國語優化
- 支援繁體中文語音識別
- ZeroGPU 動態載入模式
- 每次推論重新載入以確保穩定性"""

# Gradio 界面
with gr.Blocks(title="MediaTek ASR 台灣國語測試") as demo:
    gr.Markdown("# 🎤 MediaTek Breeze-ASR-25 台灣國語識別測試")
    gr.Markdown("**專為台灣國語優化的語音識別測試平台**")
    
    # 模型資訊顯示
    with gr.Accordion("🤖 模型資訊", open=False):
        model_info = gr.Textbox(
            value=get_model_info(),
            label="模型詳細資訊",
            lines=6,
            interactive=False
        )
    
    with gr.Row():
        with gr.Column():
            gr.Markdown("### 🎙️ 音訊輸入")
            audio_input = gr.Audio(
                type="filepath", 
                label="上傳音訊檔案 (wav, mp3, m4a)",
                format="wav"
            )
            
            gr.Markdown("### 📋 測試說明")
            gr.Markdown("""
            - 🎯 上傳 5-60 秒的台灣國語音訊
            - 🔊 建議使用清晰、低噪音的錄音
            - ⚡ 每次識別會重新載入模型 (ZeroGPU 限制)
            - 📊 系統會顯示詳細的性能指標
            """)
            
            submit_btn = gr.Button("🚀 開始識別", variant="primary", size="lg")
            
        with gr.Column():
            gr.Markdown("### 📄 識別結果")
            transcript_output = gr.Textbox(
                label="✨ 識別文字", 
                lines=5,
                placeholder="識別結果將顯示在這裡..."
            )
            
            performance_output = gr.Textbox(
                label="⚡ 性能指標",
                lines=8,
                placeholder="性能數據將顯示在這裡..."
            )
            
            status_output = gr.Textbox(
                label="📊 處理狀態",
                lines=2
            )
    
    # 使用範例
    with gr.Accordion("📖 使用範例與 API", open=False):
        gr.Markdown("""
        ## 🔗 Gradio Client API 使用
        
        ```python
        from gradio_client import Client
        
        client = Client("sheep52031/mediatek-asr-test")
        result = client.predict("audio_file.wav", api_name="/predict")
        
        transcript = result[0]    # 識別文字
        performance = result[1]   # 性能指標  
        status = result[2]        # 處理狀態
        ```
        
        ## 📊 評估指標
        - **RTF < 1.0**: 實時處理能力
        - **準確度**: 台灣國語識別正確率
        - **處理時間**: 總耗時包含模型載入
        """)
    
    # 事件綁定
    submit_btn.click(
        transcribe_audio,
        inputs=[audio_input],
        outputs=[transcript_output, performance_output, status_output]
    )

if __name__ == "__main__":
    demo.launch()