Spaces:

badrex
/

JASRv1.1

Running on Zero

App Files Files Community

badrex commited on Oct 20

Commit

08eda6a

verified ·

1 Parent(s): 01eb311

Update app.py

Browse files

Files changed (1) hide show

app.py +231 -121

app.py CHANGED Viewed

@@ -1,143 +1,253 @@
-import gradio as gr
-from transformers import pipeline
-import numpy as np
 import os
-from huggingface_hub import login
 import spaces
-HF_TOKEN = os.environ.get("HF_TOKEN")
-if HF_TOKEN:
-    login(token=HF_TOKEN)
-MODEL_ID = "badrex/JASRv1.1"
-transcriber = pipeline("automatic-speech-recognition", model=MODEL_ID)
-# @spaces.GPU
-# def transcribe(audio):
-#     sr, y = audio
-#     # convert to mono if stereo
-#     #if y.ndim > 1:
-#     #    y = y.mean(axis=1)
-#     #y = y.astype(np.float32)
-#     #y /= np.max(np.abs(y))
-#     return transcriber({"sampling_rate": sr, "raw": y})["text"]
-# @spaces.GPU
-# def transcribe(audio):
-#     sr, y = audio
-#     # Convert stereo → mono
-#     if y.ndim > 1:
-#         y = np.mean(y, axis=1)
-#     # Ensure float32
-#     y = y.astype(np.float32)
-#     # Normalize to [-1, 1] if it's not already
-#     if np.max(np.abs(y)) > 1.0:
-#         y /= np.max(np.abs(y))
-@spaces.GPU
-def transcribe(audio):
-    sr, y = audio
-    # convert to mono if stereo
-    if y.ndim > 1:
-        y = y.mean(axis=1)
-    # resample to 16kHz if needed
-    if sr != 16000:
-        y = librosa.resample(y, orig_sr=sr, target_sr=16000)
-    y = y.astype(np.float32)
-    y /= np.max(np.abs(y))
-    return transcriber({"sampling_rate": sr, "raw": y})["text"]
-examples = []
-examples_dir = "examples"
-if os.path.exists(examples_dir):
-    for filename in os.listdir(examples_dir):
-        if filename.endswith((".wav", ".mp3", ".ogg")):
-            examples.append([os.path.join(examples_dir, filename)])
-    print(f"Found {len(examples)} example files")
-else:
-    print("Examples directory not found")
-# @spaces.GPU
-# def transcribe(audio):
-#     sr, y = audio
-#     if y.ndim > 1:
-#         y = np.mean(y, axis=1)
-#     y = y.astype(np.float32)
-#     # normalize to [-1, 1]
-#     max_val = np.max(np.abs(y))
-#     if max_val > 0:
-#         y /= max_val
-#     target_sr = transcriber.model.config.sampling_rate if hasattr(transcriber.model, "config") else 16000
-#     if sr != target_sr:
-#         import librosa
-#         y = librosa.resample(y, orig_sr=sr, target_sr=target_sr)
-#         sr = target_sr
-#     return transcriber({"sampling_rate": sr, "raw": y})["text"]
-demo = gr.Interface(
-    fn=transcribe,
-    inputs=gr.Audio(),
-    outputs="text",
-    title="<div>JASR v1.1 🎙️ <br>Speech Recognition for Dialectal Arabic</div>",
-    description="""
-        <div class="centered-content">
-            <div>
-                <p>
-                Developed with ❤ by <a href="https://badrex.github.io/" style="color: #2563eb;">Badr al-Absi</a>
-                </p>
-                <br>
-                <p style="font-size: 15px; line-height: 1.8;">
-                Marhaban 👋🏼
-                <br>
-                <br>
-                 This is a demo for JASR, pronounced <i>Jāsir</i> [جاسِر], a Transformer-based automatic speech recognition (ASR) system for dialectal Arabic.
-                 The current running instance is optimized for the regional dialects of <i>Jazirat al-Arab</i>, or the Arabian Peninsula.
-                 JASR is still under active development.
-                <br>
-                <p style="font-size: 15px; line-height: 1.8;">
-                Simply <strong>upload an audio file</strong> 📤 or <strong>record yourself speaking</strong> 🎙️⏺️ to try out the model!
-                </p>
-            </div>
-        </div>
-        """,
-    examples=examples if examples else None,
-    example_labels=[
-        "Kuwait Theatre",
-        "Saudi Radio Poetry",
-        "News Report (MSA)",
-        "San3ani Arabic male",
-        "San3ani Arabic female",
-        "Khaleeji Theatre",
-        "TEDx KSA",
-        "Yousif Saif Football Commentary",
-        "Khaleeji Theatre 2",
-        "TV Drama",
-        "KSA Theatre",
-        "TV Drama 2",
-        "Radio Jeddah (KSA)",
-        "Omani Theatre",
-        "Khaleeji Drama",
-        "Radio News",
-        "TEDx KSA 2",
-        "Radio Jeddah (KSA) 2",
-    ],
-    cache_examples=False,
-    examples_per_page=18,
-    flagging_mode=None,
-)
-if __name__ == "__main__":
-    demo.launch()

 import os
+import gradio as gr
 import spaces
+import torch
+from transformers import AutoProcessor, AutoModelForCTC
+device = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"Using device: {device}")
+# load examples
+examples = []
+examples_dir = "examples"
+if os.path.exists(examples_dir):
+    for filename in os.listdir(examples_dir):
+        if filename.endswith((".wav", ".mp3", ".ogg")):
+            examples.append([os.path.join(examples_dir, filename)])
+# Load model and processor
+MODEL_PATH = "badrex/JASRv1.1"
+processor = AutoProcessor.from_pretrained(MODEL_PATH)
+model = AutoModelForCTC.from_pretrained(MODEL_PATH)
+@spaces.GPU()
+def process_audio(audio_path):
+    """Process audio with return the generated respotextnse.
+    Args:
+        audio_path: Path to the audio file to be transcribed.
+    Returns:
+        String containing the transcribed text from the audio file, or an error message
+        if the audio file is missing.
+    """
+    if not audio_path:
+        return "Please upload an audio file."
+    inputs = inputs = processor(audio_path, sampling_rate=16000, return_tensors="pt")
+    inputs = inputs.to(device, dtype=torch.bfloat16)
+    with torch.no_grad():
+        logits = model(**inputs).logits
+    outputs = torch.argmax(logits, dim=-1)
+    decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
+    return decoded_outputs[0]
+# Define Gradio interface
+with gr.Blocks(title="Voxtral Demo") as demo:
+    gr.Markdown("<div>JASR v1.1 🎙️ <br>Speech Recognition for Dialectal Arabic</div>")
+    gr.Markdown("Upload an audio file and get a transcription from JASR v1.1.")
+    with gr.Row():
+        with gr.Column():
+            audio_input = gr.Audio(type="filepath", label="Upload Audio")
+            # model_selector = gr.Dropdown(
+            #     choices=["Voxtral Mini (3B)", "Voxtral Small (24B)"],
+            #     value="Voxtral Mini (3B)",
+            #     label="Select Model"
+            # )
+            # language = gr.Dropdown(
+            #     choices=list(LANGUAGES.keys()),
+            #     value="English",
+            #     label="Language"
+            # )
+            #max_tokens = gr.Slider(minimum=50, maximum=1000, value=500, step=50, label="Max Output Tokens")
+            submit_btn = gr.Button("Transcribe Audio", variant="primary")
+        with gr.Column():
+            output_text = gr.Textbox(label="Text Transcription", lines=10)
+    submit_btn.click(
+        fn=process_audio,
+        inputs=[audio_input],
+        outputs=output_text
+    )
+    gr.Examples(
+        examples=examples if examples else None,
+        inputs=[audio_input],
+        example_labels=[
+            "Kuwait Theatre",
+            "Saudi Radio Poetry",
+            "News Report (MSA)",
+            "San3ani Arabic male",
+            "San3ani Arabic female",
+            "Khaleeji Theatre",
+            "TEDx KSA",
+            "Yousif Saif Football Commentary",
+            "Khaleeji Theatre 2",
+            "TV Drama",
+            "KSA Theatre",
+            "TV Drama 2",
+            "Radio Jeddah (KSA)",
+            "Omani Theatre",
+            "Khaleeji Drama",
+            "Radio News",
+            "TEDx KSA 2",
+            "Radio Jeddah (KSA) 2",
+        ],
+    )
+# Launch the app
+if __name__ == "__main__":
+    demo.queue().launch() #share=False, ssr_mode=False, mcp_server=True
+# import gradio as gr
+# from transformers import pipeline
+# import numpy as np
+# import os
+# from huggingface_hub import login
+# import spaces
+# HF_TOKEN = os.environ.get("HF_TOKEN")
+# if HF_TOKEN:
+#     login(token=HF_TOKEN)
+# MODEL_ID = "badrex/JASRv1.1"
+# transcriber = pipeline("automatic-speech-recognition", model=MODEL_ID)
+# # @spaces.GPU
+# # def transcribe(audio):
+# #     sr, y = audio
+# #     # convert to mono if stereo
+# #     #if y.ndim > 1:
+# #     #    y = y.mean(axis=1)
+# #     #y = y.astype(np.float32)
+# #     #y /= np.max(np.abs(y))
+# #     return transcriber({"sampling_rate": sr, "raw": y})["text"]
+# # @spaces.GPU
+# # def transcribe(audio):
+# #     sr, y = audio
+# #     # Convert stereo → mono
+# #     if y.ndim > 1:
+# #         y = np.mean(y, axis=1)
+# #     # Ensure float32
+# #     y = y.astype(np.float32)
+# #     # Normalize to [-1, 1] if it's not already
+# #     if np.max(np.abs(y)) > 1.0:
+# #         y /= np.max(np.abs(y))
+# @spaces.GPU
+# def transcribe(audio):
+#     sr, y = audio
+#     # convert to mono if stereo
+#     if y.ndim > 1:
+#         y = y.mean(axis=1)
+#     # resample to 16kHz if needed
+#     if sr != 16000:
+#         y = librosa.resample(y, orig_sr=sr, target_sr=16000)
+#     y = y.astype(np.float32)
+#     y /= np.max(np.abs(y))
+#     return transcriber({"sampling_rate": sr, "raw": y})["text"]
+# examples = []
+# examples_dir = "examples"
+# if os.path.exists(examples_dir):
+#     for filename in os.listdir(examples_dir):
+#         if filename.endswith((".wav", ".mp3", ".ogg")):
+#             examples.append([os.path.join(examples_dir, filename)])
+#     print(f"Found {len(examples)} example files")
+# else:
+#     print("Examples directory not found")
+# # @spaces.GPU
+# # def transcribe(audio):
+# #     sr, y = audio
+# #     if y.ndim > 1:
+# #         y = np.mean(y, axis=1)
+# #     y = y.astype(np.float32)
+# #     # normalize to [-1, 1]
+# #     max_val = np.max(np.abs(y))
+# #     if max_val > 0:
+# #         y /= max_val
+# #     target_sr = transcriber.model.config.sampling_rate if hasattr(transcriber.model, "config") else 16000
+# #     if sr != target_sr:
+# #         import librosa
+# #         y = librosa.resample(y, orig_sr=sr, target_sr=target_sr)
+# #         sr = target_sr
+# #     return transcriber({"sampling_rate": sr, "raw": y})["text"]
+# demo = gr.Interface(
+#     fn=transcribe,
+#     inputs=gr.Audio(),
+#     outputs="text",
+#     title="<div>JASR v1.1 🎙️ <br>Speech Recognition for Dialectal Arabic</div>",
+#     description="""
+#         <div class="centered-content">
+#             <div>
+#                 <p>
+#                 Developed with ❤ by <a href="https://badrex.github.io/" style="color: #2563eb;">Badr al-Absi</a>
+#                 </p>
+#                 <br>
+#                 <p style="font-size: 15px; line-height: 1.8;">
+#                 Marhaban 👋🏼
+#                 <br>
+#                 <br>
+#                  This is a demo for JASR, pronounced <i>Jāsir</i> [جاسِر], a Transformer-based automatic speech recognition (ASR) system for dialectal Arabic.
+#                  The current running instance is optimized for the regional dialects of <i>Jazirat al-Arab</i>, or the Arabian Peninsula.
+#                  JASR is still under active development.
+#                 <br>
+#                 <p style="font-size: 15px; line-height: 1.8;">
+#                 Simply <strong>upload an audio file</strong> 📤 or <strong>record yourself speaking</strong> 🎙️⏺️ to try out the model!
+#                 </p>
+#             </div>
+#         </div>
+#         """,
+#     examples=examples if examples else None,
+#     example_labels=[
+#         "Kuwait Theatre",
+#         "Saudi Radio Poetry",
+#         "News Report (MSA)",
+#         "San3ani Arabic male",
+#         "San3ani Arabic female",
+#         "Khaleeji Theatre",
+#         "TEDx KSA",
+#         "Yousif Saif Football Commentary",
+#         "Khaleeji Theatre 2",
+#         "TV Drama",
+#         "KSA Theatre",
+#         "TV Drama 2",
+#         "Radio Jeddah (KSA)",
+#         "Omani Theatre",
+#         "Khaleeji Drama",
+#         "Radio News",
+#         "TEDx KSA 2",
+#         "Radio Jeddah (KSA) 2",
+#     ],
+#     cache_examples=False,
+#     examples_per_page=18,
+#     flagging_mode=None,
+# )
+# if __name__ == "__main__":
+#     demo.launch()