badrex commited on
Commit
08eda6a
·
verified ·
1 Parent(s): 01eb311

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +231 -121
app.py CHANGED
@@ -1,143 +1,253 @@
1
- import gradio as gr
2
- from transformers import pipeline
3
- import numpy as np
4
  import os
5
- from huggingface_hub import login
6
  import spaces
 
 
7
 
8
- HF_TOKEN = os.environ.get("HF_TOKEN")
9
- if HF_TOKEN:
10
- login(token=HF_TOKEN)
11
-
12
- MODEL_ID = "badrex/JASRv1.1"
13
- transcriber = pipeline("automatic-speech-recognition", model=MODEL_ID)
14
 
15
- # @spaces.GPU
16
- # def transcribe(audio):
17
- # sr, y = audio
18
- # # convert to mono if stereo
19
- # #if y.ndim > 1:
20
- # # y = y.mean(axis=1)
21
- # #y = y.astype(np.float32)
22
- # #y /= np.max(np.abs(y))
23
- # return transcriber({"sampling_rate": sr, "raw": y})["text"]
24
 
25
- # @spaces.GPU
26
- # def transcribe(audio):
27
- # sr, y = audio
 
28
 
29
- # # Convert stereo → mono
30
- # if y.ndim > 1:
31
- # y = np.mean(y, axis=1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
- # # Ensure float32
34
- # y = y.astype(np.float32)
35
 
36
- # # Normalize to [-1, 1] if it's not already
37
- # if np.max(np.abs(y)) > 1.0:
38
- # y /= np.max(np.abs(y))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
- @spaces.GPU
41
- def transcribe(audio):
42
- sr, y = audio
43
 
44
- # convert to mono if stereo
45
- if y.ndim > 1:
46
- y = y.mean(axis=1)
47
 
48
- # resample to 16kHz if needed
49
- if sr != 16000:
50
- y = librosa.resample(y, orig_sr=sr, target_sr=16000)
51
 
52
- y = y.astype(np.float32)
53
- y /= np.max(np.abs(y))
54
 
55
- return transcriber({"sampling_rate": sr, "raw": y})["text"]
56
 
57
- examples = []
58
- examples_dir = "examples"
59
- if os.path.exists(examples_dir):
60
- for filename in os.listdir(examples_dir):
61
- if filename.endswith((".wav", ".mp3", ".ogg")):
62
- examples.append([os.path.join(examples_dir, filename)])
63
 
64
- print(f"Found {len(examples)} example files")
65
- else:
66
- print("Examples directory not found")
67
 
68
- # @spaces.GPU
69
- # def transcribe(audio):
70
- # sr, y = audio
71
 
72
- # if y.ndim > 1:
73
- # y = np.mean(y, axis=1)
74
 
75
- # y = y.astype(np.float32)
76
 
77
- # # normalize to [-1, 1]
78
- # max_val = np.max(np.abs(y))
79
- # if max_val > 0:
80
- # y /= max_val
81
 
82
- # target_sr = transcriber.model.config.sampling_rate if hasattr(transcriber.model, "config") else 16000
83
- # if sr != target_sr:
84
- # import librosa
85
- # y = librosa.resample(y, orig_sr=sr, target_sr=target_sr)
86
- # sr = target_sr
87
 
88
- # return transcriber({"sampling_rate": sr, "raw": y})["text"]
89
 
90
- demo = gr.Interface(
91
- fn=transcribe,
92
- inputs=gr.Audio(),
93
- outputs="text",
94
- title="<div>JASR v1.1 🎙️ <br>Speech Recognition for Dialectal Arabic</div>",
95
- description="""
96
- <div class="centered-content">
97
- <div>
98
- <p>
99
- Developed with ❤ by <a href="https://badrex.github.io/" style="color: #2563eb;">Badr al-Absi</a>
100
- </p>
101
- <br>
102
- <p style="font-size: 15px; line-height: 1.8;">
103
- Marhaban 👋🏼
104
- <br>
105
- <br>
106
- This is a demo for JASR, pronounced <i>Jāsir</i> [جاسِر], a Transformer-based automatic speech recognition (ASR) system for dialectal Arabic.
107
- The current running instance is optimized for the regional dialects of <i>Jazirat al-Arab</i>, or the Arabian Peninsula.
108
- JASR is still under active development.
109
- <br>
110
- <p style="font-size: 15px; line-height: 1.8;">
111
- Simply <strong>upload an audio file</strong> 📤 or <strong>record yourself speaking</strong> 🎙️⏺️ to try out the model!
112
- </p>
113
- </div>
114
- </div>
115
- """,
116
- examples=examples if examples else None,
117
- example_labels=[
118
- "Kuwait Theatre",
119
- "Saudi Radio Poetry",
120
- "News Report (MSA)",
121
- "San3ani Arabic male",
122
- "San3ani Arabic female",
123
- "Khaleeji Theatre",
124
- "TEDx KSA",
125
- "Yousif Saif Football Commentary",
126
- "Khaleeji Theatre 2",
127
- "TV Drama",
128
- "KSA Theatre",
129
- "TV Drama 2",
130
- "Radio Jeddah (KSA)",
131
- "Omani Theatre",
132
- "Khaleeji Drama",
133
- "Radio News",
134
- "TEDx KSA 2",
135
- "Radio Jeddah (KSA) 2",
136
- ],
137
- cache_examples=False,
138
- examples_per_page=18,
139
- flagging_mode=None,
140
- )
141
-
142
- if __name__ == "__main__":
143
- demo.launch()
 
 
 
 
1
  import os
2
+ import gradio as gr
3
  import spaces
4
+ import torch
5
+ from transformers import AutoProcessor, AutoModelForCTC
6
 
7
+ device = "cuda" if torch.cuda.is_available() else "cpu"
8
+ print(f"Using device: {device}")
 
 
 
 
9
 
10
+ # load examples
11
+ examples = []
12
+ examples_dir = "examples"
13
+ if os.path.exists(examples_dir):
14
+ for filename in os.listdir(examples_dir):
15
+ if filename.endswith((".wav", ".mp3", ".ogg")):
16
+ examples.append([os.path.join(examples_dir, filename)])
 
 
17
 
18
+ # Load model and processor
19
+ MODEL_PATH = "badrex/JASRv1.1"
20
+ processor = AutoProcessor.from_pretrained(MODEL_PATH)
21
+ model = AutoModelForCTC.from_pretrained(MODEL_PATH)
22
 
23
+ @spaces.GPU()
24
+ def process_audio(audio_path):
25
+ """Process audio with return the generated respotextnse.
26
+
27
+ Args:
28
+ audio_path: Path to the audio file to be transcribed.
29
+ Returns:
30
+ String containing the transcribed text from the audio file, or an error message
31
+ if the audio file is missing.
32
+ """
33
+ if not audio_path:
34
+ return "Please upload an audio file."
35
+
36
+ inputs = inputs = processor(audio_path, sampling_rate=16000, return_tensors="pt")
37
+ inputs = inputs.to(device, dtype=torch.bfloat16)
38
+
39
+ with torch.no_grad():
40
+ logits = model(**inputs).logits
41
+
42
+ outputs = torch.argmax(logits, dim=-1)
43
+
44
+ decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
45
+
46
+ return decoded_outputs[0]
47
 
 
 
48
 
49
+ # Define Gradio interface
50
+ with gr.Blocks(title="Voxtral Demo") as demo:
51
+ gr.Markdown("<div>JASR v1.1 🎙️ <br>Speech Recognition for Dialectal Arabic</div>")
52
+ gr.Markdown("Upload an audio file and get a transcription from JASR v1.1.")
53
+
54
+ with gr.Row():
55
+ with gr.Column():
56
+ audio_input = gr.Audio(type="filepath", label="Upload Audio")
57
+
58
+ # model_selector = gr.Dropdown(
59
+ # choices=["Voxtral Mini (3B)", "Voxtral Small (24B)"],
60
+ # value="Voxtral Mini (3B)",
61
+ # label="Select Model"
62
+ # )
63
+
64
+ # language = gr.Dropdown(
65
+ # choices=list(LANGUAGES.keys()),
66
+ # value="English",
67
+ # label="Language"
68
+ # )
69
+
70
+ #max_tokens = gr.Slider(minimum=50, maximum=1000, value=500, step=50, label="Max Output Tokens")
71
+ submit_btn = gr.Button("Transcribe Audio", variant="primary")
72
+
73
+ with gr.Column():
74
+ output_text = gr.Textbox(label="Text Transcription", lines=10)
75
+
76
+ submit_btn.click(
77
+ fn=process_audio,
78
+ inputs=[audio_input],
79
+ outputs=output_text
80
+ )
81
+
82
+ gr.Examples(
83
+ examples=examples if examples else None,
84
+ inputs=[audio_input],
85
+ example_labels=[
86
+ "Kuwait Theatre",
87
+ "Saudi Radio Poetry",
88
+ "News Report (MSA)",
89
+ "San3ani Arabic male",
90
+ "San3ani Arabic female",
91
+ "Khaleeji Theatre",
92
+ "TEDx KSA",
93
+ "Yousif Saif Football Commentary",
94
+ "Khaleeji Theatre 2",
95
+ "TV Drama",
96
+ "KSA Theatre",
97
+ "TV Drama 2",
98
+ "Radio Jeddah (KSA)",
99
+ "Omani Theatre",
100
+ "Khaleeji Drama",
101
+ "Radio News",
102
+ "TEDx KSA 2",
103
+ "Radio Jeddah (KSA) 2",
104
+ ],
105
+ )
106
+
107
+ # Launch the app
108
+ if __name__ == "__main__":
109
+ demo.queue().launch() #share=False, ssr_mode=False, mcp_server=True
110
+
111
+ # import gradio as gr
112
+ # from transformers import pipeline
113
+ # import numpy as np
114
+ # import os
115
+ # from huggingface_hub import login
116
+ # import spaces
117
+
118
+ # HF_TOKEN = os.environ.get("HF_TOKEN")
119
+ # if HF_TOKEN:
120
+ # login(token=HF_TOKEN)
121
+
122
+ # MODEL_ID = "badrex/JASRv1.1"
123
+ # transcriber = pipeline("automatic-speech-recognition", model=MODEL_ID)
124
+
125
+ # # @spaces.GPU
126
+ # # def transcribe(audio):
127
+ # # sr, y = audio
128
+ # # # convert to mono if stereo
129
+ # # #if y.ndim > 1:
130
+ # # # y = y.mean(axis=1)
131
+ # # #y = y.astype(np.float32)
132
+ # # #y /= np.max(np.abs(y))
133
+ # # return transcriber({"sampling_rate": sr, "raw": y})["text"]
134
+
135
+ # # @spaces.GPU
136
+ # # def transcribe(audio):
137
+ # # sr, y = audio
138
+
139
+ # # # Convert stereo → mono
140
+ # # if y.ndim > 1:
141
+ # # y = np.mean(y, axis=1)
142
+
143
+ # # # Ensure float32
144
+ # # y = y.astype(np.float32)
145
+
146
+ # # # Normalize to [-1, 1] if it's not already
147
+ # # if np.max(np.abs(y)) > 1.0:
148
+ # # y /= np.max(np.abs(y))
149
 
150
+ # @spaces.GPU
151
+ # def transcribe(audio):
152
+ # sr, y = audio
153
 
154
+ # # convert to mono if stereo
155
+ # if y.ndim > 1:
156
+ # y = y.mean(axis=1)
157
 
158
+ # # resample to 16kHz if needed
159
+ # if sr != 16000:
160
+ # y = librosa.resample(y, orig_sr=sr, target_sr=16000)
161
 
162
+ # y = y.astype(np.float32)
163
+ # y /= np.max(np.abs(y))
164
 
165
+ # return transcriber({"sampling_rate": sr, "raw": y})["text"]
166
 
167
+ # examples = []
168
+ # examples_dir = "examples"
169
+ # if os.path.exists(examples_dir):
170
+ # for filename in os.listdir(examples_dir):
171
+ # if filename.endswith((".wav", ".mp3", ".ogg")):
172
+ # examples.append([os.path.join(examples_dir, filename)])
173
 
174
+ # print(f"Found {len(examples)} example files")
175
+ # else:
176
+ # print("Examples directory not found")
177
 
178
+ # # @spaces.GPU
179
+ # # def transcribe(audio):
180
+ # # sr, y = audio
181
 
182
+ # # if y.ndim > 1:
183
+ # # y = np.mean(y, axis=1)
184
 
185
+ # # y = y.astype(np.float32)
186
 
187
+ # # # normalize to [-1, 1]
188
+ # # max_val = np.max(np.abs(y))
189
+ # # if max_val > 0:
190
+ # # y /= max_val
191
 
192
+ # # target_sr = transcriber.model.config.sampling_rate if hasattr(transcriber.model, "config") else 16000
193
+ # # if sr != target_sr:
194
+ # # import librosa
195
+ # # y = librosa.resample(y, orig_sr=sr, target_sr=target_sr)
196
+ # # sr = target_sr
197
 
198
+ # # return transcriber({"sampling_rate": sr, "raw": y})["text"]
199
 
200
+ # demo = gr.Interface(
201
+ # fn=transcribe,
202
+ # inputs=gr.Audio(),
203
+ # outputs="text",
204
+ # title="<div>JASR v1.1 🎙️ <br>Speech Recognition for Dialectal Arabic</div>",
205
+ # description="""
206
+ # <div class="centered-content">
207
+ # <div>
208
+ # <p>
209
+ # Developed with ❤ by <a href="https://badrex.github.io/" style="color: #2563eb;">Badr al-Absi</a>
210
+ # </p>
211
+ # <br>
212
+ # <p style="font-size: 15px; line-height: 1.8;">
213
+ # Marhaban 👋🏼
214
+ # <br>
215
+ # <br>
216
+ # This is a demo for JASR, pronounced <i>Jāsir</i> [جاسِر], a Transformer-based automatic speech recognition (ASR) system for dialectal Arabic.
217
+ # The current running instance is optimized for the regional dialects of <i>Jazirat al-Arab</i>, or the Arabian Peninsula.
218
+ # JASR is still under active development.
219
+ # <br>
220
+ # <p style="font-size: 15px; line-height: 1.8;">
221
+ # Simply <strong>upload an audio file</strong> 📤 or <strong>record yourself speaking</strong> 🎙️⏺️ to try out the model!
222
+ # </p>
223
+ # </div>
224
+ # </div>
225
+ # """,
226
+ # examples=examples if examples else None,
227
+ # example_labels=[
228
+ # "Kuwait Theatre",
229
+ # "Saudi Radio Poetry",
230
+ # "News Report (MSA)",
231
+ # "San3ani Arabic male",
232
+ # "San3ani Arabic female",
233
+ # "Khaleeji Theatre",
234
+ # "TEDx KSA",
235
+ # "Yousif Saif Football Commentary",
236
+ # "Khaleeji Theatre 2",
237
+ # "TV Drama",
238
+ # "KSA Theatre",
239
+ # "TV Drama 2",
240
+ # "Radio Jeddah (KSA)",
241
+ # "Omani Theatre",
242
+ # "Khaleeji Drama",
243
+ # "Radio News",
244
+ # "TEDx KSA 2",
245
+ # "Radio Jeddah (KSA) 2",
246
+ # ],
247
+ # cache_examples=False,
248
+ # examples_per_page=18,
249
+ # flagging_mode=None,
250
+ # )
251
+
252
+ # if __name__ == "__main__":
253
+ # demo.launch()