Spaces:

mozilla-ai
/

speech-to-text-alignment

Running

App Files Files Community

kostissz commited on Apr 17

Commit

41deb18

verified ·

1 Parent(s): b7a92bd

Fix mic audio input

Browse files

Files changed (1) hide show

app.py +24 -5

app.py CHANGED Viewed

@@ -1,10 +1,12 @@
 import csv
-from pathlib import Path
 from typing import Tuple
 import gradio as gr
 from transformers import WhisperProcessor, WhisperForConditionalGeneration
 from whisper_bidec import decode_wav, get_logits_processor, load_corpus_from_sentences
 def _parse_file(file_path: str) -> list[str]:
@@ -22,9 +24,22 @@ def _parse_file(file_path: str) -> list[str]:
     return sentences
 def transcribe(
     processor_name: str,
-    audio: str,
     bias_strength: float,
     bias_text: str | None,
     bias_text_file: str | None,
@@ -36,21 +51,25 @@ def transcribe(
     if bias_text:
         sentences = bias_text.split(",")
-    elif Path(bias_text_file).is_file():
         sentences = _parse_file(bias_text_file)
     if sentences:
         corpus = load_corpus_from_sentences(sentences, processor)
         logits_processor = get_logits_processor(
             corpus=corpus, processor=processor, bias_towards_lm=bias_strength
         )
         text_with_bias = decode_wav(
-            model, processor, audio, logits_processor=logits_processor
         )
     else:
         text_with_bias = ""
-    text_no_bias = decode_wav(model, processor, audio, logits_processor=None)
     return text_no_bias, text_with_bias

 import csv
+import os
+import tempfile
 from typing import Tuple
 import gradio as gr
 from transformers import WhisperProcessor, WhisperForConditionalGeneration
 from whisper_bidec import decode_wav, get_logits_processor, load_corpus_from_sentences
+from pydub import AudioSegment
 def _parse_file(file_path: str) -> list[str]:
     return sentences
+def _convert_audio(input_audio_path: str) -> str:
+    """Whisper decoder expects wav files with 16kHz sample rate and mono channel.
+    Convert the audio file to this format, save it in a tmp file and return the path.
+    """
+    fd, tmp_path = tempfile.mkstemp(suffix=".wav")
+    os.close(fd)  # Close file descriptor
+    audio = AudioSegment.from_file(input_audio_path)
+    audio = audio.set_channels(1).set_frame_rate(16000)
+    audio.export(tmp_path, format="wav")
+    return tmp_path
 def transcribe(
     processor_name: str,
+    audio_path: str,
     bias_strength: float,
     bias_text: str | None,
     bias_text_file: str | None,
     if bias_text:
         sentences = bias_text.split(",")
+    elif bias_text_file:
         sentences = _parse_file(bias_text_file)
+    converted_audio_path = _convert_audio(audio_path)
     if sentences:
         corpus = load_corpus_from_sentences(sentences, processor)
         logits_processor = get_logits_processor(
             corpus=corpus, processor=processor, bias_towards_lm=bias_strength
         )
         text_with_bias = decode_wav(
+            model, processor, converted_audio_path, logits_processor=logits_processor
         )
     else:
         text_with_bias = ""
+    text_no_bias = decode_wav(
+        model, processor, converted_audio_path, logits_processor=None
+    )
     return text_no_bias, text_with_bias