Whisper_speaker_diarization

Runtime error

App Files Files Community

vumichien commited on Jan 3, 2023

Commit

28f8c47

1 Parent(s): 05f6fe1

Update app.py

Browse files

Files changed (1) hide show

app.py +146 -68

app.py CHANGED Viewed

@@ -21,10 +21,8 @@ from gpuinfo import GPUInfo
 import wave
 import contextlib
 import psutil
-num_cores = psutil.cpu_count()
-os.environ["OMP_NUM_THREADS"] = f"{num_cores}"
 whisper_models = ["base", "small", "medium", "large"]
 source_languages = {
@@ -128,16 +126,60 @@ source_languages = {
     "jw": "Javanese",
     "su": "Sundanese",
 }
 embedding_model = PretrainedSpeakerEmbedding(
     "speechbrain/spkrec-ecapa-voxceleb",
-    device=torch.device("cuda"))
-source_language_list = [key[0] for key in source_languages.items()]
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-print("DEVICE IS: ")
-print(device)
 def convert_time(secs):
     return datetime.timedelta(seconds=round(secs))
@@ -149,14 +191,12 @@ def get_youtube(video_url):
     print(abs_video_path)
     return abs_video_path
 def speech_to_text(video_file_path, selected_source_lang, whisper_model, num_speakers):
     """
     # Transcribe youtube link using OpenAI Whisper
-    This space allows you to:
-    1. Download youtube video with a given url
-    2. Watch it in the first video component
-    3. Run automatic speech recognition and diarization (speaker identification)
     Speech Recognition is based on models from OpenAI Whisper https://github.com/openai/whisper
     Speaker diarization model and pipeline from by https://github.com/pyannote/pyannote-audio
@@ -257,7 +297,6 @@ def speech_to_text(video_file_path, selected_source_lang, whisper_model, num_spe
 # Inspiration from https://huggingface.co/spaces/RASMUS/Whisper-youtube-crosslingual-subtitles
 video_in = gr.Video(label="Video file", mirror_webcam=False)
 youtube_url_in = gr.Textbox(label="Youtube url", lines=1, interactive=True)
 df_init = pd.DataFrame(columns=['Start', 'End', 'Speaker', 'Text'])
 memory = psutil.virtual_memory()
 selected_source_lang = gr.Dropdown(choices=source_language_list, type="value", value="en", label="Spoken language in video", interactive=True)
@@ -265,72 +304,111 @@ selected_whisper_model = gr.Dropdown(choices=whisper_models, type="value", value
 number_speakers = gr.Number(precision=0, value=2, label="Selected number of speakers", interactive=True)
 system_info = gr.Markdown(f"*Memory: {memory.total / (1024 * 1024 * 1024):.2f}GB, used: {memory.percent}%, available: {memory.available / (1024 * 1024 * 1024):.2f}GB*")
 transcription_df = gr.DataFrame(value=df_init,label="Transcription dataframe", row_count=(0, "dynamic"), max_rows = 10, wrap=True, overflow_row_behaviour='paginate')
 title = "Whisper speaker diarization"
 demo = gr.Blocks(title=title)
 demo.encrypt = False
 with demo:
-    gr.Markdown('''
-        <div>
-        <h1 style='text-align: center'>Whisper speaker diarization</h1>
-        This space uses Whisper models from <a href='https://github.com/openai/whisper' target='_blank'><b>OpenAI</b></a> to recoginze the speech and ECAPA-TDNN model from <a href='https://github.com/speechbrain/speechbrain' target='_blank'><b>SpeechBrain</b></a> to encode and clasify speakers</h2>
-        </div>
-    ''')
-    with gr.Row():
         gr.Markdown('''
-        ### What you can do with this space:
-        ##### 1. Download youtube video with a given URL
-        ##### 2. Watch it in the first video component
-        ##### 3. Run automatic speech recognition and diarization (speaker identification)
         ''')
-    with gr.Row():
-        gr.Markdown('''
-            ### You can test with some youtube links as below:
             ''')
-    examples = gr.Examples(examples=
-            [ "https://www.youtube.com/watch?v=j7BfEzAFuYc&t=32s",
-              "https://www.youtube.com/watch?v=-UX0X45sYe4",
-              "https://www.youtube.com/watch?v=7minSgqi-Gw"],
-           label="Examples", inputs=[youtube_url_in])
-    with gr.Row():
-        with gr.Column():
-            youtube_url_in.render()
-            download_youtube_btn = gr.Button("Download Youtube video")
-            download_youtube_btn.click(get_youtube, [youtube_url_in], [
-                video_in])
-            print(video_in)
-    with gr.Row():
-        with gr.Column():
-            video_in.render()
             with gr.Column():
-                gr.Markdown('''
-                ##### Here you can start the transcription process.
-                ##### Please select the source language for transcription.
-                ##### You should select a number of speakers for getting better results.
-                ''')
-            selected_source_lang.render()
-            selected_whisper_model.render()
-            number_speakers.render()
-            transcribe_btn = gr.Button("Transcribe audio and diarization")
-            transcribe_btn.click(speech_to_text, [video_in, selected_source_lang, selected_whisper_model, number_speakers], [transcription_df, system_info])
-    with gr.Row():
-        gr.Markdown('''
-        ##### Here you will get transcription  output
-        ##### ''')
-    with gr.Row():
-        with gr.Column():
-            transcription_df.render()
-            system_info.render()
-            gr.Markdown('''<center><img src='https://visitor-badge.glitch.me/badge?page_id=WhisperDiarizationSpeakers' alt='visitor badge'></center>''')
 demo.launch(debug=True)

 import wave
 import contextlib
+from transformers import pipeline
 import psutil
 whisper_models = ["base", "small", "medium", "large"]
 source_languages = {
     "jw": "Javanese",
     "su": "Sundanese",
 }
+source_language_list = [key[0] for key in source_languages.items()]
+MODEL_NAME = "vumichien/whisper-medium-jp"
+lang = "ja"
+device = 0 if torch.cuda.is_available() else "cpu"
+pipe = pipeline(
+    task="automatic-speech-recognition",
+    model=MODEL_NAME,
+    chunk_length_s=30,
+    device=device,
+)
+pipe.model.config.forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids(language=lang, task="transcribe")
 embedding_model = PretrainedSpeakerEmbedding(
     "speechbrain/spkrec-ecapa-voxceleb",
+    device=torch.device("cuda" if torch.cuda.is_available() else "cpu"))
+def transcribe(microphone, file_upload):
+    warn_output = ""
+    if (microphone is not None) and (file_upload is not None):
+        warn_output = (
+            "WARNING: You've uploaded an audio file and used the microphone. "
+            "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
+        )
+    elif (microphone is None) and (file_upload is None):
+        return "ERROR: You have to either use the microphone or upload an audio file"
+    file = microphone if microphone is not None else file_upload
+    text = pipe(file)["text"]
+    return warn_output + text
+def _return_yt_html_embed(yt_url):
+    video_id = yt_url.split("?v=")[-1]
+    HTML_str = (
+        f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
+        " </center>"
+    )
+    return HTML_str
+def yt_transcribe(yt_url):
+    yt = YouTube(yt_url)
+    html_embed_str = _return_yt_html_embed(yt_url)
+    stream = yt.streams.filter(only_audio=True)[0]
+    stream.download(filename="audio.mp3")
+    text = pipe("audio.mp3")["text"]
+    return html_embed_str, text
 def convert_time(secs):
     return datetime.timedelta(seconds=round(secs))
     print(abs_video_path)
     return abs_video_path
 def speech_to_text(video_file_path, selected_source_lang, whisper_model, num_speakers):
     """
     # Transcribe youtube link using OpenAI Whisper
+    1. Using Open AI's Whisper model to seperate audio into segments and generate transcripts.
+    2. Generating speaker embeddings for each segments.
+    3. Applying agglomerative clustering on the embeddings to identify the speaker for each segment.
     Speech Recognition is based on models from OpenAI Whisper https://github.com/openai/whisper
     Speaker diarization model and pipeline from by https://github.com/pyannote/pyannote-audio
 # Inspiration from https://huggingface.co/spaces/RASMUS/Whisper-youtube-crosslingual-subtitles
 video_in = gr.Video(label="Video file", mirror_webcam=False)
 youtube_url_in = gr.Textbox(label="Youtube url", lines=1, interactive=True)
 df_init = pd.DataFrame(columns=['Start', 'End', 'Speaker', 'Text'])
 memory = psutil.virtual_memory()
 selected_source_lang = gr.Dropdown(choices=source_language_list, type="value", value="en", label="Spoken language in video", interactive=True)
 number_speakers = gr.Number(precision=0, value=2, label="Selected number of speakers", interactive=True)
 system_info = gr.Markdown(f"*Memory: {memory.total / (1024 * 1024 * 1024):.2f}GB, used: {memory.percent}%, available: {memory.available / (1024 * 1024 * 1024):.2f}GB*")
 transcription_df = gr.DataFrame(value=df_init,label="Transcription dataframe", row_count=(0, "dynamic"), max_rows = 10, wrap=True, overflow_row_behaviour='paginate')
 title = "Whisper speaker diarization"
 demo = gr.Blocks(title=title)
 demo.encrypt = False
 with demo:
+    with gr.Tab("Whisper speaker diarization"):
         gr.Markdown('''
+            <div>
+            <h1 style='text-align: center'>Whisper speaker diarization</h1>
+            This space uses Whisper models from <a href='https://github.com/openai/whisper' target='_blank'><b>OpenAI</b></a> to recoginze the speech and ECAPA-TDNN model from <a href='https://github.com/speechbrain/speechbrain' target='_blank'><b>SpeechBrain</b></a> to encode and clasify speakers</h2>
+            </div>
         ''')
+        with gr.Row():
+            gr.Markdown('''
+            ### Transcribe youtube link using OpenAI Whisper
+            ##### 1. Using Open AI's Whisper model to seperate audio into segments and generate transcripts.
+            ##### 2. Generating speaker embeddings for each segments.
+            ##### 3. Applying agglomerative clustering on the embeddings to identify the speaker for each segment.
             ''')
+        with gr.Row():
+            gr.Markdown('''
+                ### You can test by following examples:
+                ''')
+        examples = gr.Examples(examples=
+                [ "https://www.youtube.com/watch?v=j7BfEzAFuYc&t=32s",
+                  "https://www.youtube.com/watch?v=-UX0X45sYe4",
+                  "https://www.youtube.com/watch?v=7minSgqi-Gw"],
+              label="Examples", inputs=[youtube_url_in])
+        with gr.Row():
             with gr.Column():
+                youtube_url_in.render()
+                download_youtube_btn = gr.Button("Download Youtube video")
+                download_youtube_btn.click(get_youtube, [youtube_url_in], [
+                    video_in])
+                print(video_in)
+        with gr.Row():
+            with gr.Column():
+                video_in.render()
+                with gr.Column():
+                    gr.Markdown('''
+                    ##### Here you can start the transcription process.
+                    ##### Please select the source language for transcription.
+                    ##### You should select a number of speakers for getting better results.
+                    ''')
+                selected_source_lang.render()
+                selected_whisper_model.render()
+                number_speakers.render()
+                transcribe_btn = gr.Button("Transcribe audio and diarization")
+                transcribe_btn.click(speech_to_text, [video_in, selected_source_lang, selected_whisper_model, number_speakers], [transcription_df, system_info])
+        with gr.Row():
+            gr.Markdown('''
+            ##### Here you will get transcription  output
+            ##### ''')
+        with gr.Row():
+            with gr.Column():
+                transcription_df.render()
+                system_info.render()
+                gr.Markdown('''<center><img src='https://visitor-badge.glitch.me/badge?page_id=WhisperDiarizationSpeakers' alt='visitor badge'></center>''')
+    with gr.Tab("Whisper Transcribe Japanese Audio"):
+        gr.Markdown(f'''
+              <div>
+              <h1 style='text-align: center'>Whisper Transcribe Japanese Audio</h1>
+              </div>
+              Transcribe long-form microphone or audio inputs with the click of a button! The fine-tuned
+              checkpoint <a href='https://huggingface.co/{MODEL_NAME}' target='_blank'><b>{MODEL_NAME}</b></a> to transcribe audio files of arbitrary length.
+          ''')
+        microphone = gr.inputs.Audio(source="microphone", type="filepath", optional=True)
+        upload = gr.inputs.Audio(source="upload", type="filepath", optional=True)
+        transcribe_btn = gr.Button("Transcribe Audio")
+        text_output = gr.Textbox()
+        with gr.Row():
+            gr.Markdown('''
+                ### You can test by following examples:
+                ''')
+        examples = gr.Examples(examples=
+              [ "sample1.wav",
+                "sample2.wav",
+                ],
+              label="Examples", inputs=[upload])
+        transcribe_btn.click(transcribe, [microphone, upload], outputs=text_output)
+    with gr.Tab("Whisper Transcribe Japanese YouTube"):
+        gr.Markdown(f'''
+              <div>
+              <h1 style='text-align: center'>Whisper Transcribe Japanese YouTube</h1>
+              </div>
+                Transcribe long-form YouTube videos with the click of a button! The fine-tuned checkpoint:
+                <a href='https://huggingface.co/{MODEL_NAME}' target='_blank'><b>{MODEL_NAME}</b></a> to transcribe audio files of arbitrary length.
+            ''')
+        youtube_link = gr.Textbox(label="Youtube url", lines=1, interactive=True)
+        yt_transcribe_btn = gr.Button("Transcribe YouTube")
+        text_output2 = gr.Textbox()
+        html_output = gr.Markdown()
+        yt_transcribe_btn.click(yt_transcribe, [youtube_link], outputs=[html_output, text_output2])
 demo.launch(debug=True)