Spaces:

ACloudCenter
/

canary-qwen-transcriber-2.5b

Runtime error

App Files Files Community

ACloudCenter commited on Sep 2, 2025

Commit

b39fef1

1 Parent(s): 3bf9123

Modify main app to remove redunant chatbot issues

Browse files

Files changed (1) hide show

app.py +51 -86

app.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import os
-# Set tokenizers parallelism to avoid fork warning in Spaces
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 import gradio as gr
@@ -59,52 +58,30 @@ def transcribe_audio(audio_filepath):
     return transcript, transcript
-# Streaming Q&A function
 @spaces.GPU
 def transcript_qa(transcript, question, history):
     if not transcript:
-        yield history + [{"role": "assistant", "content": "Please transcribe audio first"}], ""
-        return
     if not question:
-        yield history, ""
-        return
-    # Add user message to history
-    history = history + [{"role": "user", "content": question}]
-    # Add placeholder for assistant response
-    history = history + [{"role": "assistant", "content": ""}]
     with torch.inference_mode(), model.llm.disable_adapter():
-        # For streaming, we'd need to use a different generation method
-        # Since model.generate doesn't support streaming, we'll generate full response
-        # but simulate streaming for better UX
         output_ids = model.generate(
-            prompts=[[{"role": "user", "content": f"When answering questions about the transcript, use markdown when appropriate, such as lists, bullet points, and code blocks: {question}\n\n{transcript}"}]],
-            max_new_tokens=2048,  # Reduced for faster responses
         )
     ans = model.tokenizer.ids_to_text(output_ids[0].cpu())
-    ans = ans.split("<|im_start|>assistant")[-1]  # get rid of the prompt
     if "<think>" in ans:
         if "</think>" in ans:
             ans = ans.split("<think>")[-1]
-            _, ans = ans.split("</think>")  # get rid of the thinking
-        ans = ans.strip()
-    # Simulate streaming by yielding words progressively
-    words = ans.split()
-    current_response = ""
-    for i, word in enumerate(words):
-        current_response += word + " "
-        history[-1] = {"role": "assistant", "content": current_response.strip()}
-        yield history, ""
-        # Small delay to make streaming more visible
-        if i % 3 == 0:  # Try every 3 words for smoother streaming
-            import time
-            time.sleep(0.01)
 def disable_transcribe():
     return gr.update(interactive=False)
@@ -113,7 +90,7 @@ def enable_transcribe():
     return gr.update(interactive=True)
 def reset_chatbot():
-    return [], []  # Reset both chatbot display and state
 # Build the Gradio interface
 with gr.Blocks(theme=theme) as demo:
@@ -179,12 +156,11 @@ with gr.Blocks(theme=theme) as demo:
                 its transcript. This model is ready for commercial use.''')
     # State variables
-    transcript_state = gr.State()
-    chatbot_state = gr.State()
     with gr.Row():
         with gr.Column(scale=1):
-            gr.Markdown("### Step1 - Audio Input")
             audio_input = gr.Audio(
                 sources=["microphone", "upload"],
                 type="filepath",
@@ -194,7 +170,7 @@ with gr.Blocks(theme=theme) as demo:
             transcribe_btn = gr.Button("Transcribe Audio", variant="primary", size="lg")
         with gr.Column(scale=1):
-            gr.Markdown("### Step2 - Transcript")
             transcript_output = gr.Textbox(
                 label="",
                 lines=10,
@@ -202,71 +178,60 @@ with gr.Blocks(theme=theme) as demo:
                 max_lines=10
             )
-    gr.Markdown("### Step3 - Interactive Q&A")
-    # Create a wrapper function for Chatbot
-    def qa_wrapper(message, history, transcript):
-        # Check if we have a transcript
-        if not transcript:
-            yield "Please transcribe audio first before asking questions."
-        # Convert Chatbot history format to our format
-        formatted_history = []
-        if history:
-            for msg in history:
-                if isinstance(msg, dict):
-                    formatted_history.append(msg)
-                elif isinstance(msg, tuple):
-                    # Handle tuple format (user, assistant)
-                    if msg[0]:
-                        formatted_history.append({"role": "user", "content": msg[0]})
-                    if len(msg) > 1 and msg[1]:
-                        formatted_history.append({"role": "assistant", "content": msg[1]})
-        # Process the Q&A with the current message
-        response_generator = transcript_qa(transcript, message, formatted_history)
-        # Stream the response
-        for response_history, _ in response_generator:
-            if response_history and response_history[-1]["role"] == "assistant":
-                yield response_history[-1]["content"]
-    # Use ChatInterface for cleaner UI
-    chatbot = gr.Chatbot(
-        type="messages",
-        height=450,
-        label="",
-        render_markdown=True,
-        layout="panel"
     )
-    msg = gr.Textbox(
-        placeholder="Ask a question about the transcript...",
-        container=False
     )
-    examples = [
-        ["Can you please summarize this?", None],
-        ["What were the key points discussed?", None],
-        ["What was the main topic?", None],
-        ["What is the TLDR version so I can just leave this conference call early?", None]
-    ]
-    additional_inputs = [transcript_state]
-    # Event handlers - simplified since ChatInterface handles Q&A
     transcribe_btn.click(
         fn=disable_transcribe,
-        inputs=None,
         outputs=[transcribe_btn]
     ).then(
         fn=reset_chatbot,
-        inputs=None,
-        outputs=[chatbot, chatbot_state]  # Reset both the display and state
     ).then(
         fn=transcribe_audio,
         inputs=[audio_input],
         outputs=[transcript_output, transcript_state]
     ).then(
         fn=enable_transcribe,
-        inputs=None,
         outputs=[transcribe_btn]
     )

 import os
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 import gradio as gr
     return transcript, transcript
 @spaces.GPU
 def transcript_qa(transcript, question, history):
     if not transcript:
+        return history + [{"role": "user", "content": question}, {"role": "assistant", "content": "Please transcribe audio first before asking questions."}]
     if not question:
+        return history
     with torch.inference_mode(), model.llm.disable_adapter():
         output_ids = model.generate(
+            prompts=[[{"role": "user", "content": f"Answer this question about the transcript: {question}\n\nTranscript: {transcript}"}]],
+            max_new_tokens=512,
         )
     ans = model.tokenizer.ids_to_text(output_ids[0].cpu())
+    ans = ans.split("<|im_start|>assistant")[-1]
     if "<think>" in ans:
         if "</think>" in ans:
             ans = ans.split("<think>")[-1]
+            _, ans = ans.split("</think>")
+    ans = ans.strip()
+    return history + [{"role": "user", "content": question}, {"role": "assistant", "content": ans}]
 def disable_transcribe():
     return gr.update(interactive=False)
     return gr.update(interactive=True)
 def reset_chatbot():
+    return []
 # Build the Gradio interface
 with gr.Blocks(theme=theme) as demo:
                 its transcript. This model is ready for commercial use.''')
     # State variables
+    transcript_state = gr.State("")
     with gr.Row():
         with gr.Column(scale=1):
+            gr.Markdown("### Step 1 - Audio Input")
             audio_input = gr.Audio(
                 sources=["microphone", "upload"],
                 type="filepath",
             transcribe_btn = gr.Button("Transcribe Audio", variant="primary", size="lg")
         with gr.Column(scale=1):
+            gr.Markdown("### Step 2 - Transcript")
             transcript_output = gr.Textbox(
                 label="",
                 lines=10,
                 max_lines=10
             )
+    gr.Markdown("### Step 3 - Interactive Q&A")
+    chatbot = gr.Chatbot(type="messages", height=450)
+    msg = gr.Textbox(placeholder="Ask a question about the transcript...", label="")
+    with gr.Row():
+        submit_btn = gr.Button("Submit", variant="primary")
+        clear_btn = gr.Button("Clear Chat")
+    gr.Examples(
+        examples=[
+            "Can you please summarize this?",
+            "What were the key points discussed?",
+            "What was the main topic?",
+            "What is the TLDR version?"
+        ],
+        inputs=msg
+    )
+    # Event handlers
+    def submit_question(question, history, transcript):
+        if not question:
+            return "", history
+        new_history = transcript_qa(transcript, question, history)
+        return "", new_history
+    msg.submit(
+        fn=submit_question,
+        inputs=[msg, chatbot, transcript_state],
+        outputs=[msg, chatbot]
     )
+    submit_btn.click(
+        fn=submit_question,
+        inputs=[msg, chatbot, transcript_state],
+        outputs=[msg, chatbot]
     )
+    clear_btn.click(
+        fn=lambda: [],
+        outputs=[chatbot]
+    )
     transcribe_btn.click(
         fn=disable_transcribe,
         outputs=[transcribe_btn]
     ).then(
         fn=reset_chatbot,
+        outputs=[chatbot]
     ).then(
         fn=transcribe_audio,
         inputs=[audio_input],
         outputs=[transcript_output, transcript_state]
     ).then(
         fn=enable_transcribe,
         outputs=[transcribe_btn]
     )