COldish-Ayako commited on Mar 29

Commit

fcd58ee

unverified ·

1 Parent(s): ae2f86a

Upload 19 files

Browse files

Files changed (20) hide show

.gitattributes +1 -0
assets/jfk.flac +3 -0
moyoyo_asr_models/ggml-medium-encoder.mlmodelc/analytics/coremldata.bin +3 -0
moyoyo_asr_models/ggml-medium-encoder.mlmodelc/coremldata.bin +3 -0
moyoyo_asr_models/ggml-medium-encoder.mlmodelc/metadata.json +64 -0
moyoyo_asr_models/ggml-medium-encoder.mlmodelc/model.mil +0 -0
moyoyo_asr_models/ggml-medium-encoder.mlmodelc/weights/weight.bin +3 -0
moyoyo_asr_models/ggml-medium-q5_0.bin +3 -0
run_client.py +15 -0
run_server.py +31 -0
transcribe/__init__.py +0 -0
transcribe/__pycache__/__init__.cpython-311.pyc +0 -0
transcribe/__pycache__/client.cpython-311.pyc +0 -0
transcribe/__pycache__/server.cpython-311.pyc +0 -0
transcribe/__pycache__/utils.cpython-311.pyc +0 -0
transcribe/__pycache__/vad.cpython-311.pyc +0 -0
transcribe/client.py +675 -0
transcribe/server.py +684 -0
transcribe/utils.py +81 -0
transcribe/vad.py +160 -0

.gitattributes CHANGED Viewed

@@ -33,4 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 *.icns filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/jfk.flac filter=lfs diff=lfs merge=lfs -text
 *.icns filter=lfs diff=lfs merge=lfs -text

assets/jfk.flac ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:63a4b1e4c1dc655ac70961ffbf518acd249df237e5a0152faae9a4a836949715
+size 1152693

moyoyo_asr_models/ggml-medium-encoder.mlmodelc/analytics/coremldata.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:adbe456375e7eb3407732a426ecb65bbda86860e4aa801f3a696b70b8a533cdd
+size 207

moyoyo_asr_models/ggml-medium-encoder.mlmodelc/coremldata.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:05fe28591b40616fa0c34ad7b853133623f5300923ec812acb11459c411acf3b
+size 149

moyoyo_asr_models/ggml-medium-encoder.mlmodelc/metadata.json ADDED Viewed

	@@ -0,0 +1,64 @@

+[
+  {
+    "metadataOutputVersion" : "3.0",
+    "storagePrecision" : "Float16",
+    "outputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float32",
+        "formattedType" : "MultiArray (Float32)",
+        "shortDescription" : "",
+        "shape" : "[]",
+        "name" : "output",
+        "type" : "MultiArray"
+      }
+    ],
+    "modelParameters" : [
+    ],
+    "specificationVersion" : 6,
+    "mlProgramOperationTypeHistogram" : {
+      "Linear" : 144,
+      "Matmul" : 48,
+      "Cast" : 2,
+      "Conv" : 2,
+      "Softmax" : 24,
+      "Add" : 49,
+      "LayerNorm" : 49,
+      "Mul" : 48,
+      "Transpose" : 97,
+      "Gelu" : 26,
+      "Reshape" : 96
+    },
+    "computePrecision" : "Mixed (Float16, Float32, Int32)",
+    "isUpdatable" : "0",
+    "availability" : {
+      "macOS" : "12.0",
+      "tvOS" : "15.0",
+      "watchOS" : "8.0",
+      "iOS" : "15.0",
+      "macCatalyst" : "15.0"
+    },
+    "modelType" : {
+      "name" : "MLModelType_mlProgram"
+    },
+    "userDefinedMetadata" : {
+    },
+    "inputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float32",
+        "formattedType" : "MultiArray (Float32 1 × 80 × 3000)",
+        "shortDescription" : "",
+        "shape" : "[1, 80, 3000]",
+        "name" : "logmel_data",
+        "type" : "MultiArray"
+      }
+    ],
+    "generatedClassName" : "coreml_encoder_medium",
+    "method" : "predict"
+  }
+]

moyoyo_asr_models/ggml-medium-encoder.mlmodelc/model.mil ADDED Viewed

The diff for this file is too large to render. See raw diff

moyoyo_asr_models/ggml-medium-encoder.mlmodelc/weights/weight.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6a188b0e4e3109f28f38f1f47ea2497ffe623923419df8e1ae12cb5f809a1815
+size 614507008

moyoyo_asr_models/ggml-medium-q5_0.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:19fea4b380c3a618ec4723c3eef2eb785ffba0d0538cf43f8f235e7b3b34220f
+size 539212467

run_client.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from transcribe.client import TranscriptionClient
+client = TranscriptionClient(
+    "localhost",
+    9000,
+    lang="zh",
+    save_output_recording=False,  # Only used for microphone input, False by Default
+    output_recording_filename="./output_recording.wav",  # Only used for microphone input
+    max_clients=4,
+    max_connection_time=600,
+    mute_audio_playback=False,  # Only used for file input, False by Default
+)
+if __name__ == '__main__':
+    client()

run_server.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import argparse
+import os
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--port', '-p',
+                        type=int,
+                        default=9090,
+                        help="Websocket port to run the server on.")
+    parser.add_argument('--backend', '-b',
+                        type=str,
+                        default='pywhispercpp',
+                        help='Backends from ["pywhispercpp"]')
+    parser.add_argument('--omp_num_threads', '-omp',
+                        type=int,
+                        default=1,
+                        help="Number of threads to use for OpenMP")
+    args = parser.parse_args()
+    if "OMP_NUM_THREADS" not in os.environ:
+        os.environ["OMP_NUM_THREADS"] = str(args.omp_num_threads)
+    from transcribe.server import TranscriptionServer
+    server = TranscriptionServer()
+    server.run(
+        "0.0.0.0",
+        port=args.port,
+        backend=args.backend,
+    )

transcribe/__init__.py ADDED Viewed

File without changes

transcribe/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (183 Bytes). View file

transcribe/__pycache__/client.cpython-311.pyc ADDED Viewed

Binary file (39 kB). View file

transcribe/__pycache__/server.cpython-311.pyc ADDED Viewed

Binary file (36 kB). View file

transcribe/__pycache__/utils.cpython-311.pyc ADDED Viewed

Binary file (4.64 kB). View file

transcribe/__pycache__/vad.cpython-311.pyc ADDED Viewed

Binary file (9.36 kB). View file

transcribe/client.py ADDED Viewed

	@@ -0,0 +1,675 @@

+import json
+import os
+import shutil
+import threading
+import time
+import uuid
+import wave
+import av
+import numpy as np
+import pyaudio
+import websocket
+import transcribe.utils as utils
+class Client:
+    """
+    Handles communication with a server using WebSocket.
+    """
+    INSTANCES = {}
+    END_OF_AUDIO = "END_OF_AUDIO"
+    def __init__(
+            self,
+            host=None,
+            port=None,
+            lang=None,
+            log_transcription=True,
+            max_clients=4,
+            max_connection_time=600,
+    ):
+        """
+        Initializes a Client instance for audio recording and streaming to a server.
+        If host and port are not provided, the WebSocket connection will not be established.
+        the audio recording starts immediately upon initialization.
+        Args:
+            host (str): The hostname or IP address of the server.
+            port (int): The port number for the WebSocket server.
+            lang (str, optional): The selected language for transcription. Default is None.
+            log_transcription (bool, optional): Whether to log transcription output to the console. Default is True.
+            max_clients (int, optional): Maximum number of client connections allowed. Default is 4.
+            max_connection_time (int, optional): Maximum allowed connection time in seconds. Default is 600.
+        """
+        self.recording = False
+        self.uid = str(uuid.uuid4())
+        self.waiting = False
+        self.last_response_received = None
+        self.disconnect_if_no_response_for = 15
+        self.language = lang
+        self.server_error = False
+        self.last_segment = None
+        self.last_received_segment = None
+        self.log_transcription = log_transcription
+        self.max_clients = max_clients
+        self.max_connection_time = max_connection_time
+        self.audio_bytes = None
+        if host is not None and port is not None:
+            socket_url = f"ws://{host}:{port}"
+            self.client_socket = websocket.WebSocketApp(
+                socket_url,
+                on_open=lambda ws: self.on_open(ws),
+                on_message=lambda ws, message: self.on_message(ws, message),
+                on_error=lambda ws, error: self.on_error(ws, error),
+                on_close=lambda ws, close_status_code, close_msg: self.on_close(
+                    ws, close_status_code, close_msg
+                ),
+            )
+        else:
+            print("[ERROR]: No host or port specified.")
+            return
+        Client.INSTANCES[self.uid] = self
+        # start websocket client in a thread
+        self.ws_thread = threading.Thread(target=self.client_socket.run_forever)
+        self.ws_thread.daemon = True
+        self.ws_thread.start()
+        self.transcript = []
+        print("[INFO]: * recording")
+    def handle_status_messages(self, message_data):
+        """Handles server status messages."""
+        status = message_data["status"]
+        if status == "WAIT":
+            self.waiting = True
+            print(f"[INFO]: Server is full. Estimated wait time {round(message_data['message'])} minutes.")
+        elif status == "ERROR":
+            print(f"Message from Server: {message_data['message']}")
+            self.server_error = True
+        elif status == "WARNING":
+            print(f"Message from Server: {message_data['message']}")
+    def process_segments(self, segments):
+        """Processes transcript segments."""
+        text = []
+        for i, seg in enumerate(segments):
+            if not text or text[-1] != seg["text"]:
+                text.append(seg["text"])
+                if i == len(segments) - 1 and not seg.get("completed", False):
+                    self.last_segment = seg
+        # update last received segment and last valid response time
+        if self.last_received_segment is None or self.last_received_segment != segments[-1]["text"]:
+            self.last_response_received = time.time()
+            self.last_received_segment = segments[-1]["text"]
+        if self.log_transcription:
+            # Truncate to last 3 entries for brevity.
+            text = text[-3:]
+            utils.clear_screen()
+            utils.print_transcript(text)
+    def on_message(self, ws, message):
+        """
+        Callback function called when a message is received from the server.
+        It updates various attributes of the client based on the received message, including
+        recording status, language detection, and server messages. If a disconnect message
+        is received, it sets the recording status to False.
+        Args:
+            ws (websocket.WebSocketApp): The WebSocket client instance.
+            message (str): The received message from the server.
+        """
+        message = json.loads(message)
+        if self.uid != message.get("uid"):
+            print("[ERROR]: invalid client uid")
+            return
+        if "status" in message.keys():
+            self.handle_status_messages(message)
+            return
+        if "message" in message.keys() and message["message"] == "DISCONNECT":
+            print("[INFO]: Server disconnected due to overtime.")
+            self.recording = False
+        if "message" in message.keys() and message["message"] == "SERVER_READY":
+            self.last_response_received = time.time()
+            self.recording = True
+            self.server_backend = message["backend"]
+            print(f"[INFO]: Server Running with backend {self.server_backend}")
+            return
+        if "language" in message.keys():
+            self.language = message.get("language")
+            lang_prob = message.get("language_prob")
+            print(
+                f"[INFO]: Server detected language {self.language} with probability {lang_prob}"
+            )
+            return
+        if "segments" in message.keys():
+            self.process_segments(message["segments"])
+    def on_error(self, ws, error):
+        print(f"[ERROR] WebSocket Error: {error}")
+        self.server_error = True
+        self.error_message = error
+    def on_close(self, ws, close_status_code, close_msg):
+        print(f"[INFO]: Websocket connection closed: {close_status_code}: {close_msg}")
+        self.recording = False
+        self.waiting = False
+    def on_open(self, ws):
+        """
+        Callback function called when the WebSocket connection is successfully opened.
+        Sends an initial configuration message to the server, including client UID,
+        language selection, and task type.
+        Args:
+            ws (websocket.WebSocketApp): The WebSocket client instance.
+        """
+        print("[INFO]: Opened connection")
+        ws.send(
+            json.dumps(
+                {
+                    "uid": self.uid,
+                    "language": self.language,
+                    "max_clients": self.max_clients,
+                    "max_connection_time": self.max_connection_time,
+                }
+            )
+        )
+    def send_packet_to_server(self, message):
+        """
+        Send an audio packet to the server using WebSocket.
+        Args:
+            message (bytes): The audio data packet in bytes to be sent to the server.
+        """
+        try:
+            self.client_socket.send(message, websocket.ABNF.OPCODE_BINARY)
+        except Exception as e:
+            print(e)
+    def close_websocket(self):
+        """
+        Close the WebSocket connection and join the WebSocket thread.
+        First attempts to close the WebSocket connection using `self.client_socket.close()`. After
+        closing the connection, it joins the WebSocket thread to ensure proper termination.
+        """
+        try:
+            self.client_socket.close()
+        except Exception as e:
+            print("[ERROR]: Error closing WebSocket:", e)
+        try:
+            self.ws_thread.join()
+        except Exception as e:
+            print("[ERROR:] Error joining WebSocket thread:", e)
+    def get_client_socket(self):
+        """
+        Get the WebSocket client socket instance.
+        Returns:
+            WebSocketApp: The WebSocket client socket instance currently in use by the client.
+        """
+        return self.client_socket
+    def wait_before_disconnect(self):
+        """Waits a bit before disconnecting in order to process pending responses."""
+        assert self.last_response_received
+        while time.time() - self.last_response_received < self.disconnect_if_no_response_for:
+            continue
+class TranscriptionTeeClient:
+    """
+    Client for handling audio recording, streaming, and transcription tasks via one or more
+    WebSocket connections.
+    Acts as a high-level client for audio transcription tasks using a WebSocket connection. It can be used
+    to send audio data for transcription to one or more servers, and receive transcribed text segments.
+    Args:
+        clients (list): one or more previously initialized Client instances
+    Attributes:
+        clients (list): the underlying Client instances responsible for handling WebSocket connections.
+    """
+    def __init__(self, clients, save_output_recording=False, output_recording_filename="./output_recording.wav",
+                 mute_audio_playback=False):
+        self.clients = clients
+        if not self.clients:
+            raise Exception("At least one client is required.")
+        self.chunk = 4096
+        self.format = pyaudio.paInt16
+        self.channels = 1
+        self.rate = 16000
+        self.record_seconds = 60000
+        self.save_output_recording = save_output_recording
+        self.output_recording_filename = output_recording_filename
+        self.mute_audio_playback = mute_audio_playback
+        self.frames = b""
+        self.p = pyaudio.PyAudio()
+        try:
+            self.stream = self.p.open(
+                format=self.format,
+                channels=self.channels,
+                rate=self.rate,
+                input=True,
+                frames_per_buffer=self.chunk,
+            )
+        except OSError as error:
+            print(f"[WARN]: Unable to access microphone. {error}")
+            self.stream = None
+    def __call__(self, audio=None, rtsp_url=None, hls_url=None, save_file=None):
+        """
+        Start the transcription process.
+        Initiates the transcription process by connecting to the server via a WebSocket. It waits for the server
+        to be ready to receive audio data and then sends audio for transcription. If an audio file is provided, it
+        will be played and streamed to the server; otherwise, it will perform live recording.
+        Args:
+            audio (str, optional): Path to an audio file for transcription. Default is None, which triggers live recording.
+        """
+        assert sum(
+            source is not None for source in [audio, rtsp_url, hls_url]
+        ) <= 1, 'You must provide only one selected source'
+        print("[INFO]: Waiting for server ready ...")
+        for client in self.clients:
+            while not client.recording:
+                if client.waiting or client.server_error:
+                    self.close_all_clients()
+                    return
+        print("[INFO]: Server Ready!")
+        if hls_url is not None:
+            self.process_hls_stream(hls_url, save_file)
+        elif audio is not None:
+            resampled_file = utils.resample(audio)
+            self.play_file(resampled_file)
+        elif rtsp_url is not None:
+            self.process_rtsp_stream(rtsp_url)
+        else:
+            self.record()
+    def close_all_clients(self):
+        """Closes all client websockets."""
+        for client in self.clients:
+            client.close_websocket()
+    def multicast_packet(self, packet, unconditional=False):
+        """
+        Sends an identical packet via all clients.
+        Args:
+            packet (bytes): The audio data packet in bytes to be sent.
+            unconditional (bool, optional): If true, send regardless of whether clients are recording.  Default is False.
+        """
+        for client in self.clients:
+            if (unconditional or client.recording):
+                client.send_packet_to_server(packet)
+    def play_file(self, filename):
+        """
+        Play an audio file and send it to the server for processing.
+        Reads an audio file, plays it through the audio output, and simultaneously sends
+        the audio data to the server for processing. It uses PyAudio to create an audio
+        stream for playback. The audio data is read from the file in chunks, converted to
+        floating-point format, and sent to the server using WebSocket communication.
+        This method is typically used when you want to process pre-recorded audio and send it
+        to the server in real-time.
+        Args:
+            filename (str): The path to the audio file to be played and sent to the server.
+        """
+        # read audio and create pyaudio stream
+        with wave.open(filename, "rb") as wavfile:
+            self.stream = self.p.open(
+                format=self.p.get_format_from_width(wavfile.getsampwidth()),
+                channels=wavfile.getnchannels(),
+                rate=wavfile.getframerate(),
+                input=True,
+                output=True,
+                frames_per_buffer=self.chunk,
+            )
+            chunk_duration = self.chunk / float(wavfile.getframerate())
+            try:
+                while any(client.recording for client in self.clients):
+                    data = wavfile.readframes(self.chunk)
+                    if data == b"":
+                        break
+                    audio_array = self.bytes_to_float_array(data)
+                    self.multicast_packet(audio_array.tobytes())
+                    if self.mute_audio_playback:
+                        time.sleep(chunk_duration)
+                    else:
+                        self.stream.write(data)
+                wavfile.close()
+                for client in self.clients:
+                    client.wait_before_disconnect()
+                self.multicast_packet(Client.END_OF_AUDIO.encode('utf-8'), True)
+                self.stream.close()
+                self.close_all_clients()
+            except KeyboardInterrupt:
+                wavfile.close()
+                self.stream.stop_stream()
+                self.stream.close()
+                self.p.terminate()
+                self.close_all_clients()
+                print("[INFO]: Keyboard interrupt.")
+    def process_rtsp_stream(self, rtsp_url):
+        """
+        Connect to an RTSP source, process the audio stream, and send it for transcription.
+        Args:
+            rtsp_url (str): The URL of the RTSP stream source.
+        """
+        print("[INFO]: Connecting to RTSP stream...")
+        try:
+            container = av.open(rtsp_url, format="rtsp", options={"rtsp_transport": "tcp"})
+            self.process_av_stream(container, stream_type="RTSP")
+        except Exception as e:
+            print(f"[ERROR]: Failed to process RTSP stream: {e}")
+        finally:
+            for client in self.clients:
+                client.wait_before_disconnect()
+            self.multicast_packet(Client.END_OF_AUDIO.encode('utf-8'), True)
+            self.close_all_clients()
+        print("[INFO]: RTSP stream processing finished.")
+    def process_hls_stream(self, hls_url, save_file=None):
+        """
+        Connect to an HLS source, process the audio stream, and send it for transcription.
+        Args:
+            hls_url (str): The URL of the HLS stream source.
+            save_file (str, optional): Local path to save the network stream.
+        """
+        print("[INFO]: Connecting to HLS stream...")
+        try:
+            container = av.open(hls_url, format="hls")
+            self.process_av_stream(container, stream_type="HLS", save_file=save_file)
+        except Exception as e:
+            print(f"[ERROR]: Failed to process HLS stream: {e}")
+        finally:
+            for client in self.clients:
+                client.wait_before_disconnect()
+            self.multicast_packet(Client.END_OF_AUDIO.encode('utf-8'), True)
+            self.close_all_clients()
+        print("[INFO]: HLS stream processing finished.")
+    def process_av_stream(self, container, stream_type, save_file=None):
+        """
+        Process an AV container stream and send audio packets to the server.
+        Args:
+            container (av.container.InputContainer): The input container to process.
+            stream_type (str): The type of stream being processed ("RTSP" or "HLS").
+            save_file (str, optional): Local path to save the stream. Default is None.
+        """
+        audio_stream = next((s for s in container.streams if s.type == "audio"), None)
+        if not audio_stream:
+            print(f"[ERROR]: No audio stream found in {stream_type} source.")
+            return
+        output_container = None
+        if save_file:
+            output_container = av.open(save_file, mode="w")
+            output_audio_stream = output_container.add_stream(codec_name="pcm_s16le", rate=self.rate)
+        try:
+            for packet in container.demux(audio_stream):
+                for frame in packet.decode():
+                    audio_data = frame.to_ndarray().tobytes()
+                    self.multicast_packet(audio_data)
+                    if save_file:
+                        output_container.mux(frame)
+        except Exception as e:
+            print(f"[ERROR]: Error during {stream_type} stream processing: {e}")
+        finally:
+            # Wait for server to send any leftover transcription.
+            time.sleep(5)
+            self.multicast_packet(Client.END_OF_AUDIO.encode('utf-8'), True)
+            if output_container:
+                output_container.close()
+            container.close()
+    def save_chunk(self, n_audio_file):
+        """
+        Saves the current audio frames to a WAV file in a separate thread.
+        Args:
+        n_audio_file (int): The index of the audio file which determines the filename.
+                            This helps in maintaining the order and uniqueness of each chunk.
+        """
+        t = threading.Thread(
+            target=self.write_audio_frames_to_file,
+            args=(self.frames[:], f"chunks/{n_audio_file}.wav",),
+        )
+        t.start()
+    def finalize_recording(self, n_audio_file):
+        """
+        Finalizes the recording process by saving any remaining audio frames,
+        closing the audio stream, and terminating the process.
+        Args:
+        n_audio_file (int): The file index to be used if there are remaining audio frames to be saved.
+                            This index is incremented before use if the last chunk is saved.
+        """
+        if self.save_output_recording and len(self.frames):
+            self.write_audio_frames_to_file(
+                self.frames[:], f"chunks/{n_audio_file}.wav"
+            )
+            n_audio_file += 1
+        self.stream.stop_stream()
+        self.stream.close()
+        self.p.terminate()
+        self.close_all_clients()
+        if self.save_output_recording:
+            self.write_output_recording(n_audio_file)
+    def record(self):
+        """
+        Record audio data from the input stream and save it to a WAV file.
+        Continuously records audio data from the input stream, sends it to the server via a WebSocket
+        connection, and simultaneously saves it to multiple WAV files in chunks. It stops recording when
+        the `RECORD_SECONDS` duration is reached or when the `RECORDING` flag is set to `False`.
+        Audio data is saved in chunks to the "chunks" directory. Each chunk is saved as a separate WAV file.
+        The recording will continue until the specified duration is reached or until the `RECORDING` flag is set to `False`.
+        The recording process can be interrupted by sending a KeyboardInterrupt (e.g., pressing Ctrl+C). After recording,
+        the method combines all the saved audio chunks into the specified `out_file`.
+        """
+        n_audio_file = 0
+        if self.save_output_recording:
+            if os.path.exists("chunks"):
+                shutil.rmtree("chunks")
+            os.makedirs("chunks")
+        try:
+            for _ in range(0, int(self.rate / self.chunk * self.record_seconds)):
+                if not any(client.recording for client in self.clients):
+                    break
+                data = self.stream.read(self.chunk, exception_on_overflow=False)
+                self.frames += data
+                audio_array = self.bytes_to_float_array(data)
+                self.multicast_packet(audio_array.tobytes())
+                # save frames if more than a minute
+                if len(self.frames) > 60 * self.rate:
+                    if self.save_output_recording:
+                        self.save_chunk(n_audio_file)
+                        n_audio_file += 1
+                    self.frames = b""
+        except KeyboardInterrupt:
+            self.finalize_recording(n_audio_file)
+    def write_audio_frames_to_file(self, frames, file_name):
+        """
+        Write audio frames to a WAV file.
+        The WAV file is created or overwritten with the specified name. The audio frames should be
+        in the correct format and match the specified channel, sample width, and sample rate.
+        Args:
+            frames (bytes): The audio frames to be written to the file.
+            file_name (str): The name of the WAV file to which the frames will be written.
+        """
+        with wave.open(file_name, "wb") as wavfile:
+            wavfile: wave.Wave_write
+            wavfile.setnchannels(self.channels)
+            wavfile.setsampwidth(2)
+            wavfile.setframerate(self.rate)
+            wavfile.writeframes(frames)
+    def write_output_recording(self, n_audio_file):
+        """
+        Combine and save recorded audio chunks into a single WAV file.
+        The individual audio chunk files are expected to be located in the "chunks" directory. Reads each chunk
+        file, appends its audio data to the final recording, and then deletes the chunk file. After combining
+        and saving, the final recording is stored in the specified `out_file`.
+        Args:
+            n_audio_file (int): The number of audio chunk files to combine.
+            out_file (str): The name of the output WAV file to save the final recording.
+        """
+        input_files = [
+            f"chunks/{i}.wav"
+            for i in range(n_audio_file)
+            if os.path.exists(f"chunks/{i}.wav")
+        ]
+        with wave.open(self.output_recording_filename, "wb") as wavfile:
+            wavfile: wave.Wave_write
+            wavfile.setnchannels(self.channels)
+            wavfile.setsampwidth(2)
+            wavfile.setframerate(self.rate)
+            for in_file in input_files:
+                with wave.open(in_file, "rb") as wav_in:
+                    while True:
+                        data = wav_in.readframes(self.chunk)
+                        if data == b"":
+                            break
+                        wavfile.writeframes(data)
+                # remove this file
+                os.remove(in_file)
+        wavfile.close()
+        # clean up temporary directory to store chunks
+        if os.path.exists("chunks"):
+            shutil.rmtree("chunks")
+    @staticmethod
+    def bytes_to_float_array(audio_bytes):
+        """
+        Convert audio data from bytes to a NumPy float array.
+        It assumes that the audio data is in 16-bit PCM format. The audio data is normalized to
+        have values between -1 and 1.
+        Args:
+            audio_bytes (bytes): Audio data in bytes.
+        Returns:
+            np.ndarray: A NumPy array containing the audio data as float values normalized between -1 and 1.
+        """
+        raw_data = np.frombuffer(buffer=audio_bytes, dtype=np.int16)
+        return raw_data.astype(np.float32) / 32768.0
+class TranscriptionClient(TranscriptionTeeClient):
+    """
+    Client for handling audio transcription tasks via a single WebSocket connection.
+    Acts as a high-level client for audio transcription tasks using a WebSocket connection. It can be used
+    to send audio data for transcription to a server and receive transcribed text segments.
+    Args:
+        host (str): The hostname or IP address of the server.
+        port (int): The port number to connect to on the server.
+        lang (str, optional): The primary language for transcription. Default is None, which defaults to English ('en').
+        save_output_recording (bool, optional): Whether to save the microphone recording. Default is False.
+        output_recording_filename (str, optional): Path to save the output recording WAV file. Default is "./output_recording.wav".
+        output_transcription_path (str, optional): File path to save the output transcription (SRT file). Default is "./output.srt".
+        log_transcription (bool, optional): Whether to log transcription output to the console. Default is True.
+        max_clients (int, optional): Maximum number of client connections allowed. Default is 4.
+        max_connection_time (int, optional): Maximum allowed connection time in seconds. Default is 600.
+        mute_audio_playback (bool, optional): If True, mutes audio playback during file playback. Default is False.
+    Attributes:
+        client (Client): An instance of the underlying Client class responsible for handling the WebSocket connection.
+    Example:
+        To create a TranscriptionClient and start transcription on microphone audio:
+        ```python
+        transcription_client = TranscriptionClient(host="localhost", port=9090)
+        transcription_client()
+        ```
+    """
+    def __init__(
+            self,
+            host,
+            port,
+            lang=None,
+            save_output_recording=False,
+            output_recording_filename="./output_recording.wav",
+            log_transcription=True,
+            max_clients=4,
+            max_connection_time=600,
+            mute_audio_playback=False,
+    ):
+        self.client = Client(
+            host, port, lang, log_transcription=log_transcription, max_clients=max_clients,
+            max_connection_time=max_connection_time
+        )
+        if save_output_recording and not output_recording_filename.endswith(".wav"):
+            raise ValueError(f"Please provide a valid `output_recording_filename`: {output_recording_filename}")
+        TranscriptionTeeClient.__init__(
+            self,
+            [self.client],
+            save_output_recording=save_output_recording,
+            output_recording_filename=output_recording_filename,
+            mute_audio_playback=mute_audio_playback
+        )

transcribe/server.py ADDED Viewed

	@@ -0,0 +1,684 @@

+import functools
+import json
+import logging
+import pathlib
+import threading
+import time
+from enum import Enum
+from typing import List, Optional
+import librosa
+import numpy as np
+import soundfile
+from pywhispercpp.model import Model
+from websockets.exceptions import ConnectionClosed
+from websockets.sync.server import serve
+from transcribe.vad import VoiceActivityDetector
+logging.basicConfig(level=logging.INFO)
+class ClientManager:
+    def __init__(self, max_clients=4, max_connection_time=600):
+        """
+        Initializes the ClientManager with specified limits on client connections and connection durations.
+        Args:
+            max_clients (int, optional): The maximum number of simultaneous client connections allowed. Defaults to 4.
+            max_connection_time (int, optional): The maximum duration (in seconds) a client can stay connected. Defaults
+                                                 to 600 seconds (10 minutes).
+        """
+        self.clients = {}
+        self.start_times = {}
+        self.max_clients = max_clients
+        self.max_connection_time = max_connection_time
+    def add_client(self, websocket, client):
+        """
+        Adds a client and their connection start time to the tracking dictionaries.
+        Args:
+            websocket: The websocket associated with the client to add.
+            client: The client object to be added and tracked.
+        """
+        self.clients[websocket] = client
+        self.start_times[websocket] = time.time()
+    def get_client(self, websocket):
+        """
+        Retrieves a client associated with the given websocket.
+        Args:
+            websocket: The websocket associated with the client to retrieve.
+        Returns:
+            The client object if found, False otherwise.
+        """
+        if websocket in self.clients:
+            return self.clients[websocket]
+        return False
+    def remove_client(self, websocket):
+        """
+        Removes a client and their connection start time from the tracking dictionaries. Performs cleanup on the
+        client if necessary.
+        Args:
+            websocket: The websocket associated with the client to be removed.
+        """
+        client = self.clients.pop(websocket, None)
+        if client:
+            client.cleanup()
+        self.start_times.pop(websocket, None)
+    def get_wait_time(self):
+        """
+        Calculates the estimated wait time for new clients based on the remaining connection times of current clients.
+        Returns:
+            The estimated wait time in minutes for new clients to connect. Returns 0 if there are available slots.
+        """
+        wait_time = None
+        for start_time in self.start_times.values():
+            current_client_time_remaining = self.max_connection_time - (time.time() - start_time)
+            if wait_time is None or current_client_time_remaining < wait_time:
+                wait_time = current_client_time_remaining
+        return wait_time / 60 if wait_time is not None else 0
+    def is_server_full(self, websocket, options):
+        """
+        Checks if the server is at its maximum client capacity and sends a wait message to the client if necessary.
+        Args:
+            websocket: The websocket of the client attempting to connect.
+            options: A dictionary of options that may include the client's unique identifier.
+        Returns:
+            True if the server is full, False otherwise.
+        """
+        if len(self.clients) >= self.max_clients:
+            wait_time = self.get_wait_time()
+            response = {"uid": options["uid"], "status": "WAIT", "message": wait_time}
+            websocket.send(json.dumps(response))
+            return True
+        return False
+    def is_client_timeout(self, websocket):
+        """
+        Checks if a client has exceeded the maximum allowed connection time and disconnects them if so, issuing a warning.
+        Args:
+            websocket: The websocket associated with the client to check.
+        Returns:
+            True if the client's connection time has exceeded the maximum limit, False otherwise.
+        """
+        elapsed_time = time.time() - self.start_times[websocket]
+        if elapsed_time >= self.max_connection_time:
+            self.clients[websocket].disconnect()
+            logging.warning(f"Client with uid '{self.clients[websocket].client_uid}' disconnected due to overtime.")
+            return True
+        return False
+class BackendType(Enum):
+    PYWHISPERCPP = "pywhispercpp"
+    @staticmethod
+    def valid_types() -> List[str]:
+        return [backend_type.value for backend_type in BackendType]
+    @staticmethod
+    def is_valid(backend: str) -> bool:
+        return backend in BackendType.valid_types()
+    def is_pywhispercpp(self) -> bool:
+        return self == BackendType.PYWHISPERCPP
+class TranscriptionServer:
+    RATE = 16000
+    def __init__(self):
+        self.client_manager = None
+        self.no_voice_activity_chunks = 0
+        self.single_model = False
+    def initialize_client(
+            self, websocket, options
+    ):
+        client: Optional[ServeClientBase] = None
+        if self.backend.is_pywhispercpp():
+            client = ServeClientWhisperCPP(
+                websocket,
+                language=options["language"],
+                client_uid=options["uid"],
+                single_model=self.single_model,
+            )
+            logging.info("Running pywhispercpp backend.")
+        if client is None:
+            raise ValueError(f"Backend type {self.backend.value} not recognised or not handled.")
+        self.client_manager.add_client(websocket, client)
+    def get_audio_from_websocket(self, websocket):
+        """
+        Receives audio buffer from websocket and creates a numpy array out of it.
+        Args:
+            websocket: The websocket to receive audio from.
+        Returns:
+            A numpy array containing the audio.
+        """
+        frame_data = websocket.recv()
+        if frame_data == b"END_OF_AUDIO":
+            return False
+        return np.frombuffer(frame_data, dtype=np.float32)
+    def handle_new_connection(self, websocket):
+        try:
+            logging.info("New client connected")
+            options = websocket.recv()
+            options = json.loads(options)
+            if self.client_manager is None:
+                max_clients = options.get('max_clients', 4)
+                max_connection_time = options.get('max_connection_time', 600)
+                self.client_manager = ClientManager(max_clients, max_connection_time)
+            if self.client_manager.is_server_full(websocket, options):
+                websocket.close()
+                return False  # Indicates that the connection should not continue
+            if self.backend.is_pywhispercpp():
+                self.vad_detector = VoiceActivityDetector(frame_rate=self.RATE)
+            self.initialize_client(websocket, options)
+            return True
+        except json.JSONDecodeError:
+            logging.error("Failed to decode JSON from client")
+            return False
+        except ConnectionClosed:
+            logging.info("Connection closed by client")
+            return False
+        except Exception as e:
+            logging.error(f"Error during new connection initialization: {str(e)}")
+            return False
+    def process_audio_frames(self, websocket):
+        frame_np = self.get_audio_from_websocket(websocket)
+        client = self.client_manager.get_client(websocket)
+        # TODO Vad has some problem, it will be blocking process loop
+        # if frame_np is False:
+        #     if self.backend.is_pywhispercpp():
+        #         client.set_eos(True)
+        #     return False
+        # if self.backend.is_pywhispercpp():
+        #     voice_active = self.voice_activity(websocket, frame_np)
+        #     if voice_active:
+        #         self.no_voice_activity_chunks = 0
+        #         client.set_eos(False)
+        #     if self.use_vad and not voice_active:
+        #         return True
+        client.add_frames(frame_np)
+        return True
+    def recv_audio(self,
+                   websocket,
+                   backend: BackendType = BackendType.PYWHISPERCPP):
+        self.backend = backend
+        if not self.handle_new_connection(websocket):
+            return
+        try:
+            while not self.client_manager.is_client_timeout(websocket):
+                if not self.process_audio_frames(websocket):
+                    break
+        except ConnectionClosed:
+            logging.info("Connection closed by client")
+        except Exception as e:
+            logging.error(f"Unexpected error: {str(e)}")
+        finally:
+            if self.client_manager.get_client(websocket):
+                self.cleanup(websocket)
+                websocket.close()
+            del websocket
+    def run(self,
+            host,
+            port=9090,
+            backend="pywhispercpp"):
+        """
+        Run the transcription server.
+        Args:
+            host (str): The host address to bind the server.
+            port (int): The port number to bind the server.
+        """
+        if not BackendType.is_valid(backend):
+            raise ValueError(f"{backend} is not a valid backend type. Choose backend from {BackendType.valid_types()}")
+        with serve(
+                functools.partial(
+                    self.recv_audio,
+                    backend=BackendType(backend),
+                ),
+                host,
+                port
+        ) as server:
+            server.serve_forever()
+    def voice_activity(self, websocket, frame_np):
+        """
+        Evaluates the voice activity in a given audio frame and manages the state of voice activity detection.
+        This method uses the configured voice activity detection (VAD) model to assess whether the given audio frame
+        contains speech. If the VAD model detects no voice activity for more than three consecutive frames,
+        it sets an end-of-speech (EOS) flag for the associated client. This method aims to efficiently manage
+        speech detection to improve subsequent processing steps.
+        Args:
+            websocket: The websocket associated with the current client. Used to retrieve the client object
+                    from the client manager for state management.
+            frame_np (numpy.ndarray): The audio frame to be analyzed. This should be a NumPy array containing
+                                    the audio data for the current frame.
+        Returns:
+            bool: True if voice activity is detected in the current frame, False otherwise. When returning False
+                after detecting no voice activity for more than three consecutive frames, it also triggers the
+                end-of-speech (EOS) flag for the client.
+        """
+        if not self.vad_detector(frame_np):
+            self.no_voice_activity_chunks += 1
+            if self.no_voice_activity_chunks > 3:
+                client = self.client_manager.get_client(websocket)
+                if not client.eos:
+                    client.set_eos(True)
+                time.sleep(0.1)  # Sleep 100m; wait some voice activity.
+            return False
+        return True
+    def cleanup(self, websocket):
+        """
+        Cleans up resources associated with a given client's websocket.
+        Args:
+            websocket: The websocket associated with the client to be cleaned up.
+        """
+        if self.client_manager.get_client(websocket):
+            self.client_manager.remove_client(websocket)
+class ServeClientBase(object):
+    RATE = 16000
+    SERVER_READY = "SERVER_READY"
+    DISCONNECT = "DISCONNECT"
+    def __init__(self, client_uid, websocket):
+        self.client_uid = client_uid
+        self.websocket = websocket
+        self.frames = b""
+        self.timestamp_offset = 0.0
+        self.frames_np = None
+        self.frames_offset = 0.0
+        self.text = []
+        self.current_out = ''
+        self.prev_out = ''
+        self.t_start = None
+        self.exit = False
+        self.same_output_count = 0
+        self.show_prev_out_thresh = 5  # if pause(no output from whisper) show previous output for 5 seconds
+        self.add_pause_thresh = 3  # add a blank to segment list as a pause(no speech) for 3 seconds
+        self.transcript = []
+        self.send_last_n_segments = 10
+        # text formatting
+        self.pick_previous_segments = 2
+        # threading
+        self.lock = threading.Lock()
+    def speech_to_text(self):
+        raise NotImplementedError
+    def transcribe_audio(self):
+        raise NotImplementedError
+    def handle_transcription_output(self):
+        raise NotImplementedError
+    def add_frames(self, frame_np):
+        """
+        Add audio frames to the ongoing audio stream buffer.
+        This method is responsible for maintaining the audio stream buffer, allowing the continuous addition
+        of audio frames as they are received. It also ensures that the buffer does not exceed a specified size
+        to prevent excessive memory usage.
+        If the buffer size exceeds a threshold (45 seconds of audio data), it discards the oldest 30 seconds
+        of audio data to maintain a reasonable buffer size. If the buffer is empty, it initializes it with the provided
+        audio frame. The audio stream buffer is used for real-time processing of audio data for transcription.
+        Args:
+            frame_np (numpy.ndarray): The audio frame data as a NumPy array.
+        """
+        self.lock.acquire()
+        if self.frames_np is not None and self.frames_np.shape[0] > 45 * self.RATE:
+            self.frames_offset += 30.0
+            self.frames_np = self.frames_np[int(30 * self.RATE):]
+            # check timestamp offset(should be >= self.frame_offset)
+            # this basically means that there is no speech as timestamp offset hasnt updated
+            # and is less than frame_offset
+            if self.timestamp_offset < self.frames_offset:
+                self.timestamp_offset = self.frames_offset
+        if self.frames_np is None:
+            self.frames_np = frame_np.copy()
+        else:
+            self.frames_np = np.concatenate((self.frames_np, frame_np), axis=0)
+        self.lock.release()
+    def clip_audio_if_no_valid_segment(self):
+        """
+        Update the timestamp offset based on audio buffer status.
+        Clip audio if the current chunk exceeds 30 seconds, this basically implies that
+        no valid segment for the last 30 seconds from whisper
+        """
+        with self.lock:
+            if self.frames_np[int((self.timestamp_offset - self.frames_offset) * self.RATE):].shape[0] > 25 * self.RATE:
+                duration = self.frames_np.shape[0] / self.RATE
+                self.timestamp_offset = self.frames_offset + duration - 5
+    def get_audio_chunk_for_processing(self):
+        """
+        Retrieves the next chunk of audio data for processing based on the current offsets.
+        Calculates which part of the audio data should be processed next, based on
+        the difference between the current timestamp offset and the frame's offset, scaled by
+        the audio sample rate (RATE). It then returns this chunk of audio data along with its
+        duration in seconds.
+        Returns:
+            tuple: A tuple containing:
+                - input_bytes (np.ndarray): The next chunk of audio data to be processed.
+                - duration (float): The duration of the audio chunk in seconds.
+        """
+        with self.lock:
+            samples_take = max(0, (self.timestamp_offset - self.frames_offset) * self.RATE)
+            input_bytes = self.frames_np[int(samples_take):].copy()
+        duration = input_bytes.shape[0] / self.RATE
+        return input_bytes, duration
+    def prepare_segments(self, last_segment=None):
+        """
+        Prepares the segments of transcribed text to be sent to the client.
+        This method compiles the recent segments of transcribed text, ensuring that only the
+        specified number of the most recent segments are included. It also appends the most
+        recent segment of text if provided (which is considered incomplete because of the possibility
+        of the last word being truncated in the audio chunk).
+        Args:
+            last_segment (str, optional): The most recent segment of transcribed text to be added
+                                          to the list of segments. Defaults to None.
+        Returns:
+            list: A list of transcribed text segments to be sent to the client.
+        """
+        segments = []
+        if len(self.transcript) >= self.send_last_n_segments:
+            segments = self.transcript[-self.send_last_n_segments:].copy()
+        else:
+            segments = self.transcript.copy()
+        if last_segment is not None:
+            segments = segments + [last_segment]
+        return segments
+    def get_audio_chunk_duration(self, input_bytes):
+        """
+        Calculates the duration of the provided audio chunk.
+        Args:
+            input_bytes (numpy.ndarray): The audio chunk for which to calculate the duration.
+        Returns:
+            float: The duration of the audio chunk in seconds.
+        """
+        return input_bytes.shape[0] / self.RATE
+    def send_transcription_to_client(self, segments):
+        """
+        Sends the specified transcription segments to the client over the websocket connection.
+        This method formats the transcription segments into a JSON object and attempts to send
+        this object to the client. If an error occurs during the send operation, it logs the error.
+        Returns:
+            segments (list): A list of transcription segments to be sent to the client.
+        """
+        try:
+            self.websocket.send(
+                json.dumps({
+                    "uid": self.client_uid,
+                    "segments": segments,
+                })
+            )
+        except Exception as e:
+            logging.error(f"[ERROR]: Sending data to client: {e}")
+    def disconnect(self):
+        """
+        Notify the client of disconnection and send a disconnect message.
+        This method sends a disconnect message to the client via the WebSocket connection to notify them
+        that the transcription service is disconnecting gracefully.
+        """
+        self.websocket.send(json.dumps({
+            "uid": self.client_uid,
+            "message": self.DISCONNECT
+        }))
+    def cleanup(self):
+        """
+        Perform cleanup tasks before exiting the transcription service.
+        This method performs necessary cleanup tasks, including stopping the transcription thread, marking
+        the exit flag to indicate the transcription thread should exit gracefully, and destroying resources
+        associated with the transcription process.
+        """
+        logging.info("Cleaning up.")
+        self.exit = True
+class ServeClientWhisperCPP(ServeClientBase):
+    SINGLE_MODEL = None
+    SINGLE_MODEL_LOCK = threading.Lock()
+    def __init__(self, websocket, language=None, client_uid=None,
+                 single_model=False):
+        """
+        Initialize a ServeClient instance.
+        The Whisper model is initialized based on the client's language and device availability.
+        The transcription thread is started upon initialization. A "SERVER_READY" message is sent
+        to the client to indicate that the server is ready.
+        Args:
+            websocket (WebSocket): The WebSocket connection for the client.
+            language (str, optional): The language for transcription. Defaults to None.
+            client_uid (str, optional): A unique identifier for the client. Defaults to None.
+            single_model (bool, optional): Whether to instantiate a new model for each client connection. Defaults to False.
+        """
+        super().__init__(client_uid, websocket)
+        self.language = language
+        self.eos = False
+        if single_model:
+            if ServeClientWhisperCPP.SINGLE_MODEL is None:
+                self.create_model()
+                ServeClientWhisperCPP.SINGLE_MODEL = self.transcriber
+            else:
+                self.transcriber = ServeClientWhisperCPP.SINGLE_MODEL
+        else:
+            self.create_model()
+        # threading
+        logging.info('Create a thread to process audio.')
+        self.trans_thread = threading.Thread(target=self.speech_to_text)
+        self.trans_thread.start()
+        self.websocket.send(json.dumps({
+            "uid": self.client_uid,
+            "message": self.SERVER_READY,
+            "backend": "pywhispercpp"
+        }))
+    def create_model(self, warmup=True):
+        """
+        Instantiates a new model, sets it as the transcriber and does warmup if desired.
+        """
+        model = 'medium-q5_0'
+        here = pathlib.Path(__file__)
+        models_dir = f'{here.parent.parent / "moyoyo_asr_models"}'
+        self.transcriber = Model(model=model, models_dir=models_dir)
+        if warmup:
+            self.warmup()
+    def warmup(self, warmup_steps=1):
+        """
+        Warmup TensorRT since first few inferences are slow.
+        Args:
+            warmup_steps (int): Number of steps to warm up the model for.
+        """
+        logging.info("[INFO:] Warming up whisper.cpp engine..")
+        mel, _, = soundfile.read("assets/jfk.flac")
+        for i in range(warmup_steps):
+            self.transcriber.transcribe(mel, print_progress=False)
+    def set_eos(self, eos):
+        """
+        Sets the End of Speech (EOS) flag.
+        Args:
+            eos (bool): The value to set for the EOS flag.
+        """
+        self.lock.acquire()
+        self.eos = eos
+        self.lock.release()
+    def handle_transcription_output(self, last_segment, duration):
+        """
+        Handle the transcription output, updating the transcript and sending data to the client.
+        Args:
+            last_segment (str): The last segment from the whisper output which is considered to be incomplete because
+                                of the possibility of word being truncated.
+            duration (float): Duration of the transcribed audio chunk.
+        """
+        segments = self.prepare_segments({"text": last_segment})
+        self.send_transcription_to_client(segments)
+        if self.eos:
+            self.update_timestamp_offset(last_segment, duration)
+    def transcribe_audio(self, input_bytes):
+        """
+        Transcribe the audio chunk and send the results to the client.
+        Args:
+            input_bytes (np.array): The audio chunk to transcribe.
+        """
+        if ServeClientWhisperCPP.SINGLE_MODEL:
+            ServeClientWhisperCPP.SINGLE_MODEL_LOCK.acquire()
+        logging.info(f"[pywhispercpp:] Processing audio with duration: {input_bytes.shape[0] / self.RATE}")
+        mel = input_bytes
+        duration = librosa.get_duration(y=input_bytes, sr=self.RATE)
+        if self.language == "zh":
+            prompt = '以下是简体中文普通话的句子。'
+        else:
+            prompt = 'The following is an English sentence.'
+        segments = self.transcriber.transcribe(mel, language='zh', initial_prompt=prompt, print_progress=False)
+        text = []
+        for segment in segments:
+            content = segment.text
+            text.append(content)
+        last_segment = ' '.join(text)
+        logging.info(f"[pywhispercpp:] Last segment: {last_segment}")
+        if ServeClientWhisperCPP.SINGLE_MODEL:
+            ServeClientWhisperCPP.SINGLE_MODEL_LOCK.release()
+        if last_segment:
+            self.handle_transcription_output(last_segment, duration)
+    def update_timestamp_offset(self, last_segment, duration):
+        """
+        Update timestamp offset and transcript.
+        Args:
+            last_segment (str): Last transcribed audio from the whisper model.
+            duration (float): Duration of the last audio chunk.
+        """
+        if not len(self.transcript):
+            self.transcript.append({"text": last_segment + " "})
+        elif self.transcript[-1]["text"].strip() != last_segment:
+            self.transcript.append({"text": last_segment + " "})
+        logging.info(f'Transcript list context: {self.transcript}')
+        with self.lock:
+            self.timestamp_offset += duration
+    def speech_to_text(self):
+        """
+        Process an audio stream in an infinite loop, continuously transcribing the speech.
+        This method continuously receives audio frames, performs real-time transcription, and sends
+        transcribed segments to the client via a WebSocket connection.
+        If the client's language is not detected, it waits for 30 seconds of audio input to make a language prediction.
+        It utilizes the Whisper ASR model to transcribe the audio, continuously processing and streaming results. Segments
+        are sent to the client in real-time, and a history of segments is maintained to provide context.Pauses in speech
+        (no output from Whisper) are handled by showing the previous output for a set duration. A blank segment is added if
+        there is no speech for a specified duration to indicate a pause.
+        Raises:
+            Exception: If there is an issue with audio processing or WebSocket communication.
+        """
+        while True:
+            if self.exit:
+                logging.info("Exiting speech to text thread")
+                break
+            if self.frames_np is None:
+                time.sleep(0.02)  # wait for any audio to arrive
+                continue
+            self.clip_audio_if_no_valid_segment()
+            input_bytes, duration = self.get_audio_chunk_for_processing()
+            if duration < 1:
+                continue
+            try:
+                input_sample = input_bytes.copy()
+                logging.info(f"[pywhispercpp:] Processing audio with duration: {duration}")
+                self.transcribe_audio(input_sample)
+            except Exception as e:
+                logging.error(f"[ERROR]: {e}")

transcribe/utils.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import os
+import textwrap
+from pathlib import Path
+import av
+def clear_screen():
+    """Clears the console screen."""
+    os.system("cls" if os.name == "nt" else "clear")
+def print_transcript(text):
+    """Prints formatted transcript text."""
+    wrapper = textwrap.TextWrapper(width=60)
+    for line in wrapper.wrap(text="".join(text)):
+        print(line)
+def format_time(s):
+    """Convert seconds (float) to SRT time format."""
+    hours = int(s // 3600)
+    minutes = int((s % 3600) // 60)
+    seconds = int(s % 60)
+    milliseconds = int((s - int(s)) * 1000)
+    return f"{hours:02}:{minutes:02}:{seconds:02},{milliseconds:03}"
+def create_srt_file(segments, resampled_file):
+    with open(resampled_file, 'w', encoding='utf-8') as srt_file:
+        segment_number = 1
+        for segment in segments:
+            start_time = format_time(float(segment['start']))
+            end_time = format_time(float(segment['end']))
+            text = segment['text']
+            srt_file.write(f"{segment_number}\n")
+            srt_file.write(f"{start_time} --> {end_time}\n")
+            srt_file.write(f"{text}\n\n")
+            segment_number += 1
+def resample(file: str, sr: int = 16000):
+    """
+    Resample the audio file to 16kHz.
+    Args:
+        file (str): The audio file to open
+        sr (int): The sample rate to resample the audio if necessary
+    Returns:
+        resampled_file (str): The resampled audio file
+    """
+    container = av.open(file)
+    stream = next(s for s in container.streams if s.type == 'audio')
+    resampler = av.AudioResampler(
+        format='s16',
+        layout='mono',
+        rate=sr,
+    )
+    resampled_file = Path(file).stem + "_resampled.wav"
+    output_container = av.open(resampled_file, mode='w')
+    output_stream = output_container.add_stream('pcm_s16le', rate=sr)
+    output_stream.layout = 'mono'
+    for frame in container.decode(audio=0):
+        frame.pts = None
+        resampled_frames = resampler.resample(frame)
+        if resampled_frames is not None:
+            for resampled_frame in resampled_frames:
+                for packet in output_stream.encode(resampled_frame):
+                    output_container.mux(packet)
+    for packet in output_stream.encode(None):
+        output_container.mux(packet)
+    output_container.close()
+    return resampled_file

transcribe/vad.py ADDED Viewed

	@@ -0,0 +1,160 @@

+import os
+import subprocess
+import warnings
+import numpy as np
+import onnxruntime
+import torch
+class VoiceActivityDetection():
+    def __init__(self, force_onnx_cpu=True):
+        path = self.download()
+        opts = onnxruntime.SessionOptions()
+        opts.log_severity_level = 3
+        opts.inter_op_num_threads = 1
+        opts.intra_op_num_threads = 1
+        if force_onnx_cpu and 'CPUExecutionProvider' in onnxruntime.get_available_providers():
+            self.session = onnxruntime.InferenceSession(path, providers=['CPUExecutionProvider'], sess_options=opts)
+        else:
+            self.session = onnxruntime.InferenceSession(path, providers=['CUDAExecutionProvider'], sess_options=opts)
+        self.reset_states()
+        if '16k' in path:
+            warnings.warn('This model support only 16000 sampling rate!')
+            self.sample_rates = [16000]
+        else:
+            self.sample_rates = [8000, 16000]
+    def _validate_input(self, x, sr: int):
+        if x.dim() == 1:
+            x = x.unsqueeze(0)
+        if x.dim() > 2:
+            raise ValueError(f"Too many dimensions for input audio chunk {x.dim()}")
+        if sr != 16000 and (sr % 16000 == 0):
+            step = sr // 16000
+            x = x[:, ::step]
+            sr = 16000
+        if sr not in self.sample_rates:
+            raise ValueError(f"Supported sampling rates: {self.sample_rates} (or multiply of 16000)")
+        if sr / x.shape[1] > 31.25:
+            raise ValueError("Input audio chunk is too short")
+        return x, sr
+    def reset_states(self, batch_size=1):
+        self._state = torch.zeros((2, batch_size, 128)).float()
+        self._context = torch.zeros(0)
+        self._last_sr = 0
+        self._last_batch_size = 0
+    def __call__(self, x, sr: int):
+        x, sr = self._validate_input(x, sr)
+        num_samples = 512 if sr == 16000 else 256
+        if x.shape[-1] != num_samples:
+            raise ValueError(
+                f"Provided number of samples is {x.shape[-1]} (Supported values: 256 for 8000 sample rate, 512 for 16000)")
+        batch_size = x.shape[0]
+        context_size = 64 if sr == 16000 else 32
+        if not self._last_batch_size:
+            self.reset_states(batch_size)
+        if (self._last_sr) and (self._last_sr != sr):
+            self.reset_states(batch_size)
+        if (self._last_batch_size) and (self._last_batch_size != batch_size):
+            self.reset_states(batch_size)
+        if not len(self._context):
+            self._context = torch.zeros(batch_size, context_size)
+        x = torch.cat([self._context, x], dim=1)
+        if sr in [8000, 16000]:
+            ort_inputs = {'input': x.numpy(), 'state': self._state.numpy(), 'sr': np.array(sr, dtype='int64')}
+            ort_outs = self.session.run(None, ort_inputs)
+            out, state = ort_outs
+            self._state = torch.from_numpy(state)
+        else:
+            raise ValueError()
+        self._context = x[..., -context_size:]
+        self._last_sr = sr
+        self._last_batch_size = batch_size
+        out = torch.from_numpy(out)
+        return out
+    def audio_forward(self, x, sr: int):
+        outs = []
+        x, sr = self._validate_input(x, sr)
+        self.reset_states()
+        num_samples = 512 if sr == 16000 else 256
+        if x.shape[1] % num_samples:
+            pad_num = num_samples - (x.shape[1] % num_samples)
+            x = torch.nn.functional.pad(x, (0, pad_num), 'constant', value=0.0)
+        for i in range(0, x.shape[1], num_samples):
+            wavs_batch = x[:, i:i + num_samples]
+            out_chunk = self.__call__(wavs_batch, sr)
+            outs.append(out_chunk)
+        stacked = torch.cat(outs, dim=1)
+        return stacked.cpu()
+    @staticmethod
+    def download(model_url="https://github.com/snakers4/silero-vad/raw/v5.0/files/silero_vad.onnx"):
+        target_dir = os.path.expanduser("~/.cache/silero-vad/")
+        # Ensure the target directory exists
+        os.makedirs(target_dir, exist_ok=True)
+        # Define the target file path
+        model_filename = os.path.join(target_dir, "silero_vad.onnx")
+        # Check if the model file already exists
+        if not os.path.exists(model_filename):
+            # If it doesn't exist, download the model using wget
+            try:
+                # subprocess.run(["wget", "-O", model_filename, model_url], check=True)
+                subprocess.run(["curl", "-sL", "-o", model_filename, model_url], check=True)
+            except subprocess.CalledProcessError:
+                print("Failed to download the model using wget.")
+        return model_filename
+class VoiceActivityDetector:
+    def __init__(self, threshold=0.5, frame_rate=16000):
+        """
+        Initializes the VoiceActivityDetector with a voice activity detection model and a threshold.
+        Args:
+            threshold (float, optional): The probability threshold for detecting voice activity. Defaults to 0.5.
+        """
+        self.model = VoiceActivityDetection()
+        self.threshold = threshold
+        self.frame_rate = frame_rate
+    def __call__(self, audio_frame):
+        """
+        Determines if the given audio frame contains speech by comparing the detected speech probability against
+        the threshold.
+        Args:
+            audio_frame (np.ndarray): The audio frame to be analyzed for voice activity. It is expected to be a
+                                      NumPy array of audio samples.
+        Returns:
+            bool: True if the speech probability exceeds the threshold, indicating the presence of voice activity;
+                  False otherwise.
+        """
+        speech_probs = self.model.audio_forward(torch.from_numpy(audio_frame.copy()), self.frame_rate)[0]
+        return torch.any(speech_probs > self.threshold).item()