from typing import Dict, List, Any from kokoro import KPipeline from IPython.display import display, Audio import soundfile as sf import torch import io import os import base64 class EndpointHandler(): def __init__(self, model_dir: str): self.pipeline = KPipeline(lang_code='a') def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]: inputs = data.get("inputs", {}) text = inputs.get("text") voice = inputs.get("voice") audio_segments = [] generator = self.pipeline(text, voice) # Direct append without saving to disk for i, (gs, ps, audio) in enumerate(generator): audio_segments.append(audio) # Concatenate all audio segments full_audio = torch.cat([torch.tensor(a) for a in audio_segments]) sample_rate = 24000 audio_length_seconds = len(full_audio) / sample_rate # Write full audio to a binary buffer buffer = io.BytesIO() sf.write(buffer, full_audio.numpy(), 24000, format='WAV') buffer.seek(0) audio_bytes = buffer.read() audio_b64 = base64.b64encode(audio_bytes).decode("utf-8") return { "headers": { "Content-Disposition": "attachment; filename=output.wav", "Content-Type": "audio/wav" }, "body": audio_b64, "statusCode": 200, "isBase64Encoded": True, "audio_length_seconds": audio_length_seconds }