| | from typing import Dict |
| | from pathlib import Path |
| | import tempfile |
| | import torch |
| | import torchaudio |
| | import librosa |
| |
|
| | SAMPLE_RATE = 16000 |
| |
|
| | class EndpointHandler(): |
| | def __init__(self, path=""): |
| | |
| | self.mars5, self.config_class = torch.hub.load('Camb-ai/mars5-tts', 'mars5_english', trust_repo=True) |
| |
|
| | def __call__(self, data: Dict[str, bytes]) -> Dict[str, str]: |
| | """ |
| | Args: |
| | data (Dict[str, bytes]): |
| | Includes the text, audio file path, and transcript. |
| | Returns: |
| | Dict[str, str]: Path to the synthesized audio file. |
| | """ |
| | |
| | text = data["text"] |
| | audio_file = data["audio_file"] |
| | transcript = data["transcript"] |
| |
|
| | |
| | wav, sr = librosa.load(audio_file, sr=self.mars5.sr, mono=True) |
| | wav = torch.from_numpy(wav) |
| |
|
| | |
| | deep_clone = True |
| | cfg = self.config_class(deep_clone=deep_clone, rep_penalty_window=100, top_k=100, temperature=0.7, freq_penalty=3) |
| |
|
| | |
| | ar_codes, wav_out = self.mars5.tts(text, wav, transcript, cfg=cfg) |
| |
|
| | |
| | output_path = Path(tempfile.mktemp(suffix=".wav")) |
| | torchaudio.save(output_path, wav_out.unsqueeze(0), self.mars5.sr) |
| |
|
| | return {"synthesized_audio": str(output_path)} |
| |
|