Spaces:
Runtime error
Runtime error
| from typing import List, Union, Callable | |
| import torch | |
| import torchaudio | |
| SAMPLING_RATE = 16_000 | |
| win_length = 400 # int((25 / 1_000) * SAMPLING_RATE) | |
| hop_length = 160 # int((10 / 1_000) * SAMPLING_RATE) | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| MFCC_FN = torchaudio.transforms.MFCC( | |
| sample_rate=SAMPLING_RATE, | |
| n_mfcc=128, | |
| melkwargs={ | |
| "n_fft": 512, | |
| "win_length": win_length, | |
| "hop_length": hop_length, | |
| }, | |
| ).to(device) | |
| LFCC_FN = torchaudio.transforms.LFCC( | |
| sample_rate=SAMPLING_RATE, | |
| n_lfcc=128, | |
| speckwargs={ | |
| "n_fft": 512, | |
| "win_length": win_length, | |
| "hop_length": hop_length, | |
| }, | |
| ).to(device) | |
| MEL_SCALE_FN = torchaudio.transforms.MelScale( | |
| n_mels=80, | |
| n_stft=257, | |
| sample_rate=SAMPLING_RATE, | |
| ).to(device) | |
| delta_fn = torchaudio.transforms.ComputeDeltas( | |
| win_length=400, | |
| mode="replicate", | |
| ) | |
| def get_frontend( | |
| frontends: List[str], | |
| ) -> Union[torchaudio.transforms.MFCC, torchaudio.transforms.LFCC, Callable,]: | |
| if "mfcc" in frontends: | |
| return prepare_mfcc_double_delta | |
| elif "lfcc" in frontends: | |
| return prepare_lfcc_double_delta | |
| raise ValueError(f"{frontends} frontend is not supported!") | |
| def prepare_lfcc_double_delta(input): | |
| if input.ndim < 4: | |
| input = input.unsqueeze(1) # (bs, 1, n_lfcc, frames) | |
| x = LFCC_FN(input) | |
| delta = delta_fn(x) | |
| double_delta = delta_fn(delta) | |
| x = torch.cat((x, delta, double_delta), 2) # -> [bs, 1, 128 * 3, 1500] | |
| return x[:, :, :, :3000] # (bs, n, n_lfcc * 3, frames) | |
| def prepare_mfcc_double_delta(input): | |
| if input.ndim < 4: | |
| input = input.unsqueeze(1) # (bs, 1, n_lfcc, frames) | |
| x = MFCC_FN(input) | |
| delta = delta_fn(x) | |
| double_delta = delta_fn(delta) | |
| x = torch.cat((x, delta, double_delta), 2) # -> [bs, 1, 128 * 3, 1500] | |
| return x[:, :, :, :3000] # (bs, n, n_lfcc * 3, frames) | |