MagpieTTS_Internal_Demo / tests /collections /audio /test_audio_maxine_models.py
subhankarg's picture
Upload folder using huggingface_hub
0558aa4 verified
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pytest
import torch
from omegaconf import DictConfig
try:
import importlib
importlib.import_module('torchaudio')
HAVE_TORCHAUDIO = True
except ModuleNotFoundError:
HAVE_TORCHAUDIO = False
from nemo.collections.audio.models.maxine import BNR2
@pytest.fixture()
def maxine_model_fixture():
sample_rate = 16000
fft_length = 1920
hop_length = 480
num_mels = 320
optim = {
'name': 'adam',
'lr': 0.0005,
'sched': {
'name': 'StepLR',
},
'gamma': 0.999,
'step_size': 2,
}
loss = {
'_target_': 'nemo.collections.audio.losses.maxine.CombinedLoss',
'sample_rate': sample_rate,
'fft_length': fft_length,
'hop_length': hop_length,
'num_mels': num_mels,
'sisnr_loss_weight': 1,
'spectral_loss_weight': 15,
'asr_loss_weight': 1,
'use_asr_loss': True,
'use_mel_spec': True,
}
config = DictConfig(
{
'type': "bnr",
'sample_rate': sample_rate,
'fft_length': fft_length,
'hop_length': hop_length,
'num_mels': num_mels,
'skip_nan_grad': False,
'num_outputs': 1,
'segment': 4,
'loss': DictConfig(loss),
'optim': DictConfig(optim),
}
)
bnr = BNR2(cfg=config)
return bnr
class TestBNR2Model:
"""Test BNR 2 model."""
@pytest.mark.unit
@pytest.mark.skipif(not HAVE_TORCHAUDIO, reason="Modules in this test require torchaudio")
def test_constructor(self, maxine_model_fixture):
"""Test that the model can be constructed from a config dict."""
model = maxine_model_fixture.train()
confdict = model.to_config_dict()
instance2 = BNR2.from_config_dict(confdict)
assert isinstance(instance2, BNR2)
@pytest.mark.unit
@pytest.mark.skipif(not HAVE_TORCHAUDIO, reason="Modules in this test require torchaudio")
@pytest.mark.parametrize(
"batch_size, sample_len",
[
# Note: Must be a multiple of 10ms @ 16kkHz
(4, 16), # Example 1
(2, 8), # Example 2
(1, 32), # Example 3
],
)
def test_forward_infer(self, maxine_model_fixture, batch_size, sample_len):
"""Test that the model can run forward inference."""
model = maxine_model_fixture.eval()
confdict = model.to_config_dict()
sampling_rate = confdict['sample_rate']
input_signal = torch.randn(size=(batch_size, 1, sample_len * sampling_rate))
abs_tol = 1e-5
with torch.no_grad():
# batch size 1
output_list = []
for i in range(input_signal.size(0)):
output = model.forward(input_signal=input_signal[i : i + 1])
output_list.append(output)
output_instance = torch.cat(output_list, 0)
# batch size batch_size
output_batch = model.forward(input_signal=input_signal)
# Check that the output is the same for the instance and batch
assert output_instance.shape == output_batch.shape
diff = torch.max(torch.abs(output_instance - output_batch))
assert diff <= abs_tol