Spaces:
Runtime error
Runtime error
| # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| import contextlib | |
| import glob | |
| import json | |
| import os | |
| from dataclasses import dataclass, field, is_dataclass | |
| from pathlib import Path | |
| from typing import List, Optional | |
| import lightning.pytorch as pl | |
| import torch | |
| from omegaconf import OmegaConf | |
| from nemo.collections.audio.models import AudioToAudioModel | |
| from nemo.core.config import hydra_runner | |
| from nemo.utils import logging, model_utils | |
| """ | |
| Process audio file on a single CPU/GPU. Useful for processing of moderate amounts of audio data. | |
| # Arguments | |
| model_path: path to .nemo checkpoint for an AudioToAudioModel | |
| pretrained_name: name of a pretrained AudioToAudioModel model (from NGC registry) | |
| audio_dir: path to directory with audio files | |
| dataset_manifest: path to dataset JSON manifest file (in NeMo format) | |
| max_utts: maximum number of utterances to process | |
| input_channel_selector: list of channels to take from audio files, defaults to `None` and takes all available channels | |
| input_key: key for audio filepath in the manifest file, defaults to `audio_filepath` | |
| output_dir: Directory where processed files will be saved | |
| output_filename: Output filename where manifest pointing to processed files will be written | |
| batch_size: batch size during inference | |
| cuda: Optional int to enable or disable execution of model on certain CUDA device. | |
| amp: Bool to decide if Automatic Mixed Precision should be used during inference | |
| audio_type: Str filetype of the audio. Supported = wav, flac, mp3 | |
| overwrite_output: Bool which when set allowes repeated processing runs to overwrite previous results. | |
| # Usage | |
| AudioToAudioModel can be specified by either `model_path` or `pretrained_name`. | |
| Data for processing can be defined with either `audio_dir` or `dataset_manifest`. | |
| Processed audio is saved in `output_dir`, and a manifest for processed files is saved | |
| in `output_filename`. | |
| ``` | |
| python process_audio.py \ | |
| model_path=null \ | |
| pretrained_name=null \ | |
| audio_dir="" \ | |
| dataset_manifest="" \ | |
| input_channel_selector=[] \ | |
| output_dir="" \ | |
| output_filename="" \ | |
| batch_size=1 \ | |
| cuda=0 \ | |
| amp=True | |
| ``` | |
| """ | |
| class ProcessConfig: | |
| # Required configs | |
| model_path: Optional[str] = None # Path to a .nemo file | |
| pretrained_name: Optional[str] = None # Name of a pretrained model | |
| audio_dir: Optional[str] = None # Path to a directory which contains audio files | |
| dataset_manifest: Optional[str] = None # Path to dataset's JSON manifest | |
| max_utts: Optional[int] = None # max number of utterances to process | |
| # Audio configs | |
| input_channel_selector: Optional[List] = None # Union types not supported Optional[Union[List, int]] | |
| input_key: Optional[str] = None # Can be used with a manifest | |
| # General configs | |
| output_dir: Optional[str] = None | |
| output_filename: Optional[str] = None | |
| batch_size: int = 1 | |
| num_workers: int = 0 | |
| # Override model config | |
| override_config_path: Optional[str] = None # path to a yaml config that will override the internal config file | |
| # Override sampler config | |
| # For example, to set number of steps, use `++sampler.num_samples=42` | |
| sampler: dict = field(default_factory=dict) | |
| # Set `cuda` to int to define CUDA device. If 'None', will look for CUDA | |
| # device anyway, and do inference on CPU only if CUDA device is not found. | |
| # If `cuda` is a negative number, inference will be on CPU only. | |
| cuda: Optional[int] = None | |
| amp: bool = False | |
| audio_type: str = "wav" | |
| # Recompute model predictions, even if the output folder exists. | |
| overwrite_output: bool = False | |
| def main(cfg: ProcessConfig) -> ProcessConfig: | |
| logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}') | |
| if is_dataclass(cfg): | |
| cfg = OmegaConf.structured(cfg) | |
| if cfg.model_path is None and cfg.pretrained_name is None: | |
| raise ValueError("Both cfg.model_path and cfg.pretrained_name cannot be None!") | |
| if cfg.audio_dir is None and cfg.dataset_manifest is None: | |
| raise ValueError("Both cfg.audio_dir and cfg.dataset_manifest cannot be None!") | |
| # setup GPU | |
| if cfg.cuda is None: | |
| if torch.cuda.is_available(): | |
| device = [0] # use 0th CUDA device | |
| accelerator = 'gpu' | |
| else: | |
| device = 1 | |
| accelerator = 'cpu' | |
| else: | |
| device = [cfg.cuda] | |
| accelerator = 'gpu' | |
| map_location = torch.device('cuda:{}'.format(device[0]) if accelerator == 'gpu' else 'cpu') | |
| # setup model | |
| if cfg.model_path is not None: | |
| # restore model from .nemo file path | |
| model_cfg = AudioToAudioModel.restore_from(restore_path=cfg.model_path, return_config=True) | |
| classpath = model_cfg.target # original class path | |
| imported_class = model_utils.import_class_by_path(classpath) # type: AudioToAudioModel | |
| logging.info(f"Restoring model : {imported_class.__name__}") | |
| audio_to_audio_model = imported_class.restore_from( | |
| restore_path=cfg.model_path, override_config_path=cfg.override_config_path, map_location=map_location | |
| ) # type: AudioToAudioModel | |
| model_name = os.path.splitext(os.path.basename(cfg.model_path))[0] | |
| else: | |
| # restore model by name | |
| audio_to_audio_model = AudioToAudioModel.from_pretrained( | |
| model_name=cfg.pretrained_name, map_location=map_location | |
| ) # type: AudioToAudioModel | |
| model_name = cfg.pretrained_name | |
| trainer = pl.Trainer(devices=device, accelerator=accelerator) | |
| audio_to_audio_model.set_trainer(trainer) | |
| audio_to_audio_model = audio_to_audio_model.eval() | |
| # override sampler if necessary | |
| if cfg.sampler: | |
| logging.info('Overriding sampler with %s', cfg.sampler) | |
| if hasattr(audio_to_audio_model, 'sampler'): | |
| for key, value in cfg.sampler.items(): | |
| if not hasattr(audio_to_audio_model.sampler, key): | |
| raise RuntimeError(f'Model sampler does not have attribute {key}') | |
| logging.debug('Try to set model.sampler.%s to %s', key, value) | |
| setattr(audio_to_audio_model.sampler, key, value) | |
| if getattr(audio_to_audio_model.sampler, key) != value: | |
| raise RuntimeError(f'Failed to set model sampler attribute {key} to {value}') | |
| logging.info('model.sampler.%s was set to %s', key, value) | |
| else: | |
| raise RuntimeError('Model does not have a sampler') | |
| if cfg.audio_dir is not None: | |
| input_dir = cfg.audio_dir | |
| filepaths = list(glob.glob(os.path.join(cfg.audio_dir, f"**/*.{cfg.audio_type}"), recursive=True)) | |
| else: | |
| # get filenames from manifest | |
| filepaths = [] | |
| if os.stat(cfg.dataset_manifest).st_size == 0: | |
| raise RuntimeError(f"The input dataset_manifest {cfg.dataset_manifest} is empty.") | |
| input_key = 'audio_filepath' if cfg.input_key is None else cfg.input_key | |
| manifest_dir = Path(cfg.dataset_manifest).parent | |
| with open(cfg.dataset_manifest, 'r') as f: | |
| for line in f: | |
| item = json.loads(line) | |
| audio_file = Path(item[input_key]) | |
| if not audio_file.is_file() and not audio_file.is_absolute(): | |
| audio_file = manifest_dir / audio_file | |
| filepaths.append(str(audio_file.absolute())) | |
| # common path for all files | |
| common_path = os.path.commonpath(filepaths) | |
| if Path(common_path).is_relative_to(manifest_dir): | |
| # if all paths are relative to the manifest, use manifest dir as input dir | |
| input_dir = manifest_dir | |
| else: | |
| # use the parent of the common path as input dir | |
| input_dir = Path(common_path).parent | |
| if cfg.max_utts is not None: | |
| # Limit the number of utterances to process | |
| filepaths = filepaths[: cfg.max_utts] | |
| logging.info(f"\nProcessing {len(filepaths)} files...\n") | |
| # setup AMP (optional) | |
| if cfg.amp and torch.cuda.is_available() and hasattr(torch.cuda, 'amp') and hasattr(torch.cuda.amp, 'autocast'): | |
| logging.info("AMP enabled!\n") | |
| autocast = torch.cuda.amp.autocast | |
| else: | |
| def autocast(): | |
| yield | |
| # Compute output filename | |
| if cfg.output_dir is None: | |
| # create default output filename | |
| if cfg.audio_dir is not None: | |
| cfg.output_dir = os.path.dirname(os.path.join(cfg.audio_dir, '.')) + f'_processed_{model_name}' | |
| else: | |
| cfg.output_dir = os.path.dirname(cfg.dataset_manifest) + f'_processed_{model_name}' | |
| # Compute output filename | |
| if cfg.output_filename is None: | |
| # create default output filename | |
| cfg.output_filename = cfg.output_dir.rstrip('/') + '_manifest.json' | |
| # if transcripts should not be overwritten, and already exists, skip re-transcription step and return | |
| if not cfg.overwrite_output and os.path.exists(cfg.output_dir): | |
| raise RuntimeError( | |
| f"Previous output found at {cfg.output_dir}, and flag `overwrite_output`" | |
| f"is {cfg.overwrite_output}. Returning without processing." | |
| ) | |
| # Process audio | |
| with autocast(): | |
| with torch.no_grad(): | |
| paths2processed_files = audio_to_audio_model.process( | |
| paths2audio_files=filepaths, | |
| output_dir=cfg.output_dir, | |
| batch_size=cfg.batch_size, | |
| num_workers=cfg.num_workers, | |
| input_channel_selector=cfg.input_channel_selector, | |
| input_dir=input_dir, | |
| ) | |
| logging.info(f"Finished processing {len(filepaths)} files!") | |
| logging.info(f"Processed audio is available in the output directory: {cfg.output_dir}") | |
| # Prepare new/updated manifest with a new key for processed audio | |
| with open(cfg.output_filename, 'w', encoding='utf-8') as f: | |
| if cfg.dataset_manifest is not None: | |
| with open(cfg.dataset_manifest, 'r') as fr: | |
| for idx, line in enumerate(fr): | |
| item = json.loads(line) | |
| item['processed_audio_filepath'] = paths2processed_files[idx] | |
| f.write(json.dumps(item) + "\n") | |
| if cfg.max_utts is not None and idx >= cfg.max_utts - 1: | |
| break | |
| else: | |
| for idx, processed_file in enumerate(paths2processed_files): | |
| item = {'processed_audio_filepath': processed_file} | |
| f.write(json.dumps(item) + "\n") | |
| return cfg | |
| if __name__ == '__main__': | |
| main() # noqa pylint: disable=no-value-for-parameter | |