subhankarg's picture
Upload folder using huggingface_hub
0558aa4 verified
env:
save_git_hash: True
engine:
model_path: null
pretrained_name: null
output_filename: null
random_seed: &random_seed 42
inference:
mode: offline # choose from offline, chunked or offline_by_chunked
chunk_len_in_secs: 1.6 #null # Need to specify if use buffered inference (default for offline_by_chunked is 20)
total_buffer_in_secs: 4 #null # Need to specify if use buffered inference (default for offline_by_chunked is 22)
model_stride: 8 # Model downsampling factor, 8 for Citrinet and FastConformer models, and 4 for Conformer models
decoder_type: null # Used for hybrid CTC RNNT model only. Specify decoder_type *ctc* or *rnnt* for hybrid CTC RNNT model.
test_ds:
manifest_filepath: null
sample_rate: 16000
batch_size: 32
num_workers: 4
augmentor:
silence:
prob: 0.8
min_start_silence_secs: 0
max_start_silence_secs: 5
min_end_silence_secs: 0
max_end_silence_secs: 5
rng: *random_seed
noise:
manifest_path: null
prob: 0.8
min_snr_db: 0
max_snr_db: 15
rng: *random_seed
transcribe_params:
# Put additional overrides for params in TranscriptionConfig used by transcribe_speech.py here
# Don't put the following fields here: 'calculate_wer', 'model_path', 'pretrained_name', 'dataset_manifest',
# 'output_filename', 'batch_size', 'num_workers', 'random_seed', 'eval_config_yaml', 'decoder_type'
allow_partial_transcribe: False # only set True if your audio is too long and have 'offset' in manifest
analyst:
metric_calculator:
exist_pred_manifest: null # specify the previously generated manifest will skip engine
clean_groundtruth_text: True
langid: "en" # speciify language to clean text. Note use text normalization in NeMo for better performancce
output_filename: null
use_cer: False
ignore_capitalization: False
ignore_punctuation: False
punctuations: null # a string of punctuations to remove when ignore_punctuation=True. if not set, default to '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~'
strip_punc_space: False # strip spaces before punctuations. e.g., "I do ." -> "I do."
metadata:
duration:
enable: True
slot: [[0,2],[2,5],[5,10],[10,20],[20,100000]] # a slot accepts List[List[str]] or List[List[float]]. i.e. 1.8s belongs to slot [0,2]
save_wer_per_class: False # whether to save wer for each presented class.
gender:
enable: False
slot: [["female"]] # One could also report only one group/class though there are multiple classes in the data.
save_wer_per_class: True
speaker:
enable: True
save_wer_per_class: False
age:
enable: False
slot: null
save_wer_per_class: False
emotion:
enable: True
slot: [['happy','laugh'],['neutral'],['sad']]
save_wer_per_class: False
writer:
report_filename: null