File size: 3,310 Bytes
0558aa4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
env:
    save_git_hash: True

engine:    
    model_path: null  
    pretrained_name: null
    
    output_filename: null
    random_seed: &random_seed 42

    inference: 
        mode: offline # choose from offline, chunked or offline_by_chunked
        chunk_len_in_secs: 1.6 #null # Need to specify if use buffered inference (default for offline_by_chunked is 20)
        total_buffer_in_secs: 4 #null # Need to specify if use buffered inference (default for offline_by_chunked is 22)
        model_stride: 8 # Model downsampling factor, 8 for Citrinet and FastConformer models, and 4 for Conformer models
        decoder_type: null # Used for hybrid CTC RNNT model only. Specify decoder_type *ctc* or *rnnt* for hybrid CTC RNNT model.
   
    test_ds:
        manifest_filepath: null
        sample_rate: 16000
        batch_size: 32
        num_workers: 4
        
        augmentor:
          silence:
            prob: 0.8
            min_start_silence_secs: 0
            max_start_silence_secs: 5
            min_end_silence_secs: 0
            max_end_silence_secs: 5
            rng: *random_seed
            
          noise:
            manifest_path: null
            prob: 0.8
            min_snr_db: 0
            max_snr_db: 15
            rng: *random_seed

    transcribe_params:
        # Put additional overrides for params in TranscriptionConfig used by transcribe_speech.py here
        # Don't put the following fields here: 'calculate_wer', 'model_path', 'pretrained_name', 'dataset_manifest', 
        # 'output_filename', 'batch_size', 'num_workers', 'random_seed', 'eval_config_yaml', 'decoder_type'
        allow_partial_transcribe: False  # only set True if your audio is too long and have 'offset' in manifest

analyst:
    metric_calculator:
        exist_pred_manifest: null # specify the previously generated manifest will skip engine
        clean_groundtruth_text: True
        langid: "en" # speciify language to clean text. Note use text normalization in NeMo for better performancce
        output_filename: null
        use_cer: False
        ignore_capitalization: False
        ignore_punctuation: False
        punctuations: null  # a string of punctuations to remove when ignore_punctuation=True. if not set, default to '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~'
        strip_punc_space: False # strip spaces before punctuations. e.g., "I do ." -> "I do."

    metadata:
        duration: 
            enable: True
            slot: [[0,2],[2,5],[5,10],[10,20],[20,100000]] # a slot accepts List[List[str]] or List[List[float]]. i.e. 1.8s belongs to slot [0,2]
            save_wer_per_class: False # whether to save wer for each presented class.
            
        gender:
            enable: False
            slot: [["female"]] # One could also report only one group/class though there are multiple classes in the data. 
            save_wer_per_class: True
            
        speaker:
            enable: True
            save_wer_per_class: False
            
        age:
            enable: False
            slot: null
            save_wer_per_class: False
            
        emotion: 
            enable: True
            slot: [['happy','laugh'],['neutral'],['sad']] 
            save_wer_per_class: False
                 
writer:
    report_filename: null