niobures commited on
Commit
2eb0625
·
verified ·
1 Parent(s): e20b933

Sherpa-CTC (en)

Browse files
.gitattributes CHANGED
@@ -39,3 +39,5 @@ en/sherpa-onnx-nemo-ctc-en-conformer-small/test_wavs/0.wav filter=lfs diff=lfs m
39
  en/sherpa-onnx-nemo-ctc-en-conformer-small/test_wavs/1.wav filter=lfs diff=lfs merge=lfs -text
40
  en/sherpa-onnx-nemo-ctc-en-conformer-medium/test_wavs/0.wav filter=lfs diff=lfs merge=lfs -text
41
  en/sherpa-onnx-nemo-ctc-en-conformer-medium/test_wavs/1.wav filter=lfs diff=lfs merge=lfs -text
 
 
 
39
  en/sherpa-onnx-nemo-ctc-en-conformer-small/test_wavs/1.wav filter=lfs diff=lfs merge=lfs -text
40
  en/sherpa-onnx-nemo-ctc-en-conformer-medium/test_wavs/0.wav filter=lfs diff=lfs merge=lfs -text
41
  en/sherpa-onnx-nemo-ctc-en-conformer-medium/test_wavs/1.wav filter=lfs diff=lfs merge=lfs -text
42
+ en/sherpa-onnx-nemo-ctc-en-conformer-large/test_wavs/0.wav filter=lfs diff=lfs merge=lfs -text
43
+ en/sherpa-onnx-nemo-ctc-en-conformer-large/test_wavs/1.wav filter=lfs diff=lfs merge=lfs -text
en/sherpa-onnx-nemo-ctc-en-conformer-large/.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
en/sherpa-onnx-nemo-ctc-en-conformer-large/README.md ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ ---
4
+
5
+ # Introduction
6
+
7
+ This repo contains torchscript model of `stt_en_conformer_ctc_large` from NeMo.
8
+
9
+ See https://registry.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_en_conformer_ctc_large
10
+
11
+ The following code is used to obtain `model.onnx` and `tokens.txt`:
12
+
13
+ ```python3
14
+ m = nemo_asr.models.EncDecCTCModelBPE.from_pretrained('stt_en_conformer_ctc_large')
15
+ m.export('model.onnx')
16
+
17
+ with open('tokens.txt', 'w') as f:
18
+ for i, s in enumerate(m.decoder.vocabulary):
19
+ f.write(f"{s} {i}\n")
20
+ f.write(f"<blk> {i+1}\n")
21
+ `
en/sherpa-onnx-nemo-ctc-en-conformer-large/add-model-metadata.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+
3
+ # Copyright (c) 2023 Xiaomi Corporation
4
+ # Author: Fangjun Kuang
5
+
6
+ from typing import Dict
7
+
8
+ import numpy as np
9
+ import onnx
10
+
11
+
12
+ def get_vocab_size():
13
+ with open("tokens.txt") as f:
14
+ return len(f.readlines())
15
+
16
+
17
+ def add_meta_data(filename: str, meta_data: Dict[str, str]):
18
+ """Add meta data to an ONNX model. It is changed in-place.
19
+
20
+ Args:
21
+ filename:
22
+ Filename of the ONNX model to be changed.
23
+ meta_data:
24
+ Key-value pairs.
25
+ """
26
+ model = onnx.load(filename)
27
+ for key, value in meta_data.items():
28
+ meta = model.metadata_props.add()
29
+ meta.key = key
30
+ meta.value = value
31
+
32
+ onnx.save(model, filename)
33
+ print(f"Updated {filename}")
34
+
35
+
36
+ def main():
37
+ vocab_size = get_vocab_size()
38
+ # 8 for citrinet
39
+ # 4 for conformer ctc
40
+ subsampling_factor = 4
41
+
42
+ meta_data = {
43
+ "vocab_size": str(vocab_size),
44
+ "normalize_type": "per_feature",
45
+ "subsampling_factor": str(subsampling_factor),
46
+ "model_type": "EncDecCTCModelBPE",
47
+ "version": "1",
48
+ "model_author": "nemo",
49
+ "comment": "https://registry.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_en_conformer_ctc_small",
50
+ }
51
+ add_meta_data("model.onnx", meta_data)
52
+
53
+
54
+ if __name__ == "__main__":
55
+ main()
en/sherpa-onnx-nemo-ctc-en-conformer-large/model.int8.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7055703cc9c5dc787706d10d4c0260a903695d4958933a97f64352902320c5c5
3
+ size 169392184
en/sherpa-onnx-nemo-ctc-en-conformer-large/model.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:44e0d51556e5d99fdfff481660bef8d5a1dcd3cd1b2f34ec55c2f421a296a66a
3
+ size 532287873
en/sherpa-onnx-nemo-ctc-en-conformer-large/quantize-model.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+
3
+ import onnx
4
+ from onnxruntime.quantization import QuantType, quantize_dynamic
5
+
6
+
7
+ def main():
8
+ onnx_model = onnx.load("model.onnx")
9
+ quantize_dynamic(
10
+ model_input="model.onnx",
11
+ model_output="model.int8.onnx",
12
+ per_channel=True,
13
+ weight_type=QuantType.QUInt8,
14
+ )
15
+
16
+
17
+ if __name__ == "__main__":
18
+ main()
en/sherpa-onnx-nemo-ctc-en-conformer-large/source.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://huggingface.co/csukuangfj/sherpa-onnx-nemo-ctc-en-conformer-large
en/sherpa-onnx-nemo-ctc-en-conformer-large/test.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+
3
+ # Copyright (c) 2023 Xiaomi Corporation
4
+ # Author: Fangjun Kuang
5
+
6
+ import kaldi_native_fbank as knf
7
+ import itertools
8
+ import librosa
9
+ import numpy as np
10
+ import onnxruntime as ort
11
+
12
+
13
+ def compute_feat(filename):
14
+ sample_rate = 16000
15
+ samples, _ = librosa.load(filename, sr=sample_rate)
16
+ opts = knf.FbankOptions()
17
+ opts.frame_opts.dither = 0
18
+ opts.frame_opts.snip_edges = False
19
+ opts.frame_opts.samp_freq = sample_rate
20
+ opts.mel_opts.num_bins = 80
21
+
22
+ online_fbank = knf.OnlineFbank(opts)
23
+ online_fbank.accept_waveform(sample_rate, (samples * 32768).tolist())
24
+ online_fbank.input_finished()
25
+
26
+ features = np.stack(
27
+ [online_fbank.get_frame(i) for i in range(online_fbank.num_frames_ready)]
28
+ )
29
+ assert features.data.contiguous is True
30
+ assert features.dtype == np.float32, features.dtype
31
+ mean = features.mean(axis=0, keepdims=True)
32
+ stddev = features.std(axis=0, keepdims=True)
33
+ features = (features - mean) / (stddev + 1e-5)
34
+ return features
35
+
36
+
37
+ def load_tokens():
38
+ ans = dict()
39
+ with open("tokens.txt", encoding="utf-8") as f:
40
+ for line in f:
41
+ sym, idx = line.strip().split()
42
+ ans[int(idx)] = sym
43
+ return ans
44
+
45
+
46
+ def main():
47
+ filename = "./test_wavs/0.wav"
48
+ features = compute_feat(filename) # (T, C)
49
+ features = np.expand_dims(features, axis=0) # (N, T, C)
50
+ features = features.transpose(0, 2, 1) # (N, C, T)
51
+ print(features.shape) # (N, C, T), (1, 80, 663)
52
+ features_length = np.array([features.shape[2]], dtype=np.int64)
53
+ print(features_length)
54
+
55
+ sess = ort.InferenceSession("model.onnx")
56
+
57
+ for n in sess.get_inputs():
58
+ print(n.name, n.type, n.shape)
59
+
60
+ for n in sess.get_outputs():
61
+ print(n.name, n.type, n.shape)
62
+
63
+ inputs = {
64
+ sess.get_inputs()[0].name: features,
65
+ sess.get_inputs()[1].name: features_length,
66
+ }
67
+
68
+ outputs = sess.run([sess.get_outputs()[0].name], input_feed=inputs)
69
+ # outputs[0] contains log_probs
70
+
71
+ print(outputs[0].shape) # (N, T, C), (1, 166, 1025)
72
+ print(outputs[0].dtype) # float32
73
+ print(np.exp(outputs[0]).sum(axis=-1).reshape(-1)[:10]) # validate it is log_probs
74
+ indexes = outputs[0].argmax(axis=-1)
75
+ print(indexes.shape)
76
+ indexes = indexes.squeeze().tolist()
77
+ unique_indexes = [k for k, _ in itertools.groupby(indexes)]
78
+ print(indexes)
79
+ print(unique_indexes)
80
+
81
+ tokens = load_tokens()
82
+ text = "".join([tokens[i] for i in unique_indexes if i != len(tokens) - 1])
83
+ print(text)
84
+
85
+ """
86
+ audio_signal tensor(float) ['audio_signal_dynamic_axes_1', 80, 'audio_signal_dynamic_axes_2']
87
+ length tensor(int64) ['length_dynamic_axes_1']
88
+ logprobs tensor(float) ['logprobs_dynamic_axes_1', 'logprobs_dynamic_axes_2', 1025]
89
+ """
90
+
91
+
92
+ if __name__ == "__main__":
93
+ main()
en/sherpa-onnx-nemo-ctc-en-conformer-large/test_wavs/0.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6bc58a4efdf20daac252b6b1502632601a71efe0308f6757dc1eda34891a7e4f
3
+ size 212044
en/sherpa-onnx-nemo-ctc-en-conformer-large/test_wavs/1.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5143a6ba93c4b274e2c4ac22deb75c2c48936c853f0519add1de828b6c79cc5a
3
+ size 534924
en/sherpa-onnx-nemo-ctc-en-conformer-large/test_wavs/8k.wav ADDED
Binary file (77.2 kB). View file
 
en/sherpa-onnx-nemo-ctc-en-conformer-large/test_wavs/trans.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ 0.wav AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS
2
+ 1.wav GOD AS A DIRECT CONSEQUENCE OF THE SIN WHICH MAN THUS PUNISHED HAD GIVEN HER A LOVELY CHILD WHOSE PLACE WAS ON THAT SAME DISHONOURED BOSOM TO CONNECT HER PARENT FOR EVER WITH THE RACE AND DESCENT OF MORTALS AND TO BE FINALLY A BLESSED SOUL IN HEAVEN
3
+ 8k.wav YET THESE THOUGHTS AFFECTED HESTER PRYNNE LESS WITH HOPE THAN APPREHENSION
en/sherpa-onnx-nemo-ctc-en-conformer-large/tokens.txt ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <unk> 0
2
+ ▁ 1
3
+ s 2
4
+ t 3
5
+ e 4
6
+ d 5
7
+ o 6
8
+ ▁the 7
9
+ a 8
10
+ i 9
11
+ ▁a 10
12
+ u 11
13
+ y 12
14
+ m 13
15
+ l 14
16
+ n 15
17
+ p 16
18
+ re 17
19
+ c 18
20
+ h 19
21
+ r 20
22
+ ▁s 21
23
+ g 22
24
+ ▁to 23
25
+ er 24
26
+ ing 25
27
+ f 26
28
+ ▁and 27
29
+ an 28
30
+ ▁i 29
31
+ k 30
32
+ ▁that 31
33
+ ' 32
34
+ ▁of 33
35
+ ▁in 34
36
+ w 35
37
+ ▁p 36
38
+ ed 37
39
+ or 38
40
+ al 39
41
+ ar 40
42
+ ▁f 41
43
+ en 42
44
+ in 43
45
+ b 44
46
+ ▁you 45
47
+ ▁w 46
48
+ ▁b 47
49
+ le 48
50
+ ll 49
51
+ es 50
52
+ ▁it 51
53
+ ve 52
54
+ ur 53
55
+ ▁we 54
56
+ ▁re 55
57
+ ▁be 56
58
+ ly 57
59
+ ▁is 58
60
+ ▁he 59
61
+ ▁o 60
62
+ ▁c 61
63
+ it 62
64
+ ▁n 63
65
+ ▁on 64
66
+ un 65
67
+ ▁t 66
68
+ on 67
69
+ se 68
70
+ th 69
71
+ ce 70
72
+ ▁do 71
73
+ ic 72
74
+ ▁for 73
75
+ ▁th 74
76
+ ion 75
77
+ ch 76
78
+ ▁was 77
79
+ ri 78
80
+ ent 79
81
+ ▁g 80
82
+ ver 81
83
+ ▁co 82
84
+ li 83
85
+ ▁ha 84
86
+ ▁ma 85
87
+ la 86
88
+ ro 87
89
+ v 88
90
+ us 89
91
+ ▁ca 90
92
+ ▁di 91
93
+ ▁this 92
94
+ ra 93
95
+ ▁st 94
96
+ ▁e 95
97
+ ▁not 96
98
+ ▁so 97
99
+ ▁de 98
100
+ ▁have 99
101
+ ter 100
102
+ ir 101
103
+ ▁go 102
104
+ ation 103
105
+ ▁with 104
106
+ ate 105
107
+ ▁me 106
108
+ ▁mo 107
109
+ ment 108
110
+ ▁con 109
111
+ ▁but 110
112
+ vi 111
113
+ ▁pro 112
114
+ ▁ho 113
115
+ j 114
116
+ ▁com 115
117
+ ight 116
118
+ ▁know 117
119
+ ▁what 118
120
+ ect 119
121
+ ▁ex 120
122
+ ▁some 121
123
+ ▁would 122
124
+ ▁like 123
125
+ x 124
126
+ ▁his 125
127
+ q 126
128
+ z 127
129
+ <blk> 128