OnurDursun commited on
Commit
ce0a898
·
verified ·
1 Parent(s): 30c0f00

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +121 -0
  2. best_model.pth +3 -0
  3. train.py +185 -0
app.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tempfile
3
+ import requests
4
+ import torch
5
+ import torch.nn as nn
6
+ import torchaudio
7
+ import moviepy.editor as mpy
8
+ import gradio as gr
9
+
10
+ # ─── 1. AccentCNN definition (same as your training code) ───────────────────
11
+ class AccentCNN(nn.Module):
12
+ def __init__(self, num_classes: int):
13
+ super().__init__()
14
+ self.features = nn.Sequential(
15
+ nn.Conv2d(1, 16, 3, padding=1), nn.BatchNorm2d(16), nn.ReLU(), nn.MaxPool2d(2),
16
+ nn.Conv2d(16, 32, 3, padding=1), nn.BatchNorm2d(32), nn.ReLU(), nn.MaxPool2d(2),
17
+ nn.Conv2d(32, 64, 3, padding=1), nn.BatchNorm2d(64), nn.ReLU(), nn.MaxPool2d(2),
18
+ nn.Conv2d(64,128, 3, padding=1), nn.BatchNorm2d(128), nn.ReLU(), nn.MaxPool2d(2),
19
+ )
20
+ self.classifier = nn.Sequential(
21
+ nn.Dropout(0.5),
22
+ nn.Flatten(),
23
+ nn.Linear(128*14*14, 256),
24
+ nn.ReLU(),
25
+ nn.Dropout(0.5),
26
+ nn.Linear(256, num_classes),
27
+ )
28
+ def forward(self, x):
29
+ return self.classifier(self.features(x))
30
+
31
+ # ─── 2. Audio→Mel transforms ─────────────────────────────────────────────────
32
+ SAMPLE_RATE = 16000
33
+ DURATION = 3.0
34
+ MAX_LEN = int(SAMPLE_RATE * DURATION)
35
+ mel_spec = torchaudio.transforms.MelSpectrogram(sample_rate=SAMPLE_RATE, n_mels=128)
36
+ to_db = torchaudio.transforms.AmplitudeToDB()
37
+ #resize = torchaudio.transforms.Resize((224,224))
38
+ MEAN = 0.485
39
+ STD = 0.229
40
+
41
+ import torch.nn.functional as F
42
+ # … remove “resize = torchaudio.transforms.Resize((224,224))” …
43
+
44
+ def preprocess_wav(wav: torch.Tensor) -> torch.Tensor:
45
+ # wav: (1, N)
46
+ if wav.shape[1] < MAX_LEN:
47
+ wav = nn.functional.pad(wav, (0, MAX_LEN - wav.shape[1]))
48
+ else:
49
+ wav = wav[:, :MAX_LEN]
50
+
51
+ spec = mel_spec(wav) # (1, 128, T)
52
+ spec = to_db(spec) # log scale
53
+
54
+ # --- NEW: resize via interpolate instead of torchaudio.Resize ---
55
+ # spec.unsqueeze(0): (B=1, C=1, H=128, W=T)
56
+ spec = F.interpolate(
57
+ spec.unsqueeze(0),
58
+ size=(224, 224),
59
+ mode="bilinear",
60
+ align_corners=False
61
+ ).squeeze(0) # back to (1, 224, 224)
62
+ spec = (spec - MEAN) / STD
63
+ return spec # ImageNet‐style norm
64
+ # ImageNet-style norm
65
+
66
+ # ─── 3. Load labels & model ─────────────────────────────────────────────────
67
+ LABELS = ["american","english","indian","irish","scottish","southafrican"]
68
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
69
+ model = AccentCNN(len(LABELS)).to(device)
70
+ model.load_state_dict(torch.load("best_model.pth", map_location=device))
71
+ model.eval()
72
+
73
+ # ─── 4. Inference pipeline ─────────────────────────────────────────────────
74
+ def predict_from_url(mp4_url: str):
75
+ # 4a. Download mp4 to temp file
76
+ with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmp_video:
77
+ resp = requests.get(mp4_url, stream=True)
78
+ resp.raise_for_status()
79
+ for chunk in resp.iter_content(1024*1024):
80
+ tmp_video.write(chunk)
81
+ video_path = tmp_video.name
82
+
83
+ # 4b. Extract audio track as wav
84
+ clip = mpy.VideoFileClip(video_path)
85
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_audio:
86
+ clip.audio.write_audiofile(tmp_audio.name, fps=SAMPLE_RATE, logger=None)
87
+ audio_path = tmp_audio.name
88
+ clip.close()
89
+ os.unlink(video_path)
90
+
91
+ # 4c. Load & preprocess
92
+ wav, sr = torchaudio.load(audio_path)
93
+ os.unlink(audio_path)
94
+ wav = wav.mean(dim=0, keepdim=True) # stereo→mono
95
+ spec = preprocess_wav(wav)
96
+
97
+ # 4d. Model forward & postprocess
98
+ with torch.no_grad():
99
+ inp = spec.unsqueeze(0).to(device) # add batch dim
100
+ logits = model(inp)
101
+ probs = torch.softmax(logits, dim=1).cpu().squeeze()
102
+ # Prepare output
103
+ results = {lbl: float(probs[i]) for i,lbl in enumerate(LABELS)}
104
+ pred_label = LABELS[int(probs.argmax())]
105
+ return pred_label, results
106
+
107
+ # ─── 5. Gradio UI ────────────────────────────────────────────────────────────
108
+ iface = gr.Interface(
109
+ fn=predict_from_url,
110
+ inputs=gr.Textbox(label="Public MP4 URL"),
111
+ outputs=[
112
+ gr.Label(num_top_classes=1, label="Predicted Accent"),
113
+ gr.JSON(label="Class Probabilities")
114
+ ],
115
+ title="Accent Classification from Video",
116
+ description="Enter a public MP4 link; the app downloads, extracts audio, and predicts accent.",
117
+ allow_flagging="never",
118
+ )
119
+
120
+ if __name__ == "__main__":
121
+ iface.launch()
best_model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7cfd6be64d456ed8b7a9704246325324dcb53b07ac335cc06864aafc6576cc82
3
+ size 26100706
train.py ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import argparse
3
+ from glob import glob
4
+
5
+ import torch
6
+ import torch.nn as nn
7
+ import torch.optim as optim
8
+ import torchaudio
9
+ from torch.utils.data import Dataset, DataLoader, random_split
10
+ from torchvision.transforms import Resize, Normalize
11
+
12
+ import os
13
+ import torchaudio
14
+
15
+
16
+ print("Backends before:", torchaudio.list_audio_backends())
17
+
18
+
19
+ import soundfile
20
+ torchaudio.set_audio_backend("soundfile")
21
+
22
+ print("Backends after:", torchaudio.list_audio_backends())
23
+
24
+
25
+ class AccentDataset(Dataset):
26
+ """
27
+ Custom Dataset for loading audio files and converting to Mel-spectrograms.
28
+ Expects directory structure:
29
+ dataset/
30
+ american/
31
+ english/
32
+ indian/
33
+ irish/
34
+ scottish/
35
+ """
36
+ def __init__(self, root_dir, sample_rate=16000, n_mels=128, duration=3.0):
37
+ super().__init__()
38
+ self.sample_rate = sample_rate
39
+ self.max_len = int(sample_rate * duration)
40
+ self.labels = sorted(os.listdir(root_dir))
41
+ self.filepaths = []
42
+ for label in self.labels:
43
+ files = glob(os.path.join(root_dir, label, '*.wav'))
44
+ self.filepaths += [(fp, label) for fp in files]
45
+
46
+ # Audio → MelSpectrogram → dB
47
+ self.mel_spec = torchaudio.transforms.MelSpectrogram(
48
+ sample_rate=sample_rate,
49
+ n_mels=n_mels
50
+ )
51
+ self.to_db = torchaudio.transforms.AmplitudeToDB()
52
+
53
+ # Resize spectrogram to 224×224 and normalize like ImageNet
54
+ self.resize = Resize((224, 224))
55
+ self.normalize = Normalize(mean=[0.485], std=[0.229])
56
+
57
+ def __len__(self):
58
+ return len(self.filepaths)
59
+
60
+ def __getitem__(self, idx):
61
+ path, label = self.filepaths[idx]
62
+ wav, sr = torchaudio.load(path)
63
+ wav = wav.mean(dim=0, keepdim=True) # mono
64
+ if wav.size(1) < self.max_len:
65
+ pad = self.max_len - wav.size(1)
66
+ wav = nn.functional.pad(wav, (0, pad))
67
+ else:
68
+ wav = wav[:, :self.max_len]
69
+
70
+ spec = self.mel_spec(wav) # (1, n_mels, time)
71
+ spec_db = self.to_db(spec) # log scale
72
+ spec_resized = self.resize(spec_db) # (1, 224, 224)
73
+ spec_norm = self.normalize(spec_resized)
74
+
75
+ label_idx = self.labels.index(label)
76
+ return spec_norm, label_idx
77
+
78
+
79
+ class AccentCNN(nn.Module):
80
+
81
+ def __init__(self, num_classes):
82
+ super().__init__()
83
+ self.features = nn.Sequential(
84
+ nn.Conv2d(1, 16, kernel_size=3, stride=1, padding=1),
85
+ nn.BatchNorm2d(16),
86
+ nn.ReLU(),
87
+ nn.MaxPool2d(2), # 112x112
88
+
89
+ nn.Conv2d(16, 32, kernel_size=3, padding=1),
90
+ nn.BatchNorm2d(32),
91
+ nn.ReLU(),
92
+ nn.MaxPool2d(2), # 56x56
93
+
94
+ nn.Conv2d(32, 64, kernel_size=3, padding=1),
95
+ nn.BatchNorm2d(64),
96
+ nn.ReLU(),
97
+ nn.MaxPool2d(2), # 28x28
98
+
99
+ nn.Conv2d(64, 128, kernel_size=3, padding=1),
100
+ nn.BatchNorm2d(128),
101
+ nn.ReLU(),
102
+ nn.MaxPool2d(2) # 14x14
103
+ )
104
+ self.classifier = nn.Sequential(
105
+ nn.Dropout(0.5),
106
+ nn.Flatten(),
107
+ nn.Linear(128 * 14 * 14, 256),
108
+ nn.ReLU(),
109
+ nn.Dropout(0.5),
110
+ nn.Linear(256, num_classes)
111
+ )
112
+
113
+ def forward(self, x):
114
+ x = self.features(x)
115
+ x = self.classifier(x)
116
+ return x
117
+
118
+
119
+ def build_model(num_classes):
120
+
121
+ return AccentCNN(num_classes)
122
+
123
+
124
+ def train_one_epoch(model, loader, criterion, optimizer, device):
125
+ model.train()
126
+ running_loss = 0.0
127
+ for inputs, targets in loader:
128
+ inputs, targets = inputs.to(device), targets.to(device)
129
+ optimizer.zero_grad()
130
+ outputs = model(inputs)
131
+ loss = criterion(outputs, targets)
132
+ loss.backward()
133
+ optimizer.step()
134
+ running_loss += loss.item() * inputs.size(0)
135
+ return running_loss / len(loader.dataset)
136
+
137
+
138
+ def evaluate(model, loader, device):
139
+ model.eval()
140
+ correct = 0
141
+ with torch.no_grad():
142
+ for inputs, targets in loader:
143
+ inputs, targets = inputs.to(device), targets.to(device)
144
+ preds = model(inputs).argmax(dim=1)
145
+ correct += (preds == targets).sum().item()
146
+ return correct / len(loader.dataset)
147
+
148
+
149
+ def main():
150
+ # Device
151
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
152
+
153
+ # Dataset & DataLoaders
154
+ full_ds = AccentDataset(
155
+ root_dir="content/dataset2",
156
+ sample_rate=16000,
157
+ n_mels=128,
158
+ duration=3.0
159
+ )
160
+
161
+ num_classes = len(full_ds.labels)
162
+ val_size = int(len(full_ds) * 0.1)
163
+ train_size = len(full_ds) - val_size
164
+ train_ds, val_ds = random_split(full_ds, [train_size, val_size])
165
+ train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
166
+ val_loader = DataLoader(val_ds, batch_size=32, shuffle=False)
167
+
168
+ # Model, Loss, Optimizer
169
+ model = build_model(num_classes).to(device)
170
+ criterion = nn.CrossEntropyLoss()
171
+ optimizer = optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-4)
172
+
173
+ # Training loop
174
+ best_acc = 0.0
175
+ for epoch in range(1, 101):
176
+ loss = train_one_epoch(model, train_loader, criterion, optimizer, device)
177
+ acc = evaluate(model, val_loader, device)
178
+ print(f"Epoch {epoch}/100 — Loss: {loss:.4f} — Val Acc: {acc*100:.1f}%")
179
+ if acc > best_acc:
180
+ best_acc = acc
181
+ torch.save(model.state_dict(), "best_model.pth")
182
+ print(f"Training complete. Best val accuracy: {best_acc*100:.1f}%")
183
+
184
+ if __name__ == "__main__":
185
+ main()