Spaces:

OnurDursun
/

English-Accent-Detection

Sleeping

App Files Files Community

OnurDursun commited on May 24, 2025

Commit

ce0a898

verified ·

1 Parent(s): 30c0f00

Upload 3 files

Browse files

Files changed (3) hide show

app.py +121 -0
best_model.pth +3 -0
train.py +185 -0

app.py ADDED Viewed

	@@ -0,0 +1,121 @@

+import os
+import tempfile
+import requests
+import torch
+import torch.nn as nn
+import torchaudio
+import moviepy.editor as mpy
+import gradio as gr
+# ─── 1. AccentCNN definition (same as your training code) ───────────────────
+class AccentCNN(nn.Module):
+    def __init__(self, num_classes: int):
+        super().__init__()
+        self.features = nn.Sequential(
+            nn.Conv2d(1, 16, 3, padding=1), nn.BatchNorm2d(16), nn.ReLU(), nn.MaxPool2d(2),
+            nn.Conv2d(16, 32, 3, padding=1), nn.BatchNorm2d(32), nn.ReLU(), nn.MaxPool2d(2),
+            nn.Conv2d(32, 64, 3, padding=1), nn.BatchNorm2d(64), nn.ReLU(), nn.MaxPool2d(2),
+            nn.Conv2d(64,128, 3, padding=1), nn.BatchNorm2d(128), nn.ReLU(), nn.MaxPool2d(2),
+        )
+        self.classifier = nn.Sequential(
+            nn.Dropout(0.5),
+            nn.Flatten(),
+            nn.Linear(128*14*14, 256),
+            nn.ReLU(),
+            nn.Dropout(0.5),
+            nn.Linear(256, num_classes),
+        )
+    def forward(self, x):
+        return self.classifier(self.features(x))
+# ─── 2. Audio→Mel transforms ─────────────────────────────────────────────────
+SAMPLE_RATE = 16000
+DURATION    = 3.0
+MAX_LEN     = int(SAMPLE_RATE * DURATION)
+mel_spec    = torchaudio.transforms.MelSpectrogram(sample_rate=SAMPLE_RATE, n_mels=128)
+to_db       = torchaudio.transforms.AmplitudeToDB()
+#resize      = torchaudio.transforms.Resize((224,224))
+MEAN = 0.485
+STD  = 0.229
+import torch.nn.functional as F
+# … remove “resize = torchaudio.transforms.Resize((224,224))” …
+def preprocess_wav(wav: torch.Tensor) -> torch.Tensor:
+    # wav: (1, N)
+    if wav.shape[1] < MAX_LEN:
+        wav = nn.functional.pad(wav, (0, MAX_LEN - wav.shape[1]))
+    else:
+        wav = wav[:, :MAX_LEN]
+    spec = mel_spec(wav)        # (1, 128, T)
+    spec = to_db(spec)          # log scale
+    # --- NEW: resize via interpolate instead of torchaudio.Resize ---
+    # spec.unsqueeze(0): (B=1, C=1, H=128, W=T)
+    spec = F.interpolate(
+        spec.unsqueeze(0),
+        size=(224, 224),
+        mode="bilinear",
+        align_corners=False
+    ).squeeze(0)               # back to (1, 224, 224)
+    spec = (spec - MEAN) / STD
+    return spec      # ImageNet‐style norm
+# ImageNet-style norm
+# ─── 3. Load labels & model ─────────────────────────────────────────────────
+LABELS = ["american","english","indian","irish","scottish","southafrican"]
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model = AccentCNN(len(LABELS)).to(device)
+model.load_state_dict(torch.load("best_model.pth", map_location=device))
+model.eval()
+# ─── 4. Inference pipeline ─────────────────────────────────────────────────
+def predict_from_url(mp4_url: str):
+    # 4a. Download mp4 to temp file
+    with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmp_video:
+        resp = requests.get(mp4_url, stream=True)
+        resp.raise_for_status()
+        for chunk in resp.iter_content(1024*1024):
+            tmp_video.write(chunk)
+        video_path = tmp_video.name
+    # 4b. Extract audio track as wav
+    clip = mpy.VideoFileClip(video_path)
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_audio:
+        clip.audio.write_audiofile(tmp_audio.name, fps=SAMPLE_RATE, logger=None)
+        audio_path = tmp_audio.name
+    clip.close()
+    os.unlink(video_path)
+    # 4c. Load & preprocess
+    wav, sr = torchaudio.load(audio_path)
+    os.unlink(audio_path)
+    wav = wav.mean(dim=0, keepdim=True)  # stereo→mono
+    spec = preprocess_wav(wav)
+    # 4d. Model forward & postprocess
+    with torch.no_grad():
+        inp = spec.unsqueeze(0).to(device)  # add batch dim
+        logits = model(inp)
+        probs  = torch.softmax(logits, dim=1).cpu().squeeze()
+    # Prepare output
+    results = {lbl: float(probs[i]) for i,lbl in enumerate(LABELS)}
+    pred_label = LABELS[int(probs.argmax())]
+    return pred_label, results
+# ─── 5. Gradio UI ────────────────────────────────────────────────────────────
+iface = gr.Interface(
+    fn=predict_from_url,
+    inputs=gr.Textbox(label="Public MP4 URL"),
+    outputs=[
+        gr.Label(num_top_classes=1, label="Predicted Accent"),
+        gr.JSON(label="Class Probabilities")
+    ],
+    title="Accent Classification from Video",
+    description="Enter a public MP4 link; the app downloads, extracts audio, and predicts accent.",
+    allow_flagging="never",
+)
+if __name__ == "__main__":
+    iface.launch()

best_model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7cfd6be64d456ed8b7a9704246325324dcb53b07ac335cc06864aafc6576cc82
+size 26100706

train.py ADDED Viewed

	@@ -0,0 +1,185 @@

+import os
+import argparse
+from glob import glob
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import torchaudio
+from torch.utils.data import Dataset, DataLoader, random_split
+from torchvision.transforms import Resize, Normalize
+import os
+import torchaudio
+print("Backends before:", torchaudio.list_audio_backends())
+import soundfile
+torchaudio.set_audio_backend("soundfile")
+print("Backends after:", torchaudio.list_audio_backends())
+class AccentDataset(Dataset):
+    """
+    Custom Dataset for loading audio files and converting to Mel-spectrograms.
+    Expects directory structure:
+      dataset/
+        american/
+        english/
+        indian/
+        irish/
+        scottish/
+    """
+    def __init__(self, root_dir, sample_rate=16000, n_mels=128, duration=3.0):
+        super().__init__()
+        self.sample_rate = sample_rate
+        self.max_len = int(sample_rate * duration)
+        self.labels = sorted(os.listdir(root_dir))
+        self.filepaths = []
+        for label in self.labels:
+            files = glob(os.path.join(root_dir, label, '*.wav'))
+            self.filepaths += [(fp, label) for fp in files]
+        # Audio → MelSpectrogram → dB
+        self.mel_spec = torchaudio.transforms.MelSpectrogram(
+            sample_rate=sample_rate,
+            n_mels=n_mels
+        )
+        self.to_db = torchaudio.transforms.AmplitudeToDB()
+        # Resize spectrogram to 224×224 and normalize like ImageNet
+        self.resize = Resize((224, 224))
+        self.normalize = Normalize(mean=[0.485], std=[0.229])
+    def __len__(self):
+        return len(self.filepaths)
+    def __getitem__(self, idx):
+        path, label = self.filepaths[idx]
+        wav, sr = torchaudio.load(path)
+        wav = wav.mean(dim=0, keepdim=True)  # mono
+        if wav.size(1) < self.max_len:
+            pad = self.max_len - wav.size(1)
+            wav = nn.functional.pad(wav, (0, pad))
+        else:
+            wav = wav[:, :self.max_len]
+        spec = self.mel_spec(wav)            # (1, n_mels, time)
+        spec_db = self.to_db(spec)           # log scale
+        spec_resized = self.resize(spec_db)  # (1, 224, 224)
+        spec_norm = self.normalize(spec_resized)
+        label_idx = self.labels.index(label)
+        return spec_norm, label_idx
+class AccentCNN(nn.Module):
+    def __init__(self, num_classes):
+        super().__init__()
+        self.features = nn.Sequential(
+            nn.Conv2d(1, 16, kernel_size=3, stride=1, padding=1),
+            nn.BatchNorm2d(16),
+            nn.ReLU(),
+            nn.MaxPool2d(2),  # 112x112
+            nn.Conv2d(16, 32, kernel_size=3, padding=1),
+            nn.BatchNorm2d(32),
+            nn.ReLU(),
+            nn.MaxPool2d(2),  # 56x56
+            nn.Conv2d(32, 64, kernel_size=3, padding=1),
+            nn.BatchNorm2d(64),
+            nn.ReLU(),
+            nn.MaxPool2d(2),  # 28x28
+            nn.Conv2d(64, 128, kernel_size=3, padding=1),
+            nn.BatchNorm2d(128),
+            nn.ReLU(),
+            nn.MaxPool2d(2)   # 14x14
+        )
+        self.classifier = nn.Sequential(
+            nn.Dropout(0.5),
+            nn.Flatten(),
+            nn.Linear(128 * 14 * 14, 256),
+            nn.ReLU(),
+            nn.Dropout(0.5),
+            nn.Linear(256, num_classes)
+        )
+    def forward(self, x):
+        x = self.features(x)
+        x = self.classifier(x)
+        return x
+def build_model(num_classes):
+    return AccentCNN(num_classes)
+def train_one_epoch(model, loader, criterion, optimizer, device):
+    model.train()
+    running_loss = 0.0
+    for inputs, targets in loader:
+        inputs, targets = inputs.to(device), targets.to(device)
+        optimizer.zero_grad()
+        outputs = model(inputs)
+        loss = criterion(outputs, targets)
+        loss.backward()
+        optimizer.step()
+        running_loss += loss.item() * inputs.size(0)
+    return running_loss / len(loader.dataset)
+def evaluate(model, loader, device):
+    model.eval()
+    correct = 0
+    with torch.no_grad():
+        for inputs, targets in loader:
+            inputs, targets = inputs.to(device), targets.to(device)
+            preds = model(inputs).argmax(dim=1)
+            correct += (preds == targets).sum().item()
+    return correct / len(loader.dataset)
+def main():
+    # Device
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    # Dataset & DataLoaders
+    full_ds = AccentDataset(
+        root_dir="content/dataset2",
+        sample_rate=16000,
+        n_mels=128,
+        duration=3.0
+    )
+    num_classes = len(full_ds.labels)
+    val_size = int(len(full_ds) * 0.1)
+    train_size = len(full_ds) - val_size
+    train_ds, val_ds = random_split(full_ds, [train_size, val_size])
+    train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
+    val_loader   = DataLoader(val_ds,   batch_size=32, shuffle=False)
+    # Model, Loss, Optimizer
+    model = build_model(num_classes).to(device)
+    criterion = nn.CrossEntropyLoss()
+    optimizer = optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-4)
+    # Training loop
+    best_acc = 0.0
+    for epoch in range(1, 101):
+        loss = train_one_epoch(model, train_loader, criterion, optimizer, device)
+        acc  = evaluate(model, val_loader, device)
+        print(f"Epoch {epoch}/100 — Loss: {loss:.4f} — Val Acc: {acc*100:.1f}%")
+        if acc > best_acc:
+            best_acc = acc
+            torch.save(model.state_dict(), "best_model.pth")
+    print(f"Training complete. Best val accuracy: {best_acc*100:.1f}%")
+if __name__ == "__main__":
+    main()