English-Accent-Detection / preprocess.py
OnurDursun's picture
Upload preprocess.py
d9fe6a0 verified
import kagglehub
import os
from pathlib import Path
import shutil
path = kagglehub.dataset_download("pratt3000/vctk-corpus")
print("Path to dataset files:", path)
speaker_info = F"{path}/speaker-info.txt"
# 1. Paths
speaker_info = Path(speaker_info)
recordings_dir = Path(F"{path}/wav48") # your base audio folder
print(recordings_dir)
dataset_dir = Path.cwd() / "dataset2"
dataset_dir.mkdir(exist_ok=True)
# 2. Which accents we care about
desired = {"english", "american", "scottish", "irish", "indian", "southafrican"}
# 3. Parse speaker-info.txt β†’ build accent β†’ [ids]
accent_ids = {acc: [] for acc in desired}
with speaker_info.open("r") as f:
header = next(f) # skip column names
for line in f:
parts = line.strip().split()
spk_id, _, _, accent = parts[:4]
key = accent.lower()
if key in desired:
accent_ids[key].append(spk_id)
if len(accent_ids[key]) > 4:
accent_ids[key] = accent_ids[key][:4]
print(accent_ids)
for accent, ids in accent_ids.items():
target_dir = dataset_dir / accent
target_dir.mkdir(exist_ok=True)
for spk_id in ids:
for folder in recordings_dir.iterdir():
print(folder, spk_id, folder.is_dir(), folder.name)
if folder.is_dir() and folder.name == f"p{spk_id}":
print(folder)
for file in folder.glob("*.wav"):
print(file)
shutil.copy2(file, target_dir / file.name)
print(f"Copied {file.name} β†’ {target_dir}/")
else:
continue
print("All done!")
for lang in desired:
lang_folder = os.path.join(content_folder, lang)
print(len(os.listdir(lang_folder)), ":", lang_folder)
for accent, ids in accent_ids.items():
target_dir = dataset_dir / accent
print(len(os.listdir(target_dir)))