import kagglehub import os from pathlib import Path import shutil path = kagglehub.dataset_download("pratt3000/vctk-corpus") print("Path to dataset files:", path) speaker_info = F"{path}/speaker-info.txt" # 1. Paths speaker_info = Path(speaker_info) recordings_dir = Path(F"{path}/wav48") # your base audio folder print(recordings_dir) dataset_dir = Path.cwd() / "dataset2" dataset_dir.mkdir(exist_ok=True) # 2. Which accents we care about desired = {"english", "american", "scottish", "irish", "indian", "southafrican"} # 3. Parse speaker-info.txt → build accent → [ids] accent_ids = {acc: [] for acc in desired} with speaker_info.open("r") as f: header = next(f) # skip column names for line in f: parts = line.strip().split() spk_id, _, _, accent = parts[:4] key = accent.lower() if key in desired: accent_ids[key].append(spk_id) if len(accent_ids[key]) > 4: accent_ids[key] = accent_ids[key][:4] print(accent_ids) for accent, ids in accent_ids.items(): target_dir = dataset_dir / accent target_dir.mkdir(exist_ok=True) for spk_id in ids: for folder in recordings_dir.iterdir(): print(folder, spk_id, folder.is_dir(), folder.name) if folder.is_dir() and folder.name == f"p{spk_id}": print(folder) for file in folder.glob("*.wav"): print(file) shutil.copy2(file, target_dir / file.name) print(f"Copied {file.name} → {target_dir}/") else: continue print("All done!") for lang in desired: lang_folder = os.path.join(content_folder, lang) print(len(os.listdir(lang_folder)), ":", lang_folder) for accent, ids in accent_ids.items(): target_dir = dataset_dir / accent print(len(os.listdir(target_dir)))