Spaces:
Sleeping
Sleeping
| import kagglehub | |
| import os | |
| from pathlib import Path | |
| import shutil | |
| path = kagglehub.dataset_download("pratt3000/vctk-corpus") | |
| print("Path to dataset files:", path) | |
| speaker_info = F"{path}/speaker-info.txt" | |
| # 1. Paths | |
| speaker_info = Path(speaker_info) | |
| recordings_dir = Path(F"{path}/wav48") # your base audio folder | |
| print(recordings_dir) | |
| dataset_dir = Path.cwd() / "dataset2" | |
| dataset_dir.mkdir(exist_ok=True) | |
| # 2. Which accents we care about | |
| desired = {"english", "american", "scottish", "irish", "indian", "southafrican"} | |
| # 3. Parse speaker-info.txt β build accent β [ids] | |
| accent_ids = {acc: [] for acc in desired} | |
| with speaker_info.open("r") as f: | |
| header = next(f) # skip column names | |
| for line in f: | |
| parts = line.strip().split() | |
| spk_id, _, _, accent = parts[:4] | |
| key = accent.lower() | |
| if key in desired: | |
| accent_ids[key].append(spk_id) | |
| if len(accent_ids[key]) > 4: | |
| accent_ids[key] = accent_ids[key][:4] | |
| print(accent_ids) | |
| for accent, ids in accent_ids.items(): | |
| target_dir = dataset_dir / accent | |
| target_dir.mkdir(exist_ok=True) | |
| for spk_id in ids: | |
| for folder in recordings_dir.iterdir(): | |
| print(folder, spk_id, folder.is_dir(), folder.name) | |
| if folder.is_dir() and folder.name == f"p{spk_id}": | |
| print(folder) | |
| for file in folder.glob("*.wav"): | |
| print(file) | |
| shutil.copy2(file, target_dir / file.name) | |
| print(f"Copied {file.name} β {target_dir}/") | |
| else: | |
| continue | |
| print("All done!") | |
| for lang in desired: | |
| lang_folder = os.path.join(content_folder, lang) | |
| print(len(os.listdir(lang_folder)), ":", lang_folder) | |
| for accent, ids in accent_ids.items(): | |
| target_dir = dataset_dir / accent | |
| print(len(os.listdir(target_dir))) | |