import kagglehub
import os
from pathlib import Path
import shutil

path = kagglehub.dataset_download("pratt3000/vctk-corpus")

print("Path to dataset files:", path)


speaker_info = F"{path}/speaker-info.txt"


# 1. Paths
speaker_info = Path(speaker_info)
recordings_dir = Path(F"{path}/wav48") # your base audio folder
print(recordings_dir)
dataset_dir    = Path.cwd() / "dataset2"
dataset_dir.mkdir(exist_ok=True)

# 2. Which accents we care about
desired = {"english", "american", "scottish", "irish", "indian", "southafrican"}

# 3. Parse speaker-info.txt → build accent → [ids]
accent_ids = {acc: [] for acc in desired}
with speaker_info.open("r") as f:
    header = next(f)  # skip column names
    for line in f:
        parts = line.strip().split()

        spk_id, _, _, accent = parts[:4]
        key = accent.lower()
        if key in desired:
            accent_ids[key].append(spk_id)
            if len(accent_ids[key]) > 4:
                accent_ids[key] = accent_ids[key][:4]

print(accent_ids)

for accent, ids in accent_ids.items():
    target_dir = dataset_dir / accent
    target_dir.mkdir(exist_ok=True)
    for spk_id in ids:
        for folder in recordings_dir.iterdir():
            print(folder, spk_id, folder.is_dir(), folder.name)
            if folder.is_dir() and folder.name == f"p{spk_id}":
                print(folder)
                for file in folder.glob("*.wav"):
                    print(file)
                    shutil.copy2(file, target_dir / file.name)
                    print(f"Copied {file.name} → {target_dir}/")
            else:
                continue


print("All done!")


for lang in desired:
    lang_folder = os.path.join(content_folder, lang)
    print(len(os.listdir(lang_folder)), ":", lang_folder)

for accent, ids in accent_ids.items():
    target_dir = dataset_dir / accent
    print(len(os.listdir(target_dir)))