Spaces:
Runtime error
Runtime error
File size: 5,585 Bytes
2c0f55c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 |
from pathlib import Path
import pandas as pd
if __name__ == "__main__":
import sys
sys.path.append(str(Path(__file__).parent.parent.parent.absolute()))
from src.datasets.base_dataset import SimpleAudioFakeDataset
ASVSPOOF_SPLIT = {
"train": ['A01', 'A07', 'A08', 'A02', 'A09', 'A10', 'A03', 'A04', 'A05', 'A06', 'A11', 'A12', 'A13', 'A14', 'A15', 'A16', 'A17', 'A18', 'A19'],
"test": ['A01', 'A07', 'A08', 'A02', 'A09', 'A10', 'A03', 'A04', 'A05', 'A06', 'A11', 'A12', 'A13', 'A14', 'A15', 'A16', 'A17', 'A18', 'A19'],
"val": ['A01', 'A07', 'A08', 'A02', 'A09', 'A10', 'A03', 'A04', 'A05', 'A06', 'A11', 'A12', 'A13', 'A14', 'A15', 'A16', 'A17', 'A18', 'A19'],
"partition_ratio": [0.7, 0.15],
"seed": 45,
}
class ASVSpoofDataset(SimpleAudioFakeDataset):
protocol_folder_name = "ASVspoof2019_LA_cm_protocols"
subset_dir_prefix = "ASVspoof2019_LA_"
subsets = ("train", "dev", "eval")
def __init__(self, path, subset="train", transform=None):
super().__init__(subset, transform)
self.path = path
self.allowed_attacks = ASVSPOOF_SPLIT[subset]
self.partition_ratio = ASVSPOOF_SPLIT["partition_ratio"]
self.seed = ASVSPOOF_SPLIT["seed"]
self.samples = pd.DataFrame()
for subset in self.subsets:
subset_dir = Path(self.path) / f"{self.subset_dir_prefix}{subset}"
subset_protocol_path = self.get_protocol_path(subset)
subset_samples = self.read_protocol(subset_dir, subset_protocol_path)
self.samples = pd.concat([self.samples, subset_samples])
self.transform = transform
def get_protocol_path(self, subset):
paths = list((Path(self.path) / self.protocol_folder_name).glob("*.txt"))
for path in paths:
if subset in Path(path).stem:
return path
def read_protocol(self, subset_dir, protocol_path):
samples = {
"user_id": [],
"sample_name": [],
"attack_type": [],
"label": [],
"path": []
}
real_samples = []
fake_samples = []
with open(protocol_path, "r") as file:
for line in file:
attack_type = line.strip().split(" ")[3]
if attack_type == "-":
real_samples.append(line)
elif attack_type in self.allowed_attacks:
fake_samples.append(line)
if attack_type not in self.allowed_attacks:
continue
fake_samples = self.split_samples(fake_samples)
for line in fake_samples:
samples = self.add_line_to_samples(samples, line, subset_dir)
real_samples = self.split_samples(real_samples)
for line in real_samples:
samples = self.add_line_to_samples(samples, line, subset_dir)
return pd.DataFrame(samples)
@staticmethod
def add_line_to_samples(samples, line, subset_dir):
user_id, sample_name, _, attack_type, label = line.strip().split(" ")
samples["user_id"].append(user_id)
samples["sample_name"].append(sample_name)
samples["attack_type"].append(attack_type)
samples["label"].append(label)
assert (subset_dir / "flac" / f"{sample_name}.flac").exists()
samples["path"].append(subset_dir / "flac" / f"{sample_name}.flac")
return samples
class ASVSpoof2019DatasetOriginal(ASVSpoofDataset):
subsets = {"train": "train", "test": "dev", "val": "eval"}
protocol_folder_name = "ASVspoof2019_LA_cm_protocols"
subset_dir_prefix = "ASVspoof2019_LA_"
subset_dirs_attacks = {
"train": ["A01", "A02", "A03", "A04", "A05", "A06"],
"dev": ["A01", "A02", "A03", "A04", "A05", "A06"],
"eval": [
"A07", "A08", "A09", "A10", "A11", "A12", "A13", "A14", "A15",
"A16", "A17", "A18", "A19"
]
}
def __init__(self, path, fold_subset="train"):
"""
Initialise object. Skip __init__ of ASVSpoofDataset doe to different
logic, but follow SimpleAudioFakeDataset constructor.
"""
super(ASVSpoofDataset, self).__init__(float('inf'), fold_subset)
self.path = path
subset = self.subsets[fold_subset]
self.allowed_attacks = self.subset_dirs_attacks[subset]
subset_dir = Path(self.path) / f"{self.subset_dir_prefix}{subset}"
subset_protocol_path = self.get_protocol_path(subset)
self.samples = self.read_protocol(subset_dir, subset_protocol_path)
def read_protocol(self, subset_dir, protocol_path):
samples = {
"user_id": [],
"sample_name": [],
"attack_type": [],
"label": [],
"path": []
}
real_samples = []
fake_samples = []
with open(protocol_path, "r") as file:
for line in file:
attack_type = line.strip().split(" ")[3]
if attack_type == "-":
real_samples.append(line)
elif attack_type in self.allowed_attacks:
fake_samples.append(line)
else:
raise ValueError(
"Tried to load attack that shouldn't be here!"
)
for line in fake_samples:
samples = self.add_line_to_samples(samples, line, subset_dir)
for line in real_samples:
samples = self.add_line_to_samples(samples, line, subset_dir)
return pd.DataFrame(samples)
|