Spaces:
Sleeping
Sleeping
Zen0
commited on
Commit
·
fc15652
1
Parent(s):
35bed4c
Fix schema mismatch: load files individually and strip metadata
Browse filesFiles have different metadata columns (e.g., correction_applied in E8 file).
Load each category file separately, remove metadata columns, then concatenate.
Strategy:
1. Load each of 6 category files individually
2. Keep only essential columns (task_id, category, description, options, etc.)
3. Remove varying metadata (correction_applied, source_date, etc.)
4. Concatenate into single dataset
This handles schema differences between corrected and uncorrected files.
app.py
CHANGED
|
@@ -77,12 +77,45 @@ def load_benchmark_dataset(subset="australian", num_samples=200):
|
|
| 77 |
global dataset_cache
|
| 78 |
|
| 79 |
if dataset_cache is None:
|
| 80 |
-
# Load data files
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
|
| 87 |
# Proportional sampling
|
| 88 |
import random
|
|
|
|
| 77 |
global dataset_cache
|
| 78 |
|
| 79 |
if dataset_cache is None:
|
| 80 |
+
# Load data files individually to handle different schemas per file
|
| 81 |
+
from datasets import concatenate_datasets
|
| 82 |
+
|
| 83 |
+
# Get list of category files for the subset
|
| 84 |
+
import glob
|
| 85 |
+
from huggingface_hub import hf_hub_download
|
| 86 |
+
|
| 87 |
+
# Manually specify the categories to avoid globbing issues
|
| 88 |
+
categories = [
|
| 89 |
+
"knowledge_terminology",
|
| 90 |
+
"knowledge_threat_intelligence",
|
| 91 |
+
"regulatory_essential_eight",
|
| 92 |
+
"regulatory_ism_controls",
|
| 93 |
+
"regulatory_privacy_act",
|
| 94 |
+
"regulatory_soci_act"
|
| 95 |
+
]
|
| 96 |
+
|
| 97 |
+
datasets_list = []
|
| 98 |
+
for category in categories:
|
| 99 |
+
try:
|
| 100 |
+
ds = load_dataset(
|
| 101 |
+
"json",
|
| 102 |
+
data_files=f"hf://datasets/Zen0/AusCyberBench/data/{subset}/{category}.jsonl",
|
| 103 |
+
split="train"
|
| 104 |
+
)
|
| 105 |
+
# Remove metadata columns that may differ between files
|
| 106 |
+
cols_to_remove = [col for col in ds.column_names if col not in [
|
| 107 |
+
'task_id', 'category', 'subcategory', 'title', 'description',
|
| 108 |
+
'task_type', 'difficulty', 'answer', 'options', 'context',
|
| 109 |
+
'australian_focus', 'regulatory_references'
|
| 110 |
+
]]
|
| 111 |
+
if cols_to_remove:
|
| 112 |
+
ds = ds.remove_columns(cols_to_remove)
|
| 113 |
+
datasets_list.append(ds)
|
| 114 |
+
except Exception as e:
|
| 115 |
+
print(f"Warning: Could not load {category}: {e}")
|
| 116 |
+
|
| 117 |
+
# Concatenate all datasets
|
| 118 |
+
dataset_cache = concatenate_datasets(datasets_list)
|
| 119 |
|
| 120 |
# Proportional sampling
|
| 121 |
import random
|