Zen0 commited on
Commit
fc15652
·
1 Parent(s): 35bed4c

Fix schema mismatch: load files individually and strip metadata

Browse files

Files have different metadata columns (e.g., correction_applied in E8 file).
Load each category file separately, remove metadata columns, then concatenate.

Strategy:
1. Load each of 6 category files individually
2. Keep only essential columns (task_id, category, description, options, etc.)
3. Remove varying metadata (correction_applied, source_date, etc.)
4. Concatenate into single dataset

This handles schema differences between corrected and uncorrected files.

Files changed (1) hide show
  1. app.py +39 -6
app.py CHANGED
@@ -77,12 +77,45 @@ def load_benchmark_dataset(subset="australian", num_samples=200):
77
  global dataset_cache
78
 
79
  if dataset_cache is None:
80
- # Load data files directly as JSON to avoid schema mismatch issues
81
- dataset_cache = load_dataset(
82
- "json",
83
- data_files=f"hf://datasets/Zen0/AusCyberBench/data/{subset}/*.jsonl",
84
- split="train"
85
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
 
87
  # Proportional sampling
88
  import random
 
77
  global dataset_cache
78
 
79
  if dataset_cache is None:
80
+ # Load data files individually to handle different schemas per file
81
+ from datasets import concatenate_datasets
82
+
83
+ # Get list of category files for the subset
84
+ import glob
85
+ from huggingface_hub import hf_hub_download
86
+
87
+ # Manually specify the categories to avoid globbing issues
88
+ categories = [
89
+ "knowledge_terminology",
90
+ "knowledge_threat_intelligence",
91
+ "regulatory_essential_eight",
92
+ "regulatory_ism_controls",
93
+ "regulatory_privacy_act",
94
+ "regulatory_soci_act"
95
+ ]
96
+
97
+ datasets_list = []
98
+ for category in categories:
99
+ try:
100
+ ds = load_dataset(
101
+ "json",
102
+ data_files=f"hf://datasets/Zen0/AusCyberBench/data/{subset}/{category}.jsonl",
103
+ split="train"
104
+ )
105
+ # Remove metadata columns that may differ between files
106
+ cols_to_remove = [col for col in ds.column_names if col not in [
107
+ 'task_id', 'category', 'subcategory', 'title', 'description',
108
+ 'task_type', 'difficulty', 'answer', 'options', 'context',
109
+ 'australian_focus', 'regulatory_references'
110
+ ]]
111
+ if cols_to_remove:
112
+ ds = ds.remove_columns(cols_to_remove)
113
+ datasets_list.append(ds)
114
+ except Exception as e:
115
+ print(f"Warning: Could not load {category}: {e}")
116
+
117
+ # Concatenate all datasets
118
+ dataset_cache = concatenate_datasets(datasets_list)
119
 
120
  # Proportional sampling
121
  import random