uiuc-kang-lab
diff --git a/‎src/inspect_evals/cybermetric/cybermetric.py‎
Lines changed: 43 additions & 13 deletions b/‎src/inspect_evals/cybermetric/cybermetric.py‎
Lines changed: 43 additions & 13 deletions
@@ -1,5 +1,4 @@
 """
-
 CyberMetric: A Benchmark Dataset based on Retrieval-Augmented Generation for Evaluating LLMs in Cybersecurity Knowledge
 
 Norbert Tihanyi, Mohamed Amine Ferrag, Ridhi Jain, Tamas Bisztray, Merouane Debbah
@@ -16,63 +15,94 @@
 
 """
 
+import json
 from pathlib import Path
 from typing import Any
 
-from datasets import Dataset  # type: ignore
+import requests
 from inspect_ai import Task, task
-from inspect_ai.dataset import Sample, json_dataset
+from inspect_ai.dataset import Dataset, Sample, json_dataset
 from inspect_ai.scorer import choice
 from inspect_ai.solver import multiple_choice, system_message
+from platformdirs import user_cache_dir
 
-DATASET_DIR = Path(__file__).parent / "datasets"
+GITHUB_DATASET_URL_TEMPLATE = "https://raw.githubusercontent.com/cybermetric/CyberMetric/205262cdf5022ba890e792efd176fb19d42913fa/{name}-v1.json"
 SYSTEM_MESSAGE = "You are a security expert who answers questions."
+CACHE_DIR = Path(user_cache_dir("inspect_evals")) / "cybermetric" / "data"
 
 
 @task
 def cybermetric_80() -> Task:
     """Evaluates model on CyberMetric-80"""
-    return create_task(80)
+    return create_task("CyberMetric-80")
 
 
 @task
 def cybermetric_500() -> Task:
     """Evaluates model on CyberMetric-500"""
-    return create_task(500)
+    return create_task("CyberMetric-500")
 
 
 @task
 def cybermetric_2000() -> Task:
     """Evaluates model on CyberMetric-2000"""
-    return create_task(2000)
+    return create_task("CyberMetric-2000")
 
 
 @task
 def cybermetric_10000() -> Task:
     """Evaluates model on CyberMetric-10000"""
-    return create_task(10000)
+    return create_task("CyberMetric-10000")
 
 
-def create_task(dataset_size: int) -> Task:
-    name = f"CyberMetric-{dataset_size}"
+def create_task(dataset_name: str) -> Task:
     return Task(
-        name=name,
-        dataset=load_dataset(name),
+        name=dataset_name,
+        dataset=load_dataset(dataset_name),
         solver=[system_message(SYSTEM_MESSAGE), multiple_choice()],
         scorer=choice(),
     )
 
 
 def load_dataset(name: str) -> Dataset:
+    """
+    Load a dataset from the CyberMetric repository on GitHub.
+
+    The json files on GitHub have the questions nested under a "questions" key, which makes it incompatible with the `json_dataset` loader. This function reads the json file, extracts the questions, and writes them to a jsonl file, which can then be read by `json_dataset`.
+    """
+    dataset_url = GITHUB_DATASET_URL_TEMPLATE.format(name=name)
+    json_file = CACHE_DIR / f"{name}.json"
+    jsonl_file = CACHE_DIR / f"{name}.jsonl"
+
+    if not jsonl_file.exists():
+        _ensure_data(json_file, dataset_url)
+
+        # Read data from json file and un-nest
+        with open(json_file, "r") as f:
+            data = json.load(f)["questions"]
+
+        # Write data to back to jsonl file
+        with jsonl_file.open("w") as f:
+            f.writelines(json.dumps(record) + "\n" for record in data)
+
     dataset = json_dataset(
-        json_file=f"{DATASET_DIR}/{name.lower()}.json",
+        json_file=str(jsonl_file),
         name=name,
         sample_fields=record_to_sample,
         auto_id=True,
     )
     return dataset
 
 
+def _ensure_data(json_file: Path, dataset_url: str) -> None:
+    if not json_file.exists():
+        json_file.parent.mkdir(parents=True, exist_ok=True)
+        response = requests.get(dataset_url)
+        response.raise_for_status()
+        with open(json_file, "wb") as f:
+            f.write(response.content)
+
+
 def record_to_sample(record: dict[str, Any]) -> Sample:
     options = ", ".join([f"{key}) {value}" for key, value in record["answers"].items()])
     input = f"Question: {record['question']}\nOptions: {options}\n\n"