|
1 | 1 | """ |
2 | | -
|
3 | 2 | CyberMetric: A Benchmark Dataset based on Retrieval-Augmented Generation for Evaluating LLMs in Cybersecurity Knowledge |
4 | 3 |
|
5 | 4 | Norbert Tihanyi, Mohamed Amine Ferrag, Ridhi Jain, Tamas Bisztray, Merouane Debbah |
|
16 | 15 |
|
17 | 16 | """ |
18 | 17 |
|
| 18 | +import json |
19 | 19 | from pathlib import Path |
20 | 20 | from typing import Any |
21 | 21 |
|
22 | | -from datasets import Dataset # type: ignore |
| 22 | +import requests |
23 | 23 | from inspect_ai import Task, task |
24 | | -from inspect_ai.dataset import Sample, json_dataset |
| 24 | +from inspect_ai.dataset import Dataset, Sample, json_dataset |
25 | 25 | from inspect_ai.scorer import choice |
26 | 26 | from inspect_ai.solver import multiple_choice, system_message |
| 27 | +from platformdirs import user_cache_dir |
27 | 28 |
|
28 | | -DATASET_DIR = Path(__file__).parent / "datasets" |
| 29 | +GITHUB_DATASET_URL_TEMPLATE = "https://raw.githubusercontent.com/cybermetric/CyberMetric/205262cdf5022ba890e792efd176fb19d42913fa/{name}-v1.json" |
29 | 30 | SYSTEM_MESSAGE = "You are a security expert who answers questions." |
| 31 | +CACHE_DIR = Path(user_cache_dir("inspect_evals")) / "cybermetric" / "data" |
30 | 32 |
|
31 | 33 |
|
32 | 34 | @task |
33 | 35 | def cybermetric_80() -> Task: |
34 | 36 | """Evaluates model on CyberMetric-80""" |
35 | | - return create_task(80) |
| 37 | + return create_task("CyberMetric-80") |
36 | 38 |
|
37 | 39 |
|
38 | 40 | @task |
39 | 41 | def cybermetric_500() -> Task: |
40 | 42 | """Evaluates model on CyberMetric-500""" |
41 | | - return create_task(500) |
| 43 | + return create_task("CyberMetric-500") |
42 | 44 |
|
43 | 45 |
|
44 | 46 | @task |
45 | 47 | def cybermetric_2000() -> Task: |
46 | 48 | """Evaluates model on CyberMetric-2000""" |
47 | | - return create_task(2000) |
| 49 | + return create_task("CyberMetric-2000") |
48 | 50 |
|
49 | 51 |
|
50 | 52 | @task |
51 | 53 | def cybermetric_10000() -> Task: |
52 | 54 | """Evaluates model on CyberMetric-10000""" |
53 | | - return create_task(10000) |
| 55 | + return create_task("CyberMetric-10000") |
54 | 56 |
|
55 | 57 |
|
56 | | -def create_task(dataset_size: int) -> Task: |
57 | | - name = f"CyberMetric-{dataset_size}" |
| 58 | +def create_task(dataset_name: str) -> Task: |
58 | 59 | return Task( |
59 | | - name=name, |
60 | | - dataset=load_dataset(name), |
| 60 | + name=dataset_name, |
| 61 | + dataset=load_dataset(dataset_name), |
61 | 62 | solver=[system_message(SYSTEM_MESSAGE), multiple_choice()], |
62 | 63 | scorer=choice(), |
63 | 64 | ) |
64 | 65 |
|
65 | 66 |
|
66 | 67 | def load_dataset(name: str) -> Dataset: |
| 68 | + """ |
| 69 | + Load a dataset from the CyberMetric repository on GitHub. |
| 70 | +
|
| 71 | + The json files on GitHub have the questions nested under a "questions" key, which makes it incompatible with the `json_dataset` loader. This function reads the json file, extracts the questions, and writes them to a jsonl file, which can then be read by `json_dataset`. |
| 72 | + """ |
| 73 | + dataset_url = GITHUB_DATASET_URL_TEMPLATE.format(name=name) |
| 74 | + json_file = CACHE_DIR / f"{name}.json" |
| 75 | + jsonl_file = CACHE_DIR / f"{name}.jsonl" |
| 76 | + |
| 77 | + if not jsonl_file.exists(): |
| 78 | + _ensure_data(json_file, dataset_url) |
| 79 | + |
| 80 | + # Read data from json file and un-nest |
| 81 | + with open(json_file, "r") as f: |
| 82 | + data = json.load(f)["questions"] |
| 83 | + |
| 84 | + # Write data to back to jsonl file |
| 85 | + with jsonl_file.open("w") as f: |
| 86 | + f.writelines(json.dumps(record) + "\n" for record in data) |
| 87 | + |
67 | 88 | dataset = json_dataset( |
68 | | - json_file=f"{DATASET_DIR}/{name.lower()}.json", |
| 89 | + json_file=str(jsonl_file), |
69 | 90 | name=name, |
70 | 91 | sample_fields=record_to_sample, |
71 | 92 | auto_id=True, |
72 | 93 | ) |
73 | 94 | return dataset |
74 | 95 |
|
75 | 96 |
|
| 97 | +def _ensure_data(json_file: Path, dataset_url: str) -> None: |
| 98 | + if not json_file.exists(): |
| 99 | + json_file.parent.mkdir(parents=True, exist_ok=True) |
| 100 | + response = requests.get(dataset_url) |
| 101 | + response.raise_for_status() |
| 102 | + with open(json_file, "wb") as f: |
| 103 | + f.write(response.content) |
| 104 | + |
| 105 | + |
76 | 106 | def record_to_sample(record: dict[str, Any]) -> Sample: |
77 | 107 | options = ", ".join([f"{key}) {value}" for key, value in record["answers"].items()]) |
78 | 108 | input = f"Question: {record['question']}\nOptions: {options}\n\n" |
|
0 commit comments