Skip to content

Commit a9c1e4a

Browse files
Merge pull request rllm-org#126 from matthewreed26/sec_qa_benchmark
SecQA Benchmark Implementation | ASET - Arcadia Impact
2 parents 4445b86 + 6b8ef83 commit a9c1e4a

File tree

6 files changed

+337
-0
lines changed

6 files changed

+337
-0
lines changed

README.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,13 @@ Inspect supports many model providers including OpenAI, Anthropic, Google, Mistr
140140
inspect eval inspect_evals/gdm_in_house_ctf
141141
```
142142

143+
- ### [SecQA: A Concise Question-Answering Dataset for Evaluating Large Language Models in Computer Security](src/inspect_evals/sec_qa)
144+
LLMs should be capable of understanding computer security principles and applying best practices to their responses. SecQA has “v1” and “v2” datasets of multiple-choice questions that aim to provide two levels of cybersecurity evaluation criteria. The questions are generated by GPT-4 based on the “Computer Systems Security: Planning for Success” textbook and vetted by humans.
145+
<sub><sup>Contributed by: [@matthewreed26](https://github.com/matthewreed26)</sub></sup>
146+
```
147+
inspect eval inspect_evals/secqa_v1
148+
inspect eval inspect_evals/secqa_v2
149+
```
143150

144151
## Safeguards
145152

src/inspect_evals/_registry.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@
5555
from .piqa import piqa
5656
from .pubmedqa import pubmedqa
5757
from .race_h import race_h
58+
from .sec_qa import sec_qa_v1, sec_qa_v1_5_shot, sec_qa_v2, sec_qa_v2_5_shot
5859
from .squad import squad
5960
from .swe_bench import swe_bench
6061
from .truthfulqa import truthfulqa
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
from .sec_qa import (
2+
DATASET_PATH,
3+
format_system_message_fewshot,
4+
record_to_sample,
5+
sample_to_fewshot,
6+
sec_qa_v1,
7+
sec_qa_v1_5_shot,
8+
sec_qa_v2,
9+
sec_qa_v2_5_shot,
10+
)
11+
12+
__all__ = [
13+
"DATASET_PATH",
14+
"format_system_message_fewshot",
15+
"record_to_sample",
16+
"sample_to_fewshot",
17+
"sec_qa_v1",
18+
"sec_qa_v1_5_shot",
19+
"sec_qa_v2",
20+
"sec_qa_v2_5_shot",
21+
]

src/inspect_evals/sec_qa/sec_qa.py

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
"""
2+
SecQA: A Concise Question-Answering Dataset for Evaluating Large Language Models in Computer Security
3+
4+
Zefang Liu
5+
https://arxiv.org/pdf/2312.15838
6+
"""
7+
8+
from typing import Any
9+
10+
from inspect_ai import Task, task
11+
from inspect_ai.dataset import Dataset, Sample, hf_dataset
12+
from inspect_ai.scorer import choice
13+
from inspect_ai.solver import multiple_choice, system_message
14+
15+
# setup for problem + instructions for providing answer
16+
SYSTEM_MESSAGE_FEWSHOT = """
17+
The following are multiple choice questions about Computer Security. Some examples are provided below.
18+
19+
{examples}
20+
""".strip()
21+
22+
DATASET_PATH = "zefang-liu/secqa"
23+
DATASET_SUBSET_NAME_V1 = "secqa_v1"
24+
DATASET_SUBSET_NAME_V2 = "secqa_v2"
25+
DATASET_TEST_SPLIT = "test"
26+
DATASET_FEWSHOT_SPLIT = "dev"
27+
28+
29+
@task
30+
def sec_qa_v1() -> Task:
31+
"""Inspect Task implementing the SecQA benchmark v1 0-shot"""
32+
dataset = retrieve_hf_dataset(DATASET_SUBSET_NAME_V1, DATASET_TEST_SPLIT)
33+
return Task(
34+
dataset=dataset,
35+
solver=[multiple_choice()],
36+
scorer=choice(),
37+
)
38+
39+
40+
@task
41+
def sec_qa_v1_5_shot() -> Task:
42+
"""Inspect Task implementing the SecQA benchmark v1 5-shot"""
43+
fewshot_samples = retrieve_hf_dataset(DATASET_SUBSET_NAME_V1, DATASET_FEWSHOT_SPLIT)
44+
dataset = retrieve_hf_dataset(DATASET_SUBSET_NAME_V1, DATASET_TEST_SPLIT)
45+
return Task(
46+
dataset=dataset,
47+
solver=[
48+
system_message(format_system_message_fewshot(fewshot_samples)),
49+
multiple_choice(),
50+
],
51+
scorer=choice(),
52+
)
53+
54+
55+
@task
56+
def sec_qa_v2() -> Task:
57+
"""Inspect Task implementing the SecQA benchmark v2 0-shot"""
58+
dataset = retrieve_hf_dataset(DATASET_SUBSET_NAME_V2, DATASET_TEST_SPLIT)
59+
return Task(
60+
dataset=dataset,
61+
solver=[multiple_choice()],
62+
scorer=choice(),
63+
)
64+
65+
66+
@task
67+
def sec_qa_v2_5_shot() -> Task:
68+
"""Inspect Task implementing the SecQA benchmark v2 5-shot"""
69+
fewshot_samples = retrieve_hf_dataset(DATASET_SUBSET_NAME_V2, DATASET_FEWSHOT_SPLIT)
70+
dataset = retrieve_hf_dataset(DATASET_SUBSET_NAME_V2, DATASET_TEST_SPLIT)
71+
return Task(
72+
dataset=dataset,
73+
solver=[
74+
system_message(format_system_message_fewshot(fewshot_samples)),
75+
multiple_choice(),
76+
],
77+
scorer=choice(),
78+
)
79+
80+
81+
def retrieve_hf_dataset(name: str, split: str) -> Dataset:
82+
return hf_dataset(
83+
path=DATASET_PATH,
84+
name=name,
85+
split=split,
86+
sample_fields=record_to_sample,
87+
trust=True,
88+
)
89+
90+
91+
def record_to_sample(record: dict[str, Any]) -> Sample:
92+
return Sample(
93+
input=record["Question"],
94+
choices=[record["A"], record["B"], record["C"], record["D"]],
95+
target=record["Answer"],
96+
)
97+
98+
99+
def format_system_message_fewshot(fewshot_samples: Dataset) -> str:
100+
return SYSTEM_MESSAGE_FEWSHOT.format(
101+
examples="\n\n".join(
102+
[sample_to_fewshot(sample=sample) for sample in fewshot_samples]
103+
)
104+
)
105+
106+
107+
def sample_to_fewshot(sample: Sample) -> str:
108+
prob_str = f"""QUESTION:\n{sample.input}"""
109+
labeled_choices = "\n"
110+
for i, letter_label in enumerate(["A", "B", "C", "D"]):
111+
labeled_choices = (
112+
labeled_choices
113+
+ f"{letter_label}) {sample.choices[i] if sample.choices is not None else None}\n"
114+
)
115+
choices_str = f"""CHOICES:{labeled_choices}"""
116+
ans_str = f"""ANSWER: {sample.target}"""
117+
118+
return f"""{prob_str}\n\n{choices_str}\n\n{ans_str}"""

tests/sec_qa/test_sec_qa.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
from typing import Any
2+
3+
from inspect_ai.dataset import Sample
4+
5+
from inspect_evals.sec_qa import record_to_sample
6+
from inspect_evals.sec_qa.sec_qa import format_system_message_fewshot, sample_to_fewshot
7+
8+
EXAMPLE_RECORD: dict[str, Any] = {
9+
"Question": "Which of the following is a common indicator of an SQL injection attack?",
10+
"A": "Frequent changes in user account permissions.",
11+
"B": "Decreased performance of endpoint protection systems.",
12+
"C": "Unusually high data upload rates to a web server.",
13+
"D": "Sudden uptick in SQL queries, far beyond the usual baseline for the application.",
14+
"Answer": "D",
15+
}
16+
17+
EXPECTED_FORMATTED_SAMPLE_TO_FEWSHOT = "QUESTION:\nWhich of the following is a common indicator of an SQL injection attack?\n\nCHOICES:\nA) Frequent changes in user account permissions.\nB) Decreased performance of endpoint protection systems.\nC) Unusually high data upload rates to a web server.\nD) Sudden uptick in SQL queries, far beyond the usual baseline for the application.\n\n\nANSWER: D"
18+
19+
20+
def test_record_to_sample():
21+
"""Test that the record is mapped correctly to Inspect Sample"""
22+
example_sample = record_to_sample(EXAMPLE_RECORD)
23+
assert example_sample == Sample(
24+
input="Which of the following is a common indicator of an SQL injection attack?",
25+
choices=[
26+
"Frequent changes in user account permissions.",
27+
"Decreased performance of endpoint protection systems.",
28+
"Unusually high data upload rates to a web server.",
29+
"Sudden uptick in SQL queries, far beyond the usual baseline for the application.",
30+
],
31+
target="D",
32+
)
33+
34+
35+
def test_format_system_message_fewshot():
36+
"""Test that the system message is formatted correctly for some samples"""
37+
formatted_system_message = format_system_message_fewshot(
38+
[record_to_sample(EXAMPLE_RECORD), record_to_sample(EXAMPLE_RECORD)]
39+
)
40+
assert (
41+
formatted_system_message
42+
== "The following are multiple choice questions about Computer Security. Some examples are provided below."
43+
+ "\n\n"
44+
+ EXPECTED_FORMATTED_SAMPLE_TO_FEWSHOT
45+
+ "\n\n"
46+
+ EXPECTED_FORMATTED_SAMPLE_TO_FEWSHOT
47+
)
48+
49+
50+
def test_sample_to_fewshot():
51+
"""Test that the sample is formatted correctly"""
52+
formatted_sample_to_fewshot = sample_to_fewshot(record_to_sample(EXAMPLE_RECORD))
53+
assert formatted_sample_to_fewshot == EXPECTED_FORMATTED_SAMPLE_TO_FEWSHOT
Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
from urllib.parse import quote
2+
3+
import requests
4+
5+
from inspect_evals.sec_qa.sec_qa import (
6+
DATASET_PATH,
7+
)
8+
9+
HUGGING_FACE_IS_VALID_RESPONSE = {
10+
"preview": True,
11+
"viewer": True,
12+
"search": True,
13+
"filter": True,
14+
"statistics": True,
15+
}
16+
17+
HUGGING_FACE_EXPECTED_SPLITS_V1 = {
18+
"dev": {
19+
"name": "dev",
20+
"num_bytes": 2538,
21+
"num_examples": 5,
22+
"dataset_name": "secqa",
23+
},
24+
"val": {
25+
"name": "val",
26+
"num_bytes": 6364,
27+
"num_examples": 12,
28+
"dataset_name": "secqa",
29+
},
30+
"test": {
31+
"name": "test",
32+
"num_bytes": 53627,
33+
"num_examples": 110,
34+
"dataset_name": "secqa",
35+
},
36+
}
37+
38+
HUGGING_FACE_EXPECTED_SPLITS_V2 = {
39+
"dev": {
40+
"name": "dev",
41+
"num_bytes": 3387,
42+
"num_examples": 5,
43+
"dataset_name": "secqa",
44+
},
45+
"val": {
46+
"name": "val",
47+
"num_bytes": 7494,
48+
"num_examples": 10,
49+
"dataset_name": "secqa",
50+
},
51+
"test": {
52+
"name": "test",
53+
"num_bytes": 68458,
54+
"num_examples": 100,
55+
"dataset_name": "secqa",
56+
},
57+
}
58+
59+
60+
def test_dataset_is_valid():
61+
"""Test that the SecQA dataset URL is valid."""
62+
response = requests.get(
63+
"https://datasets-server.huggingface.co/is-valid?dataset=" + quote(DATASET_PATH)
64+
)
65+
assert (
66+
response.status_code == 200
67+
), f"Hugging Face dataset `/is-valid` returned status code {response.status_code} check documentation at `https://huggingface.co/docs/dataset-viewer/en/valid`"
68+
assert response.json() == HUGGING_FACE_IS_VALID_RESPONSE
69+
70+
71+
def test_required_splits():
72+
"""Test that the SecQA dataset has the required splits."""
73+
response = requests.get(
74+
"https://datasets-server.huggingface.co/info?dataset=" + quote(DATASET_PATH)
75+
)
76+
assert (
77+
response.status_code == 200
78+
), f"Hugging Face dataset `/info` returned status code {response.status_code} check documentation at `https://huggingface.co/docs/dataset-viewer/en/info`"
79+
80+
data = response.json()
81+
assert (
82+
"dataset_info" in data
83+
), f"Hugging Face dataset `/info` returned unexpected json {data} check documentation at `https://huggingface.co/docs/dataset-viewer/en/info`"
84+
85+
dataset_info = data["dataset_info"]
86+
assert (
87+
"secqa_v1" in dataset_info
88+
), f"SecQA V1 subset missing from {DATASET_PATH} dataset"
89+
assert (
90+
"secqa_v2" in dataset_info
91+
), f"SecQA V2 subset missing from {DATASET_PATH} dataset"
92+
93+
assert (
94+
dataset_info["secqa_v1"]["splits"] == HUGGING_FACE_EXPECTED_SPLITS_V1
95+
), "Unexpected dataset splits for SecQA V1 and especially check dev which is used in 5-shot evaluation"
96+
assert (
97+
dataset_info["secqa_v2"]["splits"] == HUGGING_FACE_EXPECTED_SPLITS_V2
98+
), "Unexpected dataset splits for SecQA V2 and especially check dev which is used in 5-shot evaluation"
99+
100+
101+
def test_required_columns():
102+
"""Test that the SecQA dataset has the required columns."""
103+
response = requests.get(
104+
"https://datasets-server.huggingface.co/info?dataset=" + quote(DATASET_PATH)
105+
)
106+
assert (
107+
response.status_code == 200
108+
), f"Hugging Face dataset `/info` returned status code {response.status_code} check documentation at `https://huggingface.co/docs/dataset-viewer/en/info`"
109+
110+
data = response.json()
111+
assert (
112+
"dataset_info" in data
113+
), f"Hugging Face dataset `/info` returned unexpected json {data} check documentation at `https://huggingface.co/docs/dataset-viewer/en/info`"
114+
115+
dataset_info = data["dataset_info"]
116+
assert (
117+
"secqa_v1" in dataset_info
118+
), f"SecQA V1 subset missing from {DATASET_PATH} dataset"
119+
assert (
120+
"secqa_v2" in dataset_info
121+
), f"SecQA V2 subset missing from {DATASET_PATH} dataset"
122+
123+
secqa_v1_features = dataset_info["secqa_v1"]["features"]
124+
assert "Question" in secqa_v1_features, "Question column missing from SecQA V1"
125+
assert "A" in secqa_v1_features, "A column missing from SecQA V1"
126+
assert "B" in secqa_v1_features, "B column missing from SecQA V1"
127+
assert "C" in secqa_v1_features, "C column missing from SecQA V1"
128+
assert "D" in secqa_v1_features, "D column missing from SecQA V1"
129+
assert "Answer" in secqa_v1_features, "Answer column missing from SecQA V1"
130+
131+
secqa_v2_features = dataset_info["secqa_v2"]["features"]
132+
assert "Question" in secqa_v2_features, "Question column missing from SecQA V2"
133+
assert "A" in secqa_v2_features, "A column missing from SecQA V2"
134+
assert "B" in secqa_v2_features, "B column missing from SecQA V2"
135+
assert "C" in secqa_v2_features, "C column missing from SecQA V2"
136+
assert "D" in secqa_v2_features, "D column missing from SecQA V2"
137+
assert "Answer" in secqa_v2_features, "Answer column missing from SecQA V2"

0 commit comments

Comments
 (0)