initial SecQA v1 benchmark implementation

matthewreed26 · matthewreed26 · commit 86473155f86d · 2024-12-05T13:26:41.000-05:00
diff --git a/README.md b/README.md
@@ -129,6 +129,13 @@ Inspect supports many model providers including OpenAI, Anthropic, Google, Mistr
    inspect eval inspect_evals/gdm_in_house_ctf
    ```
 
+- ### [SecQA: A Concise Question-Answering Dataset for Evaluating Large Language Models in Computer Security](src/inspect_evals/sec_qa)
+  LLMs should be capable of understanding computer security principles and applying best practices to their responses. SecQA has “v1” and “v2” datasets of multiple-choice questions that aim to provide two levels of cybersecurity evaluation criteria. The questions are generated by GPT-4 based on the “Computer Systems Security: Planning for Success” textbook and vetted by humans.
+ <sub><sup>Contributed by: [@matthewreed26](https://github.com/matthewreed26)</sub></sup>
+   ```
+   inspect eval inspect_evals/secqa_v1
+   inspect eval inspect_evals/secqa_v2
+   ```
 
 ## Safeguards
 
diff --git a/src/inspect_evals/_registry.py b/src/inspect_evals/_registry.py
@@ -49,6 +49,7 @@
 from .piqa import piqa
 from .pubmedqa import pubmedqa
 from .race_h import race_h
+from .sec_qa import sec_qa_v1, sec_qa_v2
 from .squad import squad
 from .swe_bench import swe_bench
 from .truthfulqa import truthfulqa
diff --git a/src/inspect_evals/sec_qa/__init__.py b/src/inspect_evals/sec_qa/__init__.py
@@ -0,0 +1,3 @@
+from .sec_qa import sec_qa_v1, sec_qa_v2
+
+__all__ = ["sec_qa_v1", "sec_qa_v2"]
diff --git a/src/inspect_evals/sec_qa/sec_qa.py b/src/inspect_evals/sec_qa/sec_qa.py
@@ -0,0 +1,46 @@
+"""
+SecQA: A Concise Question-Answering Dataset for Evaluating Large Language Models in Computer Security
+
+Zefang Liu
+https://arxiv.org/pdf/2312.15838
+"""
+
+from typing import Any
+
+from inspect_ai import Task, task
+from inspect_ai.dataset import Sample, hf_dataset
+from inspect_ai.scorer import choice
+from inspect_ai.solver import multiple_choice, system_message
+
+# setup for problem + instructions for providing answer
+SYSTEM_MESSAGE = """
+The following are multiple choice questions about Computer Security.
+""".strip()
+
+
+@task
+def sec_qa_v1() -> Task:
+    """Inspect Task implementing the SecQA benchmark v1"""
+    # dataset
+    dataset = hf_dataset(
+        path="zefang-liu/secqa",
+        name="secqa_v1",  # aka subset
+        split="dev",
+        sample_fields=record_to_sample,
+        trust=True,
+    )
+
+    # define task
+    return Task(
+        dataset=dataset,
+        solver=[system_message(SYSTEM_MESSAGE), multiple_choice()],
+        scorer=choice(),
+    )
+
+
+def record_to_sample(record: dict[str, Any]) -> Sample:
+    return Sample(
+        input=record["Question"],
+        choices=[record["A"], record["B"], record["C"], record["D"]],
+        target=record["Answer"],
+    )

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+from .sec_qa import sec_qa_v1, sec_qa_v2`
	`2`	`+`
	`3`	`+__all__ = ["sec_qa_v1", "sec_qa_v2"]`