Merge pull request rllm-org#126 from matthewreed26/sec_qa_benchmark

alexandraabbas · web-flow · commit a9c1e4a0c3b1 · 2025-01-08T01:41:36.000Z
SecQA Benchmark Implementation | ASET - Arcadia Impact
diff --git a/README.md b/README.md
@@ -140,6 +140,13 @@ Inspect supports many model providers including OpenAI, Anthropic, Google, Mistr
    inspect eval inspect_evals/gdm_in_house_ctf
    ```
 
+- ### [SecQA: A Concise Question-Answering Dataset for Evaluating Large Language Models in Computer Security](src/inspect_evals/sec_qa)
+  LLMs should be capable of understanding computer security principles and applying best practices to their responses. SecQA has “v1” and “v2” datasets of multiple-choice questions that aim to provide two levels of cybersecurity evaluation criteria. The questions are generated by GPT-4 based on the “Computer Systems Security: Planning for Success” textbook and vetted by humans.
+ <sub><sup>Contributed by: [@matthewreed26](https://github.com/matthewreed26)</sub></sup>
+   ```
+   inspect eval inspect_evals/secqa_v1
+   inspect eval inspect_evals/secqa_v2
+   ```
 
 ## Safeguards
 
diff --git a/src/inspect_evals/_registry.py b/src/inspect_evals/_registry.py
@@ -55,6 +55,7 @@
 from .piqa import piqa
 from .pubmedqa import pubmedqa
 from .race_h import race_h
+from .sec_qa import sec_qa_v1, sec_qa_v1_5_shot, sec_qa_v2, sec_qa_v2_5_shot
 from .squad import squad
 from .swe_bench import swe_bench
 from .truthfulqa import truthfulqa
diff --git a/src/inspect_evals/sec_qa/__init__.py b/src/inspect_evals/sec_qa/__init__.py
@@ -0,0 +1,21 @@
+from .sec_qa import (
+    DATASET_PATH,
+    format_system_message_fewshot,
+    record_to_sample,
+    sample_to_fewshot,
+    sec_qa_v1,
+    sec_qa_v1_5_shot,
+    sec_qa_v2,
+    sec_qa_v2_5_shot,
+)
+
+__all__ = [
+    "DATASET_PATH",
+    "format_system_message_fewshot",
+    "record_to_sample",
+    "sample_to_fewshot",
+    "sec_qa_v1",
+    "sec_qa_v1_5_shot",
+    "sec_qa_v2",
+    "sec_qa_v2_5_shot",
+]
diff --git a/src/inspect_evals/sec_qa/sec_qa.py b/src/inspect_evals/sec_qa/sec_qa.py
@@ -0,0 +1,118 @@
+"""
+SecQA: A Concise Question-Answering Dataset for Evaluating Large Language Models in Computer Security
+
+Zefang Liu
+https://arxiv.org/pdf/2312.15838
+"""
+
+from typing import Any
+
+from inspect_ai import Task, task
+from inspect_ai.dataset import Dataset, Sample, hf_dataset
+from inspect_ai.scorer import choice
+from inspect_ai.solver import multiple_choice, system_message
+
+# setup for problem + instructions for providing answer
+SYSTEM_MESSAGE_FEWSHOT = """
+The following are multiple choice questions about Computer Security. Some examples are provided below.
+
+{examples}
+""".strip()
+
+DATASET_PATH = "zefang-liu/secqa"
+DATASET_SUBSET_NAME_V1 = "secqa_v1"
+DATASET_SUBSET_NAME_V2 = "secqa_v2"
+DATASET_TEST_SPLIT = "test"
+DATASET_FEWSHOT_SPLIT = "dev"
+
+
+@task
+def sec_qa_v1() -> Task:
+    """Inspect Task implementing the SecQA benchmark v1 0-shot"""
+    dataset = retrieve_hf_dataset(DATASET_SUBSET_NAME_V1, DATASET_TEST_SPLIT)
+    return Task(
+        dataset=dataset,
+        solver=[multiple_choice()],
+        scorer=choice(),
+    )
+
+
+@task
+def sec_qa_v1_5_shot() -> Task:
+    """Inspect Task implementing the SecQA benchmark v1 5-shot"""
+    fewshot_samples = retrieve_hf_dataset(DATASET_SUBSET_NAME_V1, DATASET_FEWSHOT_SPLIT)
+    dataset = retrieve_hf_dataset(DATASET_SUBSET_NAME_V1, DATASET_TEST_SPLIT)
+    return Task(
+        dataset=dataset,
+        solver=[
+            system_message(format_system_message_fewshot(fewshot_samples)),
+            multiple_choice(),
+        ],
+        scorer=choice(),
+    )
+
+
+@task
+def sec_qa_v2() -> Task:
+    """Inspect Task implementing the SecQA benchmark v2 0-shot"""
+    dataset = retrieve_hf_dataset(DATASET_SUBSET_NAME_V2, DATASET_TEST_SPLIT)
+    return Task(
+        dataset=dataset,
+        solver=[multiple_choice()],
+        scorer=choice(),
+    )
+
+
+@task
+def sec_qa_v2_5_shot() -> Task:
+    """Inspect Task implementing the SecQA benchmark v2 5-shot"""
+    fewshot_samples = retrieve_hf_dataset(DATASET_SUBSET_NAME_V2, DATASET_FEWSHOT_SPLIT)
+    dataset = retrieve_hf_dataset(DATASET_SUBSET_NAME_V2, DATASET_TEST_SPLIT)
+    return Task(
+        dataset=dataset,
+        solver=[
+            system_message(format_system_message_fewshot(fewshot_samples)),
+            multiple_choice(),
+        ],
+        scorer=choice(),
+    )
+
+
+def retrieve_hf_dataset(name: str, split: str) -> Dataset:
+    return hf_dataset(
+        path=DATASET_PATH,
+        name=name,
+        split=split,
+        sample_fields=record_to_sample,
+        trust=True,
+    )
+
+
+def record_to_sample(record: dict[str, Any]) -> Sample:
+    return Sample(
+        input=record["Question"],
+        choices=[record["A"], record["B"], record["C"], record["D"]],
+        target=record["Answer"],
+    )
+
+
+def format_system_message_fewshot(fewshot_samples: Dataset) -> str:
+    return SYSTEM_MESSAGE_FEWSHOT.format(
+        examples="\n\n".join(
+            [sample_to_fewshot(sample=sample) for sample in fewshot_samples]
+        )
+    )
+
+
+def sample_to_fewshot(sample: Sample) -> str:
+    prob_str = f"""QUESTION:\n{sample.input}"""
+    labeled_choices = "\n"
+    for i, letter_label in enumerate(["A", "B", "C", "D"]):
+        labeled_choices = (
+            labeled_choices
+            + f"{letter_label}) {sample.choices[i] if sample.choices is not None else None}\n"
+        )
+    choices_str = f"""CHOICES:{labeled_choices}"""
+    ans_str = f"""ANSWER: {sample.target}"""
+
+    return f"""{prob_str}\n\n{choices_str}\n\n{ans_str}"""
diff --git a/tests/sec_qa/test_sec_qa.py b/tests/sec_qa/test_sec_qa.py
@@ -0,0 +1,53 @@
+from typing import Any
+
+from inspect_ai.dataset import Sample
+
+from inspect_evals.sec_qa import record_to_sample
+from inspect_evals.sec_qa.sec_qa import format_system_message_fewshot, sample_to_fewshot
+
+EXAMPLE_RECORD: dict[str, Any] = {
+    "Question": "Which of the following is a common indicator of an SQL injection attack?",
+    "A": "Frequent changes in user account permissions.",
+    "B": "Decreased performance of endpoint protection systems.",
+    "C": "Unusually high data upload rates to a web server.",
+    "D": "Sudden uptick in SQL queries, far beyond the usual baseline for the application.",
+    "Answer": "D",
+}
+
+EXPECTED_FORMATTED_SAMPLE_TO_FEWSHOT = "QUESTION:\nWhich of the following is a common indicator of an SQL injection attack?\n\nCHOICES:\nA) Frequent changes in user account permissions.\nB) Decreased performance of endpoint protection systems.\nC) Unusually high data upload rates to a web server.\nD) Sudden uptick in SQL queries, far beyond the usual baseline for the application.\n\n\nANSWER: D"
+
+
+def test_record_to_sample():
+    """Test that the record is mapped correctly to Inspect Sample"""
+    example_sample = record_to_sample(EXAMPLE_RECORD)
+    assert example_sample == Sample(
+        input="Which of the following is a common indicator of an SQL injection attack?",
+        choices=[
+            "Frequent changes in user account permissions.",
+            "Decreased performance of endpoint protection systems.",
+            "Unusually high data upload rates to a web server.",
+            "Sudden uptick in SQL queries, far beyond the usual baseline for the application.",
+        ],
+        target="D",
+    )
+
+
+def test_format_system_message_fewshot():
+    """Test that the system message is formatted correctly for some samples"""
+    formatted_system_message = format_system_message_fewshot(
+        [record_to_sample(EXAMPLE_RECORD), record_to_sample(EXAMPLE_RECORD)]
+    )
+    assert (
+        formatted_system_message
+        == "The following are multiple choice questions about Computer Security. Some examples are provided below."
+        + "\n\n"
+        + EXPECTED_FORMATTED_SAMPLE_TO_FEWSHOT
+        + "\n\n"
+        + EXPECTED_FORMATTED_SAMPLE_TO_FEWSHOT
+    )
+
+
+def test_sample_to_fewshot():
+    """Test that the sample is formatted correctly"""
+    formatted_sample_to_fewshot = sample_to_fewshot(record_to_sample(EXAMPLE_RECORD))
+    assert formatted_sample_to_fewshot == EXPECTED_FORMATTED_SAMPLE_TO_FEWSHOT
diff --git a/tests/sec_qa/test_sec_qa_dataset.py b/tests/sec_qa/test_sec_qa_dataset.py
@@ -0,0 +1,137 @@
+from urllib.parse import quote
+
+import requests
+
+from inspect_evals.sec_qa.sec_qa import (
+    DATASET_PATH,
+)
+
+HUGGING_FACE_IS_VALID_RESPONSE = {
+    "preview": True,
+    "viewer": True,
+    "search": True,
+    "filter": True,
+    "statistics": True,
+}
+
+HUGGING_FACE_EXPECTED_SPLITS_V1 = {
+    "dev": {
+        "name": "dev",
+        "num_bytes": 2538,
+        "num_examples": 5,
+        "dataset_name": "secqa",
+    },
+    "val": {
+        "name": "val",
+        "num_bytes": 6364,
+        "num_examples": 12,
+        "dataset_name": "secqa",
+    },
+    "test": {
+        "name": "test",
+        "num_bytes": 53627,
+        "num_examples": 110,
+        "dataset_name": "secqa",
+    },
+}
+
+HUGGING_FACE_EXPECTED_SPLITS_V2 = {
+    "dev": {
+        "name": "dev",
+        "num_bytes": 3387,
+        "num_examples": 5,
+        "dataset_name": "secqa",
+    },
+    "val": {
+        "name": "val",
+        "num_bytes": 7494,
+        "num_examples": 10,
+        "dataset_name": "secqa",
+    },
+    "test": {
+        "name": "test",
+        "num_bytes": 68458,
+        "num_examples": 100,
+        "dataset_name": "secqa",
+    },
+}
+
+
+def test_dataset_is_valid():
+    """Test that the SecQA dataset URL is valid."""
+    response = requests.get(
+        "https://datasets-server.huggingface.co/is-valid?dataset=" + quote(DATASET_PATH)
+    )
+    assert (
+        response.status_code == 200
+    ), f"Hugging Face dataset `/is-valid` returned status code {response.status_code} check documentation at `https://huggingface.co/docs/dataset-viewer/en/valid`"
+    assert response.json() == HUGGING_FACE_IS_VALID_RESPONSE
+
+
+def test_required_splits():
+    """Test that the SecQA dataset has the required splits."""
+    response = requests.get(
+        "https://datasets-server.huggingface.co/info?dataset=" + quote(DATASET_PATH)
+    )
+    assert (
+        response.status_code == 200
+    ), f"Hugging Face dataset `/info` returned status code {response.status_code} check documentation at `https://huggingface.co/docs/dataset-viewer/en/info`"
+
+    data = response.json()
+    assert (
+        "dataset_info" in data
+    ), f"Hugging Face dataset `/info` returned unexpected json {data} check documentation at `https://huggingface.co/docs/dataset-viewer/en/info`"
+
+    dataset_info = data["dataset_info"]
+    assert (
+        "secqa_v1" in dataset_info
+    ), f"SecQA V1 subset missing from {DATASET_PATH} dataset"
+    assert (
+        "secqa_v2" in dataset_info
+    ), f"SecQA V2 subset missing from {DATASET_PATH} dataset"
+
+    assert (
+        dataset_info["secqa_v1"]["splits"] == HUGGING_FACE_EXPECTED_SPLITS_V1
+    ), "Unexpected dataset splits for SecQA V1 and especially check dev which is used in 5-shot evaluation"
+    assert (
+        dataset_info["secqa_v2"]["splits"] == HUGGING_FACE_EXPECTED_SPLITS_V2
+    ), "Unexpected dataset splits for SecQA V2 and especially check dev which is used in 5-shot evaluation"
+
+
+def test_required_columns():
+    """Test that the SecQA dataset has the required columns."""
+    response = requests.get(
+        "https://datasets-server.huggingface.co/info?dataset=" + quote(DATASET_PATH)
+    )
+    assert (
+        response.status_code == 200
+    ), f"Hugging Face dataset `/info` returned status code {response.status_code} check documentation at `https://huggingface.co/docs/dataset-viewer/en/info`"
+
+    data = response.json()
+    assert (
+        "dataset_info" in data
+    ), f"Hugging Face dataset `/info` returned unexpected json {data} check documentation at `https://huggingface.co/docs/dataset-viewer/en/info`"
+
+    dataset_info = data["dataset_info"]
+    assert (
+        "secqa_v1" in dataset_info
+    ), f"SecQA V1 subset missing from {DATASET_PATH} dataset"
+    assert (
+        "secqa_v2" in dataset_info
+    ), f"SecQA V2 subset missing from {DATASET_PATH} dataset"
+
+    secqa_v1_features = dataset_info["secqa_v1"]["features"]
+    assert "Question" in secqa_v1_features, "Question column missing from SecQA V1"
+    assert "A" in secqa_v1_features, "A column missing from SecQA V1"
+    assert "B" in secqa_v1_features, "B column missing from SecQA V1"
+    assert "C" in secqa_v1_features, "C column missing from SecQA V1"
+    assert "D" in secqa_v1_features, "D column missing from SecQA V1"
+    assert "Answer" in secqa_v1_features, "Answer column missing from SecQA V1"
+
+    secqa_v2_features = dataset_info["secqa_v2"]["features"]
+    assert "Question" in secqa_v2_features, "Question column missing from SecQA V2"
+    assert "A" in secqa_v2_features, "A column missing from SecQA V2"
+    assert "B" in secqa_v2_features, "B column missing from SecQA V2"
+    assert "C" in secqa_v2_features, "C column missing from SecQA V2"
+    assert "D" in secqa_v2_features, "D column missing from SecQA V2"
+    assert "Answer" in secqa_v2_features, "Answer column missing from SecQA V2"