uiuc-kang-lab
diff --git a/‎README.md‎
Lines changed: 18 additions & 0 deletions b/‎README.md‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎src/inspect_evals/_registry.py‎
Lines changed: 7 additions & 0 deletions b/‎src/inspect_evals/_registry.py‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎src/inspect_evals/cybermetric/README.md‎
Lines changed: 62 additions & 0 deletions b/‎src/inspect_evals/cybermetric/README.md‎
Lines changed: 62 additions & 0 deletions
diff --git a/‎src/inspect_evals/cybermetric/__init__.py‎
Lines changed: 8 additions & 0 deletions b/‎src/inspect_evals/cybermetric/__init__.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎src/inspect_evals/cybermetric/cybermetric.py‎
Lines changed: 81 additions & 0 deletions b/‎src/inspect_evals/cybermetric/cybermetric.py‎
Lines changed: 81 additions & 0 deletions
@@ -114,6 +114,17 @@ Inspect supports many model providers including OpenAI, Anthropic, Google, Mistr
    ```
 
 
+- ### [CyberMetric: A Benchmark Dataset based on Retrieval-Augmented Generation for Evaluating LLMs in Cybersecurity Knowledge](src/inspect_evals/cybermetric)
+  Datasets containing 80, 500, 2000 and 10000 multiple-choice questions, designed to evaluate understanding across nine domains within cybersecurity
+ <sub><sup>Contributed by: [@neilshaabi](https://github.com/neilshaabi)</sub></sup>
+   ```
+   inspect eval inspect_evals/cybermetric_80
+   inspect eval inspect_evals/cybermetric_500
+   inspect eval inspect_evals/cybermetric_2000
+   inspect eval inspect_evals/cybermetric_10000
+   ```
+
+
 - ### [InterCode: Capture the Flag](src/inspect_evals/gdm_capabilities/intercode_ctf)
   Measure expertise in coding, cryptography (i.e. binary exploitation, forensics), reverse engineering, and recognizing security vulnerabilities. Demonstrates tool use and sandboxing untrusted model code.
  <sub><sup>Contributed by: [@jjallaire](https://github.com/jjallaire)</sub></sup>
@@ -139,6 +150,13 @@ Inspect supports many model providers including OpenAI, Anthropic, Google, Mistr
    inspect eval inspect_evals/sevenllm_qa_en
    ```
 
+- ### [SecQA: A Concise Question-Answering Dataset for Evaluating Large Language Models in Computer Security](src/inspect_evals/sec_qa)
+  LLMs should be capable of understanding computer security principles and applying best practices to their responses. SecQA has “v1” and “v2” datasets of multiple-choice questions that aim to provide two levels of cybersecurity evaluation criteria. The questions are generated by GPT-4 based on the “Computer Systems Security: Planning for Success” textbook and vetted by humans.
+ <sub><sup>Contributed by: [@matthewreed26](https://github.com/matthewreed26)</sub></sup>
+   ```
+   inspect eval inspect_evals/secqa_v1
+   inspect eval inspect_evals/secqa_v2
+   ```
 
 ## Safeguards
 
 
@@ -18,6 +18,12 @@
 from .boolq import boolq
 from .commonsense_qa import commonsense_qa
 from .cybench import cybench
+from .cybermetric import (
+    cybermetric_80,
+    cybermetric_500,
+    cybermetric_2000,
+    cybermetric_10000,
+)
 from .drop import drop
 from .ds1000 import ds1000
 from .gaia import gaia, gaia_level1, gaia_level2, gaia_level3
@@ -49,6 +55,7 @@
 from .piqa import piqa
 from .pubmedqa import pubmedqa
 from .race_h import race_h
+from .sec_qa import sec_qa_v1, sec_qa_v1_5_shot, sec_qa_v2, sec_qa_v2_5_shot
 from .sevenllm import sevenllm_mcq_en, sevenllm_mcq_zh, sevenllm_qa_en, sevenllm_qa_zh
 from .squad import squad
 from .swe_bench import swe_bench
 
@@ -0,0 +1,62 @@
+# CyberMetric
+
+[CyberMetric](https://arxiv.org/abs/2402.07688) is a collection of four datasets containing 80, 500, 2000 and 10000 multiple-choice questions, designed to evaluate understanding across nine domains within cybersecurity: Disaster Recovery and BCP, Identity and Access Management (IAM), IoT Security, Cryptography, Wireless Security, Network Security, Cloud Security, Penetration Testing, and Compliance/Audit.
+
+Contributed by [@neilshaabi](https://github.com/neilshaabi)
+
+## Usage
+
+First, install the `inspect_ai` and `inspect_evals` Python packages with:
+```bash
+pip install inspect_ai
+pip install git+https://github.com/UKGovernmentBEIS/inspect_evals
+```
+
+Then, evaluate against one or more models with:
+```bash
+inspect eval inspect_evals/cybermetric_80 --model openai/gpt-4o
+inspect eval inspect_evals/cybermetric_500 --model openai/gpt-4o
+inspect eval inspect_evals/cybermetric_2000 --model openai/gpt-4o
+inspect eval inspect_evals/cybermetric_10000 --model openai/gpt-4o
+```
+
+After running evaluations, you can view their logs using the `inspect view` command:
+
+```bash
+inspect view
+```
+
+If you don't want to specify the `--model` each time you run an evaluation, create a `.env` configuration file in your working directory that defines the `INSPECT_EVAL_MODEL` environment variable along with your API key. For example:
+
+```bash
+INSPECT_EVAL_MODEL=anthropic/claude-3-5-sonnet-20240620
+ANTHROPIC_API_KEY=<anthropic-api-key>
+```
+
+## Options
+
+You can control a variety of options from the command line. For example:
+```bash
+inspect eval inspect_evals/cybermetric_80 --limit 10
+inspect eval inspect_evals/cybermetric_500 --max-connections 10
+inspect eval inspect_evals/cybermetric_2000 --temperature 0.5
+```
+
+See `inspect eval --help` for all available options.
+
+## Dataset
+
+Here is an example prompt from the dataset (after it has been further processed by Inspect):
+
+>Answer the following multiple choice question. The entire content of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of A,B,C,D.
+>
+>In cryptography, what is the purpose of using a key-derivation function (KDF)?
+>
+>
+>A) Generate public keys, B) Authenticate digital signatures, C) Encrypt data using a password, D) KDF are algorithms used to transform a secret into crucial parameters like keys and Initialization Vectors (IVs).
+
+The model is then tasked to pick the correct choice.
+
+## Scoring
+
+A simple accuracy is calculated over the datapoints.
@@ -0,0 +1,8 @@
+from .cybermetric import (
+    cybermetric_80,
+    cybermetric_500,
+    cybermetric_2000,
+    cybermetric_10000,
+)
+
+__all__ = ["cybermetric_80", "cybermetric_500", "cybermetric_2000", "cybermetric_10000"]
@@ -0,0 +1,81 @@
+"""
+
+CyberMetric: A Benchmark Dataset based on Retrieval-Augmented Generation for Evaluating LLMs in Cybersecurity Knowledge
+
+Norbert Tihanyi, Mohamed Amine Ferrag, Ridhi Jain, Tamas Bisztray, Merouane Debbah
+https://arxiv.org/abs/2402.07688
+
+# Run all subsets
+inspect eval cybermetric.py
+
+# Run specific subsets
+inspect eval cybermetric.py@cybermetric_80
+inspect eval cybermetric.py@cybermetric_500
+inspect eval cybermetric.py@cybermetric_2000
+inspect eval cybermetric.py@cybermetric_10000
+
+"""
+
+from pathlib import Path
+from typing import Any
+
+from datasets import Dataset  # type: ignore
+from inspect_ai import Task, task
+from inspect_ai.dataset import Sample, json_dataset
+from inspect_ai.scorer import choice
+from inspect_ai.solver import multiple_choice, system_message
+
+DATASET_DIR = Path(__file__).parent / "datasets"
+SYSTEM_MESSAGE = "You are a security expert who answers questions."
+
+
+@task
+def cybermetric_80() -> Task:
+    """Evaluates model on CyberMetric-80"""
+    return create_task(80)
+
+
+@task
+def cybermetric_500() -> Task:
+    """Evaluates model on CyberMetric-500"""
+    return create_task(500)
+
+
+@task
+def cybermetric_2000() -> Task:
+    """Evaluates model on CyberMetric-2000"""
+    return create_task(2000)
+
+
+@task
+def cybermetric_10000() -> Task:
+    """Evaluates model on CyberMetric-10000"""
+    return create_task(10000)
+
+
+def create_task(dataset_size: int) -> Task:
+    name = f"CyberMetric-{dataset_size}"
+    return Task(
+        name=name,
+        dataset=load_dataset(name),
+        solver=[system_message(SYSTEM_MESSAGE), multiple_choice()],
+        scorer=choice(),
+    )
+
+
+def load_dataset(name: str) -> Dataset:
+    dataset = json_dataset(
+        json_file=f"{DATASET_DIR}/{name.lower()}.json",
+        name=name,
+        sample_fields=record_to_sample,
+        auto_id=True,
+    )
+    return dataset
+
+
+def record_to_sample(record: dict[str, Any]) -> Sample:
+    options = ", ".join([f"{key}) {value}" for key, value in record["answers"].items()])
+    input = f"Question: {record['question']}\nOptions: {options}\n\n"
+    return Sample(
+        input=input, choices=list(record["answers"].keys()), target=record["solution"]
+    )