uiuc-kang-lab
diff --git a/‎.github/workflows/build.yml‎
Lines changed: 10 additions & 5 deletions b/‎.github/workflows/build.yml‎
Lines changed: 10 additions & 5 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion b/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/inspect_evals/cyberseceval_2/README.md‎
Lines changed: 7 additions & 7 deletions b/‎src/inspect_evals/cyberseceval_2/README.md‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎src/inspect_evals/cyberseceval_2/__init__.py‎
Lines changed: 8 additions & 4 deletions b/‎src/inspect_evals/cyberseceval_2/__init__.py‎
Lines changed: 8 additions & 4 deletions
diff --git a/‎src/inspect_evals/cyberseceval_2/interpreter_abuse/dataset.py‎
Lines changed: 8 additions & 2 deletions b/‎src/inspect_evals/cyberseceval_2/interpreter_abuse/dataset.py‎
Lines changed: 8 additions & 2 deletions
diff --git a/‎src/inspect_evals/cyberseceval_2/interpreter_abuse/task.py‎
Lines changed: 3 additions & 3 deletions b/‎src/inspect_evals/cyberseceval_2/interpreter_abuse/task.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/inspect_evals/cyberseceval_2/prompt_injection/dataset.py‎
Lines changed: 4 additions & 1 deletion b/‎src/inspect_evals/cyberseceval_2/prompt_injection/dataset.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎src/inspect_evals/cyberseceval_2/prompt_injection/task.py‎
Lines changed: 3 additions & 3 deletions b/‎src/inspect_evals/cyberseceval_2/prompt_injection/task.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/inspect_evals/cyberseceval_2/vulnerability_exploit/challenges/base_test_generator.py‎
Lines changed: 4 additions & 4 deletions b/‎src/inspect_evals/cyberseceval_2/vulnerability_exploit/challenges/base_test_generator.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎src/inspect_evals/cyberseceval_2/vulnerability_exploit/challenges/constraint_satisfaction/base_constraint_satisfaction_generator.py‎
Lines changed: 2 additions & 2 deletions b/‎src/inspect_evals/cyberseceval_2/vulnerability_exploit/challenges/constraint_satisfaction/base_constraint_satisfaction_generator.py‎
Lines changed: 2 additions & 2 deletions
@@ -17,12 +17,17 @@ jobs:
         python-version: ["3.10", "3.11"]
     steps:
       - uses: actions/checkout@v4
+      - name: Display Ruff version
+        uses: astral-sh/ruff-action@v3
+        # Installs ruff for use in later steps
+        with:
+          version: "0.7.2"
+          args: --version
       - name: Lint with Ruff
-        uses: chartboost/ruff-action@v1
+        run: ruff check
       - name: Format with Ruff
-        uses: chartboost/ruff-action@v1
-        with:
-          args: "format --check"
+        run: ruff format --check
+  
 
   mypy:
     runs-on: ubuntu-latest
@@ -48,7 +53,7 @@ jobs:
           python -m pip install .[test]
       - name: Run mypy
         run: |
-          mypy .
+          mypy --version && mypy .
 
   test:
     runs-on: ubuntu-latest
 
@@ -143,7 +143,7 @@ dev = [
     "ruff==0.7.2", # match version specified in .pre-commit-config.yaml
     "types-Pillow",
     "types-PyYAML",
-    "types-requests"
+    "types-requests",
 ]
 test = [
     "inspect_evals[dev]",
 
@@ -24,9 +24,9 @@ pip install git+https://github.com/UKGovernmentBEIS/inspect_evals
 Then, evaluate against one or more models with:
 
 ```bash
-inspect eval inspect_evals/interpreter_abuse --model openai/gpt-4o
-inspect eval inspect_evals/prompt_injection --model openai/gpt-4o
-inspect eval inspect_evals/vulnerability_exploit --model openai/gpt-4o
+inspect eval inspect_evals/cse2_interpreter_abuse --model openai/gpt-4o
+inspect eval inspect_evals/cse2_prompt_injection --model openai/gpt-4o
+inspect eval inspect_evals/cse2_vulnerability_exploit --model openai/gpt-4o
 ```
 
 After running evaluations, you can view their logs using the `inspect view` command:
@@ -49,9 +49,9 @@ ANTHROPIC_API_KEY=<anthropic-api-key>
 You can control a variety of options from the command line. For example:
 
 ```bash
-inspect eval inspect_evals/interpreter_abuse --limit 10
-inspect eval inspect_evals/prompt_injection --max-connections 10
-inspect eval inspect_evals/vulnerability_exploit --temperature 0.5
+inspect eval inspect_evals/cse2_interpreter_abuse --limit 10
+inspect eval inspect_evals/cse2_interpreter_abuse --max-connections 10
+inspect eval inspect_evals/cse2_interpreter_abuse --temperature 0
 ```
 
 See `inspect eval --help` for all available options.
@@ -67,4 +67,4 @@ The model's willingness to comply and generate malicious code is then evaluated
 
 ## Scoring
 
-A simple average is calculated over the test cases, indicating the rate at which the model behavior could be classified as harmful (lower scores are better).
+A simple average is calculated over the test cases, indicating the rate at which the model behavior could be classified as harmful (lower scores are better).
@@ -1,5 +1,9 @@
-from .interpreter_abuse.task import interpreter_abuse
-from .prompt_injection.task import prompt_injection
-from .vulnerability_exploit.task import vulnerability_exploit
+from .interpreter_abuse.task import cse2_interpreter_abuse
+from .prompt_injection.task import cse2_prompt_injection
+from .vulnerability_exploit.task import cse2_vulnerability_exploit
 
-__all__ = ["prompt_injection", "interpreter_abuse", "vulnerability_exploit"]
+__all__ = [
+    "cse2_prompt_injection",
+    "cse2_interpreter_abuse",
+    "cse2_vulnerability_exploit",
+]
@@ -25,9 +25,15 @@ def read_dataset(challenges: str | List[str] | None = None) -> Dataset:
 
         # get challenges and validate them
         challenges = challenges if isinstance(challenges, list) else [challenges]
-        valid_challenges = [sample.metadata["attack_type"] for sample in dataset if sample.metadata is not None]
+        valid_challenges = [
+            sample.metadata["attack_type"]
+            for sample in dataset
+            if sample.metadata is not None
+        ]
         valid_challenges = list(set(chain.from_iterable(valid_challenges)))
-        invalid_challenges = [challenge for challenge in challenges if challenge not in valid_challenges]
+        invalid_challenges = [
+            challenge for challenge in challenges if challenge not in valid_challenges
+        ]
         if invalid_challenges:
             challenge_names = ", ".join(invalid_challenges)
             raise ValueError(f"Invalid challenge names: {challenge_names}")
 
@@ -10,10 +10,10 @@
 Based on: https://github.com/meta-llama/PurpleLlama/tree/70e7a376f2a310fb89f7ad207685baa69f78107f/CybersecurityBenchmarks
 
 # eval for default epochs (4)
-inspect eval inspect_evals/interpreter_abuse -
+inspect eval inspect_evals/cse2_interpreter_abuse -
 
 # eval with 1 epoch
-inspect eval inspect_evals/interpreter_abuse --epochs 1
+inspect eval inspect_evals/cse2_interpreter_abuse --epochs 1
 """
 
 from typing import List
@@ -52,7 +52,7 @@
 
 
 @task
-def interpreter_abuse(
+def cse2_interpreter_abuse(
     challenges: str | List[str] | None = None,
     epochs: int | None = DEFAULT_EPOCHS,
     judge_llm: str | None = JUDGE_LLM,
 
@@ -4,6 +4,7 @@
 
 DATASET_URL = "https://raw.githubusercontent.com/meta-llama/PurpleLlama/70e7a376f2a310fb89f7ad207685baa69f78107f/CybersecurityBenchmarks/datasets/prompt_injection/prompt_injection.json"
 
+
 def record_to_sample(record: dict[str, Any]) -> Sample:
     return Sample(
         input=record["user_input"],
@@ -34,7 +35,9 @@ def read_dataset(
             for sample in dataset
             if sample.metadata is not None
         ]
-        invalid_challenges = [challenge for challenge in challenges if challenge not in challenge_ids]
+        invalid_challenges = [
+            challenge for challenge in challenges if challenge not in challenge_ids
+        ]
         if invalid_challenges:
             challenge_names = ", ".join(invalid_challenges)
             raise ValueError(f"Invalid challenge names: {challenge_names}")
 
@@ -11,10 +11,10 @@
 Based on: https://github.com/meta-llama/PurpleLlama/tree/70e7a376f2a310fb89f7ad207685baa69f78107f/CybersecurityBenchmarks
 
 # eval for default epochs (4)
-inspect eval inspect_evals/prompt_injection
+inspect eval inspect_evals/cse2_prompt_injection
 
 # eval with 1 epoch
-inspect eval inspect_evals/prompt_injection --epochs 1
+inspect eval inspect_evals/cse2_prompt_injection --epochs 1
 """
 
 from typing import List
@@ -49,7 +49,7 @@
 
 
 @task
-def prompt_injection(
+def cse2_prompt_injection(
     challenges: str | List[str] | None = None,
     shuffle: bool = False,
     epochs: int | None = DEFAULT_EPOCHS,
 
@@ -17,11 +17,11 @@ def __init__(self) -> None:
 
     def generate_test_case(self) -> tuple[str, str, str, str]:
         """Generate a test case based on constraints"""
-        return NotImplemented
+        raise NotImplementedError
 
     def generate_prompt(self, code: str) -> str:
         """Generate a prompt based on the test case based on constraints"""
-        return NotImplemented
+        raise NotImplementedError
 
     async def scoring_command(
         self, test_code_path: str, input: str
@@ -31,8 +31,8 @@ async def scoring_command(
 
     def score_from_output(self, output: str) -> float:
         """Score the output"""
-        return NotImplemented
+        raise NotImplementedError
 
     def generate_test_case_as_dict(self) -> Dict[str, str]:
         """Generate a test case based and return in dict"""
-        return NotImplemented
+        raise NotImplementedError
@@ -91,11 +91,11 @@ def is_fibonacci(n: int) -> bool:
 
     def generate_test_case(self) -> tuple[str, str, str, str]:
         """Generate a test case based on constraints"""
-        return NotImplemented
+        raise NotImplementedError
 
     def generate_prompt(self, code: str) -> str:
         """Generate a prompt based on the test case based on constraints"""
-        return NotImplemented
+        raise NotImplementedError
 
     async def scoring_command(
         self, test_code_path: str, input: str
Original file line number	Diff line number	Diff line change
`@@ -143,7 +143,7 @@ dev = [`
`143`	`143`	`"ruff==0.7.2", # match version specified in .pre-commit-config.yaml`
`144`	`144`	`"types-Pillow",`
`145`	`145`	`"types-PyYAML",`
`146`		`- "types-requests"`
	`146`	`+ "types-requests",`
`147`	`147`	`]`
`148`	`148`	`test = [`
`149`	`149`	`"inspect_evals[dev]",`