Skip to content

Commit 4a39c66

Browse files
committed
comment fixes and merge conflicts
1 parent 75864ff commit 4a39c66

File tree

5 files changed

+19
-19
lines changed

5 files changed

+19
-19
lines changed

src/inspect_evals/cyberseceval_2/interpreter_abuse/dataset.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44

55

66
def record_to_sample(record: dict[str, Any]) -> Sample:
7-
"""Map records to inspect samples"""
87
return Sample(
98
input=record["mutated_prompt"],
109
metadata={

src/inspect_evals/cyberseceval_2/prompt_injection/dataset.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@
22

33
from inspect_ai.dataset import Dataset, Sample, json_dataset
44

5+
DATASET_URL = "https://raw.githubusercontent.com/meta-llama/PurpleLlama/70e7a376f2a310fb89f7ad207685baa69f78107f/CybersecurityBenchmarks/datasets/prompt_injection/prompt_injection.json"
56

6-
# map records to inspect samples
77
def record_to_sample(record: dict[str, Any]) -> Sample:
88
return Sample(
99
input=record["user_input"],
@@ -22,14 +22,12 @@ def read_dataset(
2222
) -> Dataset:
2323
"""Reads a dataset from a challenges.json file."""
2424
dataset = json_dataset(
25-
json_file="https://raw.githubusercontent.com/meta-llama/PurpleLlama/70e7a376f2a310fb89f7ad207685baa69f78107f/CybersecurityBenchmarks/datasets/prompt_injection/prompt_injection.json",
25+
json_file=DATASET_URL,
2626
sample_fields=record_to_sample,
2727
shuffle=shuffle,
2828
)
2929

30-
# filter if requested
3130
if challenges:
32-
# get challenges and validate them
3331
challenges = challenges if isinstance(challenges, list) else [challenges]
3432
challenge_ids = [
3533
sample.metadata["injection_variant"]
@@ -41,7 +39,6 @@ def read_dataset(
4139
challenge_names = ", ".join(invalid_challenges)
4240
raise ValueError(f"Invalid challenge names: {challenge_names}")
4341

44-
# apply filter
4542
dataset = dataset.filter(
4643
lambda sample: sample.metadata is not None
4744
and sample.metadata["injection_variant"] in challenges

src/inspect_evals/cyberseceval_2/vulnerability_exploit/scorers.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,9 @@
1919
@scorer(metrics=[accuracy(), mean(), std()])
2020
def vul_exploit_scorer() -> Scorer:
2121
async def scorer(state: TaskState, target: Target) -> Score:
22-
# Test state's input proposed by model against the challenge's
23-
# target code to yield a score
24-
# Model's answer (exploit)
22+
"""Test state's input proposed by model against the challenge's
23+
target code to yield a score"""
24+
2525
answer = ""
2626
try:
2727
answer = re.sub("```(json)*", "", state.output.completion).strip()

tools/listing.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -30,13 +30,14 @@ def listing_md(listing: dict[str, Any]) -> str:
3030
output: list[str] = []
3131
output.append(f"- ### {link_md(listing['title'], os.path.join(listing['path']))}")
3232
output.append(f" {listing['description']}{contributors}")
33-
output.append(" ```")
33+
output.append("")
34+
output.append(" ```bash")
3435
for index, task in enumerate(listing["tasks"]):
3536
if index > 3:
3637
break
3738
output.append(f" inspect eval inspect_evals/{task}")
3839

39-
output.append(" ```\n")
40+
output.append(" ```")
4041
return "\n".join(output)
4142

4243

@@ -53,7 +54,7 @@ def readme_contents(file: Path, key: str) -> Contents:
5354

5455
# Read the file lines
5556
readme_lines = []
56-
with open(file, "r") as readme_file:
57+
with open(file, "r", encoding="utf-8") as readme_file:
5758
readme_lines = readme_file.readlines()
5859

5960
# Go through line by line and collect the prefix and suffix
@@ -84,7 +85,7 @@ def rewrite_task_readme(path: str, key: str, contents: list[str]) -> None:
8485
readme_path = Path(__file__).parent / ".." / path / "README.md"
8586
parsed = readme_contents(readme_path, key)
8687
if parsed.contains_key:
87-
with open(readme_path, "w") as readme_file:
88+
with open(readme_path, "w", encoding="utf-8") as readme_file:
8889
readme_file.write("\n".join(parsed.prefix + contents + parsed.suffix))
8990

9091

@@ -98,6 +99,7 @@ def generate_options(task_metadata: dict[str, Any]) -> None:
9899
contents.append(
99100
"You can control a variety of options from the command line. For example:"
100101
)
102+
contents.append("")
101103
contents.append("```bash")
102104
contents.append(f"inspect eval inspect_evals/{task_names[0]} --limit 10")
103105
contents.append(f"inspect eval inspect_evals/{task_names[1]} --max-connections 10")
@@ -118,6 +120,7 @@ def generate_usage(task_metadata: dict[str, Any]) -> None:
118120
contents.append(
119121
"First, install the `inspect_ai` and `inspect_evals` Python packages with:"
120122
)
123+
contents.append("")
121124
contents.append("```bash")
122125
contents.append("pip install inspect_ai")
123126
if dependency is None:
@@ -132,6 +135,7 @@ def generate_usage(task_metadata: dict[str, Any]) -> None:
132135
contents.append("")
133136

134137
contents.append("Then, evaluate against one or more models with:")
138+
contents.append("")
135139
contents.append("```bash")
136140
for index, task in enumerate(task_metadata["tasks"]):
137141
if index > 3:
@@ -180,7 +184,7 @@ def generate_readme() -> None:
180184

181185
# read the listings
182186
listings_raw: list[dict[str, Any]] = []
183-
with open(listing_file, "r") as yaml_file:
187+
with open(listing_file, "r", encoding="utf-8") as yaml_file:
184188
listings_raw = yaml.safe_load(yaml_file)
185189

186190
# read them
@@ -213,7 +217,7 @@ def generate_readme() -> None:
213217
contents = readme_contents(readme_path, EVAL_KEY)
214218

215219
# rewrite the readme with prefix and suffix content
216-
with open(readme_path, "w") as readme_file:
220+
with open(readme_path, "w", encoding="utf-8") as readme_file:
217221
readme_file.write("\n".join(contents.prefix + content + contents.suffix))
218222

219223
for listing_raw in listings_raw:
@@ -223,4 +227,4 @@ def generate_readme() -> None:
223227

224228

225229
if __name__ == "__main__":
226-
generate_readme()
230+
generate_readme()

tools/listing.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@
5858

5959
- title: "Cybench: A Framework for Evaluating Cybersecurity Capabilities and Risks of Language Models"
6060
description: |
61-
40 professional-level Capture the Flag (CTF) tasks from 4 distinct CTF competitions, chosen to be recent, meaningful, and spanning a wide range of difficulties.
61+
40 professional-level Capture the Flag (CTF) tasks from 4 distinct CTF competitions, chosen to be recent, meaningful, and spanning a wide range of difficulties.
6262
path: src/inspect_evals/cybench
6363
group: Cybersecurity
6464
contributors: ["sinman-aisi", "sam-deverett-dsit", "kola-aisi", "pgiav"]
@@ -118,7 +118,7 @@
118118

119119
- title: "GSM8K: Training Verifiers to Solve Math Word Problems"
120120
description: |
121-
Dataset of 8.5K high quality linguistically diverse grade school math word problems. Demostrates fewshot prompting.
121+
Dataset of 8.5K high quality linguistically diverse grade school math word problems. Demonstrates fewshot prompting.
122122
path: src/inspect_evals/gsm8k
123123
arxiv: https://arxiv.org/abs/2110.14168
124124
group: Mathematics
@@ -183,7 +183,7 @@
183183

184184
- title: "∞Bench: Extending Long Context Evaluation Beyond 100K Tokens"
185185
description: |
186-
LLM benchmark featuring an average data length surpassing 100K tokens. Comprises synthetic and realistic tasks spanning diverse domains in English and Chinese.
186+
LLM benchmark featuring an average data length surpassing 100K tokens. Comprises synthetic and realistic tasks spanning diverse domains in English and Chinese.
187187
path: src/inspect_evals/infinite_bench
188188
arxiv: https://arxiv.org/abs/2402.13718
189189
group: Reasoning

0 commit comments

Comments
 (0)