comment fixes and merge conflicts

ourmony · ourmony · commit 4a39c6672db6 · 2025-01-05T19:47:23.000-05:00
diff --git a/src/inspect_evals/cyberseceval_2/interpreter_abuse/dataset.py b/src/inspect_evals/cyberseceval_2/interpreter_abuse/dataset.py
@@ -4,7 +4,6 @@
 
 
 def record_to_sample(record: dict[str, Any]) -> Sample:
-    """Map records to inspect samples"""
     return Sample(
         input=record["mutated_prompt"],
         metadata={
diff --git a/src/inspect_evals/cyberseceval_2/prompt_injection/dataset.py b/src/inspect_evals/cyberseceval_2/prompt_injection/dataset.py
@@ -2,8 +2,8 @@
 
 from inspect_ai.dataset import Dataset, Sample, json_dataset
 
+DATASET_URL = "https://raw.githubusercontent.com/meta-llama/PurpleLlama/70e7a376f2a310fb89f7ad207685baa69f78107f/CybersecurityBenchmarks/datasets/prompt_injection/prompt_injection.json"
 
-# map records to inspect samples
 def record_to_sample(record: dict[str, Any]) -> Sample:
     return Sample(
         input=record["user_input"],
@@ -22,14 +22,12 @@ def read_dataset(
 ) -> Dataset:
     """Reads a dataset from a challenges.json file."""
     dataset = json_dataset(
-        json_file="https://raw.githubusercontent.com/meta-llama/PurpleLlama/70e7a376f2a310fb89f7ad207685baa69f78107f/CybersecurityBenchmarks/datasets/prompt_injection/prompt_injection.json",
+        json_file=DATASET_URL,
         sample_fields=record_to_sample,
         shuffle=shuffle,
     )
 
-    # filter if requested
     if challenges:
-        # get challenges and validate them
         challenges = challenges if isinstance(challenges, list) else [challenges]
         challenge_ids = [
             sample.metadata["injection_variant"]
@@ -41,7 +39,6 @@ def read_dataset(
             challenge_names = ", ".join(invalid_challenges)
             raise ValueError(f"Invalid challenge names: {challenge_names}")
 
-        # apply filter
         dataset = dataset.filter(
             lambda sample: sample.metadata is not None
             and sample.metadata["injection_variant"] in challenges
diff --git a/src/inspect_evals/cyberseceval_2/vulnerability_exploit/scorers.py b/src/inspect_evals/cyberseceval_2/vulnerability_exploit/scorers.py
@@ -19,9 +19,9 @@
 @scorer(metrics=[accuracy(), mean(), std()])
 def vul_exploit_scorer() -> Scorer:
     async def scorer(state: TaskState, target: Target) -> Score:
-        # Test state's input proposed by model against the challenge's
-        # target code to yield a score
-        # Model's answer (exploit)
+        """Test state's input proposed by model against the challenge's
+        target code to yield a score"""
+
         answer = ""
         try:
             answer = re.sub("```(json)*", "", state.output.completion).strip()
diff --git a/tools/listing.py b/tools/listing.py
@@ -30,13 +30,14 @@ def listing_md(listing: dict[str, Any]) -> str:
     output: list[str] = []
     output.append(f"- ### {link_md(listing['title'], os.path.join(listing['path']))}")
     output.append(f"  {listing['description']}{contributors}")
-    output.append("   ```")
+    output.append("")
+    output.append("   ```bash")
     for index, task in enumerate(listing["tasks"]):
         if index > 3:
             break
         output.append(f"   inspect eval inspect_evals/{task}")
 
-    output.append("   ```\n")
+    output.append("   ```")
     return "\n".join(output)
 
 
@@ -53,7 +54,7 @@ def readme_contents(file: Path, key: str) -> Contents:
 
     # Read the file lines
     readme_lines = []
-    with open(file, "r") as readme_file:
+    with open(file, "r", encoding="utf-8") as readme_file:
         readme_lines = readme_file.readlines()
 
     # Go through line by line and collect the prefix and suffix
@@ -84,7 +85,7 @@ def rewrite_task_readme(path: str, key: str, contents: list[str]) -> None:
     readme_path = Path(__file__).parent / ".." / path / "README.md"
     parsed = readme_contents(readme_path, key)
     if parsed.contains_key:
-        with open(readme_path, "w") as readme_file:
+        with open(readme_path, "w", encoding="utf-8") as readme_file:
             readme_file.write("\n".join(parsed.prefix + contents + parsed.suffix))
 
 
@@ -98,6 +99,7 @@ def generate_options(task_metadata: dict[str, Any]) -> None:
     contents.append(
         "You can control a variety of options from the command line. For example:"
     )
+    contents.append("")
     contents.append("```bash")
     contents.append(f"inspect eval inspect_evals/{task_names[0]} --limit 10")
     contents.append(f"inspect eval inspect_evals/{task_names[1]} --max-connections 10")
@@ -118,6 +120,7 @@ def generate_usage(task_metadata: dict[str, Any]) -> None:
     contents.append(
         "First, install the `inspect_ai` and `inspect_evals` Python packages with:"
     )
+    contents.append("")
     contents.append("```bash")
     contents.append("pip install inspect_ai")
     if dependency is None:
@@ -132,6 +135,7 @@ def generate_usage(task_metadata: dict[str, Any]) -> None:
     contents.append("")
 
     contents.append("Then, evaluate against one or more models with:")
+    contents.append("")
     contents.append("```bash")
     for index, task in enumerate(task_metadata["tasks"]):
         if index > 3:
@@ -180,7 +184,7 @@ def generate_readme() -> None:
 
     # read the listings
     listings_raw: list[dict[str, Any]] = []
-    with open(listing_file, "r") as yaml_file:
+    with open(listing_file, "r", encoding="utf-8") as yaml_file:
         listings_raw = yaml.safe_load(yaml_file)
 
     # read them
@@ -213,7 +217,7 @@ def generate_readme() -> None:
     contents = readme_contents(readme_path, EVAL_KEY)
 
     # rewrite the readme with prefix and suffix content
-    with open(readme_path, "w") as readme_file:
+    with open(readme_path, "w", encoding="utf-8") as readme_file:
         readme_file.write("\n".join(contents.prefix + content + contents.suffix))
 
     for listing_raw in listings_raw:
@@ -223,4 +227,4 @@ def generate_readme() -> None:
 
 
 if __name__ == "__main__":
-    generate_readme()
+    generate_readme()
diff --git a/tools/listing.yaml b/tools/listing.yaml
@@ -58,7 +58,7 @@
 
 - title: "Cybench: A Framework for Evaluating Cybersecurity Capabilities and Risks of Language Models"
   description: |
-    40 professional-level Capture the Flag (CTF) tasks from 4 distinct CTF competitions, chosen to be recent, meaningful, and spanning a wide range of difficulties. 
+    40 professional-level Capture the Flag (CTF) tasks from 4 distinct CTF competitions, chosen to be recent, meaningful, and spanning a wide range of difficulties.
   path: src/inspect_evals/cybench
   group: Cybersecurity
   contributors: ["sinman-aisi", "sam-deverett-dsit", "kola-aisi", "pgiav"]
@@ -118,7 +118,7 @@
 
 - title: "GSM8K: Training Verifiers to Solve Math Word Problems"
   description: |
-    Dataset of 8.5K high quality linguistically diverse grade school math word problems. Demostrates fewshot prompting.
+    Dataset of 8.5K high quality linguistically diverse grade school math word problems. Demonstrates fewshot prompting.
   path: src/inspect_evals/gsm8k
   arxiv: https://arxiv.org/abs/2110.14168
   group: Mathematics
@@ -183,7 +183,7 @@
 
 - title: "∞Bench: Extending Long Context Evaluation Beyond 100K Tokens"
   description: |
-    LLM benchmark featuring an average data length surpassing 100K tokens. Comprises synthetic and realistic tasks spanning diverse domains in English and Chinese. 
+    LLM benchmark featuring an average data length surpassing 100K tokens. Comprises synthetic and realistic tasks spanning diverse domains in English and Chinese.
   path: src/inspect_evals/infinite_bench
   arxiv: https://arxiv.org/abs/2402.13718
   group: Reasoning