Merge pull request rllm-org#121 from bact/add-sh-annotation-markdown

jjallaire · web-flow · commit ca016745d7d2 · 2024-12-03T19:23:38.000Z
Fix typos and Markdown warnings in README
diff --git a/README.md b/README.md
@@ -25,11 +25,12 @@ You will also need to install any packages required to interact with the models
 export OPENAI_API_KEY=<openai-api-key>
 pip install openai
 ```
-Furthermore, some of the evaluations require additional dependencies. If your eval needs extra dependency, instructions for installing them are provided the [list of evals](#list-of-evals). subsection (or the README for that evaluation). For example, to install the dependencies of `SWE-Bench` evaluation you should run:
+
+Furthermore, some of the evaluations require additional dependencies. If your eval needs extra dependency, instructions for installing them are provided the [list of evals](#list-of-evals) subsection (or the README for that evaluation). For example, to install the dependencies of `SWE-Bench` evaluation you should run:
 
 ```bash
    pip install "inspect_evals[swe_bench] @ git+https://github.com/UKGovernmentBEIS/inspect_evals"
-   pip install -e ".[swe_bench]" # If developing on the pacakge locally
+   pip install -e ".[swe_bench]" # If developing on the package locally
 ```
 
 Once you have a model configured, you can run evaluations for it with:
@@ -38,14 +39,14 @@ Once you have a model configured, you can run evaluations for it with:
 inspect eval inspect_evals/gpqa_diamond --model openai/gpt-4o
 ```
 
-If you don't want to specify the `--model` each time you run an evaluation, create a `.env` configuration file in your working direcotry that defines the `INSPECT_EVAL_MODEL` environment variable along with your API key. For example:
+If you don't want to specify the `--model` each time you run an evaluation, create a `.env` configuration file in your working directory that defines the `INSPECT_EVAL_MODEL` environment variable along with your API key. For example:
 
 ```bash
 INSPECT_EVAL_MODEL=openai/gpt-4o
 OPENAI_API_KEY=<openai-api-key>
 ```
 
-Inspect supports many model providers including OpenAI, Anthropic, Google, Mistral, AzureAI, AWS Bedrock, TogetherAI, Groq, HuggingFace, vLLM, Ollama, and more. See the [Model Providers](https://inspect.ai-safety-institute.org.uk/models.html) documentation for additional details.
+Inspect supports many model providers including OpenAI, Anthropic, Google, Mistral, Azure AI, AWS Bedrock, Together AI, Groq, Hugging Face, vLLM, Ollama, and more. See the [Model Providers](https://inspect.ai-safety-institute.org.uk/models.html) documentation for additional details.
 
 # List of Evals
 <!-- Eval Listing: Automatically Generated -->
diff --git a/tools/listing.py b/tools/listing.py
@@ -30,13 +30,14 @@ def listing_md(listing: dict[str, Any]) -> str:
     output: list[str] = []
     output.append(f"- ### {link_md(listing['title'], os.path.join(listing['path']))}")
     output.append(f"  {listing['description']}{contributors}")
-    output.append("   ```")
+    output.append("")
+    output.append("   ```bash")
     for index, task in enumerate(listing["tasks"]):
         if index > 3:
             break
         output.append(f"   inspect eval inspect_evals/{task}")
 
-    output.append("   ```\n")
+    output.append("   ```")
     return "\n".join(output)
 
 
@@ -98,6 +99,7 @@ def generate_options(task_metadata: dict[str, Any]) -> None:
     contents.append(
         "You can control a variety of options from the command line. For example:"
     )
+    contents.append("")
     contents.append("```bash")
     contents.append(f"inspect eval inspect_evals/{task_names[0]} --limit 10")
     contents.append(f"inspect eval inspect_evals/{task_names[1]} --max-connections 10")
@@ -118,6 +120,7 @@ def generate_usage(task_metadata: dict[str, Any]) -> None:
     contents.append(
         "First, install the `inspect_ai` and `inspect_evals` Python packages with:"
     )
+    contents.append("")
     contents.append("```bash")
     contents.append("pip install inspect_ai")
     if dependency is None:
@@ -132,6 +135,7 @@ def generate_usage(task_metadata: dict[str, Any]) -> None:
     contents.append("")
 
     contents.append("Then, evaluate against one or more models with:")
+    contents.append("")
     contents.append("```bash")
     for index, task in enumerate(task_metadata["tasks"]):
         if index > 3:
diff --git a/tools/listing.yaml b/tools/listing.yaml
@@ -58,7 +58,7 @@
 
 - title: "Cybench: A Framework for Evaluating Cybersecurity Capabilities and Risks of Language Models"
   description: |
-    40 professional-level Capture the Flag (CTF) tasks from 4 distinct CTF competitions, chosen to be recent, meaningful, and spanning a wide range of difficulties. 
+    40 professional-level Capture the Flag (CTF) tasks from 4 distinct CTF competitions, chosen to be recent, meaningful, and spanning a wide range of difficulties.
   path: src/inspect_evals/cybench
   group: Cybersecurity
   contributors: ["sinman-aisi", "sam-deverett-dsit", "kola-aisi", "pgiav"]
@@ -118,7 +118,7 @@
 
 - title: "GSM8K: Training Verifiers to Solve Math Word Problems"
   description: |
-    Dataset of 8.5K high quality linguistically diverse grade school math word problems. Demostrates fewshot prompting.
+    Dataset of 8.5K high quality linguistically diverse grade school math word problems. Demonstrates fewshot prompting.
   path: src/inspect_evals/gsm8k
   arxiv: https://arxiv.org/abs/2110.14168
   group: Mathematics
@@ -180,10 +180,9 @@
   contributors: ["seddy-aisi"]
   tasks: ["piqa"]
 
-
 - title: "∞Bench: Extending Long Context Evaluation Beyond 100K Tokens"
   description: |
-    LLM benchmark featuring an average data length surpassing 100K tokens. Comprises synthetic and realistic tasks spanning diverse domains in English and Chinese. 
+    LLM benchmark featuring an average data length surpassing 100K tokens. Comprises synthetic and realistic tasks spanning diverse domains in English and Chinese.
   path: src/inspect_evals/infinite_bench
   arxiv: https://arxiv.org/abs/2402.13718
   group: Reasoning