Skip to content

Commit 3e73bf9

Browse files
Readme and loading for chat tasks, float scoring for logviewer render (rllm-org#143)
* Readme and loading for chat tasks, float scoring for logviewer render * ruff
1 parent 13cc10f commit 3e73bf9

File tree

3 files changed

+30
-23
lines changed

3 files changed

+30
-23
lines changed

src/inspect_evals/agentharm/README.md

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ Eric Winsor<sup>2</sup>, Jerome Wynne<sup>2</sup>, Yarin Gal<sup>2,♯</sup>, Xa
1111

1212
**Paper**: <https://arxiv.org/abs/2410.09024>
1313

14-
**Benchmark**: <https://huggingface.co/datasets/ai-safety-institute/AgentHarm>
14+
**Dataset**: <https://huggingface.co/datasets/ai-safety-institute/AgentHarm>
1515

1616
*Note*: At the moment, we release 44 out of 66 public test base behaviors (176 augmented ones) and 8 out of 11 validation base behaviors (32 augmented ones). We hope to release additional behaviors in the future. The average harm score for direct requests (i.e., without jailbreaks) on this initial set of behaviors slightly increases, e.g., from 48.4% to 54.9% on GPT-4o and from 82.2% to 83.6% on Mistral.
1717

@@ -29,8 +29,9 @@ First, install the inspect_ai and inspect_evals Python packages with:
2929
pip install inspect_ai
3030
pip install git+https://github.com/UKGovernmentBEIS/inspect_evals
3131
```
32+
In VsCode, make sure to download the Inspect AI extension for viewing logs. For more details see https://inspect.ai-safety-institute.org.uk/log-viewer.html.
3233

33-
Examples of running evaluations on two latest proprietary models:
34+
Examples of running evaluations on two proprietary models:
3435
```bash
3536
inspect eval inspect_evals/agentharm --model openai/gpt-4o-2024-08-06
3637
inspect eval inspect_evals/agentharm --model anthropic/claude-3-5-sonnet-20240620
@@ -40,6 +41,12 @@ The same but on benign behaviors:
4041
inspect eval inspect_evals/agentharm_benign --model openai/gpt-4o-2024-08-06
4142
inspect eval inspect_evals/agentharm_benign --model anthropic/claude-3-5-sonnet-20240620
4243
```
44+
45+
Running the chat-only version of the harmful tasks:
46+
```bash
47+
inspect eval inspect_evals/agentharm_benign --model openai/gpt-4o-2024-08-06 -T chat_dataset=True
48+
```
49+
4350
You can also pass arguments to a task to specify using the validation split
4451
```bash
4552
inspect eval inspect_evals/agentharm --model openai/gpt-4o-2024-08-06 -T split=val

src/inspect_evals/agentharm/scorer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ async def compute_score(state: TaskState, target: Target) -> Score:
113113
explanation = f"Grading breakdown: {score_dict}, Refusal explanation: {refusal_explanation}"
114114

115115
combined_score = Score(
116-
value={"score": score, "refusal": refusal},
116+
value={"score": score, "refusal": 1.0 if refusal else 0.0},
117117
explanation=explanation,
118118
metadata={"category": state.metadata["category"]},
119119
)

src/inspect_evals/agentharm/utils.py

Lines changed: 20 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -36,19 +36,19 @@ def load_dataset(
3636
AGENTHARM_DATASET_LOCATION = (
3737
Path(user_cache_dir("inspect_evals")) / "agentharm_dataset" / "AgentHarm"
3838
)
39-
snapshot_download(repo_id="ai-safety-institute/AgentHarm", allow_patterns="*.json")
40-
if not os.path.exists(AGENTHARM_DATASET_LOCATION):
41-
AGENTHARM_DATASET_LOCATION.mkdir(parents=True, exist_ok=True)
42-
try:
43-
snapshot_download(
44-
repo_id="ai-safety-institute/AgentHarm",
45-
repo_type="dataset",
46-
local_dir=AGENTHARM_DATASET_LOCATION,
47-
allow_patterns="*.json",
48-
)
49-
except Exception as ex:
50-
shutil.rmtree(AGENTHARM_DATASET_LOCATION, True)
51-
raise ex
39+
AGENTHARM_DATASET_LOCATION.mkdir(parents=True, exist_ok=True)
40+
try:
41+
snapshot_download(
42+
repo_id="ai-safety-institute/AgentHarm",
43+
repo_type="dataset",
44+
local_dir=AGENTHARM_DATASET_LOCATION,
45+
allow_patterns="**/*.json",
46+
resume_download=True,
47+
)
48+
except Exception as ex:
49+
print(f"Error downloading dataset: {ex}")
50+
shutil.rmtree(AGENTHARM_DATASET_LOCATION, ignore_errors=True)
51+
raise ex
5252

5353
split_dict = {
5454
"harmful": {
@@ -73,24 +73,24 @@ def load_dataset(
7373
if split not in split_dict[name]:
7474
raise ValueError(f"Split {split} is not available.")
7575

76-
input_json_file = os.path.join(
77-
AGENTHARM_DATASET_LOCATION.as_posix(), split_dict[name][split]
78-
)
76+
input_json_file = AGENTHARM_DATASET_LOCATION / split_dict[name][split]
7977

80-
# Path to the output JSONL file
81-
output_jsonl_file = input_json_file + "l"
78+
# Verify the file exists
79+
if not input_json_file.exists():
80+
raise FileNotFoundError(f"File not found: {input_json_file}")
8281

8382
# Read the JSON file
8483
with open(input_json_file, "r") as json_file:
85-
data = json.load(json_file) # This assumes the JSON file contains a JSON array
84+
data = json.load(json_file)
8685

8786
# Write to the JSONL file
87+
output_jsonl_file = input_json_file.with_suffix(".jsonl")
8888
with open(output_jsonl_file, "w") as jsonl_file:
8989
for record in data["behaviors"]:
9090
jsonl_file.write(json.dumps(record) + "\n")
9191

9292
dataset = json_dataset(
93-
output_jsonl_file,
93+
str(output_jsonl_file),
9494
FieldSpec(
9595
input="prompt",
9696
id="id",

0 commit comments

Comments
 (0)