Merge pull request rllm-org#65 from mjbroerman/feature/worldsense

alexandraabbas · web-flow · commit 299f69c6c44b · 2025-01-08T17:15:10.000Z
WorldSense Benchmark Implementation | ASET - Arcadia Impact
diff --git a/pyproject.toml b/pyproject.toml
@@ -123,6 +123,7 @@ inspect_evals = "inspect_evals._registry"
 [project.optional-dependencies]
 swe_bench = ["swebench>=2.1.0","docker"]
 mathematics = ["sympy","antlr4-python3-runtime==4.13.2"]
+worldsense = ["pandas"]
 
 dev = [
     "inspect_ai@git+https://github.com/UKGovernmentBEIS/inspect_ai",
diff --git a/src/inspect_evals/_registry.py b/src/inspect_evals/_registry.py
@@ -64,4 +64,5 @@
 from .vstar_bench import vstar_bench_ar, vstar_bench_srr
 from .winogrande import winogrande
 from .wmdp import wmdp_bio, wmdp_chem, wmdp_cyber
+from .worldsense import worldsense
 from .xstest import xstest
diff --git a/src/inspect_evals/worldsense/.gitignore b/src/inspect_evals/worldsense/.gitignore
@@ -0,0 +1,9 @@
+trials.jsonl
+example.jsonl
+__pycache__/
+/.quarto/
+_output/
+_quarto.yml
+*.qmd
+*.quarto_ipynb
+img/
diff --git a/src/inspect_evals/worldsense/README.md b/src/inspect_evals/worldsense/README.md
@@ -0,0 +1,37 @@
+# WorldSense
+
+[WorldSense](https://arxiv.org/pdf/2311.15930) is a benchmark to measure reasoning over a world-model while controlling for dataset bias. 
+
+
+## Dataset
+Here is an example prompt ("description") from the dataset (`problemname == "Compl.trivial"`):
+
+>"Alice is enrolled in 3 courses per week: Alice takes history before computer science and economics before history. 
+>Choose one of the following alternatives: (1) Alice takes history in between economics and computer science, (2) Alice takes history outside of the time range between economics and computer science, or (3) it is impossible to decide.
+>Think carefully, and only respond with one of these possible options (1), (2), or (3). 
+
+There are three problem types:
+
+1. "Infer" (inference): Determine whether a given statement about a description is true or false.
+2. "Compl" (completion): Select which of three statements are true about a description, including an option for when it is not possible to decide.
+3. "Consist" (consistency): Determine whether a statement about a description is possible or impossible.
+
+In addition there are two grades:
+
+1. "trivial": can be solved on statements alone
+2. "normal": requires a world model to solve
+
+A problemname is formed by concatenation "<type>.<grade>".
+
+## Scoring
+
+- Simple accuracy
+- Standard error
+- Weighted accuracy by `tuple_ID`, `problemname`, and `problemsize` (as reported in the paper)
+- Weighted bias by `tuple_ID`, `problemname`, and `problemsize` (as reported in the paper) 
+
+In addition to built-in metrics, the main results are weighted accuracy and bias. Here the primary unit is a `tuple_ID` which corresponds to one "description" or scenario above. To this, up to three answer option sets are provided to ensure that all possible correct answers for the options are selected exactly once. Weights are used to average over multiple `tuple_ID`s. 
+
+All answer options are coded as positive or negative (in the above example,  1: 1, 2: 1, and 3: -1), and all option sets have the same arrangement of option codings. Bias is calculated from this, weighted accordingly.
+
+Problem size corresponds to the number of entities in the description. In the example above, `problemsize == 3`. Within a `problemname`, the score is the grand average of problem sizes. If multiple `problemnames` are specified (including none specified), the score is the grand average of this also.
diff --git a/src/inspect_evals/worldsense/__init__.py b/src/inspect_evals/worldsense/__init__.py
@@ -0,0 +1,3 @@
+from .worldsense import worldsense
+
+__all__ = ["worldsense"]
diff --git a/src/inspect_evals/worldsense/_utils.py b/src/inspect_evals/worldsense/_utils.py
@@ -0,0 +1,97 @@
+import pandas as pd
+from inspect_ai.scorer import Score, ValueToFloat, value_to_float
+
+
+def preprocess_scores(
+    scores: list[Score], to_float: ValueToFloat = value_to_float()
+) -> pd.DataFrame:
+    """Preprocesses a list of Score objects into a DataFrame with computed weights and biases."""
+    processed_scores = [
+        {
+            "value": to_float(score.value),
+            "answer": score.answer,
+            "tuple_ID": score.metadata.get("tuple_ID"),
+            "problemname": score.metadata.get("problemname"),
+            "problemsize": score.metadata.get("problemsize"),
+        }
+        for score in scores
+        if score.metadata
+        and None
+        not in (
+            to_float(score.value),
+            score.answer,
+            score.metadata.get("tuple_ID"),
+            score.metadata.get("problemname"),
+            score.metadata.get("problemsize"),
+        )
+    ]
+
+    score_df = pd.DataFrame(processed_scores)
+
+    # Mappings for bias and weight
+    bias_mapping = {
+        "1": 1,
+        "2": 1,
+        "TRUE": 1,
+        "POSSIBLE": 1,
+        "3": -1,
+        "FALSE": -1,
+        "IMPOSSIBLE": -1,
+    }
+    weight_mapping = {
+        "1": 0.25,
+        "2": 0.25,
+        "3": 0.5,
+        "TRUE": 0.5,
+        "POSSIBLE": 0.5,
+        "FALSE": 0.5,
+        "IMPOSSIBLE": 0.5,
+    }
+
+    # Calculate weighted values and biases
+    score_df["weight"] = score_df["answer"].map(weight_mapping).astype(float)
+    score_df["bias"] = score_df["answer"].map(bias_mapping).astype(float)
+    score_df["value"] = score_df["value"].astype(float) * score_df["weight"]
+    score_df["bias"] *= score_df["weight"]
+
+    # Group and normalize
+    grouped_scores = (
+        score_df.groupby(["tuple_ID", "problemname", "problemsize"])
+        .agg({"value": "sum", "bias": "sum", "weight": "sum"})
+        .reset_index()
+    )
+
+    grouped_scores["value"] = grouped_scores["value"] / grouped_scores["weight"].where(
+        grouped_scores["weight"] != 0, 1
+    )
+    grouped_scores["bias"] = grouped_scores["bias"] / grouped_scores["weight"].where(
+        grouped_scores["weight"] != 0, 1
+    )
+
+    return grouped_scores
+
+
+def compute_accuracy(grouped_scores: pd.DataFrame) -> float:
+    """Compute the weighted accuracy from preprocessed scores."""
+    problem_summary = (
+        grouped_scores.groupby(["problemname", "problemsize"])
+        .agg({"value": "mean"})
+        .reset_index()
+    )
+    final_summary = (
+        problem_summary.groupby("problemname").agg({"value": "mean"}).reset_index()
+    )
+    return float(final_summary["value"].mean())
+
+
+def compute_bias(grouped_scores: pd.DataFrame) -> float:
+    """Compute the weighted bias from preprocessed scores."""
+    problem_summary = (
+        grouped_scores.groupby(["problemname", "problemsize"])
+        .agg({"bias": "mean"})
+        .reset_index()
+    )
+    final_summary = (
+        problem_summary.groupby("problemname").agg({"bias": "mean"}).reset_index()
+    )
+    return float(final_summary["bias"].mean())
diff --git a/src/inspect_evals/worldsense/worldsense.py b/src/inspect_evals/worldsense/worldsense.py
@@ -0,0 +1,178 @@
+"""
+WorldSense: A Synthetic Benchmark for Grounded Reasoning
+in Large Language Models
+
+Youssef Benchekroun, Megi Dervishi, Mark Ibrahim, Jean-Baptiste Gaya, Xavier Martinet, Grégoire Mialon, Thomas Scialom, Emmanuel Dupoux, Dieuwke Hupkes, Pascal Vincent
+https://arxiv.org/pdf/2311.15930
+
+# eval all problemnames w/ 500 randomly selected samples
+inspect eval worldsense --limit 500
+
+# add chain of thought
+inspect eval worldsense --limit 500
+
+# eval selected problem
+inspect eval worldsense -T problemnames=Compl.normal
+inspect eval worldsense -T problemnames=Consist.normal
+inspect eval worldsense -T problemnames=Infer.normal,Infer.trivial
+"""  # noqa: D205
+
+import bz2
+import json
+from typing import Any, Callable, Dict
+
+import requests
+from inspect_ai import Task, task
+from inspect_ai.dataset import Dataset, MemoryDataset, Sample
+from inspect_ai.model import GenerateConfig
+from inspect_ai.scorer import (
+    Metric,
+    Score,
+    Scorer,
+    Target,
+    accuracy,
+    metric,
+    pattern,
+    scorer,
+    stderr,
+)
+from inspect_ai.solver import TaskState, generate
+
+from ._utils import compute_accuracy, compute_bias, preprocess_scores
+
+
+@task
+def worldsense(problemnames: str | list[str] = []) -> Task:
+    """
+    Task for evaluating reasoning related to world descriptions. There are three problem types ("Infer", "Compl", "Consist") and two grades ("trivial", "normal"). A problemname is formed by concatenation "<type>.<grade>". See README for details.
+
+    Args:
+        problemnames (str | list[str], optional): A string or list of strings specifying the names of problems or tasks to filter the dataset. If provided, it filters the dataset to samples that contain matching metadata for the specified problem names.
+
+    Returns:
+        Task: A task object configured with a dataset filtered by problem names (if specified), a solver, a scoring pattern for evaluating task responses, and custom metrics.
+    """
+    # filter dataset if requested
+    problemnames = problemnames if isinstance(problemnames, list) else [problemnames]
+    if len(problemnames) > 0:
+        task_dataset = dataset.filter(
+            name=f"{dataset.name}-{'-'.join(problemnames)}",
+            predicate=lambda sample: sample.metadata is not None
+            and sample.metadata.get("problemname") in problemnames,
+        )
+    else:
+        task_dataset = dataset
+
+    return Task(
+        dataset=task_dataset,
+        solver=generate(),
+        scorer=pattern_with_metadata(
+            r"^\(?\s*(1|2|3|TRUE|FALSE|IMPOSSIBLE|POSSIBLE)\s*\)?"
+        ),
+        metrics=[accuracy(), stderr(), ws_accuracy(), ws_bias()],
+        config=GenerateConfig(temperature=0.0),
+    )
+
+
+def record_to_sample(record: Dict[str, Any]) -> Sample:
+    goldresp_mapping: Dict[str, str] = {
+        "Emmanuel": "TRUE",
+        "Megi": "FALSE",
+        "Dieuwke": "POSSIBLE",
+        "Pascal": "IMPOSSIBLE",
+        "Mark": "1",
+        "Youssef": "2",
+        "Yoda": "3",
+    }
+
+    return Sample(
+        input=record["text"],
+        choices=record["expectedresp"],
+        target=goldresp_mapping.get(record["goldresp_obfusc"], "None"),
+        metadata={
+            "tuple_ID": record["tuple_ID"],
+            "problemname": record["problemname"],
+            "problemsize": record["problemsize"],
+        },
+    )
+
+
+def load_worldsense_dataset(
+    sample_fields: Callable[[dict[str, Any]], Sample],
+    shuffle: bool = True,
+) -> Dataset:
+    """
+    Load the worldsense dataset from a bz2 file directly into memory and return a Dataset.
+
+    Args:
+        sample_fields (Callable): Function to map records to samples.
+        shuffle (bool): Whether to shuffle the dataset. Default is True.
+
+    Returns:
+        Dataset: The loaded and decompressed dataset in memory.
+    """
+    url = "https://github.com/facebookresearch/worldsense/raw/bd81d945077f169cf95ff39207f788f86e4645e9/data/worldsense/test_set/trials.jsonl.bz2"
+
+    # Download and decompress the bz2 data directly into memory
+    response = requests.get(url)
+    response.raise_for_status()
+    decompressed_data = bz2.decompress(response.content).decode("utf-8")
+
+    # Parse the decompressed data into records
+    lines = decompressed_data.strip().splitlines()
+    samples = []
+    for line in lines:
+        record = json.loads(line)
+        sample = sample_fields(record)
+        if isinstance(sample, list):
+            samples.extend(sample)
+        else:
+            samples.append(sample)
+
+    # Create a MemoryDataset
+    dataset = MemoryDataset(samples=samples, name="worldsense", location=url)
+
+    # Shuffle if needed
+    if shuffle:
+        dataset.shuffle()
+
+    return dataset
+
+
+dataset = load_worldsense_dataset(sample_fields=record_to_sample, shuffle=True)
+
+
+@scorer(metrics=[accuracy(), stderr()])
+def pattern_with_metadata(
+    pattern_str: str, ignore_case: bool = True, match_all: bool = False
+) -> Scorer:
+    base_scorer = pattern(pattern_str, ignore_case, match_all)
+
+    async def score(state: TaskState, target: Target) -> Score:
+        base_score = await base_scorer(state, target)
+        base_score.metadata = state.metadata
+        return base_score
+
+    return score
+
+
+@metric
+def ws_accuracy() -> Metric:
+    """Compute weighted accuracy metric."""
+
+    def metric(scores: list[Score]) -> float:
+        grouped_scores = preprocess_scores(scores)
+        return compute_accuracy(grouped_scores)
+
+    return metric
+
+
+@metric
+def ws_bias() -> Metric:
+    """Compute weighted bias metric."""
+
+    def metric(scores: list[Score]) -> float:
+        grouped_scores = preprocess_scores(scores)
+        return compute_bias(grouped_scores)
+
+    return metric

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+from .worldsense import worldsense`
	`2`	`+`
	`3`	`+__all__ = ["worldsense"]`