Refactor metric, improve documentation, cleanup

mjbroerman · mjbroerman · commit b8f4b6c97fa4 · 2024-11-11T09:17:58.000-05:00
- Refactored the metric computation in `_utils.py`:
  - Split the `ws_metric` function into three separate functions for better modularity:
    - `preprocess_scores`: Preprocesses scores into a DataFrame with computed weights and biases.
    - `compute_accuracy`: Computes weighted accuracy from preprocessed scores.
    - `compute_bias`: Computes weighted bias from preprocessed scores.
- Updated `ws_accuracy` and `ws_bias` in `worldsense.py` to utilize the new functions.
- Improved documentation:
  - Added detailed explanations of problem types and grades in `README.md`, clarifying how `problemname` is formed.
  - Included a comprehensive docstring for the `worldsense` task in `worldsense.py`, explaining the task's purpose and usage.
diff --git a/src/inspect_evals/worldsense/README.md b/src/inspect_evals/worldsense/README.md
@@ -4,20 +4,31 @@
 
 
 ## Dataset
-Here is an example prompt from the dataset:
+Here is an example prompt ("description") from the dataset (`problemname == "Compl.trivial"`):
 
 >"Alice is enrolled in 3 courses per week: Alice takes history before computer science and economics before history. 
 >Choose one of the following alternatives: (1) Alice takes history in between economics and computer science, (2) Alice takes history outside of the time range between economics and computer science, or (3) it is impossible to decide.
 >Think carefully, and only respond with one of these possible options (1), (2), or (3). 
 
-The model is then tasked to pick the correct choice.
+There are three problem types:
+
+1. "Infer" (inference): Determine whether a given statement about a description is true or false.
+2. "Compl" (completion): Select which of three statements are true about a description, including an option for when it is not possible to decide.
+3. "Consist" (consistency): Determine whether a statement about a description is possible or impossible.
+
+In addition there are two grades:
+
+1. "trivial": can be solved on statements alone
+2. "normal": requires a world model to solve
+
+A problemname is formed by concatenation "<type>.<grade>".
 
 ## Scoring
 
 - Simple accuracy
 - Standard error
-- Weighted accuracy by `tuple_ID`, `problemname`, and `problemsize` (reported in the paper)
-- Weighted bias by `tuple_ID`, `problemname`, and `problemsize` (reported in the paper) 
+- Weighted accuracy by `tuple_ID`, `problemname`, and `problemsize` (as reported in the paper)
+- Weighted bias by `tuple_ID`, `problemname`, and `problemsize` (as reported in the paper) 
 
 In addition to built-in metrics, the main results are weighted accuracy and bias. Here the primary unit is a `tuple_ID` which corresponds to one "description" or scenario above. To this, up to three answer option sets are provided to ensure that all possible correct answers for the options are selected exactly once. Weights are used to average over multiple `tuple_ID`s. 
 
diff --git a/src/inspect_evals/worldsense/_utils.py b/src/inspect_evals/worldsense/_utils.py
@@ -2,48 +2,33 @@
 from inspect_ai.scorer import Score, ValueToFloat, value_to_float
 
 
-def ws_metric(
-    scores: list[Score],
-    kind: str,
-    to_float: ValueToFloat = value_to_float(),
-) -> float:
-    """Compute the weighted accuracy or bias metric.
-
-    Args:
-        scores (list[Score]): List of Score objects containing evaluation data.
-        kind (str): Type of metric to compute ('acc' for accuracy or 'bias').
-        to_float (ValueToFloat, optional): Function to convert `Score` values to floats. Defaults to `value_to_float()`.
-
-    Returns:
-        float: The computed metric value.
-    """
-    # Build DataFrame from the list of Score objects
-    data = []
-    for score in scores:
-        value = to_float(score.value)
-        answer = score.answer
-        metadata = score.metadata or {}
-
-        tuple_ID = metadata.get("tuple_ID")
-        problemname = metadata.get("problemname")
-        problemsize = metadata.get("problemsize")
-
-        if None in (value, answer, tuple_ID, problemname, problemsize):
-            continue
-
-        data.append(
-            {
-                "value": value,
-                "answer": answer,
-                "tuple_ID": tuple_ID,
-                "problemsize": problemsize,
-                "problemname": problemname,
-            }
+def preprocess_scores(
+    scores: list[Score], to_float: ValueToFloat = value_to_float()
+) -> pd.DataFrame:
+    """Preprocesses a list of Score objects into a DataFrame with computed weights and biases."""
+    processed_scores = [
+        {
+            "value": to_float(score.value),
+            "answer": score.answer,
+            "tuple_ID": score.metadata.get("tuple_ID"),
+            "problemname": score.metadata.get("problemname"),
+            "problemsize": score.metadata.get("problemsize"),
+        }
+        for score in scores
+        if score.metadata
+        and None
+        not in (
+            to_float(score.value),
+            score.answer,
+            score.metadata.get("tuple_ID"),
+            score.metadata.get("problemname"),
+            score.metadata.get("problemsize"),
         )
+    ]
 
-    df = pd.DataFrame(data)
+    score_df = pd.DataFrame(processed_scores)
 
-    # Define mappings for bias and weight
+    # Mappings for bias and weight
     bias_mapping = {
         "1": 1,
         "2": 1,
@@ -63,46 +48,50 @@ def ws_metric(
         "IMPOSSIBLE": 0.5,
     }
 
-    # Calculate weight and bias values
-    df["weight"] = df["answer"].map(weight_mapping).astype(float)
-    df["bias"] = df["answer"].map(bias_mapping).astype(float)
-    df["value"] = df["value"].astype(float)
-    df["value"] *= df["weight"]
-    df["bias"] *= df["weight"]
+    # Calculate weighted values and biases
+    score_df["weight"] = score_df["answer"].map(weight_mapping).astype(float)
+    score_df["bias"] = score_df["answer"].map(bias_mapping).astype(float)
+    score_df["value"] = score_df["value"].astype(float) * score_df["weight"]
+    score_df["bias"] *= score_df["weight"]
 
-    # Step 5.1: Group by 'tuple_ID', 'problemname', 'problemsize' and sum
-    grouped = (
-        df.groupby(["tuple_ID", "problemname", "problemsize"])
+    # Group and normalize
+    grouped_scores = (
+        score_df.groupby(["tuple_ID", "problemname", "problemsize"])
         .agg({"value": "sum", "bias": "sum", "weight": "sum"})
         .reset_index()
     )
 
-    # Step 5.2: Normalize 'value' and 'bias' by dividing by total 'weight'
-    grouped["value"] = grouped["value"] / grouped["weight"].where(
-        grouped["weight"] != 0, 1
+    grouped_scores["value"] = grouped_scores["value"] / grouped_scores["weight"].where(
+        grouped_scores["weight"] != 0, 1
     )
-    grouped["bias"] = grouped["bias"] / grouped["weight"].where(
-        grouped["weight"] != 0, 1
+    grouped_scores["bias"] = grouped_scores["bias"] / grouped_scores["weight"].where(
+        grouped_scores["weight"] != 0, 1
     )
 
-    # Step 6: Compute mean 'acc' and 'bias' grouped by 'problemname' and 'problemsize'
-    summaries = (
-        grouped.groupby(["problemname", "problemsize"])
-        .agg({"value": "mean", "bias": "mean"})
+    return grouped_scores
+
+
+def compute_accuracy(grouped_scores: pd.DataFrame) -> float:
+    """Compute the weighted accuracy from preprocessed scores."""
+    problem_summary = (
+        grouped_scores.groupby(["problemname", "problemsize"])
+        .agg({"value": "mean"})
         .reset_index()
     )
+    final_summary = (
+        problem_summary.groupby("problemname").agg({"value": "mean"}).reset_index()
+    )
+    return float(final_summary["value"].mean())
+
 
-    # Step 7: Compute overall mean 'acc' and 'bias' grouped by 'problemname'
-    final_summaries = (
-        summaries.groupby("problemname")
-        .agg({"value": "mean", "bias": "mean"})
+def compute_bias(grouped_scores: pd.DataFrame) -> float:
+    """Compute the weighted bias from preprocessed scores."""
+    problem_summary = (
+        grouped_scores.groupby(["problemname", "problemsize"])
+        .agg({"bias": "mean"})
         .reset_index()
     )
-
-    # Compute the final metric
-    if kind == "acc":
-        return float(final_summaries["value"].mean())
-    elif kind == "bias":
-        return float(final_summaries["bias"].mean())
-    else:
-        raise ValueError("Invalid kind argument, must be 'acc' or 'bias'")
+    final_summary = (
+        problem_summary.groupby("problemname").agg({"bias": "mean"}).reset_index()
+    )
+    return float(final_summary["bias"].mean())
diff --git a/src/inspect_evals/worldsense/worldsense.py b/src/inspect_evals/worldsense/worldsense.py
@@ -38,11 +38,20 @@
 )
 from inspect_ai.solver import TaskState, generate
 
-from ._utils import ws_metric
+from ._utils import compute_accuracy, compute_bias, preprocess_scores
 
 
 @task
 def worldsense(problemnames: str | list[str] = []) -> Task:
+    """
+    Task for evaluating reasoning related to world descriptions. There are three problem types ("Infer", "Compl", "Consist") and two grades ("trivial", "normal"). A problemname is formed by concatenation "<type>.<grade>". See README for details.
+
+    Args:
+        problemnames (str | list[str], optional): A string or list of strings specifying the names of problems or tasks to filter the dataset. If provided, it filters the dataset to samples that contain matching metadata for the specified problem names.
+
+    Returns:
+        Task: A task object configured with a dataset filtered by problem names (if specified), a solver, a scoring pattern for evaluating task responses, and custom metrics.
+    """
     # filter dataset if requested
     problemnames = problemnames if isinstance(problemnames, list) else [problemnames]
     if len(problemnames) > 0:
@@ -152,7 +161,8 @@ def ws_accuracy() -> Metric:
     """Compute weighted accuracy metric."""
 
     def metric(scores: list[Score]) -> float:
-        return ws_metric(scores, kind="acc")
+        grouped_scores = preprocess_scores(scores)
+        return compute_accuracy(grouped_scores)
 
     return metric
 
@@ -162,6 +172,7 @@ def ws_bias() -> Metric:
     """Compute weighted bias metric."""
 
     def metric(scores: list[Score]) -> float:
-        return ws_metric(scores, kind="bias")
+        grouped_scores = preprocess_scores(scores)
+        return compute_bias(grouped_scores)
 
     return metric