fx lcb metric (#981)

NathanHB · web-flow · commit 0b16f8a541ef · 2025-09-23T10:28:22.000+02:00
diff --git a/src/lighteval/tasks/extended/lcb/main.py b/src/lighteval/tasks/extended/lcb/main.py
@@ -36,6 +36,7 @@
 from aenum import extend_enum
 
 from lighteval.metrics.metrics import Metrics, SampleLevelMetric
+from lighteval.metrics.metrics_sample import SampleLevelComputation
 from lighteval.models.model_output import ModelResponse
 from lighteval.tasks.extended.lcb.codegen_metrics import (
     codegen_metrics,
@@ -80,38 +81,39 @@ def lcb_codegeneration_prompt_fn(line, task_name: str = "lcb:codegeneration") ->
     )
 
 
-def codegen_metric(model_response: ModelResponse, doc: Doc, **kwargs) -> float:
-    """Estimates the Pass@1 metric for the code generation task.
-    Extract the code from each prediction, Runs it for each sample and generations,
-    and computes the Pass@1 over the outputs.
-    """
-    assert doc.specific is not None, "Doc specific field is required for codegen_metric"
-
-    predictions = model_response.final_text
-    # Extract generated code snippets
-    generated_code_snippets = [[extract_code(pred) for pred in predictions]]  # noqa: F841
-    evaluation_sample = {  # noqa: F841
-        "inputs": doc.specific["inputs"],
-        "outputs": doc.specific["outputs"],
-        "fn_name": doc.specific["fn_name"],
-    }
-    # This is a list of lists because
-    evaluation_sample = [{"input_output": json.dumps(evaluation_sample)}]
-
-    metrics, _ = codegen_metrics(
-        evaluation_sample,
-        generated_code_snippets,
-        k_list=[1],  # Only run for Pass@1
-        num_process_evaluate=8,
-    )
-    return metrics["pass@1"]
+class CodegenMetric(SampleLevelComputation):
+    def compute(self, model_response: ModelResponse, doc: Doc, **kwargs) -> dict:
+        """Estimates the Pass@1 metric for the code generation task.
+        Extract the code from each prediction, Runs it for each sample and generations,
+        and computes the Pass@1 over the outputs.
+        """
+        assert doc.specific is not None, "Doc specific field is required for codegen_metric"
+
+        predictions = model_response.final_text
+        # Extract generated code snippets
+        generated_code_snippets = [[extract_code(pred) for pred in predictions]]  # noqa: F841
+        evaluation_sample = {  # noqa: F841
+            "inputs": doc.specific["inputs"],
+            "outputs": doc.specific["outputs"],
+            "fn_name": doc.specific["fn_name"],
+        }
+        # This is a list of lists because
+        evaluation_sample = [{"input_output": json.dumps(evaluation_sample)}]
+
+        metrics, _ = codegen_metrics(
+            evaluation_sample,
+            generated_code_snippets,
+            k_list=[1],  # Only run for Pass@1
+            num_process_evaluate=8,
+        )
+        return metrics["pass@1"]
 
 
 lcb_codegen_metric = SampleLevelMetric(
     metric_name="codegen_pass@1:16",  # This is the way of informing the number of generations currently
     category=SamplingMethod.GENERATIVE,
     higher_is_better=True,
-    sample_level_fn=codegen_metric,
+    sample_level_fn=CodegenMetric(),
     corpus_level_fn=np.mean,
     batched_compute=False,
 )