adressed issues in class_eval.py

zhenningdavidliu · zhenningdavidliu · commit 0835b12a17aa · 2024-12-27T22:33:47.000+01:00
diff --git a/src/inspect_evals/class_eval/class_eval.py b/src/inspect_evals/class_eval/class_eval.py
@@ -33,23 +33,11 @@
 
 from utils import construct_prompt
 
-
-INSTRUCTION = """
-
-You are an expert Python programmer. You will be given a task, and the tests that your code must pass.
-
-"""
-
 # Timeout for scoring.
 VERIFY_TIMEOUT = 30
 
 @task
-def class_eval(
-    k_shot: int = 1,
-    solver: Solver | None = None,
-    instruction_prompt: str = INSTRUCTION,
-    scorer: Scorer | list[Scorer] | None = None,
-    ) -> Task:
+def class_eval(few_shot: int = 1, few_shot_seed: int = 42) -> Task:
     """Inspect Task implementation of ClassEval.
 
     Args:
@@ -63,24 +51,35 @@ def class_eval(
     dataset = hf_dataset(
         path="FudanSELab/ClassEval",
         split="test",
-        sample_fields=record_to_sample,
+        sample_fields=record_to_sample
     )
 
-    solver = [system_message(instruction_prompt), generate()]
+    INSTRUCTION = """
+
+    You are an expert Python programmer. You will be given a task, and the tests that your code must pass.
+
+    """
+
+    solver = [system_message(INSTRUCTION), generate()]
 
     scorer = class_eval_scorer()
 
     return Task(
         dataset=dataset,
         solver=solver or generate(),
         scorer=scorer,
-        epochs = Epochs(k_shot, [f"pass_at_{k_shot}"]),
+        epochs = Epochs(few_shot, [f"pass_at_{few_shot}"]),
         sandbox="docker",
     )
 
 
 @scorer(metrics=[mean(), std()])
 def class_eval_scorer() -> Scorer:
+    '''
+    This is the scorer for class eval. It will first identify the python code in the output of 
+    the model. Then we append the test cases to the code and execute it. If the code passes all the test cases,
+    we return a CORRECT score. Otherwise, we return an INCORRECT score.
+    '''
 
     async def score(state:TaskState, target: Target) -> Score:
         
@@ -105,10 +104,7 @@ async def score(state:TaskState, target: Target) -> Score:
             else:
                 explanation += "Code did not pass all test cases.\n"
                 if result.stderr:
-                    explanation += "See details below.\n"
-                    explanation += "```python\n"
-                    explanation += result.stderr + "\n"
-                    explanation += "```\n"
+                    explanation += f"See details below.\n ```python\n {result.stderr} \n ```\n"
         except TimeoutError:
             result = ExecResult(False, 1, "", "Verification timed out.")
             explanation += "Verification timed out."
@@ -131,10 +127,12 @@ def find_code(completion: str) -> str:
     return str(extracted_answer)
 
 
-# map class_eval record into inspect sample
 def record_to_sample(
     record: dict[str, Any],
 ) -> Sample:
+    '''
+    Maps class_eval record into inspect sample
+    '''
     return Sample(
         input = construct_prompt(record),
         target =  record["solution_code"],
diff --git a/src/inspect_evals/class_eval/test.py b/src/inspect_evals/class_eval/test.py
@@ -25,7 +25,7 @@ def test_find_code(self):
         self.assertEqual(find_code(raw_code), sample_code)
 
     def test_task(self):
-        task = class_eval(k_shot = 5)
+        task = class_eval(few_shot = 5)
         self.assertEqual(task.dataset.name, 'FudanSELab/ClassEval')
         self.assertEqual(task.epochs, 5)
         self.assertEqual(task.sandbox.type, "docker")