Merge pull request rllm-org#147 from zhenningdavidliu/class_eval

alexandraabbas · web-flow · commit 74ba667fccb5 · 2025-01-08T02:42:39.000Z
Class eval
diff --git a/src/inspect_evals/class_eval/Dockerfile b/src/inspect_evals/class_eval/Dockerfile
@@ -0,0 +1,14 @@
+# Use an official Python base image
+FROM python:3.9-slim
+
+# Set the working directory in the container
+WORKDIR /app
+
+# Copy the requirements file
+COPY requirements.txt .
+
+# Install Python dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Default command to keep the container running
+CMD ["tail", "-f", "/dev/null"]
diff --git a/src/inspect_evals/class_eval/README.md b/src/inspect_evals/class_eval/README.md
diff --git a/src/inspect_evals/class_eval/__init__.py b/src/inspect_evals/class_eval/__init__.py
@@ -0,0 +1 @@
+from .class_eval import class_eval
diff --git a/src/inspect_evals/class_eval/class_eval.py b/src/inspect_evals/class_eval/class_eval.py
@@ -0,0 +1,149 @@
+"""
+ClassEval: Class level Python code evaluation
+
+Based on the paper https://arxiv.org/pdf/2308.01861 .
+The datasets can be found either on https://huggingface.co/datasets/FudanSELab/ClassEval
+or on https://github.com/FudanSELab/ClassEval
+
+This is an inspect_ai implementation of the paper.
+"""
+
+import re
+from typing import Any
+
+from inspect_ai import Task, task, Epochs
+from inspect_ai.dataset import Sample, hf_dataset
+from inspect_ai.scorer import (
+    CORRECT,
+    INCORRECT,
+    Score,
+    Scorer,
+    Target,
+    scorer,
+    mean,
+    std,
+)
+from inspect_ai.solver import (
+    generate, 
+    Solver, 
+    system_message,
+    TaskState,
+)
+from inspect_ai.util import ExecResult, sandbox
+
+from .utils import construct_prompt
+
+# Timeout for scoring.
+VERIFY_TIMEOUT = 30
+
+@task
+def class_eval(few_shot: int = 1, few_shot_seed: int = 42) -> Task:
+    """Inspect Task implementation of ClassEval.
+
+    Args:
+        k_shot (int): The number of few shots to include.
+        k_shot_seed (int): The seed for generating few shots.
+        solver (Solver): The solver to use for this evaluation. Defaults to the default solver.
+        instruction_prompt (String): The prompt to prepend to the code problem.
+        scorer (Scorer): The scorer to use for this evaluation. Defaults to the default scorer.
+    """
+
+    dataset = hf_dataset(
+        path="FudanSELab/ClassEval",
+        split="test",
+        sample_fields=record_to_sample
+    )
+
+    INSTRUCTION = """
+
+    You are an expert Python programmer. You will be given a task, and the tests that your code must pass.
+
+    """
+
+    return Task(
+        dataset=dataset,
+        solver=[system_message(INSTRUCTION), generate()],
+        scorer=class_eval_scorer(),
+        epochs = Epochs(few_shot, [f"pass_at_{few_shot}"]),
+        sandbox="docker",
+    )
+
+
+@scorer(metrics=[mean(), std()])
+def class_eval_scorer() -> Scorer:
+    '''
+    This is the scorer for class eval. It will first identify the python code in the output of 
+    the model. Then we append the test cases to the code and execute it. If the code passes all the test cases,
+    we return a CORRECT score. Otherwise, we return an INCORRECT score.
+    '''
+
+    async def score(state:TaskState, target: Target) -> Score:
+        
+        result = {}
+
+        generated_code = find_code(state.output.completion)
+        code = generated_code + "\n" + state.metadata["test"]
+
+        explanation = ""
+        explanation += "The following code was executed:\n\n```python\n"
+        explanation += code
+        explanation += "\n```\n"
+
+        try:
+            result = await sandbox().exec(
+                cmd=["python", "-c", code],
+                timeout=VERIFY_TIMEOUT,
+            )
+
+            if result.success:
+                explanation += "All test cases passed.\n"
+            else:
+                explanation += "Code did not pass all test cases.\n"
+                if result.stderr:
+                    explanation += f"See details below.\n ```python\n {result.stderr} \n ```\n"
+        except TimeoutError:
+            result = ExecResult(False, 1, "", "Verification timed out.")
+            explanation += "Verification timed out."
+
+        return Score(
+            value=CORRECT if result.success else INCORRECT,
+            answer=generated_code,
+            explanation=explanation,
+        )
+
+    return score
+
+
+def find_code(completion: str) -> str:
+    """Remove Markdown formatting around generated code blocks."""
+    pattern = re.compile(r"```python\n(.*?)```", re.DOTALL)
+    matches = pattern.findall(completion)
+    extracted_answer = matches[0] if len(matches) >= 1 else completion
+
+    return str(extracted_answer)
+
+
+def record_to_sample(
+    record: dict[str, Any],
+) -> Sample:
+    '''
+    Maps class_eval record into inspect sample
+    '''
+    return Sample(
+        input = construct_prompt(record),
+        target =  record["solution_code"],
+        id = record["task_id"],
+        metadata={
+            "task_id": record["task_id"],
+            "skeleton": record["skeleton"],
+            "test": record["test"],
+            "solution_code": record["solution_code"],
+            "import_statement": record["import_statement"],
+            "class_description": record["class_description"],
+            "methods_info": record["methods_info"],
+            "class_name": record["class_name"],
+            "test_classes": record["test_classes"],
+            "class_constructor": record["class_constructor"],
+            "fields": record["fields"],
+        }
+    )
diff --git a/src/inspect_evals/class_eval/notes.txt b/src/inspect_evals/class_eval/notes.txt
@@ -0,0 +1 @@
+In inference_pipeline line 106 we have the response generation for openai, they use a simple system prompt.
diff --git a/src/inspect_evals/class_eval/requirements.txt b/src/inspect_evals/class_eval/requirements.txt
@@ -0,0 +1,10 @@
+numpy
+scipy
+pandas
+python-docx
+openpyxl
+bs4
+pillow
+gensim
+PyPDF2
+nltk
diff --git a/src/inspect_evals/class_eval/test.py b/src/inspect_evals/class_eval/test.py
@@ -0,0 +1,37 @@
+import unittest
+from inspect_ai.solver import (
+    generate, 
+    system_message
+)
+
+from utils import construct_prompt
+from class_eval import record_to_sample, find_code, class_eval
+
+from test_data import (
+    record, 
+    sample, 
+    prompt, 
+    sample_code, 
+    raw_code, 
+    INSTRUCTION
+)
+
+class TestClassEval(unittest.TestCase):
+
+    def test_record_to_sample(self):
+        self.assertEqual(record_to_sample(record), sample)
+
+    def test_find_code(self):
+        self.assertEqual(find_code(raw_code), sample_code)
+
+    def test_task(self):
+        task = class_eval(few_shot = 5)
+        self.assertEqual(task.dataset.name, 'FudanSELab/ClassEval')
+        self.assertEqual(task.epochs, 5)
+        self.assertEqual(task.sandbox.type, "docker")
+
+
+class TestUtils(unittest.TestCase):
+
+    def test_construct_prompt(self):
+        self.assertEqual(construct_prompt(record), prompt)
diff --git a/src/inspect_evals/class_eval/test_data.py b/src/inspect_evals/class_eval/test_data.py
@@ -0,0 +1,86 @@
+from inspect_ai.dataset import Sample
+
+record = {
+        "task_id": "ClassEval_demo",
+        "skeleton": "import numpy as np\n\nclass Add_numbers:\n    \"\"\"\n    This class adds numbers.\n    \"\"\"\n\n    def __init__(self):\n        pass\n\n    def add(self, x, y):\n        \"\"\"\n        Add the two numbers\n        :param x: float, first number\n        :param y: float, second number\n        :return: float, sum of the two numbers\n        >>> adder = Add_numbers()\n        >>> adder.add(1,2)",
+        "solution_code": "No solution code provided.",
+        "import_statement": [
+            "import numpy as np"
+        ],
+        "class_description": "    \"\"\"\n    This class adds numbers.\n    \"\"\"\n",
+        "class_name": "Add_numbers",
+        "test": "No test code provided.",
+        "test_classes": [
+            "AddTest"
+        ],
+        "class_constructor": "class Add_numbers: \n    def __init__(self):\n        pass\n\n",
+        "fields": [],
+        "methods_info": [
+            {
+                "method_name": "add",
+                "method_description": "def filter(self, request):\n        \"\"\"\n        Filter the incoming request based on certain rules and conditions.\n        :param request: dict, the incoming request details\n        :return: bool, True if the request is allowed, False otherwise\n        >>> filter = AccessGatewayFilter()\n        >>> filter.filter({'path': '/login', 'method': 'POST'})\n        True\n\n        \"\"\"",
+                "test_class": "AddTest",
+                "test_code": "pass",
+                "dependencies": {
+                    "Standalone": True,
+                    "lib_dependencies": [],
+                    "field_dependencies": [],
+                    "method_dependencies": []
+                }
+            }
+        ]
+    }
+
+
+sample = Sample(
+    input = 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n            ### Instruction:\n            Please complete the class Add_numbers in the following code.\nimport numpy as np\n\nclass Add_numbers:\n    """\n    This class adds numbers.\n    """\n\n    def __init__(self):\n        pass\n\n    def add(self, x, y):\n        """\n        Add the two numbers\n        :param x: float, first number\n        :param y: float, second number\n        :return: float, sum of the two numbers\n        >>> adder = Add_numbers()\n        >>> adder.add(1,2)\n\n            ### Response:\n            ',
+    choices=None,
+    target='No solution code provided.',
+    id='ClassEval_demo',
+    metadata={'task_id': 'ClassEval_demo', 'skeleton': 'import numpy as np\n\nclass Add_numbers:\n    """\n    This class adds numbers.\n    """\n\n    def __init__(self):\n        pass\n\n    def add(self, x, y):\n        """\n        Add the two numbers\n        :param x: float, first number\n        :param y: float, second number\n        :return: float, sum of the two numbers\n        >>> adder = Add_numbers()\n        >>> adder.add(1,2)', 'test': 'No test code provided.', 'solution_code': 'No solution code provided.', 'import_statement': ['import numpy as np'], 'class_description': '    """\n    This class adds numbers.\n    """\n', 'methods_info': [{'method_name': 'add', 'method_description': 'def filter(self, request):\n        """\n        Filter the incoming request based on certain rules and conditions.\n        :param request: dict, the incoming request details\n        :return: bool, True if the request is allowed, False otherwise\n        >>> filter = AccessGatewayFilter()\n        >>> filter.filter({\'path\': \'/login\', \'method\': \'POST\'})\n        True\n\n        """', 'test_class': 'AddTest', 'test_code': 'pass', 'dependencies': {'Standalone': True, 'lib_dependencies': [], 'field_dependencies': [], 'method_dependencies': []}}], 'class_name': 'Add_numbers', 'test_classes': ['AddTest'], 'class_constructor': 'class Add_numbers: \n    def __init__(self):\n        pass\n\n', 'fields': []},
+    sandbox=None,
+    files=None,
+    setup=None,
+)
+
+
+prompt =  r'''Below is an instruction that describes a task. Write a response that appropriately completes the request.
+
+            ### Instruction:
+            Please complete the class Add_numbers in the following code.
+import numpy as np
+
+class Add_numbers:
+    """
+    This class adds numbers.
+    """
+
+    def __init__(self):
+        pass
+
+    def add(self, x, y):
+        """
+        Add the two numbers
+        :param x: float, first number
+        :param y: float, second number
+        :return: float, sum of the two numbers
+        >>> adder = Add_numbers()
+        >>> adder.add(1,2)
+
+            ### Response:
+            '''
+
+raw_code = r"""Test text
+```python
+print("Hello, World!")```
+Test text 
+"""
+
+sample_code = r"""print("Hello, World!")"""
+
+
+INSTRUCTION = """
+
+You are an expert Python programmer. You will be given a task, and the tests that your code must pass.
+
+"""
diff --git a/src/inspect_evals/class_eval/utils.py b/src/inspect_evals/class_eval/utils.py
@@ -0,0 +1,83 @@
+"""
+The InferenceUtil is taken from https://github.com/FudanSELab/ClassEval/blob/master/generation/inference_util.py as we want to keep faith with the original implementation.
+"""
+
+from enum import Enum
+
+class ModelName(Enum):
+    Instruct_CodeGen = 0
+    WizardCoder = 1
+    Instruct_StarCoder = 2
+    InCoder = 3
+    PolyCoder = 4
+    SantaCoder = 5
+    Vicuna = 6
+    ChatGLM = 7
+    GPT_3_5 = 8
+    GPT_4 = 9
+    others = 10
+    Magicoder = 11
+    CodeGeeX2 = 12
+    DeepSeekCoder_inst = 13
+    Gemini_Pro = 14
+    CodeLlama_13b_inst = 15
+
+class GenerationStrategy(Enum):
+    Holistic = 0
+    Incremental = 1
+    Compositional = 2
+
+class InferenceUtil:
+
+    @staticmethod
+    def generate_prompt(instruction, model_name):
+        if model_name == ModelName.DeepSeekCoder_inst.value or model_name == ModelName.Gemini_Pro.value:
+            return instruction
+
+        elif model_name == ModelName.Magicoder.value:
+            return f"""You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.
+
+            @@ Instruction:
+            {instruction}
+
+            @@ Response:
+            """
+        else:
+            return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
+
+            ### Instruction:
+            {instruction}
+
+            ### Response:
+            """
+        
+
+# FOR NOW, We default to using the prompts that work for GPT3.5 and the holistic strategy
+def construct_prompt(info, model_name = ModelName.GPT_3_5, strategy = GenerationStrategy.Holistic):
+    prompt = ""
+    if strategy == GenerationStrategy.Holistic:
+        if model_name == ModelName.PolyCoder.value or model_name == ModelName.SantaCoder.value:
+            skeleton = info['skeleton']
+            prompt = skeleton
+        else:
+            class_name = info['class_name']
+            skeleton = info['skeleton']
+            instruction = f"Please complete the class {class_name} in the following code."
+            instruction = instruction + '\n' + skeleton
+            prompt = InferenceUtil.generate_prompt(instruction, model_name)
+
+    elif strategy == GenerationStrategy.Incremental:
+        if model_name == ModelName.PolyCoder.value or model_name == ModelName.SantaCoder.value:
+            prompt = info['skeleton']
+        else:
+            prompt = info['instruction'] + info['skeleton']
+            prompt = InferenceUtil.generate_prompt(prompt, model_name)
+
+    elif strategy == GenerationStrategy.Compositional:
+        if model_name == ModelName.PolyCoder.value or model_name == ModelName.SantaCoder.value:
+            prompt = info['skeleton']
+        else:
+            prompt = info['instruction'] + info['skeleton']
+            prompt = InferenceUtil.generate_prompt(prompt, model_name)
+
+    return prompt

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+In inference_pipeline line 106 we have the response generation for openai, they use a simple system prompt.`
-Original file line number
+Diff line change
@@ @@ -0,0 +1,10 @@ @@
 +numpy
 +scipy
 +pandas
 +python-docx
 +openpyxl
 +bs4
 +pillow
 +gensim
 +PyPDF2
 +nltk