Skip to content

Commit 74ba667

Browse files
Merge pull request rllm-org#147 from zhenningdavidliu/class_eval
Class eval
2 parents b938c4f + b056e51 commit 74ba667

File tree

9 files changed

+381
-0
lines changed

9 files changed

+381
-0
lines changed
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
# Use an official Python base image
2+
FROM python:3.9-slim
3+
4+
# Set the working directory in the container
5+
WORKDIR /app
6+
7+
# Copy the requirements file
8+
COPY requirements.txt .
9+
10+
# Install Python dependencies
11+
RUN pip install --no-cache-dir -r requirements.txt
12+
13+
# Default command to keep the container running
14+
CMD ["tail", "-f", "/dev/null"]

src/inspect_evals/class_eval/README.md

Whitespace-only changes.
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from .class_eval import class_eval
Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
"""
2+
ClassEval: Class level Python code evaluation
3+
4+
Based on the paper https://arxiv.org/pdf/2308.01861 .
5+
The datasets can be found either on https://huggingface.co/datasets/FudanSELab/ClassEval
6+
or on https://github.com/FudanSELab/ClassEval
7+
8+
This is an inspect_ai implementation of the paper.
9+
"""
10+
11+
import re
12+
from typing import Any
13+
14+
from inspect_ai import Task, task, Epochs
15+
from inspect_ai.dataset import Sample, hf_dataset
16+
from inspect_ai.scorer import (
17+
CORRECT,
18+
INCORRECT,
19+
Score,
20+
Scorer,
21+
Target,
22+
scorer,
23+
mean,
24+
std,
25+
)
26+
from inspect_ai.solver import (
27+
generate,
28+
Solver,
29+
system_message,
30+
TaskState,
31+
)
32+
from inspect_ai.util import ExecResult, sandbox
33+
34+
from .utils import construct_prompt
35+
36+
# Timeout for scoring.
37+
VERIFY_TIMEOUT = 30
38+
39+
@task
40+
def class_eval(few_shot: int = 1, few_shot_seed: int = 42) -> Task:
41+
"""Inspect Task implementation of ClassEval.
42+
43+
Args:
44+
k_shot (int): The number of few shots to include.
45+
k_shot_seed (int): The seed for generating few shots.
46+
solver (Solver): The solver to use for this evaluation. Defaults to the default solver.
47+
instruction_prompt (String): The prompt to prepend to the code problem.
48+
scorer (Scorer): The scorer to use for this evaluation. Defaults to the default scorer.
49+
"""
50+
51+
dataset = hf_dataset(
52+
path="FudanSELab/ClassEval",
53+
split="test",
54+
sample_fields=record_to_sample
55+
)
56+
57+
INSTRUCTION = """
58+
59+
You are an expert Python programmer. You will be given a task, and the tests that your code must pass.
60+
61+
"""
62+
63+
return Task(
64+
dataset=dataset,
65+
solver=[system_message(INSTRUCTION), generate()],
66+
scorer=class_eval_scorer(),
67+
epochs = Epochs(few_shot, [f"pass_at_{few_shot}"]),
68+
sandbox="docker",
69+
)
70+
71+
72+
@scorer(metrics=[mean(), std()])
73+
def class_eval_scorer() -> Scorer:
74+
'''
75+
This is the scorer for class eval. It will first identify the python code in the output of
76+
the model. Then we append the test cases to the code and execute it. If the code passes all the test cases,
77+
we return a CORRECT score. Otherwise, we return an INCORRECT score.
78+
'''
79+
80+
async def score(state:TaskState, target: Target) -> Score:
81+
82+
result = {}
83+
84+
generated_code = find_code(state.output.completion)
85+
code = generated_code + "\n" + state.metadata["test"]
86+
87+
explanation = ""
88+
explanation += "The following code was executed:\n\n```python\n"
89+
explanation += code
90+
explanation += "\n```\n"
91+
92+
try:
93+
result = await sandbox().exec(
94+
cmd=["python", "-c", code],
95+
timeout=VERIFY_TIMEOUT,
96+
)
97+
98+
if result.success:
99+
explanation += "All test cases passed.\n"
100+
else:
101+
explanation += "Code did not pass all test cases.\n"
102+
if result.stderr:
103+
explanation += f"See details below.\n ```python\n {result.stderr} \n ```\n"
104+
except TimeoutError:
105+
result = ExecResult(False, 1, "", "Verification timed out.")
106+
explanation += "Verification timed out."
107+
108+
return Score(
109+
value=CORRECT if result.success else INCORRECT,
110+
answer=generated_code,
111+
explanation=explanation,
112+
)
113+
114+
return score
115+
116+
117+
def find_code(completion: str) -> str:
118+
"""Remove Markdown formatting around generated code blocks."""
119+
pattern = re.compile(r"```python\n(.*?)```", re.DOTALL)
120+
matches = pattern.findall(completion)
121+
extracted_answer = matches[0] if len(matches) >= 1 else completion
122+
123+
return str(extracted_answer)
124+
125+
126+
def record_to_sample(
127+
record: dict[str, Any],
128+
) -> Sample:
129+
'''
130+
Maps class_eval record into inspect sample
131+
'''
132+
return Sample(
133+
input = construct_prompt(record),
134+
target = record["solution_code"],
135+
id = record["task_id"],
136+
metadata={
137+
"task_id": record["task_id"],
138+
"skeleton": record["skeleton"],
139+
"test": record["test"],
140+
"solution_code": record["solution_code"],
141+
"import_statement": record["import_statement"],
142+
"class_description": record["class_description"],
143+
"methods_info": record["methods_info"],
144+
"class_name": record["class_name"],
145+
"test_classes": record["test_classes"],
146+
"class_constructor": record["class_constructor"],
147+
"fields": record["fields"],
148+
}
149+
)
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
In inference_pipeline line 106 we have the response generation for openai, they use a simple system prompt.
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
numpy
2+
scipy
3+
pandas
4+
python-docx
5+
openpyxl
6+
bs4
7+
pillow
8+
gensim
9+
PyPDF2
10+
nltk
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
import unittest
2+
from inspect_ai.solver import (
3+
generate,
4+
system_message
5+
)
6+
7+
from utils import construct_prompt
8+
from class_eval import record_to_sample, find_code, class_eval
9+
10+
from test_data import (
11+
record,
12+
sample,
13+
prompt,
14+
sample_code,
15+
raw_code,
16+
INSTRUCTION
17+
)
18+
19+
class TestClassEval(unittest.TestCase):
20+
21+
def test_record_to_sample(self):
22+
self.assertEqual(record_to_sample(record), sample)
23+
24+
def test_find_code(self):
25+
self.assertEqual(find_code(raw_code), sample_code)
26+
27+
def test_task(self):
28+
task = class_eval(few_shot = 5)
29+
self.assertEqual(task.dataset.name, 'FudanSELab/ClassEval')
30+
self.assertEqual(task.epochs, 5)
31+
self.assertEqual(task.sandbox.type, "docker")
32+
33+
34+
class TestUtils(unittest.TestCase):
35+
36+
def test_construct_prompt(self):
37+
self.assertEqual(construct_prompt(record), prompt)
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
from inspect_ai.dataset import Sample
2+
3+
record = {
4+
"task_id": "ClassEval_demo",
5+
"skeleton": "import numpy as np\n\nclass Add_numbers:\n \"\"\"\n This class adds numbers.\n \"\"\"\n\n def __init__(self):\n pass\n\n def add(self, x, y):\n \"\"\"\n Add the two numbers\n :param x: float, first number\n :param y: float, second number\n :return: float, sum of the two numbers\n >>> adder = Add_numbers()\n >>> adder.add(1,2)",
6+
"solution_code": "No solution code provided.",
7+
"import_statement": [
8+
"import numpy as np"
9+
],
10+
"class_description": " \"\"\"\n This class adds numbers.\n \"\"\"\n",
11+
"class_name": "Add_numbers",
12+
"test": "No test code provided.",
13+
"test_classes": [
14+
"AddTest"
15+
],
16+
"class_constructor": "class Add_numbers: \n def __init__(self):\n pass\n\n",
17+
"fields": [],
18+
"methods_info": [
19+
{
20+
"method_name": "add",
21+
"method_description": "def filter(self, request):\n \"\"\"\n Filter the incoming request based on certain rules and conditions.\n :param request: dict, the incoming request details\n :return: bool, True if the request is allowed, False otherwise\n >>> filter = AccessGatewayFilter()\n >>> filter.filter({'path': '/login', 'method': 'POST'})\n True\n\n \"\"\"",
22+
"test_class": "AddTest",
23+
"test_code": "pass",
24+
"dependencies": {
25+
"Standalone": True,
26+
"lib_dependencies": [],
27+
"field_dependencies": [],
28+
"method_dependencies": []
29+
}
30+
}
31+
]
32+
}
33+
34+
35+
sample = Sample(
36+
input = 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n ### Instruction:\n Please complete the class Add_numbers in the following code.\nimport numpy as np\n\nclass Add_numbers:\n """\n This class adds numbers.\n """\n\n def __init__(self):\n pass\n\n def add(self, x, y):\n """\n Add the two numbers\n :param x: float, first number\n :param y: float, second number\n :return: float, sum of the two numbers\n >>> adder = Add_numbers()\n >>> adder.add(1,2)\n\n ### Response:\n ',
37+
choices=None,
38+
target='No solution code provided.',
39+
id='ClassEval_demo',
40+
metadata={'task_id': 'ClassEval_demo', 'skeleton': 'import numpy as np\n\nclass Add_numbers:\n """\n This class adds numbers.\n """\n\n def __init__(self):\n pass\n\n def add(self, x, y):\n """\n Add the two numbers\n :param x: float, first number\n :param y: float, second number\n :return: float, sum of the two numbers\n >>> adder = Add_numbers()\n >>> adder.add(1,2)', 'test': 'No test code provided.', 'solution_code': 'No solution code provided.', 'import_statement': ['import numpy as np'], 'class_description': ' """\n This class adds numbers.\n """\n', 'methods_info': [{'method_name': 'add', 'method_description': 'def filter(self, request):\n """\n Filter the incoming request based on certain rules and conditions.\n :param request: dict, the incoming request details\n :return: bool, True if the request is allowed, False otherwise\n >>> filter = AccessGatewayFilter()\n >>> filter.filter({\'path\': \'/login\', \'method\': \'POST\'})\n True\n\n """', 'test_class': 'AddTest', 'test_code': 'pass', 'dependencies': {'Standalone': True, 'lib_dependencies': [], 'field_dependencies': [], 'method_dependencies': []}}], 'class_name': 'Add_numbers', 'test_classes': ['AddTest'], 'class_constructor': 'class Add_numbers: \n def __init__(self):\n pass\n\n', 'fields': []},
41+
sandbox=None,
42+
files=None,
43+
setup=None,
44+
)
45+
46+
47+
prompt = r'''Below is an instruction that describes a task. Write a response that appropriately completes the request.
48+
49+
### Instruction:
50+
Please complete the class Add_numbers in the following code.
51+
import numpy as np
52+
53+
class Add_numbers:
54+
"""
55+
This class adds numbers.
56+
"""
57+
58+
def __init__(self):
59+
pass
60+
61+
def add(self, x, y):
62+
"""
63+
Add the two numbers
64+
:param x: float, first number
65+
:param y: float, second number
66+
:return: float, sum of the two numbers
67+
>>> adder = Add_numbers()
68+
>>> adder.add(1,2)
69+
70+
### Response:
71+
'''
72+
73+
raw_code = r"""Test text
74+
```python
75+
print("Hello, World!")```
76+
Test text
77+
"""
78+
79+
sample_code = r"""print("Hello, World!")"""
80+
81+
82+
INSTRUCTION = """
83+
84+
You are an expert Python programmer. You will be given a task, and the tests that your code must pass.
85+
86+
"""
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
"""
2+
The InferenceUtil is taken from https://github.com/FudanSELab/ClassEval/blob/master/generation/inference_util.py as we want to keep faith with the original implementation.
3+
"""
4+
5+
from enum import Enum
6+
7+
class ModelName(Enum):
8+
Instruct_CodeGen = 0
9+
WizardCoder = 1
10+
Instruct_StarCoder = 2
11+
InCoder = 3
12+
PolyCoder = 4
13+
SantaCoder = 5
14+
Vicuna = 6
15+
ChatGLM = 7
16+
GPT_3_5 = 8
17+
GPT_4 = 9
18+
others = 10
19+
Magicoder = 11
20+
CodeGeeX2 = 12
21+
DeepSeekCoder_inst = 13
22+
Gemini_Pro = 14
23+
CodeLlama_13b_inst = 15
24+
25+
class GenerationStrategy(Enum):
26+
Holistic = 0
27+
Incremental = 1
28+
Compositional = 2
29+
30+
class InferenceUtil:
31+
32+
@staticmethod
33+
def generate_prompt(instruction, model_name):
34+
if model_name == ModelName.DeepSeekCoder_inst.value or model_name == ModelName.Gemini_Pro.value:
35+
return instruction
36+
37+
elif model_name == ModelName.Magicoder.value:
38+
return f"""You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.
39+
40+
@@ Instruction:
41+
{instruction}
42+
43+
@@ Response:
44+
"""
45+
else:
46+
return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
47+
48+
### Instruction:
49+
{instruction}
50+
51+
### Response:
52+
"""
53+
54+
55+
# FOR NOW, We default to using the prompts that work for GPT3.5 and the holistic strategy
56+
def construct_prompt(info, model_name = ModelName.GPT_3_5, strategy = GenerationStrategy.Holistic):
57+
prompt = ""
58+
if strategy == GenerationStrategy.Holistic:
59+
if model_name == ModelName.PolyCoder.value or model_name == ModelName.SantaCoder.value:
60+
skeleton = info['skeleton']
61+
prompt = skeleton
62+
else:
63+
class_name = info['class_name']
64+
skeleton = info['skeleton']
65+
instruction = f"Please complete the class {class_name} in the following code."
66+
instruction = instruction + '\n' + skeleton
67+
prompt = InferenceUtil.generate_prompt(instruction, model_name)
68+
69+
elif strategy == GenerationStrategy.Incremental:
70+
if model_name == ModelName.PolyCoder.value or model_name == ModelName.SantaCoder.value:
71+
prompt = info['skeleton']
72+
else:
73+
prompt = info['instruction'] + info['skeleton']
74+
prompt = InferenceUtil.generate_prompt(prompt, model_name)
75+
76+
elif strategy == GenerationStrategy.Compositional:
77+
if model_name == ModelName.PolyCoder.value or model_name == ModelName.SantaCoder.value:
78+
prompt = info['skeleton']
79+
else:
80+
prompt = info['instruction'] + info['skeleton']
81+
prompt = InferenceUtil.generate_prompt(prompt, model_name)
82+
83+
return prompt

0 commit comments

Comments
 (0)