Skip to content

Commit 67c4409

Browse files
authored
Merge branch 'main' into mercury
2 parents 49e6f24 + 84b96da commit 67c4409

File tree

7 files changed

+212
-10
lines changed

7 files changed

+212
-10
lines changed

bigcode_eval/tasks/__init__.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@
44
from . import (apps, codexglue_code_to_text, codexglue_text_to_text, conala,
55
concode, ds1000, gsm, humaneval, humanevalplus, humanevalpack,
66
instruct_humaneval, instruct_wizard_humaneval, mbpp, mbppplus,
7-
multiple, parity, python_bugs, quixbugs, recode, santacoder_fim,
8-
mercury)
7+
multiple, parity, python_bugs, quixbugs, recode, santacoder_fim,
8+
studenteval, mercury)
99

1010
TASK_REGISTRY = {
1111
**apps.create_all_tasks(),
@@ -29,6 +29,7 @@
2929
**instruct_humaneval.create_all_tasks(),
3030
**recode.create_all_tasks(),
3131
**santacoder_fim.create_all_tasks(),
32+
"studenteval": studenteval.StudentEval,
3233
"mercury": mercury.Mercury,
3334
}
3435

bigcode_eval/tasks/humanevalpack.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -228,15 +228,19 @@ def get_prompt(self, prompt_base, instruction, context=None):
228228
elif self.prompt == "codellama":
229229
# https://hf.co/codellama
230230
prompt = f"[INST] {inp.strip()} [/INST] {prompt_base}"
231+
elif self.prompt == "deepseek":
232+
prompt = f"You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n{inp.strip()}\n### Response:\n{prompt_base}"
231233
elif self.prompt in ["tulu", "gritlm"]:
232234
# https://hf.co/GritLM/GritLM-7B
233235
prompt = f"<|user|>\n{inp}\n<|assistant|>\n{prompt_base}"
234236
elif self.prompt == "zephyr":
235237
# https://hf.co/HuggingFaceH4/zephyr-7b-beta
236238
prompt = f"<|user|>\n{inp}</s>\n<|assistant|>\n{prompt_base}"
237-
elif self.prompt == "yi":
239+
elif self.prompt in ["yi", "starchat2", "codeqwen"]:
238240
# https://hf.co/01-ai/Yi-34B-Chat
239241
prompt = f"<|im_start|>user\n{inp}<|im_end|>\n<|im_start|>assistant\n{prompt_base}"
242+
elif self.prompt == "codegemma":
243+
prompt = f"<start_of_turn>user\n{inp}<end_of_turn>\n<start_of_turn>model\n{prompt_base}"
240244
elif self.prompt == "codellama-70b":
241245
prompt = f"Source: user\n\n {inp.strip()} Source: assistant\nDestination: user \n\n{prompt_base}"
242246
elif self.prompt == "aurora-m":

bigcode_eval/tasks/humanevalplus.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,11 +29,11 @@ class GeneralHumanEvalPlus(GeneralHumanEval):
2929

3030
DATASET_PATH = "evalplus/humanevalplus"
3131

32-
def __init__(self, strip_prompt, k=[1, 10, 100], num_workers=16, timeout=10.0):
33-
if timeout < 10.0:
32+
def __init__(self, strip_prompt, k=[1, 10, 100], num_workers=16, timeout=20.0):
33+
if timeout < 20.0:
3434
warn(
3535
"It is suggested to have a longer timeout as HumanEval+ has lots of tests. "
36-
f"The current timeout is {timeout}s while the suggested timeout is 10s."
36+
f"The current timeout is {timeout}s while the suggested timeout is 20s."
3737
)
3838
super().__init__(strip_prompt, k, num_workers, timeout)
3939

bigcode_eval/tasks/mbppplus.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
The MBPP+ dataset is created by the EvalPlus framework which extends the original MBPP dataset
55
by adding more automatically generated test cases to each problem. Note MBPP+ only includes 399
66
tasks which are a subset of the original MBPP dataset. The subset is selected from the sanitized
7-
MBPP (a subset of manually examined tasks by the original MBPP authors) and EvalPlus further
7+
MBPP (a subset of manually examined tasks by the original MBPP authors) and EvalPlus further
88
removes low-quality and ill-formed tasks for benchmark quality control.
99
1010
Homepage: https://github.com/evalplus/evalplus
@@ -56,9 +56,6 @@ def get_reference(self, doc):
5656
def get_dataset(self):
5757
"""Returns dataset for the task or an iterable of any object, that get_prompt can handle"""
5858
dataset = self.dataset["test"]
59-
assert (
60-
len(dataset) == 399
61-
), "MBPP+ only has 399 problems. Please retry by deleting its old cache"
6259
return dataset
6360

6461
def process_results(self, generations, references):

bigcode_eval/tasks/studenteval.py

Lines changed: 177 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,177 @@
1+
"""
2+
StudentEval is a dataset of 1,749 prompts for 48 problems, authored by 80
3+
students who have only completed a one-semester Python programming class.
4+
Unlike many other benchmarks, it has multiple prompts per problem and multiple
5+
attempts by the same participant.
6+
7+
Web page: https://huggingface.co/datasets/wellesley-easel/StudentEval
8+
"""
9+
10+
from bigcode_eval.base import Task
11+
from datasets import load_dataset
12+
from multiprocessing import cpu_count
13+
from concurrent.futures import ThreadPoolExecutor
14+
from tqdm import tqdm
15+
import tempfile
16+
import pandas as pd
17+
import numpy as np
18+
import subprocess
19+
20+
_CITATION = """\
21+
@misc{babe2023studenteval,
22+
title={StudentEval: A Benchmark of Student-Written Prompts for Large Language Models of Code},
23+
author={Hannah McLean Babe and Sydney Nguyen and Yangtian Zi and Arjun Guha and Molly Q Feldman and Carolyn Jane Anderson},
24+
year={2023},
25+
eprint={2306.04556},
26+
archivePrefix={arXiv},
27+
primaryClass={cs.LG}
28+
}"""
29+
30+
EXECUTION_TIMEOUT = 15
31+
32+
33+
# Source: Chen at al. Evaluating Large Language Models of Code. 2021
34+
def _estimator(n: int, c: int, k: int) -> float:
35+
"""
36+
Calculates 1 - comb(n - c, k) / comb(n, k).
37+
"""
38+
assert c <= n, "c must be less than n"
39+
if n - c < k:
40+
return 1.0
41+
return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
42+
43+
44+
def _run_assembled_program(item):
45+
"""
46+
Runs the program with a timeout. The result dictionary has a "success" key
47+
that is 1 on success and 0 on failure. It also includes keys necessary to
48+
group results (problem, prompt, and group) and report results for each
49+
subset of StudentEval.
50+
"""
51+
with tempfile.NamedTemporaryFile(mode="w", suffix=".py") as f:
52+
f.write(item["program"])
53+
f.flush()
54+
try:
55+
result = subprocess.run(
56+
["python3", f.name],
57+
timeout=EXECUTION_TIMEOUT,
58+
stdout=subprocess.DEVNULL,
59+
stderr=subprocess.DEVNULL,
60+
stdin=subprocess.DEVNULL,
61+
)
62+
exit_code = result.returncode
63+
except subprocess.TimeoutExpired:
64+
exit_code = 1
65+
return {
66+
"problem": item["problem"],
67+
"prompt": item["prompt"],
68+
"group": item["group"],
69+
"success": 1 if exit_code == 0 else 0,
70+
}
71+
72+
73+
def _get_group(item):
74+
"""
75+
These boolean flags are mutually exclusive in the dataset. We turn them into a
76+
a string for easy grouping with Pandas.
77+
"""
78+
if item["is_first_success"]:
79+
return "First Success"
80+
if item["is_last_success"]:
81+
return "Last Success"
82+
if item["is_first_failure"]:
83+
return "First Failure"
84+
if item["is_last_failure"]:
85+
return "Last Failure"
86+
return None
87+
88+
89+
class StudentEval(Task):
90+
DATASET_PATH = "wellesley-easel/StudentEval"
91+
92+
def __init__(self):
93+
self.stop_words = ["\ndef", "\nclass", "\nif", "\nprint"]
94+
self.requires_execution = True
95+
self.dataset = load_dataset(path=self.DATASET_PATH)
96+
# NOTE(Arjun Guha): Avoiding .filter so that we don't get a datasets
97+
# cache item on disk.
98+
self.dataset = [
99+
item for item in self.dataset["test"] if _get_group(item) is not None
100+
]
101+
102+
def get_dataset(self):
103+
return self.dataset
104+
105+
def get_prompt(self, doc):
106+
return doc["prompt"].rstrip()
107+
108+
# For a task with tests, the reference solution is the suite of tests.
109+
def get_reference(self, doc):
110+
return {
111+
"prompt": doc["prompt"],
112+
"assertions": doc["assertions"],
113+
"problem": doc["problem"],
114+
"group": _get_group(doc),
115+
}
116+
117+
def postprocess_generation(self, generation, idx):
118+
"""Defines the postprocessing for a LM generation.
119+
:param generation: str
120+
code generation from LM
121+
:param idx: int
122+
index of doc in the dataset to which the generation belongs
123+
(not used for Humaneval-Task)
124+
"""
125+
prompt = self.get_prompt(self.dataset[idx])
126+
generation = generation[len(prompt) :]
127+
return prompt + self._stop_at_stop_token(generation, self.stop_words)
128+
129+
def process_results(self, generations, references):
130+
"""Takes the list of LM generations and evaluates them against ground truth references,
131+
returning the metric for the generations.
132+
:param generations: list(list(str))
133+
list of lists containing generations
134+
:param references: list({ "assertions": list(str), "problem": str })
135+
list of reference solutions
136+
"""
137+
138+
worklist = []
139+
for generations, reference in zip(generations, references):
140+
# NOTE(Arjun Guha): This can be more efficient. At low temperature, we get lots of
141+
# repeated completions. So, this will end up running the same program repeatedly.
142+
# The original StudentEval code runs each generation once.
143+
for generation in generations:
144+
item = {
145+
"program": generation + "\n\n" + reference["assertions"],
146+
"prompt": reference["prompt"],
147+
"problem": reference["problem"],
148+
"group": reference["group"],
149+
}
150+
worklist.append(item)
151+
152+
with ThreadPoolExecutor(max_workers=cpu_count() - 1) as executor:
153+
results_df = pd.DataFrame(
154+
list(
155+
tqdm(
156+
executor.map(_run_assembled_program, worklist),
157+
total=len(worklist),
158+
)
159+
)
160+
)
161+
162+
# Calculate pass@1 for each prompt
163+
results_df = results_df.groupby(["problem", "prompt", "group"]).agg(
164+
c=("success", np.sum), n=("success", "count")
165+
)
166+
results_df.reset_index(inplace=True)
167+
results_df["pass1"] = results_df.apply(
168+
lambda row: _estimator(row["n"], row["c"], 1), axis=1
169+
)
170+
171+
# Calculate mean pass@1 for each group
172+
results_df = results_df.groupby(["group"]).agg(pass1=("pass1", np.mean))
173+
174+
# Turn into JSON
175+
results_df.reset_index(inplace=True)
176+
results_df = results_df.to_dict(orient="records")
177+
return results_df

docs/README.md

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -382,6 +382,27 @@ accelerate launch main.py \
382382
--allow_code_execution
383383
```
384384

385+
### StudentEval
386+
387+
[StudentEval](https://huggingface.co/datasets/wellesley-easel/StudentEval) is a
388+
dataset of 1,749 prompts for 48 problems, authored by 80 students who have only
389+
completed a one-semester Python programming class. Unlike many other benchmarks,
390+
it has multiple prompts per problem and multiple attempts by the same
391+
participant. Each problem is accompanied by a set of instructor-written test
392+
cases.
393+
394+
```python
395+
accelerate launch main.py \
396+
--model <MODEL_NAME> \
397+
--max_length_generation 512 \
398+
--tasks studenteval \
399+
--temperature 0.2 \
400+
--top_p 0.95 \
401+
--do_sample True \
402+
--n_samples 20 \
403+
--batch_size 20 \
404+
--allow_code_execution
405+
```
385406

386407
## Code generation benchmarks without unit tests
387408

setup.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@
2828
]
2929

3030
setup(
31+
name="bigcode_eval",
32+
python_requires='>=3.7',
3133
description="A framework for the evaluation of autoregressive code generation language models.",
3234
long_description=readme,
3335
license="Apache 2.0",

0 commit comments

Comments
 (0)