|
| 1 | +"""Is Your Code Generated by ChatGPT Really Correct? Rigorous Evaluation of Large Language Models for Code Generation |
| 2 | +https://openreview.net/forum?id=1qvx610Cu7 |
| 3 | +
|
| 4 | +The MBPP+ dataset is created by the EvalPlus framework which extends the original MBPP dataset |
| 5 | +by adding more automatically generated test cases to each problem. Note MBPP+ only includes 399 |
| 6 | +tasks which are a subset of the original MBPP dataset. The subset is selected from the sanitized |
| 7 | +MBPP (a subset of manually examined tasks by the original MBPP authors) and EvalPlus further |
| 8 | +removes low-quality and ill-formed tasks for benchmark quality control. |
| 9 | +
|
| 10 | +Homepage: https://github.com/evalplus/evalplus |
| 11 | +""" |
| 12 | + |
| 13 | +import os |
| 14 | + |
| 15 | +from bigcode_eval.tasks.mbpp import MBPP |
| 16 | +from bigcode_eval.tasks.custom_metrics.code_eval import compute_code_eval |
| 17 | + |
| 18 | +_CITATION = """ |
| 19 | +@inproceedings{evalplus, |
| 20 | + title = {Is Your Code Generated by Chat{GPT} Really Correct? Rigorous Evaluation of Large Language Models for Code Generation}, |
| 21 | + author = {Liu, Jiawei and Xia, Chunqiu Steven and Wang, Yuyao and Zhang, Lingming}, |
| 22 | + booktitle = {Thirty-seventh Conference on Neural Information Processing Systems}, |
| 23 | + year = {2023}, |
| 24 | + url = {https://openreview.net/forum?id=1qvx610Cu7}, |
| 25 | +} |
| 26 | +""" |
| 27 | + |
| 28 | + |
| 29 | +class MBPPPlus(MBPP): |
| 30 | + """A task represents an entire benchmark including its dataset, problems, |
| 31 | + answers, generation settings and evaluation methods. |
| 32 | + """ |
| 33 | + |
| 34 | + DATASET_PATH = "evalplus/mbppplus" |
| 35 | + |
| 36 | + def get_prompt(self, doc): |
| 37 | + """Builds the prompt for the LM to generate from. |
| 38 | + MBPP prompt is built following to InCoder (Fried et al.) approach |
| 39 | + prompt = docstring that includes one test |
| 40 | + """ |
| 41 | + description = doc["prompt"] # sanitized testset use "prompt" instead of "text" |
| 42 | + test_example = doc["test_list"][0] |
| 43 | + prompt = f'"""\n{description}\n{test_example}\n"""\n' |
| 44 | + return prompt |
| 45 | + |
| 46 | + # NOTE(@ganler): MBPP+ extends the original MBPP jsonl data with a "test" field which |
| 47 | + # includes the testing code ready for execution. Note the "test" field |
| 48 | + # is different from HumanEval(+) which further requires a `check` func |
| 49 | + def get_reference(self, doc): |
| 50 | + """Builds the reference solution for the doc (sample from the test dataset).""" |
| 51 | + use_mbpp_tests = os.getenv("MBBPPLUS_USE_MBPP_TESTS", "0") |
| 52 | + if use_mbpp_tests == "1": |
| 53 | + return "\n".join(doc["test_list"]) |
| 54 | + return "\n" + doc["test"] |
| 55 | + |
| 56 | + def get_dataset(self): |
| 57 | + """Returns dataset for the task or an iterable of any object, that get_prompt can handle""" |
| 58 | + dataset = self.dataset["test"] |
| 59 | + assert ( |
| 60 | + len(dataset) == 399 |
| 61 | + ), "MBPP+ only has 399 problems. Please retry by deleting its old cache" |
| 62 | + return dataset |
| 63 | + |
| 64 | + def process_results(self, generations, references): |
| 65 | + """Takes the list of LM generations and evaluates them against ground truth references, |
| 66 | + returning the metric for the generations. |
| 67 | + :param generations: list(list(str)) |
| 68 | + list of lists containing generations |
| 69 | + :param references: list(str) |
| 70 | + list of str containing refrences |
| 71 | + """ |
| 72 | + results, _ = compute_code_eval( |
| 73 | + references=references, |
| 74 | + predictions=generations, |
| 75 | + timeout=10.0, # 10s timeout |
| 76 | + ) |
| 77 | + return results |
0 commit comments