Merge pull request #190 from ganler/mbppplus

loubnabnl · web-flow · commit 00967d12093e · 2024-01-29T19:41:14.000+01:00
Add mbpp+ evaluation task
diff --git a/README.md b/README.md
@@ -26,7 +26,7 @@ Below are the features and tasks of this framework:
     - We provide Multi-GPU text generation with `accelerate` and Dockerfiles for evaluating on Docker containers for security and reproducibility.
 
 - Tasks:
-    - 6 code generation **Python** tasks (with unit tests): [HumanEval](https://huggingface.co/datasets/openai_humaneval), [HumanEval+](https://huggingface.co/datasets/evalplus/humanevalplus), [InstructHumanEval](https://huggingface.co/datasets/codeparrot/instructhumaneval), [APPS](https://huggingface.co/datasets/codeparrot/apps), [MBPP](https://huggingface.co/datasets/mbpp) and [DS-1000](https://github.com/HKUNLP/DS-1000/) for both completion (left-to-right) and insertion (FIM) mode.
+    - 7 code generation **Python** tasks (with unit tests): [HumanEval](https://huggingface.co/datasets/openai_humaneval), [HumanEval+](https://huggingface.co/datasets/evalplus/humanevalplus), [InstructHumanEval](https://huggingface.co/datasets/codeparrot/instructhumaneval), [APPS](https://huggingface.co/datasets/codeparrot/apps), [MBPP](https://huggingface.co/datasets/mbpp), [MBPP+](https://huggingface.co/datasets/evalplus/mbppplus), and [DS-1000](https://github.com/HKUNLP/DS-1000/) for both completion (left-to-right) and insertion (FIM) mode.
     - [HumanEvalPack](https://huggingface.co/datasets/bigcode/humanevalpack) extends HumanEval to **3** scenarios across **6** languages via human translations and was released with [OctoPack](https://arxiv.org/abs/2308.07124).
     - [MultiPL-E](https://github.com/nuprl/MultiPL-E) evaluation suite (HumanEval translated into **18** programming languages).
     - [Recode](https://github.com/amazon-science/recode/tree/main) applied to the HumanEval benchmark. It evaluates the robustness of code-generation models.
diff --git a/bigcode_eval/tasks/__init__.py b/bigcode_eval/tasks/__init__.py
@@ -3,8 +3,8 @@
 
 from . import (apps, codexglue_code_to_text, codexglue_text_to_text, conala,
                concode, ds1000, gsm, humaneval, humanevalplus, humanevalpack,
-               instruct_humaneval, instruct_wizard_humaneval, mbpp, multiple,
-               parity, python_bugs, quixbugs, recode, santacoder_fim)
+               instruct_humaneval, instruct_wizard_humaneval, mbpp, mbppplus,
+               multiple, parity, python_bugs, quixbugs, recode, santacoder_fim)
 
 TASK_REGISTRY = {
     **apps.create_all_tasks(),
@@ -19,6 +19,7 @@
     **humanevalplus.create_all_tasks(),
     **humanevalpack.create_all_tasks(),
     "mbpp": mbpp.MBPP,
+    "mbppplus": mbppplus.MBPPPlus,
     "parity": parity.Parity,
     "python_bugs": python_bugs.PythonBugs,
     "quixbugs": quixbugs.QuixBugs,
diff --git a/bigcode_eval/tasks/mbppplus.py b/bigcode_eval/tasks/mbppplus.py
@@ -0,0 +1,77 @@
+"""Is Your Code Generated by ChatGPT Really Correct? Rigorous Evaluation of Large Language Models for Code Generation
+https://openreview.net/forum?id=1qvx610Cu7
+
+The MBPP+ dataset is created by the EvalPlus framework which extends the original MBPP dataset
+by adding more automatically generated test cases to each problem. Note MBPP+ only includes 399
+tasks which are a subset of the original MBPP dataset. The subset is selected from the sanitized
+MBPP (a subset of manually examined tasks by the original MBPP authors) and EvalPlus further 
+removes low-quality and ill-formed tasks for benchmark quality control.
+
+Homepage: https://github.com/evalplus/evalplus
+"""
+
+import os
+
+from bigcode_eval.tasks.mbpp import MBPP
+from bigcode_eval.tasks.custom_metrics.code_eval import compute_code_eval
+
+_CITATION = """
+@inproceedings{evalplus,
+  title = {Is Your Code Generated by Chat{GPT} Really Correct? Rigorous Evaluation of Large Language Models for Code Generation},
+  author = {Liu, Jiawei and Xia, Chunqiu Steven and Wang, Yuyao and Zhang, Lingming},
+  booktitle = {Thirty-seventh Conference on Neural Information Processing Systems},
+  year = {2023},
+  url = {https://openreview.net/forum?id=1qvx610Cu7},
+}
+"""
+
+
+class MBPPPlus(MBPP):
+    """A task represents an entire benchmark including its dataset, problems,
+    answers, generation settings and evaluation methods.
+    """
+
+    DATASET_PATH = "evalplus/mbppplus"
+
+    def get_prompt(self, doc):
+        """Builds the prompt for the LM to generate from.
+        MBPP prompt is built following to InCoder (Fried et al.) approach
+        prompt = docstring that includes one test
+        """
+        description = doc["prompt"]  # sanitized testset use "prompt" instead of "text"
+        test_example = doc["test_list"][0]
+        prompt = f'"""\n{description}\n{test_example}\n"""\n'
+        return prompt
+
+    # NOTE(@ganler): MBPP+ extends the original MBPP jsonl data with a "test" field which
+    #                includes the testing code ready for execution. Note the "test" field
+    #                is different from HumanEval(+) which further requires a `check` func
+    def get_reference(self, doc):
+        """Builds the reference solution for the doc (sample from the test dataset)."""
+        use_mbpp_tests = os.getenv("MBBPPLUS_USE_MBPP_TESTS", "0")
+        if use_mbpp_tests == "1":
+            return "\n".join(doc["test_list"])
+        return "\n" + doc["test"]
+
+    def get_dataset(self):
+        """Returns dataset for the task or an iterable of any object, that get_prompt can handle"""
+        dataset = self.dataset["test"]
+        assert (
+            len(dataset) == 399
+        ), "MBPP+ only has 399 problems. Please retry by deleting its old cache"
+        return dataset
+
+    def process_results(self, generations, references):
+        """Takes the list of LM generations and evaluates them against ground truth references,
+        returning the metric for the generations.
+        :param generations: list(list(str))
+            list of lists containing generations
+        :param references: list(str)
+            list of str containing refrences
+        """
+        results, _ = compute_code_eval(
+            references=references,
+            predictions=generations,
+            timeout=10.0,  # 10s timeout
+        )
+        return results
diff --git a/docs/README.md b/docs/README.md
@@ -206,6 +206,37 @@ accelerate launch  main.py \
 
 Low temperatures generally work better for small $k$ in pass@k.
 
+### MBPP+
+[MBPP+](https://huggingface.co/datasets/evalplus/mbppplus): MBPP with additional unit tests (35x of the original MBPP) for each of the 164 problems.
+
+The generation and evaluation follows the same approach as [MBPP](#mbpp). One only needs to change the task name to `mbppplus` to run the evaluation on MBPP+, such as:
+
+> [!Note]
+> Note MBPP+ only includes **399** tasks which are a subset of the original MBPP dataset (~1000 tasks). 
+> The subset is selected from the sanitized MBPP (a subset of ~427 manually examined tasks by the original MBPP authors)
+> and EvalPlus further removes low-quality and ill-formed one for benchmark quality control to get MBPP+.
+
+```bash
+accelerate launch  main.py \
+  --model <MODEL_NAME> \
+  --max_length_generation <MAX_LENGTH> \
+  --tasks mbppplus \
+  --temperature 0.1 \
+  --n_samples 15 \
+  --batch_size 10 \
+  --allow_code_execution
+```
+
+By setting `MBBPPLUS_USE_MBPP_TESTS=1` when running MBPP+, one can run the 399 MBPP+ tasks (a subset of the 500 MBPP evaluation tasks) with the original MBPP base tests:
+
+```bash
+MBBPPLUS_USE_MBPP_TESTS=1 accelerate launch main.py \
+  --tasks mbppplus \
+  --allow_code_execution \
+  --load_generations_path generations_mbppplus.json \
+  --model <MODEL_NAME>
+```
+
 ### DS-1000
 [DS-1000](https://ds1000-code-gen.github.io/): Code generation benchmark with 1000 data science questions spanning seven Python libraries that (1) reflects diverse, realistic, and practical use cases, (2) has a reliable metric, (3) defends against memorization by perturbing questions.