Merge pull request #187 from ganler/heplus

loubnabnl · web-flow · commit 9cfa52b2819e · 2024-01-23T11:21:44.000+01:00
Add humaneval+ evaluation task
diff --git a/README.md b/README.md
@@ -26,7 +26,7 @@ Below are the features and tasks of this framework:
     - We provide Multi-GPU text generation with `accelerate` and Dockerfiles for evaluating on Docker containers for security and reproducibility.
 
 - Tasks:
-    - 5 code generation **Python** tasks (with unit tests): [HumanEval](https://huggingface.co/datasets/openai_humaneval), [InstructHumanEval](https://huggingface.co/datasets/codeparrot/instructhumaneval), [APPS](https://huggingface.co/datasets/codeparrot/apps), [MBPP](https://huggingface.co/datasets/mbpp) and [DS-1000](https://github.com/HKUNLP/DS-1000/) for both completion (left-to-right) and insertion (FIM) mode.
+    - 6 code generation **Python** tasks (with unit tests): [HumanEval](https://huggingface.co/datasets/openai_humaneval), [HumanEval+](https://huggingface.co/datasets/evalplus/humanevalplus), [InstructHumanEval](https://huggingface.co/datasets/codeparrot/instructhumaneval), [APPS](https://huggingface.co/datasets/codeparrot/apps), [MBPP](https://huggingface.co/datasets/mbpp) and [DS-1000](https://github.com/HKUNLP/DS-1000/) for both completion (left-to-right) and insertion (FIM) mode.
     - [HumanEvalPack](https://huggingface.co/datasets/bigcode/humanevalpack) extends HumanEval to **3** scenarios across **6** languages via human translations and was released with [OctoPack](https://arxiv.org/abs/2308.07124).
     - [MultiPL-E](https://github.com/nuprl/MultiPL-E) evaluation suite (HumanEval translated into **18** programming languages).
     - [Recode](https://github.com/amazon-science/recode/tree/main) applied to the HumanEval benchmark. It evaluates the robustness of code-generation models.
diff --git a/bigcode_eval/tasks/__init__.py b/bigcode_eval/tasks/__init__.py
@@ -2,7 +2,7 @@
 from pprint import pprint
 
 from . import (apps, codexglue_code_to_text, codexglue_text_to_text, conala,
-               concode, ds1000, gsm, humaneval, humanevalpack,
+               concode, ds1000, gsm, humaneval, humanevalplus, humanevalpack,
                instruct_humaneval, instruct_wizard_humaneval, mbpp, multiple,
                parity, python_bugs, quixbugs, recode, santacoder_fim)
 
@@ -16,6 +16,7 @@
     "concode": concode.Concode,
     **ds1000.create_all_tasks(),
     **humaneval.create_all_tasks(),
+    **humanevalplus.create_all_tasks(),
     **humanevalpack.create_all_tasks(),
     "mbpp": mbpp.MBPP,
     "parity": parity.Parity,
diff --git a/bigcode_eval/tasks/humanevalplus.py b/bigcode_eval/tasks/humanevalplus.py
@@ -0,0 +1,57 @@
+"""Is Your Code Generated by ChatGPT Really Correct? Rigorous Evaluation of Large Language Models for Code Generation
+https://openreview.net/forum?id=1qvx610Cu7
+
+The HumanEval+ dataset is created by the EvalPlus framework which extends the original HumanEval dataset
+by adding more automatically generated test cases to each problem.
+
+Homepage: https://github.com/evalplus/evalplus
+"""
+
+from warnings import warn
+
+from bigcode_eval.tasks.humaneval import GeneralHumanEval
+
+_CITATION = """
+@inproceedings{evalplus,
+  title = {Is Your Code Generated by Chat{GPT} Really Correct? Rigorous Evaluation of Large Language Models for Code Generation},
+  author = {Liu, Jiawei and Xia, Chunqiu Steven and Wang, Yuyao and Zhang, Lingming},
+  booktitle = {Thirty-seventh Conference on Neural Information Processing Systems},
+  year = {2023},
+  url = {https://openreview.net/forum?id=1qvx610Cu7},
+}
+"""
+
+
+class GeneralHumanEvalPlus(GeneralHumanEval):
+    """A task represents an entire benchmark including its dataset, problems,
+    answers, generation settings and evaluation methods.
+    """
+
+    DATASET_PATH = "evalplus/humanevalplus"
+
+    def __init__(self, strip_prompt, k=[1, 10, 100], num_workers=16, timeout=10.0):
+        if timeout < 10.0:
+            warn(
+                "It is suggested to have a longer timeout as HumanEval+ has lots of tests. "
+                f"The current timeout is {timeout}s while the suggested timeout is 10s."
+            )
+        super().__init__(strip_prompt, k, num_workers, timeout)
+
+
+def create_task(strip_prompt):
+    class HumanEvalPlus(GeneralHumanEvalPlus):
+        def __init__(self, **kwargs):
+            super().__init__(strip_prompt, **kwargs)
+
+    return HumanEvalPlus
+
+
+def create_all_tasks():
+    """Creates a dictionary of tasks from a list of levels
+    :return: {task_name: task}
+        e.g. {multiple-py: Task, multiple-java: Task}
+    """
+    return {
+        "humanevalplus": create_task(True),
+        "humanevalplus-unstripped": create_task(False),
+    }
diff --git a/docs/README.md b/docs/README.md
@@ -49,6 +49,22 @@ accelerate launch  main.py \
 
 If you want to evaluate only on the first $n$ samples instead of all the test dataset, set `limit` argument to $n$. 
 
+### HumanEval+
+[HumanEval+](https://huggingface.co/datasets/evalplus/humanevalplus): HumanEval with additional unit tests (80x of the original HumanEval) for each of the 164 problems.
+
+The generation and evaluation follows the same approach as [HumanEval](#humaneval). One only needs to change the task name to `humanevalplus` to run the evaluation on HumanEval+, such as:
+
+```python
+accelerate launch  main.py \
+  --model <MODEL_NAME> \
+  --max_length_generation <MAX_LENGTH> \
+  --tasks humanevalplus \
+  --temperature 0.2 \
+  --n_samples 200 \
+  --batch_size 10 \
+  --allow_code_execution
+```
+
 
 ### HumanEvalPack