Merge branch 'main' into mercury

Elfsong · web-flow · commit 67c440996902 · 2024-05-26T12:49:54.000+08:00
diff --git a/bigcode_eval/tasks/__init__.py b/bigcode_eval/tasks/__init__.py
@@ -4,8 +4,8 @@
 from . import (apps, codexglue_code_to_text, codexglue_text_to_text, conala,
                concode, ds1000, gsm, humaneval, humanevalplus, humanevalpack,
                instruct_humaneval, instruct_wizard_humaneval, mbpp, mbppplus,
-               multiple, parity, python_bugs, quixbugs, recode, santacoder_fim, 
-               mercury)
+               multiple, parity, python_bugs, quixbugs, recode, santacoder_fim,
+               studenteval, mercury)
 
 TASK_REGISTRY = {
     **apps.create_all_tasks(),
@@ -29,6 +29,7 @@
     **instruct_humaneval.create_all_tasks(),
     **recode.create_all_tasks(),
     **santacoder_fim.create_all_tasks(),
+    "studenteval": studenteval.StudentEval,
     "mercury": mercury.Mercury,
 }
 
diff --git a/bigcode_eval/tasks/humanevalpack.py b/bigcode_eval/tasks/humanevalpack.py
@@ -228,15 +228,19 @@ def get_prompt(self, prompt_base, instruction, context=None):
         elif self.prompt == "codellama":
             # https://hf.co/codellama             
             prompt = f"[INST] {inp.strip()} [/INST] {prompt_base}"
+        elif  self.prompt == "deepseek":
+            prompt = f"You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n{inp.strip()}\n### Response:\n{prompt_base}"
         elif self.prompt in ["tulu", "gritlm"]:
             # https://hf.co/GritLM/GritLM-7B
             prompt = f"<|user|>\n{inp}\n<|assistant|>\n{prompt_base}"
         elif self.prompt == "zephyr":
             # https://hf.co/HuggingFaceH4/zephyr-7b-beta
             prompt = f"<|user|>\n{inp}</s>\n<|assistant|>\n{prompt_base}"
-        elif self.prompt == "yi":
+        elif self.prompt in ["yi", "starchat2", "codeqwen"]:
             # https://hf.co/01-ai/Yi-34B-Chat     
             prompt = f"<|im_start|>user\n{inp}<|im_end|>\n<|im_start|>assistant\n{prompt_base}"
+        elif self.prompt == "codegemma":
+            prompt = f"<start_of_turn>user\n{inp}<end_of_turn>\n<start_of_turn>model\n{prompt_base}"
         elif self.prompt == "codellama-70b":
             prompt = f"Source: user\n\n {inp.strip()} Source: assistant\nDestination: user \n\n{prompt_base}"
         elif self.prompt == "aurora-m":
diff --git a/bigcode_eval/tasks/humanevalplus.py b/bigcode_eval/tasks/humanevalplus.py
@@ -29,11 +29,11 @@ class GeneralHumanEvalPlus(GeneralHumanEval):
 
     DATASET_PATH = "evalplus/humanevalplus"
 
-    def __init__(self, strip_prompt, k=[1, 10, 100], num_workers=16, timeout=10.0):
-        if timeout < 10.0:
+    def __init__(self, strip_prompt, k=[1, 10, 100], num_workers=16, timeout=20.0):
+        if timeout < 20.0:
             warn(
                 "It is suggested to have a longer timeout as HumanEval+ has lots of tests. "
-                f"The current timeout is {timeout}s while the suggested timeout is 10s."
+                f"The current timeout is {timeout}s while the suggested timeout is 20s."
             )
         super().__init__(strip_prompt, k, num_workers, timeout)
 
diff --git a/bigcode_eval/tasks/mbppplus.py b/bigcode_eval/tasks/mbppplus.py
@@ -4,7 +4,7 @@
 The MBPP+ dataset is created by the EvalPlus framework which extends the original MBPP dataset
 by adding more automatically generated test cases to each problem. Note MBPP+ only includes 399
 tasks which are a subset of the original MBPP dataset. The subset is selected from the sanitized
-MBPP (a subset of manually examined tasks by the original MBPP authors) and EvalPlus further 
+MBPP (a subset of manually examined tasks by the original MBPP authors) and EvalPlus further
 removes low-quality and ill-formed tasks for benchmark quality control.
 
 Homepage: https://github.com/evalplus/evalplus
@@ -56,9 +56,6 @@ def get_reference(self, doc):
     def get_dataset(self):
         """Returns dataset for the task or an iterable of any object, that get_prompt can handle"""
         dataset = self.dataset["test"]
-        assert (
-            len(dataset) == 399
-        ), "MBPP+ only has 399 problems. Please retry by deleting its old cache"
         return dataset
 
     def process_results(self, generations, references):
diff --git a/bigcode_eval/tasks/studenteval.py b/bigcode_eval/tasks/studenteval.py
@@ -0,0 +1,177 @@
+"""
+StudentEval is a dataset of 1,749 prompts for 48 problems, authored by 80
+students who have only completed a one-semester Python programming class.
+Unlike many other benchmarks, it has multiple prompts per problem and multiple
+attempts by the same participant.
+
+Web page: https://huggingface.co/datasets/wellesley-easel/StudentEval
+"""
+
+from bigcode_eval.base import Task
+from datasets import load_dataset
+from multiprocessing import cpu_count
+from concurrent.futures import ThreadPoolExecutor
+from tqdm import tqdm
+import tempfile
+import pandas as pd
+import numpy as np
+import subprocess
+
+_CITATION = """\
+@misc{babe2023studenteval,
+      title={StudentEval: A Benchmark of Student-Written Prompts for Large Language Models of Code}, 
+      author={Hannah McLean Babe and Sydney Nguyen and Yangtian Zi and Arjun Guha and Molly Q Feldman and Carolyn Jane Anderson},
+      year={2023},
+      eprint={2306.04556},
+      archivePrefix={arXiv},
+      primaryClass={cs.LG}
+}"""
+
+EXECUTION_TIMEOUT = 15
+
+
+# Source: Chen at al. Evaluating Large Language Models of Code. 2021
+def _estimator(n: int, c: int, k: int) -> float:
+    """
+    Calculates 1 - comb(n - c, k) / comb(n, k).
+    """
+    assert c <= n, "c must be less than n"
+    if n - c < k:
+        return 1.0
+    return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
+
+
+def _run_assembled_program(item):
+    """
+    Runs the program with a timeout. The result dictionary has a "success" key
+    that is 1 on success and 0 on failure. It also includes keys necessary to
+    group results (problem, prompt, and group) and report results for each
+    subset of StudentEval.
+    """
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".py") as f:
+        f.write(item["program"])
+        f.flush()
+        try:
+            result = subprocess.run(
+                ["python3", f.name],
+                timeout=EXECUTION_TIMEOUT,
+                stdout=subprocess.DEVNULL,
+                stderr=subprocess.DEVNULL,
+                stdin=subprocess.DEVNULL,
+            )
+            exit_code = result.returncode
+        except subprocess.TimeoutExpired:
+            exit_code = 1
+    return {
+        "problem": item["problem"],
+        "prompt": item["prompt"],
+        "group": item["group"],
+        "success": 1 if exit_code == 0 else 0,
+    }
+
+
+def _get_group(item):
+    """
+    These boolean flags are mutually exclusive in the dataset. We turn them into a
+    a string for easy grouping with Pandas.
+    """
+    if item["is_first_success"]:
+        return "First Success"
+    if item["is_last_success"]:
+        return "Last Success"
+    if item["is_first_failure"]:
+        return "First Failure"
+    if item["is_last_failure"]:
+        return "Last Failure"
+    return None
+
+
+class StudentEval(Task):
+    DATASET_PATH = "wellesley-easel/StudentEval"
+
+    def __init__(self):
+        self.stop_words = ["\ndef", "\nclass", "\nif", "\nprint"]
+        self.requires_execution = True
+        self.dataset = load_dataset(path=self.DATASET_PATH)
+        # NOTE(Arjun Guha): Avoiding .filter so that we don't get a datasets
+        # cache item on disk.
+        self.dataset = [
+            item for item in self.dataset["test"] if _get_group(item) is not None
+        ]
+
+    def get_dataset(self):
+        return self.dataset
+
+    def get_prompt(self, doc):
+        return doc["prompt"].rstrip()
+
+    # For a task with tests, the reference solution is the suite of tests.
+    def get_reference(self, doc):
+        return {
+            "prompt": doc["prompt"],
+            "assertions": doc["assertions"],
+            "problem": doc["problem"],
+            "group": _get_group(doc),
+        }
+
+    def postprocess_generation(self, generation, idx):
+        """Defines the postprocessing for a LM generation.
+        :param generation: str
+            code generation from LM
+        :param idx: int
+            index of doc in the dataset to which the generation belongs
+            (not used for Humaneval-Task)
+        """
+        prompt = self.get_prompt(self.dataset[idx])
+        generation = generation[len(prompt) :]
+        return prompt + self._stop_at_stop_token(generation, self.stop_words)
+
+    def process_results(self, generations, references):
+        """Takes the list of LM generations and evaluates them against ground truth references,
+        returning the metric for the generations.
+        :param generations: list(list(str))
+            list of lists containing generations
+        :param references: list({ "assertions": list(str), "problem": str })
+            list of reference solutions
+        """
+
+        worklist = []
+        for generations, reference in zip(generations, references):
+            # NOTE(Arjun Guha): This can be more efficient. At low temperature, we get lots of
+            # repeated completions. So, this will end up running the same program repeatedly.
+            # The original StudentEval code runs each generation once.
+            for generation in generations:
+                item = {
+                    "program": generation + "\n\n" + reference["assertions"],
+                    "prompt": reference["prompt"],
+                    "problem": reference["problem"],
+                    "group": reference["group"],
+                }
+                worklist.append(item)
+
+        with ThreadPoolExecutor(max_workers=cpu_count() - 1) as executor:
+            results_df = pd.DataFrame(
+                list(
+                    tqdm(
+                        executor.map(_run_assembled_program, worklist),
+                        total=len(worklist),
+                    )
+                )
+            )
+
+        # Calculate pass@1 for each prompt
+        results_df = results_df.groupby(["problem", "prompt", "group"]).agg(
+            c=("success", np.sum), n=("success", "count")
+        )
+        results_df.reset_index(inplace=True)
+        results_df["pass1"] = results_df.apply(
+            lambda row: _estimator(row["n"], row["c"], 1), axis=1
+        )
+
+        # Calculate mean pass@1 for each group
+        results_df = results_df.groupby(["group"]).agg(pass1=("pass1", np.mean))
+
+        # Turn into JSON
+        results_df.reset_index(inplace=True)
+        results_df = results_df.to_dict(orient="records")
+        return results_df
diff --git a/docs/README.md b/docs/README.md
@@ -382,6 +382,27 @@ accelerate launch  main.py \
   --allow_code_execution
 ```
 
+### StudentEval
+
+[StudentEval](https://huggingface.co/datasets/wellesley-easel/StudentEval) is a 
+dataset of 1,749 prompts for 48 problems, authored by 80 students who have only
+completed a one-semester Python programming class. Unlike many other benchmarks, 
+it has multiple prompts per problem and multiple attempts by the same
+participant. Each problem is accompanied by a set of instructor-written test 
+cases.
+
+```python
+accelerate launch main.py \
+  --model <MODEL_NAME> \
+  --max_length_generation 512 \
+  --tasks studenteval \
+  --temperature 0.2 \
+  --top_p 0.95 \
+  --do_sample True \
+  --n_samples 20 \
+  --batch_size 20 \
+  --allow_code_execution
+```
 
 ## Code generation benchmarks without unit tests
 
diff --git a/setup.py b/setup.py
@@ -28,6 +28,8 @@
 ]
 
 setup(
+    name="bigcode_eval",
+    python_requires='>=3.7',
     description="A framework for the evaluation of autoregressive code generation language models.",
     long_description=readme,
     license="Apache 2.0",

Original file line number	Diff line number	Diff line change
`@@ -28,6 +28,8 @@`
`28`	`28`	`]`
`29`	`29`
`30`	`30`	`setup(`
	`31`	`+ name="bigcode_eval",`
	`32`	`+ python_requires='>=3.7',`
`31`	`33`	`description="A framework for the evaluation of autoregressive code generation language models.",`
`32`	`34`	`long_description=readme,`
`33`	`35`	`license="Apache 2.0",`