Merge pull request #164 from bigcode-project/max/santacoder-fim

maxmatical · web-flow · commit 9d17482e38ea · 2023-11-15T13:10:13.000-05:00
SantaCoder FIM task
diff --git a/README.md b/README.md
@@ -35,6 +35,9 @@ Below are the features and tasks of this framework:
     - [CoNaLa](https://huggingface.co/datasets/neulab/conala) for **Python** code generation (2-shot setting and evaluation with BLEU score).
     - [Concode](https://huggingface.co/datasets/code_x_glue_tc_text_to_code) for **Java** code generation (2-shot setting and evaluation with BLEU score).
     - 3 multilingual downstream classification tasks: [Java Complexity prediction](https://huggingface.co/datasets/codeparrot/codecomplex), [Java code equivalence prediction](https://huggingface.co/datasets/code_x_glue_cc_clone_detection_big_clone_bench), [C code defect prediction](https://huggingface.co/datasets/code_x_glue_cc_defect_detection).
+    - [SantaCoder-FIM](https://huggingface.co/datasets/bigcode/santacoder-fim-task) for evaluating FIM on **Python** code using Exact Match. Further details are described in [SantaCoder](https://arxiv.org/abs/2301.03988). Includes two tasks:
+        - `StarCoderFIM`: which uses the default FIM tokens `"<fim_prefix>", "<fim_middle>", "<fim_suffix>"`, and
+        - `SantaCoderFIM`: which uses SantaCoder FIM tokens `"<fim-prefix>", "<fim-middle>", "<fim-suffix>"`
 
 More details about each task can be found in  the documentation in [`docs/README.md`](https://github.com/bigcode-project/bigcode-evaluation-harness/blob/main/docs/README.md).
 ## Setup
diff --git a/bigcode_eval/base.py b/bigcode_eval/base.py
@@ -77,3 +77,19 @@ def process_results(self, generations, references):
         :return: dict[str: float]
         """
         pass
+
+    @staticmethod
+    def _stop_at_stop_token(decoded_string, stop_tokens):
+        """
+        Produces the prefix of decoded_string that ends at the first occurrence of
+        a stop_token.
+        WARNING: the decoded_string *must not* include the prompt, which may have stop tokens
+        itself.
+        """
+        min_stop_index = len(decoded_string)
+        for stop_token in stop_tokens:
+            stop_index = decoded_string.find(stop_token)
+            if stop_index != -1 and stop_index < min_stop_index:
+                min_stop_index = stop_index
+        return decoded_string[:min_stop_index]
+
diff --git a/bigcode_eval/tasks/__init__.py b/bigcode_eval/tasks/__init__.py
@@ -4,7 +4,7 @@
 from . import (apps, codexglue_code_to_text, codexglue_text_to_text, conala,
                concode, ds1000, gsm, humaneval, humanevalpack,
                instruct_humaneval, instruct_wizard_humaneval, mbpp, multiple,
-               parity, python_bugs, quixbugs, recode)
+               parity, python_bugs, quixbugs, recode, santacoder_fim)
 
 TASK_REGISTRY = {
     **apps.create_all_tasks(),
@@ -25,6 +25,7 @@
     **gsm.create_all_tasks(),
     **instruct_humaneval.create_all_tasks(),
     **recode.create_all_tasks(),
+    **santacoder_fim.create_all_tasks(),
 }
 
 ALL_TASKS = sorted(list(TASK_REGISTRY))
diff --git a/bigcode_eval/tasks/humaneval.py b/bigcode_eval/tasks/humaneval.py
@@ -74,20 +74,6 @@ def get_reference(self, doc):
         entry_point = f"check({doc['entry_point']})"
         return "\n" + test_func + "\n" + entry_point
 
-    @staticmethod
-    def _stop_at_stop_token(decoded_string, stop_tokens):
-        """
-        Produces the prefix of decoded_string that ends at the first occurrence of
-        a stop_token.
-        WARNING: the decoded_string *must not* include the prompt, which may have stop tokens
-        itself.
-        """
-        min_stop_index = len(decoded_string)
-        for stop_token in stop_tokens:
-            stop_index = decoded_string.find(stop_token)
-            if stop_index != -1 and stop_index < min_stop_index:
-                min_stop_index = stop_index
-        return decoded_string[:min_stop_index]
 
     def postprocess_generation(self, generation, idx):
         """Defines the postprocessing for a LM generation.
diff --git a/bigcode_eval/tasks/instruct_humaneval.py b/bigcode_eval/tasks/instruct_humaneval.py
@@ -55,20 +55,6 @@ def get_reference(self, doc):
         entry_point = f"check({doc['entry_point']})"
         return "\n" + test_func + "\n" + entry_point
 
-    @staticmethod
-    def _stop_at_stop_token(decoded_string, stop_tokens):
-        """
-        Produces the prefix of decoded_string that ends at the first occurrence of
-        a stop_token.
-        WARNING: the decoded_string *must not* include the prompt, which may have stop tokens
-        itself.
-        """
-        min_stop_index = len(decoded_string)
-        for stop_token in stop_tokens:
-            stop_index = decoded_string.find(stop_token)
-            if stop_index != -1 and stop_index < min_stop_index:
-                min_stop_index = stop_index
-        return decoded_string[:min_stop_index]
 
     def process_results(self, generations, references):
         """Takes the list of LM generations and evaluates them against ground truth references,
diff --git a/bigcode_eval/tasks/mbpp.py b/bigcode_eval/tasks/mbpp.py
@@ -59,20 +59,6 @@ def get_reference(self, doc):
         """Builds the reference solution for the doc (sample from the test dataset)."""
         return "\n".join(doc["test_list"])
 
-    @staticmethod
-    def _stop_at_stop_token(decoded_string, stop_tokens):
-        """
-        Produces the prefix of decoded_string that ends at the first occurrence of
-        a stop_token.
-        WARNING: the decoded_string *must not* include the prompt, which may have stop tokens
-        itself.
-        """
-        min_stop_index = len(decoded_string)
-        for stop_token in stop_tokens:
-            stop_index = decoded_string.find(stop_token)
-            if stop_index != -1 and stop_index < min_stop_index:
-                min_stop_index = stop_index
-        return decoded_string[:min_stop_index]
 
     def postprocess_generation(self, generation, idx):
         """Defines the postprocessing for a LM generation.
diff --git a/bigcode_eval/tasks/multiple.py b/bigcode_eval/tasks/multiple.py
@@ -115,20 +115,6 @@ def remove_last_block(string, stop_words):
         # last string should be ""
         return "".join(string_list[:-2])
 
-    @staticmethod
-    def _stop_at_stop_token(decoded_string, stop_tokens):
-        """
-        Produces the prefix of decoded_string that ends at the first occurrence of
-        a stop_token.
-        WARNING: the decoded_string *must not* include the prompt, which may have stop tokens
-        itself.
-        """
-        min_stop_index = len(decoded_string)
-        for stop_token in stop_tokens:
-            stop_index = decoded_string.find(stop_token)
-            if stop_index != -1 and stop_index < min_stop_index:
-                min_stop_index = stop_index
-        return decoded_string[:min_stop_index]
 
     def postprocess_generation(self, generation, idx):
         """Defines the postprocessing for a LM generation.
diff --git a/bigcode_eval/tasks/recode.py b/bigcode_eval/tasks/recode.py
@@ -96,20 +96,6 @@ def get_reference(self, doc):
             "test_code": test_code,
         }
 
-    @staticmethod
-    def _stop_at_stop_token(decoded_string, stop_tokens):
-        """
-        Produces the prefix of decoded_string that ends at the first occurrence of
-        a stop_token.
-        WARNING: the decoded_string *must not* include the prompt, which may have stop tokens
-        itself.
-        """
-        min_stop_index = len(decoded_string)
-        for stop_token in stop_tokens:
-            stop_index = decoded_string.find(stop_token)
-            if stop_index != -1 and stop_index < min_stop_index:
-                min_stop_index = stop_index
-        return decoded_string[:min_stop_index]
 
     def postprocess_generation(self, generation, idx):
         """
diff --git a/bigcode_eval/tasks/santacoder_fim.py b/bigcode_eval/tasks/santacoder_fim.py
@@ -0,0 +1,134 @@
+from typing import Dict, List
+
+from tqdm import tqdm
+
+from bigcode_eval.base import Task
+
+_CITATION = """
+@article{allal2023santacoder,
+  title={SantaCoder: don't reach for the stars!},
+  author={Allal, Loubna Ben and Li, Raymond and Kocetkov, Denis and Mou, Chenghao and Akiki, Christopher and Ferrandis, Carlos Munoz and Muennighoff, Niklas and Mishra, Mayank and Gu, Alex and Dey, Manan and others},
+  journal={arXiv preprint arXiv:2301.03988},
+  year={2023}
+}
+"""
+
+LANGUAGES = [
+    "py",
+    "js",
+    "java",
+]
+
+
+def create_all_tasks():
+    return {
+        "santacoder_fim": SantaCoderFIM,
+        "starcoder_fim": StarCoderFIM,
+    }
+
+
+def initialize_empty_metrics(languages: List[str]) -> Dict[str, float]:
+    metrics = {}
+    for lang in languages:
+        metrics[f"n_accurate_{lang}"] = 0.0
+        metrics[f"n_count_{lang}"] = 0.0
+    return metrics
+
+
+def aggregate_per_lang_accuracy(
+    metrics: Dict[str, float], languages: List[str]
+) -> Dict[str, float]:
+    em_metrics = {}
+    for lang in languages:
+        # avoid div by 0
+        acc = (
+            metrics[f"n_accurate_{lang}"] / metrics[f"n_count_{lang}"]
+            if metrics[f"n_count_{lang}"]
+            else 0
+        )
+        em_metrics[f"{lang} Exact Match"] = acc
+
+    return em_metrics
+
+
+class SantaCoderFIM(Task):
+    DATASET_PATH = "bigcode/santacoder-fim-task"
+
+    def __init__(
+        self,
+        fim_prefix: str = "<fim-prefix>",
+        fim_middle: str = "<fim-middle>",
+        fim_suffix: str = "<fim-suffix>",
+    ):
+        stop_words = ["<|endoftext|>", "<|filename|>"]
+        super().__init__(
+            stop_words=stop_words,
+            requires_execution=False,
+        )
+        self.fim_prefix = fim_prefix
+        self.fim_middle = fim_middle
+        self.fim_suffix = fim_suffix
+
+    def get_dataset(self):
+        """Returns dataset for the task or an iterable of any object, that get_prompt can handle"""
+        dataset = self.dataset["train"]
+        return dataset
+
+    def get_prompt(self, doc):
+        """Builds the prompt for the LM to generate from."""
+        return f"""{self.fim_prefix}{doc["prompt"]}{self.fim_suffix}{doc["suffix"]}{self.fim_middle}"""
+
+    def get_reference(self, doc):
+        """Builds the reference solution for the doc (sample from the test dataset)."""
+        return doc["canonical_solution"]
+
+    def postprocess_generation(self, generation, idx):
+        """Defines the postprocessing for a LM generation.
+        :param generation: str
+            code generation from LM
+        :param idx: int
+            index of doc in the dataset to which the generation belongs
+        """
+        doc = self.get_dataset()[idx]
+        prompt = self.get_prompt(doc)
+        output = generation[len(prompt) :]
+        return self._stop_at_stop_token(output, self.stop_words)
+        # return generation
+
+    def process_results(self, generations, references):
+        """Takes the list of LM generations and evaluates them against ground truth references,
+        returning the metric for the generations as in {"metric_name": result}.
+        :param generations: list(list(str))
+            list of lists containing generations
+        :param references: list(str)
+            list of str containing refrences
+        :return: dict[str: float]
+        """
+        metrics = initialize_empty_metrics(LANGUAGES)
+        for idx, (gen, reference) in tqdm(enumerate(zip(generations, references))):
+            language = self.get_dataset()[idx]["language"]
+            for g in gen:
+                metrics[f"n_accurate_{language}"] += int(g.strip() == reference.strip())
+
+            metrics[f"n_count_{language}"] += len(gen)
+
+        em_metrics = aggregate_per_lang_accuracy(metrics, LANGUAGES)
+
+        return em_metrics
+
+
+class StarCoderFIM(SantaCoderFIM):
+    DATASET_PATH = "bigcode/santacoder-fim-task"
+
+    def __init__(self):
+        fim_prefix = "<fim_prefix>"
+        fim_middle = "<fim_middle>"
+        fim_suffix = "<fim_suffix>"
+        stop_words = ["<|endoftext|>", "<|filename|>"]
+        super().__init__(
+            stop_words=stop_words,
+            requires_execution=False,
+            fim_prefix=fim_prefix,
+            fim_middle=fim_middle,
+            fim_suffix=fim_suffix,
+        )
diff --git a/docs/README.md b/docs/README.md
@@ -357,6 +357,29 @@ accelerate launch  main.py \
 ```
 If you ever get index out-of-range errors try using a number of problems `limit` that is proportional to the number of devices you are using.
 
+### SantaCoder-FIM
+[SantaCoder-FIM](https://huggingface.co/datasets/bigcode/santacoder-fim-task): 4,792 tasks for FIM insertion described in [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988). The tasks are similar to other tasks without unit tests, with two key differences:
+1. Instead of BLEU Score, Exact Match is used to score the generations.
+2. Use zero-shot setting instead of 2-shot
+
+SantaCoder-FIM includes 2 tasks:
+- `StarCoderFIM`: which uses the default FIM tokens `"<fim_prefix>", "<fim_middle>", "<fim_suffix>"`, and
+- `SantaCoderFIM`: which uses SantaCoder FIM tokens `"<fim-prefix>", "<fim-middle>", "<fim-suffix>"`
+So depending on the FIM tokens used to train the model, you will need to select the appropriate task for evaluation.
+
+We only do single generation `n_samples=1`, and use the same generation settings as before.
+Below are the commands to run the evaluation:
+```python
+accelerate launch  main.py \
+  --model <MODEL_NAME> \
+  --max_length_generation <MAX_LENGTH> \
+  --tasks <TASK> \
+  --n_samples 1 \
+  --temperature 0.2 \
+  --batch_size 1 
+```
+If you ever get index out-of-range errors try using a number of problems `limit` that is proportional to the number of devices you are using.
+
 ## Documentation generation task
 Code to text task from [CodeXGLUE](https://huggingface.co/datasets/code_x_glue_ct_code_to_text): is a benchmark for English documentation generation from for 6 programming languages: Python, Go, Ruby, Java, JavaScript and PHP.