bigcode-project
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 0 deletions b/‎.gitignore‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎bigcode_eval/tasks/apps.py‎
Lines changed: 7 additions & 4 deletions b/‎bigcode_eval/tasks/apps.py‎
Lines changed: 7 additions & 4 deletions
diff --git a/‎bigcode_eval/tasks/codexglue_code_to_text.py‎
Lines changed: 2 additions & 2 deletions b/‎bigcode_eval/tasks/codexglue_code_to_text.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎bigcode_eval/tasks/codexglue_text_to_text.py‎
Lines changed: 6 additions & 4 deletions b/‎bigcode_eval/tasks/codexglue_text_to_text.py‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎bigcode_eval/tasks/conala.py‎
Lines changed: 4 additions & 2 deletions b/‎bigcode_eval/tasks/conala.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎bigcode_eval/tasks/concode.py‎
Lines changed: 4 additions & 2 deletions b/‎bigcode_eval/tasks/concode.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎bigcode_eval/tasks/custom_metrics/code_eval.py‎
Lines changed: 189 additions & 0 deletions b/‎bigcode_eval/tasks/custom_metrics/code_eval.py‎
Lines changed: 189 additions & 0 deletions
@@ -166,3 +166,5 @@ cython_debug/
 # Script outputs
 evaluation*.json
 generations*.json
+
+playground/
@@ -36,8 +36,8 @@ def create_all_tasks():
 
 def create_task(level):
     class APPS(GeneralAPPS):
-        def __init__(self):
-            super().__init__(level)
+        def __init__(self, **kwargs):
+            super().__init__(level, **kwargs)
 
     return APPS
 
@@ -50,12 +50,13 @@ class GeneralAPPS(Task):
     DATASET_PATH = "codeparrot/apps"
     DATASET_NAME = None
 
-    def __init__(self, level):
+    def __init__(self, level, k_list=[1, 10, 100]):
         self.DATASET_NAME = level
         super().__init__(
             stop_words=["\nQUESTION", "\n---", "\nANSWER"],
             requires_execution=True,
         )
+        self.k_list = k_list
 
     def get_dataset(self):
         """Returns dataset for the task or an iterable of any object, that get_prompt can handle"""
@@ -115,7 +116,9 @@ def process_results(self, generations, references):
             list of str containing refrences (not needed for APPS Task)
         """
         code_metric = load("codeparrot/apps_metric")
+        if level is None:
+            level = self.DATASET_NAME
         results = code_metric.compute(
-            predictions=generations, k_list=[1, 10, 100], level=self.DATASET_NAME
+            predictions=generations, k_list=self.k_list, level=self.DATASET_NAME
         )
         return results
@@ -46,8 +46,8 @@ def create_all_tasks():
 
 def create_task(language):
     class CodeToText(GeneralCodeToText):
-        def __init__(self):
-            super().__init__(language)
+        def __init__(self, **kwargs):
+            super().__init__(language, **kwargs)
 
     return CodeToText
 
 
@@ -40,8 +40,8 @@ def create_all_tasks():
 
 def create_task(translation_task):
     class CodexglueTextToTextTask(CodexglueTextToText):
-        def __init__(self):
-            super().__init__(translation_task)
+        def __init__(self, **kwargs):
+            super().__init__(translation_task, **kwargs)
 
     return CodexglueTextToTextTask
 
@@ -51,11 +51,13 @@ class CodexglueTextToText(Task):
     DATASET_PATH = "code_x_glue_tt_text_to_text"
     DATASET_NAME = None
 
-    def __init__(self, translation_task):
+    def __init__(self, translation_task, max_order=4, smooth=True):
         self.DATASET_NAME = translation_task
         stop_words = ["\n"]
         requires_execution = False
         super().__init__(stop_words, requires_execution)
+        self.max_order = max_order
+        self.smooth = smooth
 
     def get_dataset(self):
         """Returns dataset for the task or an iterable of any object, that get_prompt can handle"""
@@ -117,6 +119,6 @@ def process_results(self, generations, references):
         bleu = load("bleu")
         gens = [gen[0] for gen in generations]
         results = bleu.compute(
-            references=references, predictions=gens, max_order=4, smooth=True
+            references=references, predictions=gens, max_order=self.max_order, smooth=self.smooth
         )
         return results
@@ -34,11 +34,13 @@ class Conala(Task):
 
     DATASET_PATH = "neulab/conala"
 
-    def __init__(self):
+    def __init__(self, max_order=4, smooth=True):
         super().__init__(
             stop_words=["\n"],
             requires_execution=False,
         )
+        self.max_order = max_order
+        self.smooth = smooth
 
     def get_dataset(self):
         """Returns dataset for the task or an iterable of any object, that get_prompt can handle"""
@@ -101,6 +103,6 @@ def process_results(self, generations, references):
         bleu = load("bleu")
         gens = [gen[0] for gen in generations]
         results = bleu.compute(
-            references=references, predictions=gens, max_order=4, smooth=True
+            references=references, predictions=gens, max_order=self.max_order, smooth=self.smooth
         )
         return results
@@ -33,11 +33,13 @@ class Concode(Task):
 
     DATASET_PATH = "code_x_glue_tc_text_to_code"
 
-    def __init__(self):
+    def __init__(self, max_order=4, smooth=True):
         super().__init__(
             stop_words=["\n"],
             requires_execution=False,
         )
+        self.max_order = max_order
+        self.smooth = smooth
 
     def get_dataset(self):
         """Returns dataset for the task or an iterable of any object, that get_prompt can handle"""
@@ -102,6 +104,6 @@ def process_results(self, generations, references):
         bleu = load("bleu")
         gens = [gen[0] for gen in generations]
         results = bleu.compute(
-            references=references, predictions=gens, max_order=4, smooth=True
+            references=references, predictions=gens, max_order=self.max_order, smooth=self.smooth
         )
         return results
@@ -0,0 +1,189 @@
+# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""The CodeEval metric estimates the pass@k metric for code synthesis.
+This is an evaluation harness for the HumanEval problem solving dataset
+described in the paper "Evaluating Large Language Models Trained on Code"
+(https://arxiv.org/abs/2107.03374)."""
+
+import itertools
+import os
+from collections import Counter, defaultdict
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+import numpy as np
+
+from .execute import check_correctness
+
+
+_CITATION = """\
+@misc{chen2021evaluating,
+      title={Evaluating Large Language Models Trained on Code},
+      author={Mark Chen and Jerry Tworek and Heewoo Jun and Qiming Yuan \
+and Henrique Ponde de Oliveira Pinto and Jared Kaplan and Harri Edwards \
+and Yuri Burda and Nicholas Joseph and Greg Brockman and Alex Ray \
+and Raul Puri and Gretchen Krueger and Michael Petrov and Heidy Khlaaf \
+and Girish Sastry and Pamela Mishkin and Brooke Chan and Scott Gray \
+and Nick Ryder and Mikhail Pavlov and Alethea Power and Lukasz Kaiser \
+and Mohammad Bavarian and Clemens Winter and Philippe Tillet \
+and Felipe Petroski Such and Dave Cummings and Matthias Plappert \
+and Fotios Chantzis and Elizabeth Barnes and Ariel Herbert-Voss \
+and William Hebgen Guss and Alex Nichol and Alex Paino and Nikolas Tezak \
+and Jie Tang and Igor Babuschkin and Suchir Balaji and Shantanu Jain \
+and William Saunders and Christopher Hesse and Andrew N. Carr \
+and Jan Leike and Josh Achiam and Vedant Misra and Evan Morikawa \
+and Alec Radford and Matthew Knight and Miles Brundage and Mira Murati \
+and Katie Mayer and Peter Welinder and Bob McGrew and Dario Amodei \
+and Sam McCandlish and Ilya Sutskever and Wojciech Zaremba},
+      year={2021},
+      eprint={2107.03374},
+      archivePrefix={arXiv},
+      primaryClass={cs.LG}
+}
+"""
+
+_DESCRIPTION = """\
+This metric implements the evaluation harness for the HumanEval problem solving dataset
+described in the paper "Evaluating Large Language Models Trained on Code"
+(https://arxiv.org/abs/2107.03374).
+"""
+
+
+_KWARGS_DESCRIPTION = """
+Calculates how good are predictions given some references, using certain scores
+Args:
+    predictions: list of candidates to evaluate. Each candidates should be a list
+        of strings with several code candidates to solve the problem.
+    references: a list with a test for each prediction. Each test should evaluate the
+        correctness of a code candidate.
+    k: number of code candidates to consider in the evaluation (Default: [1, 10, 100])
+    num_workers: number of workers used to evaluate the canidate programs (Default: 4).
+    timeout:
+Returns:
+    pass_at_k: dict with pass rates for each k
+    results: dict with granular results of each unittest
+Examples:
+    >>> test_cases = ["assert add(2,3)==5"]
+    >>> candidates = [["def add(a,b): return a*b", "def add(a, b): return a+b"]]
+    >>> pass_at_k, results = compute_code_eval(references=test_cases, predictions=candidates, k=[1, 2])
+    >>> print(pass_at_k)
+    {'pass@1': 0.5, 'pass@2': 1.0}
+"""
+
+
+_WARNING = """
+################################################################################
+                                  !!!WARNING!!!
+################################################################################
+The "code_eval" metric executes untrusted model-generated code in Python.
+Although it is highly unlikely that model-generated code will do something
+overtly malicious in response to this test suite, model-generated code may act
+destructively due to a lack of model capability or alignment.
+Users are strongly encouraged to sandbox this evaluation suite so that it
+does not perform destructive actions on their host or network. For more
+information on how OpenAI sandboxes its code, see the paper "Evaluating Large
+Language Models Trained on Code" (https://arxiv.org/abs/2107.03374).
+
+Once you have read this disclaimer and taken appropriate precautions,
+set the environment variable HF_ALLOW_CODE_EVAL="1". Within Python you can to this
+with:
+
+>>> import os
+>>> os.environ["HF_ALLOW_CODE_EVAL"] = "1"
+
+################################################################################\
+"""
+
+_LICENSE = """The MIT License
+
+Copyright (c) OpenAI (https://openai.com)
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE."""
+
+def compute_code_eval(predictions, references, k=[1, 10, 100], num_workers=4, timeout=3.0):
+    """Returns the scores"""
+
+    if os.getenv("HF_ALLOW_CODE_EVAL", 0) != "1":
+        raise ValueError(_WARNING)
+
+    if os.name == "nt":
+        raise NotImplementedError("This metric is currently not supported on Windows.")
+
+    with ThreadPoolExecutor(max_workers=num_workers) as executor:
+        futures = []
+        completion_id = Counter()
+        n_samples = 0
+        results = defaultdict(list)
+
+        for task_id, (candidates, test_case) in enumerate(zip(predictions, references)):
+            for candidate in candidates:
+                test_program = candidate + "\n" + test_case
+                args = (test_program, timeout, task_id, completion_id[task_id])
+                future = executor.submit(check_correctness, *args)
+                futures.append(future)
+                completion_id[task_id] += 1
+                n_samples += 1
+
+        for future in as_completed(futures):
+            result = future.result()
+            results[result["task_id"]].append((result["completion_id"], result))
+
+    total, correct = [], []
+    for result in results.values():
+        result.sort()
+        passed = [r[1]["passed"] for r in result]
+        total.append(len(passed))
+        correct.append(sum(passed))
+    total = np.array(total)
+    correct = np.array(correct)
+
+    ks = k
+    if not isinstance(ks, (list, tuple)):
+        ks = [ks]
+    pass_at_k = {f"pass@{k}": estimate_pass_at_k(total, correct, k).mean() for k in ks if (total >= k).all()}
+
+    return pass_at_k, results
+
+
+def estimate_pass_at_k(num_samples, num_correct, k):
+    """Estimates pass@k of each problem and returns them in an array."""
+
+    def estimator(n: int, c: int, k: int) -> float:
+        """Calculates 1 - comb(n - c, k) / comb(n, k)."""
+        if n - c < k:
+            return 1.0
+        return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
+
+    if isinstance(num_samples, int):
+        num_samples_it = itertools.repeat(num_samples, len(num_correct))
+    else:
+        assert len(num_samples) == len(num_correct)
+        num_samples_it = iter(num_samples)
+
+    return np.array([estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)])