Skip to content

Commit 56ec144

Browse files
authored
Merge pull request #156 from thomwolf/merge-code-eval
Move code_eval metric from Evaluate to BigCode-eval-harness
2 parents c326b51 + ed9075b commit 56ec144

File tree

17 files changed

+477
-51
lines changed

17 files changed

+477
-51
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,3 +166,5 @@ cython_debug/
166166
# Script outputs
167167
evaluation*.json
168168
generations*.json
169+
170+
playground/

bigcode_eval/tasks/apps.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,8 @@ def create_all_tasks():
3636

3737
def create_task(level):
3838
class APPS(GeneralAPPS):
39-
def __init__(self):
40-
super().__init__(level)
39+
def __init__(self, **kwargs):
40+
super().__init__(level, **kwargs)
4141

4242
return APPS
4343

@@ -50,12 +50,13 @@ class GeneralAPPS(Task):
5050
DATASET_PATH = "codeparrot/apps"
5151
DATASET_NAME = None
5252

53-
def __init__(self, level):
53+
def __init__(self, level, k_list=[1, 10, 100]):
5454
self.DATASET_NAME = level
5555
super().__init__(
5656
stop_words=["\nQUESTION", "\n---", "\nANSWER"],
5757
requires_execution=True,
5858
)
59+
self.k_list = k_list
5960

6061
def get_dataset(self):
6162
"""Returns dataset for the task or an iterable of any object, that get_prompt can handle"""
@@ -115,7 +116,9 @@ def process_results(self, generations, references):
115116
list of str containing refrences (not needed for APPS Task)
116117
"""
117118
code_metric = load("codeparrot/apps_metric")
119+
if level is None:
120+
level = self.DATASET_NAME
118121
results = code_metric.compute(
119-
predictions=generations, k_list=[1, 10, 100], level=self.DATASET_NAME
122+
predictions=generations, k_list=self.k_list, level=self.DATASET_NAME
120123
)
121124
return results

bigcode_eval/tasks/codexglue_code_to_text.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,8 +46,8 @@ def create_all_tasks():
4646

4747
def create_task(language):
4848
class CodeToText(GeneralCodeToText):
49-
def __init__(self):
50-
super().__init__(language)
49+
def __init__(self, **kwargs):
50+
super().__init__(language, **kwargs)
5151

5252
return CodeToText
5353

bigcode_eval/tasks/codexglue_text_to_text.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,8 @@ def create_all_tasks():
4040

4141
def create_task(translation_task):
4242
class CodexglueTextToTextTask(CodexglueTextToText):
43-
def __init__(self):
44-
super().__init__(translation_task)
43+
def __init__(self, **kwargs):
44+
super().__init__(translation_task, **kwargs)
4545

4646
return CodexglueTextToTextTask
4747

@@ -51,11 +51,13 @@ class CodexglueTextToText(Task):
5151
DATASET_PATH = "code_x_glue_tt_text_to_text"
5252
DATASET_NAME = None
5353

54-
def __init__(self, translation_task):
54+
def __init__(self, translation_task, max_order=4, smooth=True):
5555
self.DATASET_NAME = translation_task
5656
stop_words = ["\n"]
5757
requires_execution = False
5858
super().__init__(stop_words, requires_execution)
59+
self.max_order = max_order
60+
self.smooth = smooth
5961

6062
def get_dataset(self):
6163
"""Returns dataset for the task or an iterable of any object, that get_prompt can handle"""
@@ -117,6 +119,6 @@ def process_results(self, generations, references):
117119
bleu = load("bleu")
118120
gens = [gen[0] for gen in generations]
119121
results = bleu.compute(
120-
references=references, predictions=gens, max_order=4, smooth=True
122+
references=references, predictions=gens, max_order=self.max_order, smooth=self.smooth
121123
)
122124
return results

bigcode_eval/tasks/conala.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,11 +34,13 @@ class Conala(Task):
3434

3535
DATASET_PATH = "neulab/conala"
3636

37-
def __init__(self):
37+
def __init__(self, max_order=4, smooth=True):
3838
super().__init__(
3939
stop_words=["\n"],
4040
requires_execution=False,
4141
)
42+
self.max_order = max_order
43+
self.smooth = smooth
4244

4345
def get_dataset(self):
4446
"""Returns dataset for the task or an iterable of any object, that get_prompt can handle"""
@@ -101,6 +103,6 @@ def process_results(self, generations, references):
101103
bleu = load("bleu")
102104
gens = [gen[0] for gen in generations]
103105
results = bleu.compute(
104-
references=references, predictions=gens, max_order=4, smooth=True
106+
references=references, predictions=gens, max_order=self.max_order, smooth=self.smooth
105107
)
106108
return results

bigcode_eval/tasks/concode.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,11 +33,13 @@ class Concode(Task):
3333

3434
DATASET_PATH = "code_x_glue_tc_text_to_code"
3535

36-
def __init__(self):
36+
def __init__(self, max_order=4, smooth=True):
3737
super().__init__(
3838
stop_words=["\n"],
3939
requires_execution=False,
4040
)
41+
self.max_order = max_order
42+
self.smooth = smooth
4143

4244
def get_dataset(self):
4345
"""Returns dataset for the task or an iterable of any object, that get_prompt can handle"""
@@ -102,6 +104,6 @@ def process_results(self, generations, references):
102104
bleu = load("bleu")
103105
gens = [gen[0] for gen in generations]
104106
results = bleu.compute(
105-
references=references, predictions=gens, max_order=4, smooth=True
107+
references=references, predictions=gens, max_order=self.max_order, smooth=self.smooth
106108
)
107109
return results
Lines changed: 189 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,189 @@
1+
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
"""The CodeEval metric estimates the pass@k metric for code synthesis.
15+
This is an evaluation harness for the HumanEval problem solving dataset
16+
described in the paper "Evaluating Large Language Models Trained on Code"
17+
(https://arxiv.org/abs/2107.03374)."""
18+
19+
import itertools
20+
import os
21+
from collections import Counter, defaultdict
22+
from concurrent.futures import ThreadPoolExecutor, as_completed
23+
24+
import numpy as np
25+
26+
from .execute import check_correctness
27+
28+
29+
_CITATION = """\
30+
@misc{chen2021evaluating,
31+
title={Evaluating Large Language Models Trained on Code},
32+
author={Mark Chen and Jerry Tworek and Heewoo Jun and Qiming Yuan \
33+
and Henrique Ponde de Oliveira Pinto and Jared Kaplan and Harri Edwards \
34+
and Yuri Burda and Nicholas Joseph and Greg Brockman and Alex Ray \
35+
and Raul Puri and Gretchen Krueger and Michael Petrov and Heidy Khlaaf \
36+
and Girish Sastry and Pamela Mishkin and Brooke Chan and Scott Gray \
37+
and Nick Ryder and Mikhail Pavlov and Alethea Power and Lukasz Kaiser \
38+
and Mohammad Bavarian and Clemens Winter and Philippe Tillet \
39+
and Felipe Petroski Such and Dave Cummings and Matthias Plappert \
40+
and Fotios Chantzis and Elizabeth Barnes and Ariel Herbert-Voss \
41+
and William Hebgen Guss and Alex Nichol and Alex Paino and Nikolas Tezak \
42+
and Jie Tang and Igor Babuschkin and Suchir Balaji and Shantanu Jain \
43+
and William Saunders and Christopher Hesse and Andrew N. Carr \
44+
and Jan Leike and Josh Achiam and Vedant Misra and Evan Morikawa \
45+
and Alec Radford and Matthew Knight and Miles Brundage and Mira Murati \
46+
and Katie Mayer and Peter Welinder and Bob McGrew and Dario Amodei \
47+
and Sam McCandlish and Ilya Sutskever and Wojciech Zaremba},
48+
year={2021},
49+
eprint={2107.03374},
50+
archivePrefix={arXiv},
51+
primaryClass={cs.LG}
52+
}
53+
"""
54+
55+
_DESCRIPTION = """\
56+
This metric implements the evaluation harness for the HumanEval problem solving dataset
57+
described in the paper "Evaluating Large Language Models Trained on Code"
58+
(https://arxiv.org/abs/2107.03374).
59+
"""
60+
61+
62+
_KWARGS_DESCRIPTION = """
63+
Calculates how good are predictions given some references, using certain scores
64+
Args:
65+
predictions: list of candidates to evaluate. Each candidates should be a list
66+
of strings with several code candidates to solve the problem.
67+
references: a list with a test for each prediction. Each test should evaluate the
68+
correctness of a code candidate.
69+
k: number of code candidates to consider in the evaluation (Default: [1, 10, 100])
70+
num_workers: number of workers used to evaluate the canidate programs (Default: 4).
71+
timeout:
72+
Returns:
73+
pass_at_k: dict with pass rates for each k
74+
results: dict with granular results of each unittest
75+
Examples:
76+
>>> test_cases = ["assert add(2,3)==5"]
77+
>>> candidates = [["def add(a,b): return a*b", "def add(a, b): return a+b"]]
78+
>>> pass_at_k, results = compute_code_eval(references=test_cases, predictions=candidates, k=[1, 2])
79+
>>> print(pass_at_k)
80+
{'pass@1': 0.5, 'pass@2': 1.0}
81+
"""
82+
83+
84+
_WARNING = """
85+
################################################################################
86+
!!!WARNING!!!
87+
################################################################################
88+
The "code_eval" metric executes untrusted model-generated code in Python.
89+
Although it is highly unlikely that model-generated code will do something
90+
overtly malicious in response to this test suite, model-generated code may act
91+
destructively due to a lack of model capability or alignment.
92+
Users are strongly encouraged to sandbox this evaluation suite so that it
93+
does not perform destructive actions on their host or network. For more
94+
information on how OpenAI sandboxes its code, see the paper "Evaluating Large
95+
Language Models Trained on Code" (https://arxiv.org/abs/2107.03374).
96+
97+
Once you have read this disclaimer and taken appropriate precautions,
98+
set the environment variable HF_ALLOW_CODE_EVAL="1". Within Python you can to this
99+
with:
100+
101+
>>> import os
102+
>>> os.environ["HF_ALLOW_CODE_EVAL"] = "1"
103+
104+
################################################################################\
105+
"""
106+
107+
_LICENSE = """The MIT License
108+
109+
Copyright (c) OpenAI (https://openai.com)
110+
111+
Permission is hereby granted, free of charge, to any person obtaining a copy
112+
of this software and associated documentation files (the "Software"), to deal
113+
in the Software without restriction, including without limitation the rights
114+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
115+
copies of the Software, and to permit persons to whom the Software is
116+
furnished to do so, subject to the following conditions:
117+
118+
The above copyright notice and this permission notice shall be included in
119+
all copies or substantial portions of the Software.
120+
121+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
122+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
123+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
124+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
125+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
126+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
127+
THE SOFTWARE."""
128+
129+
def compute_code_eval(predictions, references, k=[1, 10, 100], num_workers=4, timeout=3.0):
130+
"""Returns the scores"""
131+
132+
if os.getenv("HF_ALLOW_CODE_EVAL", 0) != "1":
133+
raise ValueError(_WARNING)
134+
135+
if os.name == "nt":
136+
raise NotImplementedError("This metric is currently not supported on Windows.")
137+
138+
with ThreadPoolExecutor(max_workers=num_workers) as executor:
139+
futures = []
140+
completion_id = Counter()
141+
n_samples = 0
142+
results = defaultdict(list)
143+
144+
for task_id, (candidates, test_case) in enumerate(zip(predictions, references)):
145+
for candidate in candidates:
146+
test_program = candidate + "\n" + test_case
147+
args = (test_program, timeout, task_id, completion_id[task_id])
148+
future = executor.submit(check_correctness, *args)
149+
futures.append(future)
150+
completion_id[task_id] += 1
151+
n_samples += 1
152+
153+
for future in as_completed(futures):
154+
result = future.result()
155+
results[result["task_id"]].append((result["completion_id"], result))
156+
157+
total, correct = [], []
158+
for result in results.values():
159+
result.sort()
160+
passed = [r[1]["passed"] for r in result]
161+
total.append(len(passed))
162+
correct.append(sum(passed))
163+
total = np.array(total)
164+
correct = np.array(correct)
165+
166+
ks = k
167+
if not isinstance(ks, (list, tuple)):
168+
ks = [ks]
169+
pass_at_k = {f"pass@{k}": estimate_pass_at_k(total, correct, k).mean() for k in ks if (total >= k).all()}
170+
171+
return pass_at_k, results
172+
173+
174+
def estimate_pass_at_k(num_samples, num_correct, k):
175+
"""Estimates pass@k of each problem and returns them in an array."""
176+
177+
def estimator(n: int, c: int, k: int) -> float:
178+
"""Calculates 1 - comb(n - c, k) / comb(n, k)."""
179+
if n - c < k:
180+
return 1.0
181+
return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
182+
183+
if isinstance(num_samples, int):
184+
num_samples_it = itertools.repeat(num_samples, len(num_correct))
185+
else:
186+
assert len(num_samples) == len(num_correct)
187+
num_samples_it = iter(num_samples)
188+
189+
return np.array([estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)])

0 commit comments

Comments
 (0)