55# https://github.com/openai/human-eval/blob/master/human_eval/execution.py
66
77from concurrent .futures import ThreadPoolExecutor , as_completed
8- from typing import Optional , Dict , Any , List
98from multiprocessing import Manager , Process
9+ from typing import Optional , Dict
1010from tqdm import tqdm
1111import numpy as np
1212import faulthandler
1919import time
2020import os
2121import io
22+
2223CITATION = """
2324@article{du2024mercury,
2425 title={Mercury: An Efficiency Benchmark for LLM Code Synthesis},
2829}
2930"""
3031
31-
3232# Timeout Exception
3333class TimeoutException (Exception ):
3434 """ Raise for TimeoutException """
@@ -56,7 +56,6 @@ def readable(self, *args, **kwargs):
5656 """ Returns True if the IO object can be read. """
5757 return False
5858
59-
6059class Sandbox (object ):
6160 @staticmethod
6261 @contextlib .contextmanager
@@ -392,8 +391,7 @@ def compute_beyond_eval(generations_list, reference_list, timeout=10):
392391 "timeout" : timeout ,
393392 }
394393
395- results = [sandbox .run_sample (sample ) for _ in range (3 )]
396- print (results [0 ])
394+ results = [sandbox .run_sample (sample ) for _ in range (5 )]
397395 t_c += 1
398396
399397 # Calculate Beyond
@@ -418,11 +416,6 @@ def compute_beyond_eval(generations_list, reference_list, timeout=10):
418416 scores ['Average' ]['correct_c' ] += [p_c ]
419417 scores ['Average' ]['beyond_c' ] += [b_l ]
420418
421- # print(f'total: {t_c}')
422- # print(f'correct: {p_c}')
423- # print(f'beyond: {b_l}')
424- # print("-" * 60)
425-
426419 results = dict ()
427420 for difficulty in ['Easy' , "Medium" , "Hard" , "Average" ]:
428421 total = np .array (scores [difficulty ]['total_c' ])
0 commit comments