1111import math
1212from datasets import Dataset , DatasetDict , load_dataset
1313from transformers import AutoTokenizer
14- from cuml .linear_model import LogisticRegression
15- import cupy as cp
16-
1714
1815def update_model_info (model_info ):
1916 for model , info in model_info .items ():
@@ -142,17 +139,17 @@ def split_gen():
142139 if "calibrated" in file :
143140 if info ["prompted" ]:
144141 if suffix .startswith ("complete" ):
145- with open (f"sanitized_calibrated_samples/complete/{ model } --bigcodebench* -{ suffix } " , "w" ) as f :
142+ with open (f"sanitized_calibrated_samples/complete/{ model } --bigcodebench-{ suffix } " , "w" ) as f :
146143 f .writelines (data )
147144 else :
148- with open (f"sanitized_calibrated_samples/instruct/{ model } --bigcodebench* -{ suffix } " , "w" ) as f :
145+ with open (f"sanitized_calibrated_samples/instruct/{ model } --bigcodebench-{ suffix } " , "w" ) as f :
149146 f .writelines (data )
150147 else :
151148 if suffix .startswith ("complete" ):
152- with open (f"sanitized_samples/complete/{ model } --bigcodebench* -{ suffix } " , "w" ) as f :
149+ with open (f"sanitized_samples/complete/{ model } --bigcodebench-{ suffix } " , "w" ) as f :
153150 f .writelines (data )
154151 else :
155- with open (f"sanitized_samples/instruct/{ model } --bigcodebench* -{ suffix } " , "w" ) as f :
152+ with open (f"sanitized_samples/instruct/{ model } --bigcodebench-{ suffix } " , "w" ) as f :
156153 f .writelines (data )
157154
158155
@@ -221,95 +218,6 @@ def read_task_perf(tids, task="complete"):
221218 return model_results , result_files
222219
223220
224- def get_winner_df (data_dict , tids , task , task_level = True , no_tie = True ):
225- winner_dict = {"task_id" : [], "model_a" : [], "model_b" : [], "winner" : []}
226- if not task_level :
227- file = f"{ task } _winner_df.csv"
228- else :
229- file = f"{ task } _winner_task_df.csv"
230-
231- if task_level :
232- for task_id in tqdm (tids ):
233- # pair without repetition (a, b) and (b, a) are the same
234- for model_a , model_b in itertools .combinations (data_dict .keys (), 2 ):
235- solve_rate_a = data_dict [model_a ][task_id ]
236- solve_rate_b = data_dict [model_b ][task_id ]
237-
238- if solve_rate_a > solve_rate_b :
239- winner_dict ["winner" ].append ("model_a" )
240- elif solve_rate_a < solve_rate_b :
241- winner_dict ["winner" ].append ("model_b" )
242- else :
243- if no_tie :
244- continue
245- winner_dict ["winner" ].append ("tie" )
246-
247- winner_dict ["task_id" ].append (task_id )
248- winner_dict ["model_a" ].append (model_a )
249- winner_dict ["model_b" ].append (model_b )
250- else :
251- data_dict = {model : np .mean (list (task_perf .values ())) for model , task_perf in data_dict .items ()}
252- for model_a , model_b in itertools .combinations (data_dict .keys (), 2 ):
253- solve_rate_a = data_dict [model_a ]
254- solve_rate_b = data_dict [model_b ]
255-
256- if solve_rate_a > solve_rate_b :
257- winner_dict ["winner" ].append ("model_a" )
258- elif solve_rate_a < solve_rate_b :
259- winner_dict ["winner" ].append ("model_b" )
260- else :
261- if no_tie :
262- continue
263- winner_dict ["winner" ].append ("tie" )
264- winner_dict ["task_id" ].append (task )
265- winner_dict ["model_a" ].append (model_a )
266- winner_dict ["model_b" ].append (model_b )
267-
268- df = pd .DataFrame (winner_dict )
269- df .to_csv (file , index = False )
270- return df
271-
272-
273- def get_bootstrap_result (battles , func_compute_elo , num_round ):
274- rows = []
275- for i in tqdm (range (num_round ), desc = "bootstrap" ):
276- rows .append (func_compute_elo (battles .sample (frac = 1.0 , replace = True )))
277- df = pd .DataFrame (rows )
278- return df [df .median ().sort_values (ascending = False ).index ]
279-
280-
281- def get_elo_mle (df , SCALE = 400 , BASE = 10 , INIT_RATING = 1000 ):
282-
283-
284- models = pd .concat ([df ["model_a" ], df ["model_b" ]]).unique ()
285- models = pd .Series (np .arange (len (models )), index = models )
286- p = len (models .index )
287- n = df .shape [0 ]
288-
289- X = cp .zeros ([n , p ])
290- X [cp .arange (n ), models [df ["model_a" ]]] = + math .log (BASE )
291- X [cp .arange (n ), models [df ["model_b" ]]] = - math .log (BASE )
292-
293- Y = cp .zeros (n )
294- Y [df ["winner" ] == "model_a" ] = 1.0
295-
296- lr = LogisticRegression (fit_intercept = False )
297- lr .fit (X , Y )
298-
299- elo_scores = SCALE * lr .coef_ [0 ] + INIT_RATING
300-
301- return pd .Series (cp .asnumpy (elo_scores ), index = models .index ).sort_values (ascending = False )
302-
303-
304- def update_elo_rating (results , elo_dict ):
305- for model , info in model_info .items ():
306- if info ["name" ] not in elo_dict :
307- results [info ["name" ]]["elo_mle" ] = None
308- else :
309- results [info ["name" ]]["elo_mle" ] = elo_dict [info ["name" ]]
310- return results
311-
312-
313221def get_domain_perf (data_dict , task2domain ):
314222 domain_perfs = {
315223 "Model" : [],
@@ -347,7 +255,7 @@ def get_solve_rate(data_dict, task="complete"):
347255
348256def get_hf_ds (results ):
349257 hf_dataset = {"model" : [], "link" : [], "moe" : [], "size" : [], "act_param" : [], "type" : [], #"lazy": [],# "direct_complete": [],
350- "complete" : [], "instruct" : [], "elo_mle" : [] }
258+ "complete" : [], "instruct" : []}
351259
352260 for model , result in results .items ():
353261 hf_dataset ["model" ].append (model )
@@ -360,7 +268,6 @@ def get_hf_ds(results):
360268 hf_dataset ["complete" ].append (result ["pass@1" ]["complete" ])
361269 hf_dataset ["instruct" ].append (result ["pass@1" ]["instruct" ])
362270 # hf_dataset["direct_complete"].append(result["direct_complete"])
363- hf_dataset ["elo_mle" ].append (result ["elo_mle" ])
364271
365272 return Dataset .from_dict (hf_dataset )
366273
@@ -395,7 +302,7 @@ def get_perf_df(data_dict):
395302
396303
397304if __name__ == "__main__" :
398- split_gen ()
305+ # split_gen()
399306 bcb_orig = load_dataset ("bigcode/bigcodebench" , split = "v0.1.1" )
400307 bcb_hard = load_dataset ("bigcode/bigcodebench-hard" , split = "v0.1.1" )
401308 bcb_config = {
@@ -429,28 +336,7 @@ def get_perf_df(data_dict):
429336 instruct_solve_rate = get_solve_rate (instruct_data , task = "instruct" )
430337 solve_rate_ds = DatasetDict ({"complete" : complete_solve_rate , "instruct" : instruct_solve_rate })
431338 push_ds (solve_rate_ds , f"bigcode/bigcodebench{ suffix } -solve-rate" )
432-
433- elo_config = {
434- "task_no_tie" : (True , True ),
435- "benchmark_tie" : (False , False ),
436- }
437- elo_ds = dict ()
438- for config , (task_level , no_tie ) in elo_config .items ():
439- filter_complete_data = {model : task_perf for model , task_perf in complete_data .items () if model in instruct_data }
440- complete_battles = get_winner_df (filter_complete_data , bcb ["task_id" ], "complete" , task_level = task_level , no_tie = no_tie )
441- instruct_battles = get_winner_df (instruct_data , bcb ["task_id" ], "instruct" , task_level = task_level , no_tie = no_tie )
442- battles = pd .concat ([complete_battles , instruct_battles ])
443- elo_mle_bootstrap = get_bootstrap_result (battles , get_elo_mle , 500 )
444- bootstrap_lu_median = elo_mle_bootstrap .median ().reset_index ().set_axis (["model" , "Elo rating" ], axis = 1 )
445- bootstrap_lu_median ["Elo rating" ] = (bootstrap_lu_median ["Elo rating" ] + 0.5 ).astype (int )
446- bootstrap_lu_median_dict = bootstrap_lu_median .set_index ("model" )["Elo rating" ].to_dict ()
447- if config == "task_no_tie" :
448- task_elo = bootstrap_lu_median_dict
449- elo = get_bootstrap_scores (elo_mle_bootstrap )
450- elo_ds [config ] = elo
451- push_ds (DatasetDict (elo_ds ), f"bigcode/bigcodebench{ suffix } -elo" )
452339
453- results = update_elo_rating (results , task_elo )
454340 with open (f"results{ suffix } .json" , "w" ) as f :
455341 json .dump (results , f , indent = 4 )
456342 ds = get_hf_ds (results )
0 commit comments