1+ import pickle
2+ import json
3+ import numpy as np
4+ from tqdm import tqdm
5+ from ast import literal_eval
6+ from glob import glob
7+ from sentence_transformers import SentenceTransformer , util
8+ import matplotlib .pyplot as plt
9+ from transformers import AutoTokenizer
10+ from datasets import load_dataset , Dataset , Features , Value , Sequence , DatasetDict
11+
12+ from utils import *
13+
14+ VERSION = "v0.1.0_hf"
15+ def update_model_info (model_info ):
16+ for model , info in model_info .items ():
17+ if "https://huggingface.co/" in info ["link" ]:
18+ hf_model = info ["link" ].split ("https://huggingface.co/" )[- 1 ]
19+ print (hf_model )
20+ tokenizer = AutoTokenizer .from_pretrained (hf_model , trust_remote_code = True )
21+ if tokenizer .chat_template is None :
22+ model_info [model ]["direct_complete" ] = True
23+ else :
24+ model_info [model ]["direct_complete" ] = False
25+ else :
26+ model_info [model ]["direct_complete" ] = False
27+
28+ return model_info
29+
30+
31+ def embed_sentences (data , col_name , id_name , model , save_path , push_to_hub = False ):
32+ pool = model .start_multi_process_pool ()
33+ embeddings = model .encode_multi_process (data [col_name ], pool = pool )
34+ qids = data [id_name ]
35+ features = Features ({id_name : Value (dtype = 'string' ), 'embeddings' : Sequence (Value ('float32' ))})
36+ embed_dict = {
37+ id_name : qids ,
38+ "embeddings" : embeddings
39+ }
40+ embed_ds = Dataset .from_dict (embed_dict , features = features )
41+ if push_to_hub :
42+ embed_ds .push_to_hub (f"bigcode/{ save_path } " )
43+ else :
44+ embed_ds .save_to_disk (save_path )
45+ return embed_ds
46+
47+
48+ def get_top_docs (query_embs , doc_emb , docs ):
49+ scores = np .dot (query_embs , doc_emb .T )
50+ top_doc_indices = np .argmax (scores , axis = 1 )
51+ top_scores = scores [np .arange (len (scores )), top_doc_indices ]
52+ results = [(i , docs [doc_idx ], score ) for i , (doc_idx , score ) in tqdm (enumerate (zip (top_doc_indices , top_scores )))]
53+
54+ return results
55+
56+
57+ def filter_top_k_percent (results , k_percent ):
58+ all_scores = [score for _ , score in results ]
59+ threshold = np .percentile (all_scores , 100 - k_percent )
60+ filtered_results = [(i , doc , score ) for i , doc , score in results if score > threshold ]
61+ return filtered_results
62+
63+
64+ def filter_top_threshold (results , threshold ):
65+ filtered_results = [(i , doc , score ) for i , doc , score in results if score > threshold ]
66+ return filtered_results
67+
68+
69+ def read_task_perf (tids , task = "complete" ):
70+ model_results = dict ()
71+ result_files = []
72+ for model , info in model_info .items ():
73+ if task == "instruct" and (not info ["prompted" ] or info ["name" ] in ["Granite-Code-3B-Instruct" , "Granite-Code-8B-Instruct" ]):
74+ continue
75+ task_perf = {f"BigCodeBench/{ task_id } " : 0 for task_id in range (1140 )}
76+ model = model .replace ("/" , "--" )
77+ # if info["link"].startswith("https://huggingface.co/"):
78+ # model = info["link"].split("https://huggingface.co/")[-1].replace("/", "--")
79+ try :
80+ if info ["prompted" ] and not info ["direct_complete" ]:
81+ files = glob (f"results/{ model } --bigcodebench-{ task } *-0-1-sanitized-calibrated_eval_results.json" )
82+ if files :
83+ file = files [0 ]
84+ else :
85+ file = glob (f"results/{ model } --bigcodebench-{ task } *-0-1-sanitized_eval_results.json" )[0 ]
86+ else :
87+ file = glob (f"results/{ model } --bigcodebench-{ task } *-0-1-sanitized_eval_results.json" )[0 ]
88+ except :
89+ continue
90+ with open (file , "r" ) as f :
91+ data = json .load (f )
92+ for task_id , perfs in data ["eval" ].items ():
93+ status = 1 if perfs [0 ]["status" ] == "pass" else 0
94+ task_perf [task_id ] = status
95+ model_results [info ["name" ]] = np .mean ([status for tid , status in task_perf .items () if tid in tids ])
96+ return sorted (model_results .items (), key = lambda x : x [1 ], reverse = True )
97+
98+
99+ if __name__ == "__main__" :
100+ bcb = load_dataset ("bigcode/bigcodebench" , trust_remote_code = True , split = VERSION )
101+ se = load_dataset ("bigcode/stack-exchange-preferences-20230914-clean-anonymization" , trust_remote_code = True , split = "train" )
102+ model = SentenceTransformer ("sentence-transformers/all-mpnet-base-v2" )
103+
104+ model_info = update_model_info (model_info )
105+
106+ se_embed = embed_sentences (se , "question" , "qid" , model , "stack-exchange-embeddings-20230914" , push_to_hub = True )
107+ bcb_embed = embed_sentences (bcb , "complete_prompt" , "task_id" , model , "bigcodebench-doc-embeddings" , push_to_hub = True )
108+
109+ solve_rate = load_dataset ("bigcode/bigcodebench-solve-rate" , trust_remote_code = True , split = "complete" )
110+
111+ query_embs = np .array (se_embed ["embeddings" ])
112+ doc_emb = np .array (bcb_embed ["embeddings" ])
113+ docs = bcb_embed ["task_id" ]
114+ retrieval_results = get_top_docs (query_embs , doc_emb , docs )
115+
116+ Dataset .from_dict ({"qid" : [i for i , _ , _ in retrieval_results ], "tid" : [doc for _ , doc , _ in retrieval_results ], "score" : [score for _ , _ , score in retrieval_results ]}).push_to_hub ("bigcode/se_bcb_results" )
117+
118+ retrieval_ds = load_dataset ("bigcode/se_bcb_results" , trust_remote_code = True , split = "train" )
119+
120+ top_results = dict ()
121+ for sample in tqdm (retrieval_ds ):
122+ i , doc , score = sample ["qid" ], sample ["tid" ], sample ["score" ]
123+ if score > 0.7 :
124+ if doc not in top_results :
125+ top_results [doc ] = (i , doc , score )
126+ else :
127+ if score > top_results [doc ][2 ]:
128+ top_results [doc ] = (i , doc , score )
129+
130+ top_id = {task_id : (qid , score ) for qid , task_id , score in top_results .values ()}
131+
132+ hard_lib_filter = {sample ["task_id" ] for sample in bcb if len (literal_eval (sample ["libs" ])) > 2 }
133+ hard_length_filter = {sample ["task_id" ] for sample in bcb if len (sample ["canonical_solution" ]) > 426 }
134+ hard_rate_filter = {task ["task_id" ]: task ["solve_rate" ] for task in solve_rate if task ["solve_rate" ] < 50 }
135+
136+ hard_tid = top_id .keys () & hard_length_filter & hard_rate_filter .keys () & hard_lib_filter
137+
138+ hard_bcb = bcb .filter (lambda x : x ["task_id" ] in hard_tid )
139+ hard_bcb_tid = bcb .filter (lambda x : x ["task_id" ] in hard_tid )["task_id" ]
140+ hard_se_qid = [top_id [_id ][0 ] for _id in hard_bcb_tid ]
141+ hard_se_q = se .select (hard_se_qid )
142+ hard_se_scores = [top_id [_id ][1 ] for _id in hard_bcb_tid ]
143+ hard_bcb_dict = {
144+ "task_id" : hard_bcb_tid ,
145+ "complete_prompt" : hard_bcb ["complete_prompt" ],
146+ "instruct_prompt" : hard_bcb ["instruct_prompt" ],
147+ "canonical_solution" : hard_bcb ["canonical_solution" ],
148+ "code_prompt" : hard_bcb ["code_prompt" ],
149+ "test" : hard_bcb ["test" ],
150+ "entry_point" : hard_bcb ["entry_point" ],
151+ "doc_struct" : hard_bcb ["doc_struct" ],
152+ "libs" : hard_bcb ["libs" ],
153+ "q_idx" : hard_se_qid ,
154+ "question" : hard_se_q ["question" ],
155+ "score" : hard_se_scores ,
156+ "_id" : hard_bcb_tid
157+ }
158+ hard_bcb = Dataset .from_dict (hard_bcb_dict )
159+ DatasetDict ({VERSION : hard_bcb }).push_to_hub ("bigcode/bigcodebench-hard" )
160+
161+ hard_complete_results = read_task_perf (hard_tid )
162+ hard_instruct_results = read_task_perf (hard_tid , task = "instruct" )
163+
164+ complete_res_dict = {model : score for model , score in hard_complete_results }
165+ instruct_res_dict = {model : score for model , score in hard_instruct_results }
166+ avg_res_dict = {model : (complete_res_dict [model ] + instruct_res_dict [model ]) / 2 for model in complete_res_dict if model in instruct_res_dict }
167+
168+ for model , score in sorted (avg_res_dict .items (), key = lambda x : x [1 ], reverse = True ):
169+ print (model , round (score * 100 , 1 ))
0 commit comments