Skip to content

Commit 4da429c

Browse files
committed
feat: collect eval results
1 parent bca1859 commit 4da429c

File tree

1 file changed

+13
-3
lines changed

1 file changed

+13
-3
lines changed

analysis/get_results.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,7 @@ def split_gen():
144144

145145
def read_task_perf(task="complete"):
146146
model_results = dict()
147+
result_files = []
147148
for model, info in model_info.items():
148149
if task == "instruct" and (not info["prompted"] or info["name"] in ["Granite-Code-3B-Instruct", "Granite-Code-8B-Instruct"]):
149150
continue
@@ -164,13 +165,14 @@ def read_task_perf(task="complete"):
164165
except:
165166
continue
166167

168+
result_files.append(file)
167169
with open(file, "r") as f:
168170
data = json.load(f)
169171
for task_id, perfs in data["eval"].items():
170172
status = 1 if perfs[0]["status"] == "pass" else 0
171173
task_perf[task_id] = status
172174
model_results[info["name"]] = task_perf
173-
return model_results
175+
return model_results, result_files
174176

175177

176178
def get_winner_df(data_dict, task, task_level=True, no_tie=True):
@@ -313,8 +315,16 @@ def push_ds(ds, path, local=False):
313315

314316
model_info = update_model_info(model_info)
315317
results = get_results()
316-
complete_data = read_task_perf("complete")
317-
instruct_data = read_task_perf("instruct")
318+
files = []
319+
complete_data, complete_files = read_task_perf("complete")
320+
instruct_data, instruct_files = read_task_perf("instruct")
321+
files.extend(complete_files)
322+
files.extend(instruct_files)
323+
shutil.rmtree("eval_results", ignore_errors=True)
324+
os.makedirs("eval_results", exist_ok=True)
325+
for file in files:
326+
shutil.copy(file, "eval_results")
327+
318328
complete_solve_rate = get_solve_rate(complete_data, task="complete")
319329
instruct_solve_rate = get_solve_rate(instruct_data, task="instruct")
320330
solve_rate_ds = DatasetDict({"complete": complete_solve_rate, "instruct": instruct_solve_rate})

0 commit comments

Comments
 (0)