Skip to content
This repository was archived by the owner on May 1, 2025. It is now read-only.

Commit e4faff9

Browse files
author
Nghi Bui
committed
update data util
1 parent d5db4bb commit e4faff9

File tree

3 files changed

+11
-3
lines changed

3 files changed

+11
-3
lines changed

codetf/data_utility/codexglue_dataset.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ def load_codexglue_text_to_code_dataset(self, *args, **kwargs):
2525
dataset = load_dataset(dataset)
2626

2727
train = dataset["train"]
28+
train = train[:50]
2829
train_nl_tensors, _ = self.process_data(train["nl"])
2930
train_code_tensors, _ = self.process_data(train["code"])
3031

codetf/data_utility/human_eval_dataset.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,12 @@ class HumanEvalDataset(BaseDataset):
99
def __init__(self, tokenizer, max_length=512):
1010
super().__init__(tokenizer, max_length)
1111

12+
def get_reference(self, task):
13+
"""Builds the reference solution for the doc (sample from the test dataset)."""
14+
test_func = task["test"]
15+
entry_point = f"check({task['entry_point']})"
16+
return "\n" + test_func + "\n" + entry_point
17+
1218
def load(self):
1319
dataset = self.dataset_config["openai_humaneval"]
1420

@@ -22,9 +28,10 @@ def load(self):
2228
# without strip, the model generates commented codes ...
2329
prompts.append(self.tokenizer.eos_token + dataset[task_index]["prompt"].strip())
2430

25-
unit_test = dataset[task_index]["test"]
26-
unit_test = re.sub(r'METADATA = {[^}]*}', '', unit_test, flags=re.MULTILINE)
27-
references.append(unit_test)
31+
# unit_test = dataset[task_index]["test"]
32+
# unit_test = re.sub(r'METADATA = {[^}]*}', '', unit_test, flags=re.MULTILINE)
33+
reference = self.get_reference(dataset[task_index])
34+
references.append(reference)
2835

2936
prompt_token_ids, prompt_attention_masks = self.process_data(prompts, padding="max_length")
3037

codetf/data_utility/stackexchange_instruction_dataset.py

Whitespace-only changes.

0 commit comments

Comments
 (0)