update data util

Nghi Bui · Nghi Bui · commit e4faff971617 · 2023-07-24T15:28:14.000Z
diff --git a/codetf/data_utility/codexglue_dataset.py b/codetf/data_utility/codexglue_dataset.py
@@ -25,6 +25,7 @@ def load_codexglue_text_to_code_dataset(self, *args, **kwargs):
         dataset = load_dataset(dataset)
 
         train = dataset["train"]
+        train = train[:50]
         train_nl_tensors, _ = self.process_data(train["nl"])
         train_code_tensors, _ = self.process_data(train["code"])
         
diff --git a/codetf/data_utility/human_eval_dataset.py b/codetf/data_utility/human_eval_dataset.py
@@ -9,6 +9,12 @@ class HumanEvalDataset(BaseDataset):
     def __init__(self, tokenizer, max_length=512):
         super().__init__(tokenizer, max_length)
     
+    def get_reference(self, task):
+        """Builds the reference solution for the doc (sample from the test dataset)."""
+        test_func = task["test"]
+        entry_point = f"check({task['entry_point']})"
+        return "\n" + test_func + "\n" + entry_point
+
     def load(self):
         dataset = self.dataset_config["openai_humaneval"]
 
@@ -22,9 +28,10 @@ def load(self):
             # without strip, the model generates commented codes ...
             prompts.append(self.tokenizer.eos_token + dataset[task_index]["prompt"].strip())
 
-            unit_test = dataset[task_index]["test"]
-            unit_test = re.sub(r'METADATA = {[^}]*}', '', unit_test, flags=re.MULTILINE)
-            references.append(unit_test)
+            # unit_test = dataset[task_index]["test"]
+            # unit_test = re.sub(r'METADATA = {[^}]*}', '', unit_test, flags=re.MULTILINE)
+            reference = self.get_reference(dataset[task_index])
+            references.append(reference)
 
         prompt_token_ids, prompt_attention_masks = self.process_data(prompts, padding="max_length")
         
diff --git a/codetf/data_utility/stackexchange_instruction_dataset.py b/codetf/data_utility/stackexchange_instruction_dataset.py