update

Nghi Bui · Nghi Bui · commit d0880726b504 · 2023-06-28T09:15:43.000Z
diff --git a/.gitignore b/.gitignore
@@ -3,6 +3,7 @@ __pycache__/
 *.py[cod]
 *$py.class
 
+codetf/__pycache__/
 codetf/.DS_Store
 assets/.DS_Store
 
diff --git a/codetf/configs/inference/codet5.yaml b/codetf/configs/inference/codet5.yaml
@@ -4,117 +4,156 @@ codet5-base-multi-sum-pretrained:
     max_source_length: 512
     max_prediction_length: 512
     beam_size: 5
+    trust_remote_code: False
+    device_map: "auto"
 codet5-base-nl2code:
     huggingface_url: "Salesforce/codet5-base-codexglue-concode"
     tokenizer_url: "Salesforce/codet5-base"
     max_source_length: 512
     max_prediction_length: 512
     beam_size: 5
+    trust_remote_code: False
+    device_map: True
 codet5-base-refine:
     huggingface_url: "Salesforce/codet5-base-codexglue-refine-medium"
     tokenizer_url: "Salesforce/codet5-base"
     max_source_length: 512
     max_prediction_length: 512
     beam_size: 5
+    trust_remote_code: False
+    device_map: True
 codet5-base-translate_cs_java:
     huggingface_url: "Salesforce/codet5-base-codexglue-translate-cs-java"
     tokenizer_url: "Salesforce/codet5-base"
     max_source_length: 512
     max_prediction_length: 512
     beam_size: 5
+    trust_remote_code: False
+    device_map: True
 codet5-base-translate_java_cs:
     huggingface_url: "Salesforce/codet5-base-codexglue-translate-java-cs"
     tokenizer_url: "Salesforce/codet5-base"
     max_source_length: 512
     max_prediction_length: 512
     beam_size: 5
+    trust_remote_code: False
+    device_map: True
 codet5-base-sum_python:
     huggingface_url: "Salesforce/codet5-base-codexglue-sum-python"
     tokenizer_url: "Salesforce/codet5-base"
     max_source_length: 512
     max_prediction_length: 512
     beam_size: 5
+    trust_remote_code: False
+    device_map: True
 codet5-base-sum_go:
     huggingface_url: "Salesforce/codet5-base-codexglue-sum-go"
     tokenizer_url: "Salesforce/codet5-base"
     max_source_length: 512
     max_prediction_length: 512
     beam_size: 5
+    trust_remote_code: False
+    device_map: True
 codet5-base-sum_php:
     huggingface_url: "Salesforce/codet5-base-codexglue-sum-php"
     tokenizer_url: "Salesforce/codet5-base"
     max_source_length: 512
     max_prediction_length: 512
     beam_size: 5
+    trust_remote_code: False
+    device_map: True
 codet5-base-sum_javascript:
     huggingface_url: "Salesforce/codet5-base-codexglue-sum-javascript"
     tokenizer_url: "Salesforce/codet5-base"
     max_source_length: 512
     max_prediction_length: 512
     beam_size: 5
+    trust_remote_code: False
+    device_map: True
 codet5-base-sum_java:
     huggingface_url: "Salesforce/codet5-base-codexglue-sum-java"
     tokenizer_url: "Salesforce/codet5-base"
     max_source_length: 512
     max_prediction_length: 512
     beam_size: 5
+    trust_remote_code: False
+    device_map: True
 codet5-base-sum_ruby:
     huggingface_url: "Salesforce/codet5-base-codexglue-sum-ruby"
     tokenizer_url: "Salesforce/codet5-base"
     max_source_length: 512
     max_prediction_length: 512
     beam_size: 5
+    trust_remote_code: False
+    device_map: True
 codet5-base-clone:
     huggingface_url: "Salesforce/codet5-base-codexglue-clone"
     tokenizer_url: "Salesforce/codet5-base"
     max_source_length: 512
     max_prediction_length: 512
     beam_size: 5
+    trust_remote_code: False
+    device_map: True
 codet5-base-defect:
     huggingface_url: "Salesforce/codet5-base-codexglue-defect"
     tokenizer_url: "Salesforce/codet5-base"
     max_source_length: 512
     max_prediction_length: 512
     beam_size: 5
+    trust_remote_code: False
+    device_map: True
 codet5-plus-instruct-16B-pretrained:
     huggingface_url: "Salesforce/instructcodet5p-16b"
     tokenizer_url: "Salesforce/instructcodet5p-16b"
     max_source_length: 512
     max_prediction_length: 512
     beam_size: 5
+    trust_remote_code: True
+    device_map: False
 codet5-plus-16B-pretrained:
     huggingface_url: "Salesforce/codet5p-16b"
     tokenizer_url: "Salesforce/codet5p-16b"
     max_source_length: 512
     max_prediction_length: 512
-    beam_size: 5
+    trust_remote_code: True
+    device_map: False
 codet5-plus-6B-pretrained:
     huggingface_url: "Salesforce/codet5p-6b"
     tokenizer_url: "Salesforce/codet5p-6b"
     max_source_length: 512
     max_prediction_length: 512
     beam_size: 5
+    trust_remote_code: True
+    device_map: False
 codet5-plus-2B-pretrained:
     huggingface_url: "Salesforce/codet5p-2b"
     tokenizer_url: "Salesforce/codet5p-2b"
     max_source_length: 512
     max_prediction_length: 512
     beam_size: 5
+    trust_remote_code: True
+    device_map: False
 codet5-plus-770M-python-pretrained:
     huggingface_url: "Salesforce/codet5p-770m-py"
     tokenizer_url: "Salesforce/codet5p-770m-py"
     max_source_length: 512
     max_prediction_length: 512
     beam_size: 5
+    trust_remote_code: False
+    device_map: True
 codet5-plus-770M-pretrained:
     huggingface_url: "Salesforce/codet5p-770m"
     tokenizer_url: "Salesforce/codet5p-770m"
     max_source_length: 512
     max_prediction_length: 512
     beam_size: 5
+    trust_remote_code: False
+    device_map: True
 codet5-plus-220M-pretrained:
     huggingface_url: "Salesforce/codet5p-220m"
     tokenizer_url: "Salesforce/codet5p-220m"
     max_source_length: 512
     max_prediction_length: 512
-    beam_size: 5
+    beam_size: 5
+    trust_remote_code: False
+    device_map: True
diff --git a/codetf/configs/training/codet5.yaml b/codetf/configs/training/codet5.yaml
@@ -3,7 +3,7 @@ hyperparameters:
   evaluation_strategy: "epoch"
   save_strategy: "epoch"
   logging_strategy: "epoch"
-  num_train_epochs: 10
+  num_train_epochs: 1
   auto_find_batch_size: True
   batch_size: 4
   max_steps: 1000
@@ -22,7 +22,7 @@ hyperparameters:
   weight_decay: 0.001
   run_name: "CodeT5-seq2seq-fine-tuned"
   ddp_find_unused_parameters: False
-  fp16: True
+  fp16: False
   bf16: False
   auto_find_batch: True
   num_workers: 4
diff --git a/codetf/models/bert_models/__init__.py b/codetf/models/bert_models/__init__.py
@@ -37,7 +37,17 @@ def load_model_from_config(model_class, model_config, load_in_8bit=False, load_i
             raise ValueError("Only one of load_in_8bit or load_in_4bit can be True. Please choose one.")
         
         if weight_sharding:
-            weights_location = hf_hub_download(checkpoint, "pytorch_model.bin")
+            try:
+                # Try to download and load the json index file
+                weights_location = hf_hub_download(checkpoint, "pytorch_model.bin")
+            except Exception:
+                try:
+                    # If that fails, try to download and load the bin file
+                    weights_location = hf_hub_download(checkpoint, "pytorch_model.bin.index.json")
+                except Exception as e:
+                    # If both fail, raise an error
+                    raise Exception(f"Failed to download weights: {str(e)}")
+                    
             config = RobertaConfig.from_pretrained(checkpoint)
             with init_empty_weights():
                 model = RobertaModel.from_config(config)
diff --git a/codetf/models/causal_lm_models/__init__.py b/codetf/models/causal_lm_models/__init__.py
@@ -36,7 +36,17 @@ def load_model_from_config(model_class, model_config, load_in_8bit=False, load_i
             raise ValueError("Only one of load_in_8bit or load_in_4bit can be True. Please choose one.")
         
         if weight_sharding:
-            weights_location = hf_hub_download(checkpoint, "pytorch_model.bin")
+            try:
+                # Try to download and load the json index file
+                weights_location = hf_hub_download(checkpoint, "pytorch_model.bin")
+            except Exception:
+                try:
+                    # If that fails, try to download and load the bin file
+                    weights_location = hf_hub_download(checkpoint, "pytorch_model.bin.index.json")
+                except Exception as e:
+                    # If both fail, raise an error
+                    raise Exception(f"Failed to download weights: {str(e)}")
+                    
             config = AutoConfig.from_pretrained(checkpoint)
             with init_empty_weights():
                 model = AutoModelForCausalLM.from_config(config)
@@ -49,13 +59,16 @@ def load_model_from_config(model_class, model_config, load_in_8bit=False, load_i
             if load_in_8bit:
                 model = AutoModelForCausalLM.from_pretrained(checkpoint, 
                                             load_in_8bit=load_in_8bit, 
+                                            low_cpu_mem_usage=True,
                                             device_map="auto")
             elif load_in_4bit:
                 model = AutoModelForCausalLM.from_pretrained(checkpoint, 
                                             load_in_4bit=load_in_4bit, 
+                                            low_cpu_mem_usage=True,
                                             device_map="auto")
             else:
                 model = AutoModelForCausalLM.from_pretrained(checkpoint, 
+                                            low_cpu_mem_usage=True,
                                             device_map="auto")
 
 
@@ -67,17 +80,17 @@ def load_model_from_config(model_class, model_config, load_in_8bit=False, load_i
             tokenizer=tokenizer
         )
    
-    def forward(self, sources):
-        encoding = self.tokenizer(sources, return_tensors='pt')
-        input_ids = encoding.input_ids.to(self.device)
-        attention_mask = encoding.attention_mask.to(self.device)
-        generated_ids = self.model.generate(input_ids, attention_mask=attention_mask, 
-                                            max_length=self.max_prediction_length)
+    def forward(self, sources, max_length=512):
+        encoding = self.tokenizer(sources, return_tensors='pt').to(self.model.device)
+        # input_ids = encoding.input_ids.to(self.device)
+        # attention_mask = encoding.attention_mask.to(self.device)
+        generated_ids = self.model.generate(**encoding, 
+                                            max_length=max_length)
 
         predictions = self.tokenizer.batch_decode(generated_ids, truncate_before_pattern=[r"\n\n^#", "^'''", "\n\n\n"])
         return predictions
 
-    def predict(self, sources):
+    def predict(self, sources, max_length=512):
         input_for_net = [' '.join(source.strip().split()).replace('\n', ' ') for source in sources]
-        output = self.forward(input_for_net)
+        output = self.forward(input_for_net, max_length=512)
         return output
diff --git a/codetf/models/seq2seq_models/__init__.py b/codetf/models/seq2seq_models/__init__.py
@@ -1,11 +1,15 @@
 import sys
 from pathlib import Path
 sys.path.append(str(Path(".").absolute().parent))
-from transformers import RobertaTokenizer
+from transformers import AutoTokenizer
 from codetf.models.base_model import BaseModel
 from transformers import AutoModelForSeq2SeqLM, AutoConfig
 from codetf.common.registry import registry
 from accelerate import Accelerator
+import torch
+from accelerate import init_empty_weights, load_checkpoint_and_dispatch
+from huggingface_hub import hf_hub_download
+import torch 
 
 @registry.register_model("codet5")
 class Seq2SeqModel(BaseModel):
@@ -22,7 +26,7 @@ def __init__(self, model, model_config, tokenizer):
 
     @classmethod
     def init_tokenizer(cls, model):
-        return RobertaTokenizer.from_pretrained(model)
+        return AutoTokenizer.from_pretrained(model)
     
   
     @classmethod
@@ -33,28 +37,62 @@ def load_model_from_config(model_class, model_config, load_in_8bit=False, load_i
         if load_in_8bit and load_in_4bit:
             raise ValueError("Only one of load_in_8bit or load_in_4bit can be True. Please choose one.")
 
+        # This "device" is for the case of CodeT5plus, will be removed in the future
+        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
         if weight_sharding:
-            weights_location = hf_hub_download(checkpoint, "pytorch_model.bin")
+            try:
+                # Try to download and load the json index file
+                weights_location = hf_hub_download(checkpoint, "pytorch_model.bin")
+            except Exception:
+                try:
+                    # If that fails, try to download and load the bin file
+                    weights_location = hf_hub_download(checkpoint, "pytorch_model.bin.index.json")
+                except Exception as e:
+                    # If both fail, raise an error
+                    raise Exception(f"Failed to download weights: {str(e)}")
             config = AutoConfig.from_pretrained(checkpoint)
             with init_empty_weights():
                 model = AutoModelForSeq2SeqLM.from_config(config)
 
             model.tie_weights()            
             model = load_checkpoint_and_dispatch(
-                model, weights_location, device_map="auto", no_split_module_classes=["GPTJBlock"]
+                model, weights_location, model_config["device_map"], 
+                no_split_module_classes=["GPTJBlock"]
             )
         else:
             if load_in_8bit:
-                model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint, 
-                                            load_in_8bit=load_in_8bit, 
-                                            device_map="auto")
+                if model_config["device_map"]:
+                    model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint, 
+                                                load_in_8bit=load_in_8bit, 
+                                                low_cpu_mem_usage=True,
+                                                device_map="auto", trust_remote_code=model_config["trust_remote_code"])
+                else: 
+                    model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint, 
+                                                load_in_8bit=load_in_8bit, 
+                                                low_cpu_mem_usage=True,
+                                                trust_remote_code=model_config["trust_remote_code"])
             elif load_in_4bit:
-                model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint, 
-                                            load_in_4bit=load_in_4bit, 
-                                            device_map="auto")
+                if model_config["device_map"]:
+                    model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint, 
+                                                load_in_4bit=load_in_4bit, 
+                                                low_cpu_mem_usage=True,
+                                                device_map="auto", trust_remote_code=model_config["trust_remote_code"])
+                else:
+                    model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint, 
+                                                load_in_4bit=load_in_4bit, 
+                                                low_cpu_mem_usage=True,
+                                                trust_remote_code=model_config["trust_remote_code"])
             else:
-                model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint, 
-                                            device_map="auto")
+                if model_config["device_map"]:
+                    model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint, 
+                                                load_in_4bit=load_in_4bit, 
+                                                low_cpu_mem_usage=True,
+                                                device_map=model_config["device_map"], trust_remote_code=model_config["trust_remote_code"])
+                else:
+                    model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint, 
+                                                load_in_4bit=load_in_4bit, 
+                                                low_cpu_mem_usage=True,
+                                                trust_remote_code=model_config["trust_remote_code"]).to(device)
 
            
         tokenizer = model_class.init_tokenizer(model_config.get("tokenizer_url"))
@@ -66,22 +104,20 @@ def load_model_from_config(model_class, model_config, load_in_8bit=False, load_i
         )
     
 
-    def forward(self, sources):
-        encoding = self.tokenizer(sources, return_tensors='pt')
-        input_ids = encoding.input_ids.to(self.device)
-        attention_mask = encoding.attention_mask.to(self.device)
-        generated_ids = self.model.generate(input_ids, attention_mask=attention_mask,
-                                            max_length=self.max_prediction_length, 
-                                            num_beams=self.beam_size)
+    def forward(self, sources, max_length=512, beam_size=5):
+        encoding = self.tokenizer(sources, return_tensors='pt').to(self.model.device)
+        encoding['decoder_input_ids'] = encoding['input_ids'].clone()
+        generated_ids = self.model.generate(**encoding,
+                                            max_length=max_length, 
+                                            num_beams=beam_size)
 
         predictions = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
         return predictions
     
 
-    def predict(self, sources):
+    def predict(self, sources, max_length=512, beam_size=5):
         
         input_for_net = [' '.join(source.strip().split()).replace('\n', ' ') for source in sources]
-        # if self.task in ["sum", "translate", "nl2code", "refine"]:
-        output = self.forward(input_for_net)
+        output = self.forward(input_for_net, max_length, beam_size)
        
         return output
diff --git a/codetf/performance/model_evaluator.py b/codetf/performance/model_evaluator.py