diff --git a/natural_language_processing/text_generation/alpaca/run.py b/natural_language_processing/text_generation/alpaca/run.py index afa3cbfe..b06ddc69 100644 --- a/natural_language_processing/text_generation/alpaca/run.py +++ b/natural_language_processing/text_generation/alpaca/run.py @@ -3,7 +3,7 @@ from utils.benchmark import run_model -def run_pytorch(model_path, num_runs, timeout, dataset_path, use_torch_fp16=False): +def run_pytorch(model_path, batch_size, num_runs, timeout, dataset_path, use_torch_fp16=False, revision=None): from transformers import AutoModelForCausalLM, AutoTokenizer def run_single_pass(pytorch_runner, _dataset): @@ -13,7 +13,7 @@ def run_single_pass(pytorch_runner, _dataset): response = decode(outputs[:, inputs.input_ids.shape[1]:]) _dataset.submit_prediction(response) - model = AutoModelForCausalLM.from_pretrained(model_path) + model = AutoModelForCausalLM.from_pretrained(model_path, revision=revision) if use_torch_fp16: model = model.half() model.eval() @@ -30,11 +30,14 @@ def run_single_pass(pytorch_runner, _dataset): def run_pytorch_fp32(model_path, num_runs, timeout, dataset_path, **kwargs): - return run_pytorch(model_path, num_runs, timeout, dataset_path, use_torch_fp16=False) + return run_pytorch(model_path, num_runs, timeout, dataset_path) def run_pytorch_fp16(model_path, num_runs, timeout, dataset_path, **kwargs): return run_pytorch(model_path, num_runs, timeout, dataset_path, use_torch_fp16=True) +def run_pytorch_int8(model_name, batch_size, num_runs, timeout, dataset_path, revision, **kwargs): + return run_pytorch(model_name, batch_size, num_runs, timeout, dataset_path, revision=revision) + def run_pytorch_cuda(model_path, num_runs, timeout, dataset_path, **kwargs): from transformers import AutoModelForCausalLM, AutoTokenizer diff --git a/natural_language_processing/text_generation/llama2/run.py b/natural_language_processing/text_generation/llama2/run.py index 9a65a64b..514f922a 100644 --- a/natural_language_processing/text_generation/llama2/run.py +++ b/natural_language_processing/text_generation/llama2/run.py @@ -4,7 +4,7 @@ from transformers import LlamaForCausalLM, AutoTokenizer -def run_pytorch(model_name, batch_size, num_runs, timeout, dataset_path, use_torch_fp16=False): +def run_pytorch(model_name, batch_size, num_runs, timeout, dataset_path, use_torch_fp16=False, revision=None): def run_single_pass(pytorch_runner, _dataset): input_tensor = tokenizer.encode(_dataset.get_input_string(), return_tensors="pt") input_tensor = torch.cat([input_tensor for _ in range(batch_size)], 0) @@ -18,7 +18,7 @@ def run_single_pass(pytorch_runner, _dataset): np.random.seed(44) torch.manual_seed(44) - model = LlamaForCausalLM.from_pretrained(model_name, torchscript=True) + model = LlamaForCausalLM.from_pretrained(model_name, torchscript=True, revision=revision) model.eval() if use_torch_fp16: model = model.half() @@ -42,6 +42,10 @@ def run_pytorch_fp32(model_name, batch_size, num_runs, timeout, dataset_path, ** def run_pytorch_fp16(model_name, batch_size, num_runs, timeout, dataset_path, **kwargs): return run_pytorch(model_name, batch_size, num_runs, timeout, dataset_path, use_torch_fp16=True) +def run_pytorch_int8(model_name, batch_size, num_runs, timeout, dataset_path, revision, **kwargs): + return run_pytorch(model_name, batch_size, num_runs, timeout, dataset_path, revision=revision) + + def main(): from utils.helpers import DefaultArgParser llama_variants = ["meta-llama/Llama-2-7b-chat-hf", "meta-llama/Llama-2-13b-chat-hf"]