From 8f6435fa495cbc5ad76428a2a59f6d8a629d6049 Mon Sep 17 00:00:00 2001 From: Karol Kontny Date: Fri, 1 Mar 2024 10:40:27 +0100 Subject: [PATCH 1/2] Support for int8 gptq models --- natural_language_processing/text_generation/llama2/run.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/natural_language_processing/text_generation/llama2/run.py b/natural_language_processing/text_generation/llama2/run.py index 9a65a64b..df4814c1 100644 --- a/natural_language_processing/text_generation/llama2/run.py +++ b/natural_language_processing/text_generation/llama2/run.py @@ -4,7 +4,7 @@ from transformers import LlamaForCausalLM, AutoTokenizer -def run_pytorch(model_name, batch_size, num_runs, timeout, dataset_path, use_torch_fp16=False): +def run_pytorch(model_name, batch_size, num_runs, timeout, dataset_path, use_torch_fp16=False, revision=None): def run_single_pass(pytorch_runner, _dataset): input_tensor = tokenizer.encode(_dataset.get_input_string(), return_tensors="pt") input_tensor = torch.cat([input_tensor for _ in range(batch_size)], 0) @@ -18,7 +18,7 @@ def run_single_pass(pytorch_runner, _dataset): np.random.seed(44) torch.manual_seed(44) - model = LlamaForCausalLM.from_pretrained(model_name, torchscript=True) + model = LlamaForCausalLM.from_pretrained(model_name, torchscript=True, revision=revision) model.eval() if use_torch_fp16: model = model.half() @@ -42,6 +42,10 @@ def run_pytorch_fp32(model_name, batch_size, num_runs, timeout, dataset_path, ** def run_pytorch_fp16(model_name, batch_size, num_runs, timeout, dataset_path, **kwargs): return run_pytorch(model_name, batch_size, num_runs, timeout, dataset_path, use_torch_fp16=True) +def run_pytorch_int8(model_name, batch_size, num_runs, timeout, dataset_path, revision, **kwargs): + return run_pytorch(model_name, batch_size, num_runs, timeout, dataset_path, use_torch_fp16=True, revision=revision) + + def main(): from utils.helpers import DefaultArgParser llama_variants = ["meta-llama/Llama-2-7b-chat-hf", "meta-llama/Llama-2-13b-chat-hf"] From 9db9c3f993ce8533cecaa1168ff84b8ca83fe91e Mon Sep 17 00:00:00 2001 From: Karol Kontny Date: Fri, 1 Mar 2024 11:50:17 +0100 Subject: [PATCH 2/2] Add alpaca int8 --- .../text_generation/alpaca/run.py | 9 ++++++--- .../text_generation/llama2/run.py | 2 +- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/natural_language_processing/text_generation/alpaca/run.py b/natural_language_processing/text_generation/alpaca/run.py index afa3cbfe..b06ddc69 100644 --- a/natural_language_processing/text_generation/alpaca/run.py +++ b/natural_language_processing/text_generation/alpaca/run.py @@ -3,7 +3,7 @@ from utils.benchmark import run_model -def run_pytorch(model_path, num_runs, timeout, dataset_path, use_torch_fp16=False): +def run_pytorch(model_path, batch_size, num_runs, timeout, dataset_path, use_torch_fp16=False, revision=None): from transformers import AutoModelForCausalLM, AutoTokenizer def run_single_pass(pytorch_runner, _dataset): @@ -13,7 +13,7 @@ def run_single_pass(pytorch_runner, _dataset): response = decode(outputs[:, inputs.input_ids.shape[1]:]) _dataset.submit_prediction(response) - model = AutoModelForCausalLM.from_pretrained(model_path) + model = AutoModelForCausalLM.from_pretrained(model_path, revision=revision) if use_torch_fp16: model = model.half() model.eval() @@ -30,11 +30,14 @@ def run_single_pass(pytorch_runner, _dataset): def run_pytorch_fp32(model_path, num_runs, timeout, dataset_path, **kwargs): - return run_pytorch(model_path, num_runs, timeout, dataset_path, use_torch_fp16=False) + return run_pytorch(model_path, num_runs, timeout, dataset_path) def run_pytorch_fp16(model_path, num_runs, timeout, dataset_path, **kwargs): return run_pytorch(model_path, num_runs, timeout, dataset_path, use_torch_fp16=True) +def run_pytorch_int8(model_name, batch_size, num_runs, timeout, dataset_path, revision, **kwargs): + return run_pytorch(model_name, batch_size, num_runs, timeout, dataset_path, revision=revision) + def run_pytorch_cuda(model_path, num_runs, timeout, dataset_path, **kwargs): from transformers import AutoModelForCausalLM, AutoTokenizer diff --git a/natural_language_processing/text_generation/llama2/run.py b/natural_language_processing/text_generation/llama2/run.py index df4814c1..514f922a 100644 --- a/natural_language_processing/text_generation/llama2/run.py +++ b/natural_language_processing/text_generation/llama2/run.py @@ -43,7 +43,7 @@ def run_pytorch_fp16(model_name, batch_size, num_runs, timeout, dataset_path, ** return run_pytorch(model_name, batch_size, num_runs, timeout, dataset_path, use_torch_fp16=True) def run_pytorch_int8(model_name, batch_size, num_runs, timeout, dataset_path, revision, **kwargs): - return run_pytorch(model_name, batch_size, num_runs, timeout, dataset_path, use_torch_fp16=True, revision=revision) + return run_pytorch(model_name, batch_size, num_runs, timeout, dataset_path, revision=revision) def main():