From 8f6435fa495cbc5ad76428a2a59f6d8a629d6049 Mon Sep 17 00:00:00 2001
From: Karol Kontny <karol@amperecomputing.com>
Date: Fri, 1 Mar 2024 10:40:27 +0100
Subject: [PATCH 1/2] Support for int8 gptq models

---
 natural_language_processing/text_generation/llama2/run.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/natural_language_processing/text_generation/llama2/run.py b/natural_language_processing/text_generation/llama2/run.py
index 9a65a64b..df4814c1 100644
--- a/natural_language_processing/text_generation/llama2/run.py
+++ b/natural_language_processing/text_generation/llama2/run.py
@@ -4,7 +4,7 @@
 from transformers import LlamaForCausalLM, AutoTokenizer
 
 
-def run_pytorch(model_name, batch_size, num_runs, timeout, dataset_path, use_torch_fp16=False):
+def run_pytorch(model_name, batch_size, num_runs, timeout, dataset_path, use_torch_fp16=False, revision=None):
     def run_single_pass(pytorch_runner, _dataset):
         input_tensor = tokenizer.encode(_dataset.get_input_string(), return_tensors="pt")
         input_tensor = torch.cat([input_tensor for _ in range(batch_size)], 0)
@@ -18,7 +18,7 @@ def run_single_pass(pytorch_runner, _dataset):
     np.random.seed(44)
     torch.manual_seed(44)
 
-    model = LlamaForCausalLM.from_pretrained(model_name, torchscript=True)
+    model = LlamaForCausalLM.from_pretrained(model_name, torchscript=True, revision=revision)
     model.eval()
     if use_torch_fp16:
         model = model.half()
@@ -42,6 +42,10 @@ def run_pytorch_fp32(model_name, batch_size, num_runs, timeout, dataset_path, **
 def run_pytorch_fp16(model_name, batch_size, num_runs, timeout, dataset_path, **kwargs):
     return run_pytorch(model_name, batch_size, num_runs, timeout, dataset_path, use_torch_fp16=True)
 
+def run_pytorch_int8(model_name, batch_size, num_runs, timeout, dataset_path, revision, **kwargs):
+    return run_pytorch(model_name, batch_size, num_runs, timeout, dataset_path, use_torch_fp16=True, revision=revision)
+
+
 def main():
     from utils.helpers import DefaultArgParser
     llama_variants = ["meta-llama/Llama-2-7b-chat-hf", "meta-llama/Llama-2-13b-chat-hf"]

From 9db9c3f993ce8533cecaa1168ff84b8ca83fe91e Mon Sep 17 00:00:00 2001
From: Karol Kontny <karol@amperecomputing.com>
Date: Fri, 1 Mar 2024 11:50:17 +0100
Subject: [PATCH 2/2] Add alpaca int8

---
 .../text_generation/alpaca/run.py                        | 9 ++++++---
 .../text_generation/llama2/run.py                        | 2 +-
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/natural_language_processing/text_generation/alpaca/run.py b/natural_language_processing/text_generation/alpaca/run.py
index afa3cbfe..b06ddc69 100644
--- a/natural_language_processing/text_generation/alpaca/run.py
+++ b/natural_language_processing/text_generation/alpaca/run.py
@@ -3,7 +3,7 @@
 from utils.benchmark import run_model
 
 
-def run_pytorch(model_path, num_runs, timeout, dataset_path, use_torch_fp16=False):
+def run_pytorch(model_path, batch_size, num_runs, timeout, dataset_path, use_torch_fp16=False, revision=None):
     from transformers import AutoModelForCausalLM, AutoTokenizer
   
     def run_single_pass(pytorch_runner, _dataset):
@@ -13,7 +13,7 @@ def run_single_pass(pytorch_runner, _dataset):
         response = decode(outputs[:, inputs.input_ids.shape[1]:])
         _dataset.submit_prediction(response)
 
-    model = AutoModelForCausalLM.from_pretrained(model_path)
+    model = AutoModelForCausalLM.from_pretrained(model_path, revision=revision)
     if use_torch_fp16:
         model = model.half()
     model.eval()
@@ -30,11 +30,14 @@ def run_single_pass(pytorch_runner, _dataset):
 
 
 def run_pytorch_fp32(model_path, num_runs, timeout, dataset_path, **kwargs):
-    return run_pytorch(model_path, num_runs, timeout, dataset_path, use_torch_fp16=False)
+    return run_pytorch(model_path, num_runs, timeout, dataset_path)
 
 def run_pytorch_fp16(model_path, num_runs, timeout, dataset_path, **kwargs):
     return run_pytorch(model_path, num_runs, timeout, dataset_path, use_torch_fp16=True)
 
+def run_pytorch_int8(model_name, batch_size, num_runs, timeout, dataset_path, revision, **kwargs):
+    return run_pytorch(model_name, batch_size, num_runs, timeout, dataset_path, revision=revision)
+
   
 def run_pytorch_cuda(model_path, num_runs, timeout, dataset_path, **kwargs):
     from transformers import AutoModelForCausalLM, AutoTokenizer
diff --git a/natural_language_processing/text_generation/llama2/run.py b/natural_language_processing/text_generation/llama2/run.py
index df4814c1..514f922a 100644
--- a/natural_language_processing/text_generation/llama2/run.py
+++ b/natural_language_processing/text_generation/llama2/run.py
@@ -43,7 +43,7 @@ def run_pytorch_fp16(model_name, batch_size, num_runs, timeout, dataset_path, **
     return run_pytorch(model_name, batch_size, num_runs, timeout, dataset_path, use_torch_fp16=True)
 
 def run_pytorch_int8(model_name, batch_size, num_runs, timeout, dataset_path, revision, **kwargs):
-    return run_pytorch(model_name, batch_size, num_runs, timeout, dataset_path, use_torch_fp16=True, revision=revision)
+    return run_pytorch(model_name, batch_size, num_runs, timeout, dataset_path, revision=revision)
 
 
 def main():