meta-pytorch
diff --git a/‎README.md‎
Lines changed: 15 additions & 7 deletions b/‎README.md‎
Lines changed: 15 additions & 7 deletions
diff --git a/‎generate.py‎
Lines changed: 14 additions & 9 deletions b/‎generate.py‎
Lines changed: 14 additions & 9 deletions
diff --git a/‎mixtral-moe/README.md‎
Lines changed: 3 additions & 4 deletions b/‎mixtral-moe/README.md‎
Lines changed: 3 additions & 4 deletions
@@ -22,10 +22,10 @@ Please check the rest of this page about benchmark of LLaMA family models.
 ### Mixtral 8x7B
 We also supported [Mixtral 8x7B](https://mistral.ai/news/mixtral-of-experts/) which is a high-quality sparse mixture of experts (MoE) model, the average token generation rates are:
 
-|                  |   1 GPU |    2 GPU  | 4 GPU  |    8 GPU    |
+|                  |   1 GPU |    2 GPU  | 4 GPU  |    8 GPU   |
 |------------------|---------|-----------|--------|------------|
-|baseline(bfloat16)|    OOM  |    78.75  | 118.23 |  203.69    |
-|        int8      |   56.04 |    99.91  | 149.53 |  218.48    |
+|baseline(bfloat16)|    OOM  |    96.67  | 155.35 |  227.82    |
+|        int8      |   97.92 |   155.03  | 216.87 |  279.35    |
 
 Note that the benchmarks run on an 8xA100-80GB, power limited to 330W with a hybrid cube mesh topology. Note that all benchmarks are run at *batch size=1*, making the reported tokens/s numbers equivalent to "tokens/s/user". In addition, they are run with a very small prompt length (just 5 tokens).
 
@@ -59,6 +59,9 @@ meta-llama/Llama-2-13b-chat-hf
 meta-llama/Llama-2-70b-chat-hf
 codellama/CodeLlama-7b-Python-hf
 codellama/CodeLlama-34b-Python-hf
+mistralai/Mistral-7B-v0.1
+mistralai/Mistral-7B-Instruct-v0.1
+mistralai/Mistral-7B-Instruct-v0.2
 ```
 
 For example, to convert Llama-2-7b-chat-hf
@@ -120,6 +123,11 @@ python generate.py --compile --checkpoint_path checkpoints/$MODEL_REPO/model.pth
 To squeeze out a little bit more performance, you can also compile the prefill with `--compile_prefill`. This will increase compilation times though.
 
 ## Quantization
+Choose device to use by
+```bash
+# The current support devices: cuda, cpu
+export DEVICE=cuda
+```
 ### Int8 Weight-Only Quantization
 To generate this version of the model
 ```bash
@@ -128,19 +136,19 @@ python quantize.py --checkpoint_path checkpoints/$MODEL_REPO/model.pth --mode in
 ```
 To run with int8, just pass the int8 checkpoint to generate.py.
 ```bash
-python generate.py --compile --checkpoint_path checkpoints/$MODEL_REPO/model_int8.pth
+python generate.py --compile --checkpoint_path checkpoints/$MODEL_REPO/model_int8.pth --device $DEVICE
 ```
 
 ### Int4 Weight-Only Quantization
 To generate int4 version of model
 ```bash
-# Spits out model at checkpoints/$MODEL_REPO/model_int4.g32.pth
-python quantize.py --checkpoint_path checkpoints/$MODEL_REPO/model.pth --mode int4 --groupsize 32
+# Spits out model at checkpoints/$MODEL_REPO/model_int4.g32.$DEVICE.pth
+python quantize.py --checkpoint_path checkpoints/$MODEL_REPO/model.pth --mode int4 --groupsize 32 --device $DEVICE
 ```
 
 To run with int4, just pass the int4 checkpoint to generate.py.
 ```bash
-python generate.py --checkpoint_path checkpoints/$MODEL_REPO/model_int4.g32.pth --compile
+python generate.py --checkpoint_path checkpoints/$MODEL_REPO/model_int4.g32.$DEVICE.pth --compile --device $DEVICE
 ```
 
 ## Speculative Sampling
 
@@ -16,7 +16,7 @@
 def device_sync(device):
     if "cuda" in device:
         torch.cuda.synchronize(device)
-    elif "cpu" in device:
+    elif ("cpu" in device) or ("mps" in device):
         pass
     else:
         print(f"device={device} is not yet suppported")
@@ -26,6 +26,7 @@ def device_sync(device):
 torch._inductor.config.triton.unique_kernel_names = True
 torch._inductor.config.fx_graph_cache = True # Experimental feature to reduce compilation times, will be on by default in future
 
+default_device = 'cuda' if torch.cuda.is_available() else 'cpu'
 
 # support running without installing as a package
 wd = Path(__file__).parent.parent.resolve()
@@ -206,13 +207,14 @@ def generate(
     }
     return seq, generate_stats
 
-def encode_tokens(tokenizer, string, bos=True, device='cuda'):
+def encode_tokens(tokenizer, string, bos=True, device=default_device):
     tokens = tokenizer.encode(string)
     if bos:
         tokens = [tokenizer.bos_id()] + tokens
     return torch.tensor(tokens, dtype=torch.int, device=device)
 
 def _load_model(checkpoint_path, device, precision, use_tp):
+    use_cuda = 'cuda' in device
     with torch.device('meta'):
         model = Transformer.from_name(checkpoint_path.parent.name)
 
@@ -223,15 +225,18 @@ def _load_model(checkpoint_path, device, precision, use_tp):
         model = simple_quantizer.convert_for_runtime()
 
     if "int4" in str(checkpoint_path):
-        print("Using int4 quantization!")
+        print("Using int4 weight-only quantization!")
         path_comps = checkpoint_path.name.split(".")
-        assert path_comps[-2].startswith("g")
-        groupsize = int(path_comps[-2][1:])
+        assert path_comps[-3].startswith("g")
+        assert path_comps[-2] in device, "weight packed format mismatch, please rerun quantize.py!"
+        groupsize = int(path_comps[-3][1:])
         from quantize import WeightOnlyInt4QuantHandler
         simple_quantizer = WeightOnlyInt4QuantHandler(model, groupsize)
-        model = simple_quantizer.convert_for_runtime()
+        model = simple_quantizer.convert_for_runtime(use_cuda)
 
     checkpoint = torch.load(str(checkpoint_path), mmap=True, weights_only=True)
+    if "model" in checkpoint and "stories" in str(checkpoint_path):
+        checkpoint = checkpoint["model"]
     model.load_state_dict(checkpoint, assign=True)
 
     if use_tp:
@@ -257,7 +262,7 @@ def main(
     profile: Optional[Path] = None,
     draft_checkpoint_path: Optional[Path] = None,
     speculate_k: int = 5,
-    device='cuda',
+    device=default_device,
 ) -> None:
     """Generates text samples based on a pre-trained Transformer model and tokenizer.
     """
@@ -310,7 +315,7 @@ def main(
         decode_one_token = torch.compile(decode_one_token, mode="reduce-overhead", fullgraph=True)
 
         # Uncomment to squeeze more perf out of prefill
-        if args.compile_prefill:
+        if compile_prefill:
             prefill = torch.compile(prefill, fullgraph=True, dynamic=True)
 
 
@@ -412,7 +417,7 @@ def callback(x):
     parser.add_argument('--profile', type=Path, default=None, help='Profile path.')
     parser.add_argument('--speculate_k', type=int, default=5, help='Speculative execution depth.')
     parser.add_argument('--draft_checkpoint_path', type=Path, default=None, help='Draft checkpoint path.')
-    parser.add_argument('--device', type=str, default="cuda", help='device to use')
+    parser.add_argument('--device', type=str, default=default_device, help='Device to use')
 
     args = parser.parse_args()
     main(
 
@@ -12,11 +12,10 @@ python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/$MODEL_REPO
 ## Benchmarks
 Benchmarks run on an 8xA100-80GB, power limited to 330W with a hybrid cube mesh topology. Note that all benchmarks are run at *batch size=1*, making the reported tokens/s numbers equivalent to "tokens/s/user". In addition, they are run with a very small prompt length (just 5 tokens).
 
-|                  |   1 GPU |    2 GPU  | 4 GPU  |    8 GPU    |
+|                  |   1 GPU |    2 GPU  | 4 GPU  |    8 GPU   |
 |------------------|---------|-----------|--------|------------|
-|baseline(bfloat16)|    OOM  |    78.75  | 118.23 |  203.69    |
-|        int8      |   56.04 |    99.91  | 149.53 |  218.48    |
-
+|baseline(bfloat16)|    OOM  |    96.67  | 155.35 |  227.82    |
+|        int8      |   97.92 |   155.03  | 216.87 |  279.35    |
 
 
 ## Generate Text