AI-Hypercomputer
diff --git a/‎benchmarks/run_offline.py‎
Lines changed: 3 additions & 4 deletions b/‎benchmarks/run_offline.py‎
Lines changed: 3 additions & 4 deletions
diff --git a/‎install_everything.sh‎
Lines changed: 1 addition & 1 deletion b/‎install_everything.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎jetstream_pt/engine.py‎
Lines changed: 17 additions & 8 deletions b/‎jetstream_pt/engine.py‎
Lines changed: 17 additions & 8 deletions
diff --git a/‎jetstream_pt/ray_worker.py‎
Lines changed: 5 additions & 7 deletions b/‎jetstream_pt/ray_worker.py‎
Lines changed: 5 additions & 7 deletions
diff --git a/‎jetstream_pt/third_party/llama2/LICENSE‎ renamed to ‎jetstream_pt/third_party/llama/LICENSE‎ b/‎jetstream_pt/third_party/llama2/LICENSE‎ renamed to ‎jetstream_pt/third_party/llama/LICENSE‎
diff --git a/‎jetstream_pt/third_party/llama2/__init__.py‎ renamed to ‎jetstream_pt/third_party/llama/__init__.py‎ b/‎jetstream_pt/third_party/llama2/__init__.py‎ renamed to ‎jetstream_pt/third_party/llama/__init__.py‎
diff --git a/‎jetstream_pt/third_party/llama2/generation_original.py‎ renamed to ‎jetstream_pt/third_party/llama/generation_original.py‎
Lines changed: 2 additions & 2 deletions b/‎jetstream_pt/third_party/llama2/generation_original.py‎ renamed to ‎jetstream_pt/third_party/llama/generation_original.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎jetstream_pt/third_party/llama2/model_args.py‎ renamed to ‎jetstream_pt/third_party/llama/model_args.py‎
Lines changed: 25 additions & 12 deletions b/‎jetstream_pt/third_party/llama2/model_args.py‎ renamed to ‎jetstream_pt/third_party/llama/model_args.py‎
Lines changed: 25 additions & 12 deletions
diff --git a/‎jetstream_pt/third_party/llama2/model_exportable.py‎ renamed to ‎jetstream_pt/third_party/llama/model_exportable.py‎
Lines changed: 3 additions & 1 deletion b/‎jetstream_pt/third_party/llama2/model_exportable.py‎ renamed to ‎jetstream_pt/third_party/llama/model_exportable.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎jetstream_pt/third_party/llama2/model_original.py‎ renamed to ‎jetstream_pt/third_party/llama/model_original.py‎ b/‎jetstream_pt/third_party/llama2/model_original.py‎ renamed to ‎jetstream_pt/third_party/llama/model_original.py‎
@@ -21,7 +21,6 @@
 import jax
 import jax.numpy as jnp
 
-from jetstream.engine import token_utils
 from jetstream_pt import engine as je
 # pylint: disable-next=all
 from benchmarks import analyze_sharegpt
@@ -97,11 +96,11 @@ def create_engine():
 def run_prefill_time(engine, params, decode_state, seqlen):
   """Run prefill and measure time."""
   metadata = engine.get_tokenizer()
-  vocab = token_utils.load_vocab(metadata.path, metadata.extra_ids)
+  tokenizer = engine.build_tokenizer(metadata)
 
   text = "This is a beautiful day"
-  tokens, true_length = token_utils.tokenize_and_pad(
-      text, vocab, is_bos=True, prefill_lengths=[seqlen]
+  tokens, true_length = tokenizer.encode(
+      text, is_bos=True, prefill_lengths=[seqlen]
   )
 
   for _ in range(3):
 
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 TORCHXLA_TAG=jetstream-pytorch
-JETSTREAM_TAG=v0.2.0
+JETSTREAM_TAG=v0.2.1
 
 # Uninstall existing jax
 pip3 show jax && pip3 uninstall -y jax
 
@@ -26,14 +26,14 @@
 import torch
 import numpy as np
 
-from jetstream.engine import engine_api, tokenizer_pb2, token_utils
+from jetstream.engine import engine_api, tokenizer_api, tokenizer_pb2, token_utils
 import torch_xla2
 from torch.utils import _pytree as pytree
 
 from jetstream_pt import cache_manager
 from jetstream_pt import quantize
 from jetstream_pt.environment import JetEngineEnvironment, JetEngineEnvironmentData
-from jetstream_pt.third_party.llama2 import model_exportable, model_args
+from jetstream_pt.third_party.llama import model_exportable, model_args
 
 
 Mesh = jax.sharding.Mesh
@@ -526,6 +526,14 @@ def get_tokenizer(self) -> tokenizer_pb2.TokenizerParameters:
     # pylint: disable-next=all
     return tokenizer_pb2.TokenizerParameters(path=self.env.tokenizer_path)
 
+  def build_tokenizer(
+      self, metadata: tokenizer_pb2.TokenizerParameters  # pylint: disable=all
+  ) -> tokenizer_api.Tokenizer:
+    if "llama-3" in self.env.model_type:
+      return token_utils.TikToken(metadata)
+
+    return token_utils.SentencePieceTokenizer(metadata)
+
   def join_prefixes(
       self,
       prefix1: engine_api.Prefix,
@@ -652,13 +660,18 @@ def create_pytorch_engine(
     context_length: int = 1024,
     batch_size: int = 1,
     max_decode_length: int = 4096,
-    model_name="llama",
+    model_name="llama-2",
     quantize_weights=False,
     quantize_kv=False,
     max_cache_length=1024,
 ) -> PyTorchEngine:
   """Returns: The pytorch engine."""
 
+  supported_models = ["llama-2", "llama-3"]
+  if model_name not in supported_models:
+    raise NotImplementedError(
+        f"Model name should be one of{','.join(supported_models)}"
+    )
   # See issue b/309529778 if it's turned on.
   jax.config.update("jax_dynamic_shapes", False)
   # Pytorch exports has int64 constants.
@@ -696,11 +709,7 @@ def create_pytorch_engine(
   if model_name.startswith("llama"):
 
     args = model_args.get_model_args(
-        param_size,
-        context_length,
-        batch_size,
-        tokenizer.vocab_size,
-        bf16_enable,
+        model_name + "-" + param_size, context_length, batch_size, bf16_enable
     )
     args.device = "meta"
     args.quantize = quantize_weights
 
@@ -32,9 +32,9 @@
 from torch.utils import _pytree as pytree
 import torch_xla2
 
-from jetstream.engine import engine_api, tokenizer_pb2, token_utils
+from jetstream.engine import engine_api, tokenizer_pb2
 
-from jetstream_pt.third_party.llama2 import model_exportable, model_args
+from jetstream_pt.third_party.llama import model_exportable, model_args
 
 from jetstream_pt import cache_manager
 from jetstream_pt import quantize
@@ -99,7 +99,7 @@ def __init__(
       context_length: int = 1024,
       batch_size: int = 1,
       max_decode_length: int = 4096,
-      model_name="llama",
+      model_name="llama-2",
       quantize_weights=False,
       quantize_kv=False,
       max_cache_length=1024,
@@ -159,14 +159,12 @@ def __init__(
     )
     env = JetEngineEnvironment(env_data)
 
-    tokenizer = token_utils.load_vocab(tokenizer_path)
     pt_model = None
-    if model_name == "llama":
+    if "llama" in model_name:
       args = model_args.get_model_args(
-          param_size,
+          model_name + "-" + param_size,
           context_length,
           batch_size,
-          tokenizer.vocab_size,
           bf16_enable,
       )
       args.device = "meta"
 
@@ -5,9 +5,9 @@
 from typing import List, Literal, Optional, Tuple, TypedDict
 
 import torch
-from jetstream_pt.third_party.llama2 import model_original
+from jetstream_pt.third_party.llama import model_original
 from flax import struct
-from jetstream_pt.third_party.llama2.tokenizer import Tokenizer
+from jetstream_pt.third_party.llama.tokenizer import Tokenizer
 
 Role = Literal["system", "user", "assistant"]
 
 
@@ -34,68 +34,81 @@ class ModelArgs:
   device = "cpu"
   quantize = False
 
+  rope_theta: float = 10000.0
+
 
 def get_arg(
-    param_size: str,
+    model_name: str,
     seqlen,
     batch_size,
-    vocab_size: int,
     bf16_enable: bool = False,
 ) -> ModelArgs:
   """Gets model args."""
 
   data = {}
-  if param_size == "tiny":
+  if model_name == "llama-2-tiny":
     data = {
         "dim": 128,
+        "vocab_size": 32000,
         "multiple_of": 32,
         "n_heads": 8,
         "n_layers": 3,
         "norm_eps": 1e-05,
     }
-  elif param_size == "7b":
+  elif model_name == "llama-2-7b":
     data = {
         "dim": 4096,
+        "vocab_size": 32000,
         "multiple_of": 256,
         "n_heads": 32,
         "n_layers": 32,
         "norm_eps": 1e-05,
     }
-  elif param_size == "13b":
+  elif model_name == "llama-2-13b":
     data = {
         "dim": 5120,
+        "vocab_size": 32000,
         "multiple_of": 256,
         "n_heads": 40,
         "n_layers": 40,
         "norm_eps": 1e-05,
     }
-  elif param_size == "70b":
+  elif model_name == "llama-2-70b":
     data = {
         "dim": 8192,
+        "vocab_size": 32000,
         "multiple_of": 4096,
         "ffn_dim_multiplier": 1.3,
         "n_heads": 64,
         "n_kv_heads": 8,
         "n_layers": 80,
         "norm_eps": 1e-05,
     }
+  elif model_name == "llama-3-8b":
+    data = {
+        "dim": 4096,
+        "vocab_size": 128256,
+        "multiple_of": 1024,
+        "ffn_dim_multiplier": 1.3,
+        "n_layers": 32,
+        "n_heads": 32,
+        "n_kv_heads": 8,
+        "norm_eps": 1e-05,
+        "rope_theta": 500000.0,
+    }
   return ModelArgs(
       max_seq_len=seqlen,
       max_batch_size=batch_size,
-      vocab_size=vocab_size,
       bf16_enable=bf16_enable,
       **data,
   )
 
 
-def get_model_args(
-    param_size, context_length, batch_size, vocab_size, bf16_enable
-):
+def get_model_args(model_name, context_length, batch_size, bf16_enable):
   model_args = get_arg(
-      param_size=param_size,
+      model_name=model_name,
       seqlen=context_length,
       batch_size=batch_size,
-      vocab_size=vocab_size,
       bf16_enable=bf16_enable,
   )
   model_args.n_kv_heads = (
 
@@ -157,7 +157,9 @@ def __init__(
     )
     # TODO what to do with this
     freqs_cis = precompute_freqs_cis(
-        self.params.dim // self.params.n_heads, self.params.max_seq_len * 2
+        self.params.dim // self.params.n_heads,
+        self.params.max_seq_len * 2,
+        theta=self.params.rope_theta,
     )
 
     self.register_buffer("freqs_cis", freqs_cis)
Original file line number	Diff line number	Diff line change
`@@ -157,7 +157,9 @@ def __init__(`
`157`	`157`	`)`
`158`	`158`	`# TODO what to do with this`
`159`	`159`	`freqs_cis = precompute_freqs_cis(`
`160`		`- self.params.dim // self.params.n_heads, self.params.max_seq_len * 2`
	`160`	`+ self.params.dim // self.params.n_heads,`
	`161`	`+ self.params.max_seq_len * 2,`
	`162`	`+ theta=self.params.rope_theta,`
`161`	`163`	`)`
`162`	`164`
`163`	`165`	`self.register_buffer("freqs_cis", freqs_cis)`