Update version of jetstream; misc fixes (#88)

qihqi · web-flow · commit 0f4e7ae2c17c · 2024-05-17T09:51:58.000-07:00
misc fixes
diff --git a/convert_checkpoints.py b/convert_checkpoints.py
@@ -324,9 +324,12 @@ def _get_llama_state_dict(input_ckpt_dir):
   print(f"Loading checkpoints takes {end - start} seconds")
 
   start = time.perf_counter()
-  state_dict = _merge_llama_weights(
-      checkpoints, _MINIMIZE_MEMORY_FOOTPRINT.value, _ENABLE_FLOAT32.value
-  )
+  if len(checkpoints) > 1:
+    state_dict = _merge_llama_weights(
+        checkpoints, _MINIMIZE_MEMORY_FOOTPRINT.value, _ENABLE_FLOAT32.value
+    )
+  else:
+    state_dict = checkpoints[0]
   end = time.perf_counter()
   print(f"Merging weights takes {end - start} seconds")
   return state_dict, params
diff --git a/install_everything.sh b/install_everything.sh
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 TORCHXLA_TAG=f26c35c2fa5eb1d22d042a2a8a8dc34f11b99f60 # updated May 14, 2024
-JETSTREAM_TAG=v0.2.1
+JETSTREAM_TAG=e4952fbb12e0ab3c33bc7c1eef3839b7c2ad0dd4 # updated May 16, 2024
 
 # Uninstall existing jax
 pip show jax && pip uninstall -y jax
@@ -39,4 +39,4 @@ git checkout $JETSTREAM_TAG
 pip install .
 popd # now at the folder deps
 popd # now at the folder current file
-pip install -e .
+pip install -e .
diff --git a/jetstream_pt/engine.py b/jetstream_pt/engine.py
@@ -298,15 +298,15 @@ def insert(cache, scaler, new_entry):
         )
         new_scaler = jax.lax.dynamic_update_slice(
             scaler,
-            scales.jax(),
+            scales,
             [slot, 0, pos, 0],
         )
         new_scaler = jax.lax.with_sharding_constraint(
             new_scaler, self.replicated
         )
         res = jax.lax.dynamic_update_slice(
             cache,
-            vals.jax(),
+            vals,
             [slot, 0, pos, 0],
         )
         res = jax.lax.with_sharding_constraint(res, self.cache_sharding)
diff --git a/jetstream_pt/third_party/llama/model_exportable.py b/jetstream_pt/third_party/llama/model_exportable.py
@@ -188,6 +188,9 @@ def forward(
       freqs_cis = self.freqs_cis[input_pos]
       freqs_cis = freqs_cis.reshape(bsz, seqlen, -1)
 
+    assert len(caches) == len(
+        self.layers
+    ), f"Number of caches ({len(caches)}) and layers ({len(self.layers)}) dont match"
     for layer, cache in zip(self.layers, caches):
       with jax.named_scope("TransformerBlock"):
         h = layer(h, freqs_cis, mask, cache)
diff --git a/run_interactive.py b/run_interactive.py
@@ -140,7 +140,7 @@ def main(argv):
   ]
   for prompt in prompts:
     slot = random.randint(0, _BATCH_SIZE.value - 1)
-    tokens, true_length = tokenizer.encode(prompt, is_bos=True)
+    tokens, true_length = tokenizer.encode(prompt)
 
     print(f"---- Input prompts are: {prompt}")
     print(f"---- Encoded tokens are: {tokens}")
@@ -157,12 +157,15 @@ def main(argv):
     while True:
       decode_state, result_tokens = engine.generate(params, decode_state)
       result_tokens = result_tokens.convert_to_numpy()
-      output, complete = tokenizer.decode(
-          slot, max_output_length, result_tokens, complete
-      )
-      if complete[0]:
+      res = result_tokens.get_result_at_slot(slot)
+      stop_tokens = set(tokenizer.tokenizer.stop_tokens)
+      stop_tokens.add(tokenizer.pad_id)
+      if (
+          res.tokens[0][0] in stop_tokens
+          or len(sampled_tokens_list) > max_output_length
+      ):
         break
-      token_id = output[0][0]
+      token_id = res.tokens[0][0]
       sampled_tokens_list.append(token_id)
       # output_str = tokenizer.decode_str([token_id])
       # print(Fore.GREEN + output_str, end="", flush=True)
@@ -173,7 +176,7 @@ def main(argv):
     print("---- All output tokens.")
     print(sampled_tokens_list)
     print("---- All output text.")
-    print(tokenizer.decode_str(sampled_tokens_list))
+    print(tokenizer.decode(sampled_tokens_list))
 
   if _PROFILING_OUTPUT.value:
     jax.profiler.stop_trace()

Original file line number	Diff line number	Diff line change
`@@ -298,15 +298,15 @@ def insert(cache, scaler, new_entry):`
`298`	`298`	`)`
`299`	`299`	`new_scaler = jax.lax.dynamic_update_slice(`
`300`	`300`	`scaler,`
`301`		`- scales.jax(),`
	`301`	`+ scales,`
`302`	`302`	`[slot, 0, pos, 0],`
`303`	`303`	`)`
`304`	`304`	`new_scaler = jax.lax.with_sharding_constraint(`
`305`	`305`	`new_scaler, self.replicated`
`306`	`306`	`)`
`307`	`307`	`res = jax.lax.dynamic_update_slice(`
`308`	`308`	`cache,`
`309`		`- vals.jax(),`
	`309`	`+ vals,`
`310`	`310`	`[slot, 0, pos, 0],`
`311`	`311`	`)`
`312`	`312`	`res = jax.lax.with_sharding_constraint(res, self.cache_sharding)`