Fix ray conflict changes (#100)

FanhaiLu1 · web-flow · commit 2880904dde6b · 2024-05-28T10:23:09.000-07:00
* Fixed multiple host bugs

* remove shard_on_batch and ragged_mha

* lint fix
diff --git a/jetstream_pt/engine.py b/jetstream_pt/engine.py
@@ -179,7 +179,7 @@ def _call_model_generate(
     )
     paramst, argst = torchjax.to_torch((weights, args))
     with self._lock:
-      with torchjax.jax_mode:
+      with torch_xla2.default_env():
         # The mode is needed so that tensors created inside of
         # the model (such as via torch.ones etc) also have the right type
         res = torch.func.functional_call(self.pt_model, paramst, argst)
@@ -210,7 +210,7 @@ def _call_model_prefill(self, weights, tokens, input_indexes):
 
     paramst, argst = torchjax.to_torch((weights, args))
     with self._lock:
-      with torchjax.jax_mode:
+      with torch_xla2.default_env():
         res = torch.func.functional_call(self.pt_model, paramst, argst)[0]
     caches_res = [c.state() for c in caches]
     return torchjax.from_torch((res, caches_res))
diff --git a/jetstream_pt/ray_worker.py b/jetstream_pt/ray_worker.py
@@ -166,7 +166,6 @@ def __init__(
         bf16_enable=bf16_enable,
         sharding_config_path=sharding_config,
     )
-    env = JetEngineEnvironment(env_data)
 
     if model_name.startswith("llama"):
 
@@ -353,7 +352,7 @@ def _call_model_generate(
     args = (tokens, input_pos, caches_obj, mask)
     paramst, argst = torchjax.to_torch((weights, args))
     with self._lock:
-      with torchjax.jax_mode():
+      with torch_xla2.default_env():
         res = torch.func.functional_call(self.pt_model, paramst, argst)
       updated_caches = [c.state() for c in caches_obj]
     scales = []
@@ -396,7 +395,7 @@ def _call_model_prefill(self, weights, tokens, input_indexes):
 
     paramst, argst = torchjax.to_torch((weights, args))
     with self._lock:
-      with torchjax.jax_mode:
+      with torch_xla2.default_env():
         res = torch.func.functional_call(self.pt_model, paramst, argst)[0]
     caches_res = [c.state() for c in caches]
     return torchjax.from_torch((res, caches_res))
diff --git a/jetstream_pt/torchjax.py b/jetstream_pt/torchjax.py
@@ -14,17 +14,15 @@
 import torch_xla2
 import torch_xla2.interop
 
-jax_mode = torch_xla2.default_env()
-
 call_jax = torch_xla2.interop.call_jax
 call_torch = torch_xla2.interop.call_torch
 
 
 def to_torch(tensors):
   """Wrap a jax Array into XLATensor."""
-  return jax_mode.j2t_iso(tensors)
+  return torch_xla2.default_env().j2t_iso(tensors)
 
 
 def from_torch(tensors):
   """Unwrap a XLATensor into jax Array."""
-  return jax_mode.t2j_iso(tensors)
+  return torch_xla2.default_env().t2j_iso(tensors)
diff --git a/run_interactive_multiple_host.py b/run_interactive_multiple_host.py
@@ -43,8 +43,6 @@ def create_engine():
       quantize_kv=FLAGS.quantize_kv_cache,
       max_cache_length=FLAGS.max_cache_length,
       sharding_config=FLAGS.sharding_config,
-      shard_on_batch=FLAGS.shard_on_batch,
-      ragged_mha=FLAGS.ragged_mha,
   )
 
   print("Initialize engine", time.perf_counter() - start)
@@ -54,7 +52,7 @@ def create_engine():
 # pylint: disable-next=all
 def main(argv):
 
-  engine = create_engine_from_config_flags()
+  engine = create_engine()
 
   start = time.perf_counter()
   engine.load_params()
@@ -99,6 +97,7 @@ def main(argv):
     while True:
       # pylint: disable-next=all
       decode_state, result_tokens = engine.generate(None, decode_state)
+      result_tokens = result_tokens.convert_to_numpy()
 
       slot_data = result_tokens.get_result_at_slot(slot)
       slot_tokens = slot_data.tokens
diff --git a/tests/test_quantization.py b/tests/test_quantization.py
@@ -237,7 +237,7 @@ def test_blockwise_quantized_linear_sharding(self):
     )
     def f(layer, weights, args):
       paramst, argst = torchjax.to_torch((weights, args))
-      with torchjax.jax_mode:
+      with torch_xla2.default_env():
         res = torch.func.functional_call(layer, paramst, argst)
       return torchjax.from_torch(res)
 

Original file line number	Diff line number	Diff line change
`@@ -237,7 +237,7 @@ def test_blockwise_quantized_linear_sharding(self):`
`237`	`237`	`)`
`238`	`238`	`def f(layer, weights, args):`
`239`	`239`	`paramst, argst = torchjax.to_torch((weights, args))`
`240`		`- with torchjax.jax_mode:`
	`240`	`+ with torch_xla2.default_env():`
`241`	`241`	`res = torch.func.functional_call(layer, paramst, argst)`
`242`	`242`	`return torchjax.from_torch(res)`
`243`	`243`