Merge pull request #2670 from AI-Hypercomputer:mattdavidow-separate-gpu-context

Google-ML-Automation · Google-ML-Automation · commit ef64c73f22e7 · 2025-11-13T06:23:23.000-08:00
PiperOrigin-RevId: 831832203
diff --git a/src/MaxText/max_utils.py b/src/MaxText/max_utils.py
@@ -28,6 +28,7 @@
 from etils import epath
 import flax
 import jax
+from contextlib import contextmanager
 from jax.experimental import mesh_utils
 from jax.sharding import PartitionSpec as P
 import jax.numpy as jnp
@@ -989,3 +990,38 @@ def get_batch_seq_len_for_mode(config, model_mode):
     raise ValueError(f"Unknown model_mode: {model_mode}")
 
   return batch_size, seq_len
+
+@contextmanager
+def maybe_get_transformer_engine_context(config):
+  """ Runs a transformer engine context engine manager for GPUs only. """
+  if config.hardware in ['gpu', 'gpu_multiprocess']:
+    with transformer_engine_context():
+      yield
+  else:
+    with dummy_context_manager():
+      yield
+
+@contextmanager
+def dummy_context_manager():
+  """A context manager that does nothing."""
+  yield
+
+@contextmanager
+def transformer_engine_context():
+  """If TransformerEngine is available, this context manager will provide
+  the library with MaxText-specific details needed for correcct operation."""
+  try:
+    from transformer_engine.jax.sharding import global_shard_guard, MeshResource  # pylint: disable=import-outside-toplevel
+    # Inform TransformerEngine of MaxText's physical mesh resources.
+    mesh_resource = MeshResource(  # pytype: disable=wrong-arg-types
+        dp_resource="data",
+        tp_resource="tensor",
+        # tpsp_resource = "tensor_sequence", #TODO(Phuong): add this back when upstreaming CGEMM
+        fsdp_resource="fsdp",
+        pp_resource=None,
+        cp_resource="context",
+    )
+    with global_shard_guard(mesh_resource):
+      yield
+  except (ImportError, AttributeError):
+    yield
diff --git a/src/MaxText/train.py b/src/MaxText/train.py
@@ -521,36 +521,18 @@ def initialize(argv: Sequence[str]) -> tuple[pyconfig.HyperParameters, Any, Any]
 
 def run(config, recorder, diagnostic_config):
   """Run the job given hyperparameters and utilities"""
-  with diagnostic.diagnose(diagnostic_config):
-    with maybe_record_goodput(recorder, GoodputEvent.JOB):
-      train_loop(config, recorder)
+  with (
+    diagnostic.diagnose(diagnostic_config),
+    maybe_record_goodput(recorder, GoodputEvent.JOB),
+    max_utils.maybe_get_transformer_engine_context(config)
+  ):
+    train_loop(config, recorder)
 
 
-@contextmanager
-def transformer_engine_context():
-  """If TransformerEngine is available, this context manager will provide
-  the library with MaxText-specific details needed for correcct operation."""
-  try:
-    from transformer_engine.jax.sharding import global_shard_guard, MeshResource  # pylint: disable=import-outside-toplevel
-    # Inform TransformerEngine of MaxText's physical mesh resources.
-    mesh_resource = MeshResource(  # pytype: disable=wrong-arg-types
-        dp_resource="data",
-        tp_resource="tensor",
-        # tpsp_resource = "tensor_sequence", #TODO(Phuong): add this back when upstreaming CGEMM
-        fsdp_resource="fsdp",
-        pp_resource=None,
-        cp_resource="context",
-    )
-    with global_shard_guard(mesh_resource):
-      yield
-  except (ImportError, AttributeError):
-    yield
-
 
 def main(argv: Sequence[str]) -> None:
-  with transformer_engine_context():
-    config, recorder, diagnostic_config = initialize(argv)
-    run(config, recorder, diagnostic_config)
+  config, recorder, diagnostic_config = initialize(argv)
+  run(config, recorder, diagnostic_config)
 
 
 if __name__ == "__main__":