[Bug Fix] Add metadata as input for sharded_state_dict to follow latest Megatron code change (NVIDIA#606)

yueshen2016 · web-flow · commit 592a4993d71b · 2025-11-26T02:19:26.000Z
## What does this PR do? **Type of change:** ?  Bug fix **Overview:**? Add metadata as input for sharded_state_dict to follow latest Megatron Core [code change](NVIDIA/Megatron-LM@a2a1c89) ## Usage  ```python # Add a code snippet demonstrating how to use this ``` ## Testing  under Megatron-Bridge repo, the command below could run smoothly ``` pytest tests/functional_tests/quantization/test_quantization_workflow.py -v -s ``` Before the fix, error is like [this](https://github.com/NVIDIA-NeMo/Megatron-Bridge/actions/runs/19511909221/job/55906907649#step:3:7038) ## Before your PR is "*Ready for review*"  - **Make sure you read and follow [Contributor guidelines](https://github.com/NVIDIA/TensorRT-Model-Optimizer/blob/main/CONTRIBUTING.md)** and your commits are signed. - **Is this change backward compatible?**: Yes/No  - **Did you write any new necessary tests?**: Yes/No - **Did you add or update any necessary documentation?**: Yes/No - **Did you update [Changelog](https://github.com/NVIDIA/TensorRT-Model-Optimizer/blob/main/CHANGELOG.rst)?**: Yes/No  ## Additional Information  --------- Signed-off-by: James Shen <yueshen@nvidia.com>
diff --git a/modelopt/torch/quantization/plugins/megatron.py b/modelopt/torch/quantization/plugins/megatron.py
@@ -230,6 +230,30 @@ def _register_extra_state_callbacks(model: torch.nn.Module):
 CUSTOM_MODEL_PLUGINS.add(megatron_replace_quant_module_hook)
 
 
+def ensure_metadata_has_dp_cp_group(metadata):
+    """Ensure `metadata` is a dict containing `dp_cp_group` entry.
+
+    If `metadata` is None, a new dict is returned with `dp_cp_group` set.
+    If `metadata` is a dict and missing `dp_cp_group`, it is updated in-place.
+
+    This function is adapted from megatron-lm's megatron.core.transformer.utils to avoid
+    dependency on megatron-lm's specific version.
+
+    Note:
+        This is a temporary method and will be removed once this function is merged to
+        megatron.core.transformer.utils in the main branch of megatron-lm.
+    """
+    if metadata is None:
+        metadata = {}
+    if "dp_cp_group" not in metadata:
+        try:
+            metadata["dp_cp_group"] = get_data_parallel_group(with_context_parallel=True)
+        except (AssertionError, RuntimeError):
+            # Fallback if context parallel is not initialized
+            metadata["dp_cp_group"] = get_data_parallel_group()
+    return metadata
+
+
 class _MegatronParallelLinear(_ParallelLinear):
     _functionals_to_replace = [
         (megatron_parallel, "linear_with_grad_accumulation_and_async_allreduce"),
@@ -285,6 +309,9 @@ def _parameter_to_keep_in_quantizer_state_dict(self, key):
         return False
 
     def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=None):
+        # Ensure metadata has dp_cp_group to avoid None subscript errors
+        metadata = ensure_metadata_has_dp_cp_group(metadata)
+
         # [WAR]: although we disable output_layer quantization by default but it will
         # still be picked up by mtq.quantize since it is a ColumnParallelLinear. We need
         # to further ensure that its sharded state_dict has no scalars or amax since
@@ -294,7 +321,7 @@ def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=None):
         #    state_dict mismatch.
         if prefix.endswith("output_layer."):
             # assert not any("_quantizer" in k for k in self.state_dict()), "quantized output_layer"
-            return super().sharded_state_dict(prefix, sharded_offsets)
+            return super().sharded_state_dict(prefix, sharded_offsets, metadata)
 
         quantizer_state_dict = {}
         for k, v in self.state_dict(prefix="", keep_vars=True).items():
@@ -310,7 +337,7 @@ def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=None):
                     "Please use regular state_dict."
                 )
         sharded_axis_dict = self._get_shard_axis_dict(quantizer_state_dict)
-        sharded_state_dict = super().sharded_state_dict(prefix, sharded_offsets)
+        sharded_state_dict = super().sharded_state_dict(prefix, sharded_offsets, metadata)
         sharded_state_dict.update(
             **make_sharded_tensors_for_checkpoint(
                 quantizer_state_dict, prefix, sharded_axis_dict, sharded_offsets
diff --git a/modelopt/torch/sparsity/weight_sparsity/plugins/megatron.py b/modelopt/torch/sparsity/weight_sparsity/plugins/megatron.py
@@ -16,6 +16,7 @@
 """Support sparsify and save/resore for Megatron."""
 
 import megatron.core.transformer.mlp as megatron_mlp
+from megatron.core.parallel_state import get_data_parallel_group
 from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
 from megatron.core.transformer.utils import make_sharded_tensors_for_checkpoint
 
@@ -25,12 +26,39 @@
 from ..module import SparseModule, SpDMRegistry
 
 
+def ensure_metadata_has_dp_cp_group(metadata):
+    """Ensure `metadata` is a dict containing `dp_cp_group` entry.
+
+    If `metadata` is None, a new dict is returned with `dp_cp_group` set.
+    If `metadata` is a dict and missing `dp_cp_group`, it is updated in-place.
+
+    This function is adapted from megatron-lm's megatron.core.transformer.utils to avoid
+    dependency on megatron-lm's specific version.
+
+    Note:
+        This is a temporary method and will be removed once this function is merged to
+        megatron.core.transformer.utils in the main branch of megatron-lm.
+    """
+    if metadata is None:
+        metadata = {}
+    if "dp_cp_group" not in metadata:
+        try:
+            metadata["dp_cp_group"] = get_data_parallel_group(with_context_parallel=True)
+        except (AssertionError, RuntimeError):
+            # Fallback if context parallel is not initialized
+            metadata["dp_cp_group"] = get_data_parallel_group()
+    return metadata
+
+
 class _MegatronParallelLinear(SparseModule):
     def _get_shard_axis_dict(self, state_dict):
         raise NotImplementedError
 
     def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=None):
-        sharded_state_dict = super().sharded_state_dict(prefix, sharded_offsets)
+        # Ensure metadata has dp_cp_group to avoid None subscript errors
+        metadata = ensure_metadata_has_dp_cp_group(metadata)
+
+        sharded_state_dict = super().sharded_state_dict(prefix, sharded_offsets, metadata)
 
         sparse_state_dict = {
             k: v