add evaluate_guards option to DynamicShapesConfig

laithsakka · laithsakka · commit d670ef324c66 · 2025-10-23T11:32:53.000-07:00
Signed-off-by: Laith Sakka &lt;lsakka@meta.com&gt;
diff --git a/tests/compile/test_dynamic_shapes_compilation.py b/tests/compile/test_dynamic_shapes_compilation.py
@@ -8,6 +8,7 @@
 from torch.torch_version import TorchVersion
 
 from vllm import LLM, SamplingParams
+from vllm.config import set_current_vllm_config
 from vllm.config.compilation import DynamicShapesType
 
 
@@ -35,9 +36,10 @@ def get_test_models():
 
 
 @pytest.mark.parametrize("model_name", get_test_models())
-def test_dynamic_shapes_compilation(monkeypatch, model_name):
+@pytest.mark.parametrize("evaluate_guards", [False, True])
+def test_dynamic_shapes_compilation(monkeypatch, model_name, evaluate_guards):
     """Test that all dynamic shapes types produce compiles"""
-    print(f"\nTesting model: {model_name}")
+    print(f"\nTesting model: {model_name} with evaluate_guards={evaluate_guards}")
 
     monkeypatch.setenv("TOKENIZERS_PARALLELISM", "true")
     # Note USE_AOT_COMPILE fails https://github.com/vllm-project/vllm/issues/27040.
@@ -76,7 +78,10 @@ def test_dynamic_shapes_compilation(monkeypatch, model_name):
         DynamicShapesType.UNBACKED,
         DynamicShapesType.BACKED_SIZE_OBLIVIOUS,
     ]:
-        print(f"Testing {shapes_type.name} dynamic shapes...")
+        print(
+            f"Testing {shapes_type.name} dynamic shapes with "
+            f"evaluate_guards={evaluate_guards}..."
+        )
 
         # Initialize the model with specific dynamic shapes configuration
         model = LLM(
@@ -85,7 +90,7 @@ def test_dynamic_shapes_compilation(monkeypatch, model_name):
                 "level": 3,  # PIECEWISE compilation
                 "dynamic_shapes_config": {
                     "dynamic_shapes_type": shapes_type.value,
-                    "eval_dynamo_ds_guards": False,
+                    "eval_dynamo_ds_guards": evaluate_guards,
                 },
             },
             # gpu_memory_utilization=0.2,
@@ -110,36 +115,136 @@ def test_dynamic_shapes_compilation(monkeypatch, model_name):
         print(f"{shape_type}: '{result}'")
 
 
-if __name__ == "__main__":
-    """Run the test directly as a Python script"""
-    import os
-
-    print("Running dynamic shapes compilation test...")
-
-    # Get test models based on PyTorch version
-    test_models = get_test_models()
-    print(f"Testing {len(test_models)} models: {test_models}")
-
-    # Create a mock monkeypatch object for environment variables
-    class MockMonkeypatch:
-        def setenv(self, key, value):
-            os.environ[key] = value
-
-    monkeypatch = MockMonkeypatch()
+@pytest.mark.parametrize("use_aot_compile", ["0", "1"])
+@pytest.mark.parametrize(
+    "dynamic_shapes_type",
+    [
+        DynamicShapesType.BACKED,
+        DynamicShapesType.BACKED_SIZE_OBLIVIOUS,
+    ],
+)
+@pytest.mark.parametrize("evaluate_guards", [False, True])
+def test_model_specialization_with_evaluate_guards(
+    monkeypatch, use_aot_compile, dynamic_shapes_type, evaluate_guards
+):
+    """Test that evaluate_guards correctly detects shape specialization violations."""
+    from contextlib import contextmanager
+
+    from vllm.compilation.decorators import support_torch_compile
+    from vllm.config import CompilationConfig, VllmConfig
+    from vllm.config.compilation import DynamicShapesConfig
+    from vllm.forward_context import set_forward_context
+
+    @support_torch_compile
+    class ModelWithSizeCheck(torch.nn.Module):
+        def __init__(self, **kwargs):
+            super().__init__()
+            self.linear = torch.nn.Linear(10, 10)
+
+        def forward(self, x: torch.Tensor):
+            x = self.linear(x)
+            # This will cause specialization - torch.compile will guard on x.shape[0]
+            if x.shape[0] >= 10:
+                return x
+            else:
+                return x
+
+    @contextmanager
+    def use_vllm_config(vllm_config: VllmConfig):
+        with set_forward_context({}, vllm_config), set_current_vllm_config(vllm_config):
+            yield
 
-    # Run test for each model
-    for model_name in test_models:
-        try:
-            print(f"\n{'=' * 60}")
-            print(f"Testing model: {model_name}")
-            print(f"{'=' * 60}")
+    monkeypatch.setenv("TOKENIZERS_PARALLELISM", "true")
+    monkeypatch.setenv("VLLM_USE_AOT_COMPILE", use_aot_compile)
 
-            test_dynamic_shapes_compilation(monkeypatch, model_name)
+    # Reset torch dynamo to clear any cached compilation state
+    torch._dynamo.reset()
 
-            print(f"✅ Test passed for {model_name}")
+    config_desc = (
+        f"AOT={use_aot_compile}, shapes={dynamic_shapes_type.name}, "
+        f"eval_guards={evaluate_guards}"
+    )
+    print(f"\n{'=' * 60}")
+    print(f"Testing: {config_desc}")
+    print(f"{'=' * 60}")
+
+    # Create vllm config with the desired settings
+    from vllm.config import CompilationMode
+
+    vllm_config = VllmConfig(
+        compilation_config=CompilationConfig(
+            mode=CompilationMode.VLLM_COMPILE,
+            dynamic_shapes_config=DynamicShapesConfig(
+                dynamic_shapes_type=dynamic_shapes_type,
+                evaluate_guards=evaluate_guards,
+            ),
+        )
+    )
 
-        except Exception as e:
-            print(f"❌ Test failed for {model_name}: {e}")
-            raise
+    assert (
+        vllm_config.compilation_config.dynamic_shapes_config.evaluate_guards
+        == evaluate_guards
+    )
+    with torch.no_grad(), use_vllm_config(vllm_config):
+        model = ModelWithSizeCheck(vllm_config=vllm_config).cuda()
+
+        # First call with size 20 - should always work
+        input_10 = torch.randn(20, 10).cuda()
+        model(input_10)
+
+        # Second call with different size (5) - behavior depends on evaluate_guards
+        input_5 = torch.randn(5, 10).cuda()
+
+        # Allow recompiles for evaluate_guards=False case
+        # Only when evaluate_guards=True do we want to detect guard violations
+        if evaluate_guards:
+            # With evaluate_guards=True, this should fail because
+            # guards were added. The model specialized on size 10,
+            # so size 5 violates the guard
+            try:
+                model(input_5)
+                # If we get here, no guard violation occurred
+                # This is a TEST FAILURE - evaluate_guards should have caused a failure
+                pytest.fail(
+                    f"{config_desc}: Expected guard violation did "
+                    f"not occur! evaluate_guards=True should fail "
+                    f"when shape changes from 10 to 5, but the "
+                    f"model ran successfully without error."
+                )
+            except Exception as e:
+                # Expected failure - guard was violated
+                error_msg = str(e)
+                if "guard" in error_msg.lower() or "recompile" in error_msg.lower():
+                    print(f"✅ {config_desc}: Expected failure due to guard violation")
+                    print(f"   Error (truncated): {error_msg[:150]}")
+                else:
+                    # Unexpected error type
+                    print(f"❌ {config_desc}: Unexpected error type")
+                    print(f"   Error: {e}")
+                    raise
+        else:
+            # With evaluate_guards=False, guards are dropped, so this should work
+            # However, recompilation may still occur, which is expected
+            try:
+                output_5 = model(input_5)
+                assert output_5.shape == (
+                    5,
+                    10,
+                ), "Output shape should match input"
+                print(f"✅ {config_desc}: Passed without guard violations")
+                print("   Second call (size 5): Success")
+            except RuntimeError as e:
+                # If it's a recompile error, that's expected when evaluate_guards=False
+                # The model is allowed to recompile with different shapes
+                if (
+                    "recompile" in str(e).lower()
+                    and "fail_on_recompile" in str(e).lower()
+                ):
+                    print(f"✅ {config_desc}: Recompile occurred (expected behavior)")
+                    print("   Recompiles are allowed when evaluate_guards=False")
+                else:
+                    print(f"❌ {config_desc}: Unexpected failure")
+                    print(f"   Error: {e}")
+                    raise
 
-    print("\n🎉 All tests completed successfully!")
+    cleanup_gpu_memory()
diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
@@ -14,6 +14,7 @@
 import torch
 import torch.fx as fx
 from torch._dispatch.python import enable_python_dispatcher
+from torch.utils._sympy.value_ranges import ValueRanges
 
 import vllm.envs as envs
 from vllm.compilation.inductor_pass import pass_context
@@ -22,6 +23,7 @@
     resolve_defined_ops,
 )
 from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig
+from vllm.config.compilation import DynamicShapesType
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.utils.import_utils import resolve_obj_by_qualname
@@ -659,6 +661,27 @@ def __call__(
             self.split_gm, submod_names_to_compile, self.vllm_config, self
         ).run(*example_inputs)
 
+        from torch._guards import detect_fake_mode
+
+        fake_mode = detect_fake_mode()
+
+        if (
+            self.compilation_config.dynamic_shapes_config.evaluate_guards
+            and self.compilation_config.dynamic_shapes_config
+            == DynamicShapesType.BACKED
+        ):
+            # Drop counter-0/1 specializations guards; for backed dynamic shapes,
+            # torch.compile will specialize for 0/1 inputs or otherwise guards that
+            # shape is >= 2. This is because it's really hard not to hit a check
+            # against 0/1. When we evaluate shape guards, we exclude checking those
+            # guards (We would fail always otherwise).
+
+            # We avoid that by updating the ranges of backed sizes when the min is
+            # 2 for any, we assume it's 0.
+            for s, r in fake_mode.shape_env.var_to_range.items():
+                if r.lower == 2:
+                    fake_mode.shape_env.var_to_range[s] = ValueRanges(0, r.upper)
+
         graph_path = os.path.join(local_cache_dir, "computation_graph.py")
         if not os.path.exists(graph_path):
             # code adapted from
@@ -685,9 +708,6 @@ def __call__(
             )
 
         # if we need to copy input buffers for cudagraph
-        from torch._guards import detect_fake_mode
-
-        fake_mode = detect_fake_mode()
         fake_args = [
             fake_mode.from_tensor(t) if isinstance(t, torch.Tensor) else t
             for t in example_inputs
diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import abstractmethod
+from contextlib import nullcontext
 from types import CodeType
 
 import torch
@@ -33,6 +34,7 @@ def __init__(self):
 
         vllm_config = get_current_vllm_config()
         mode = vllm_config.compilation_config.mode
+
         if mode is None:
             raise RuntimeError("Compilation mode cannot be NO_COMPILATION")
 
@@ -41,10 +43,15 @@ def __init__(self):
 
         if isinstance(backend, str) and backend == "inductor":
             options = vllm_config.compilation_config.inductor_compile_config
-
+        self.first_compile = True
         if mode != CompilationMode.STOCK_TORCH_COMPILE:
             # Drop all the guards.
-            options["guard_filter_fn"] = lambda x: [False for _ in x]
+            if vllm_config.compilation_config.dynamic_shapes_config.evaluate_guards:
+                options["guard_filter_fn"] = lambda x: [
+                    entry.guard_type == "SHAPE_ENV" for entry in x
+                ]
+            else:
+                options["guard_filter_fn"] = lambda x: [False for _ in x]
 
         if envs.VLLM_USE_AOT_COMPILE:
             if hasattr(torch._dynamo.config, "enable_aot_compile"):
@@ -69,10 +76,18 @@ def aot_compile(self, *args, **kwargs):
                 + "Please make sure torch.compile is enabled with the latest "
                 + f"version of PyTorch (current using torch: {torch.__version__})"
             )
-        return self._compiled_callable.aot_compile((args, kwargs))
+        prev = self.first_compile
+        self.first_compile = False
+        ctx = nullcontext() if prev else torch.compiler.set_stance("fail_on_recompile")
+        with ctx:
+            return self._compiled_callable.aot_compile((args, kwargs))
 
     def __call__(self, *args, **kwargs):
-        return self._compiled_callable(*args, **kwargs)
+        prev = self.first_compile
+        self.first_compile = False
+        ctx = nullcontext() if prev else torch.compiler.set_stance("fail_on_recompile")
+        with ctx:
+            return self._compiled_callable(*args, **kwargs)
 
     @abstractmethod
     def forward(self, *args, **kwargs): ...
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
@@ -208,6 +208,14 @@ class DynamicShapesConfig:
       backed/unbacked.
     """
 
+    evaluate_guards: bool = False
+    """
+    A debug mode to detect and fail if Dynamo ever specializes a dynamic shape by
+    guarding on it. When True, dynamic shape guards are not dropped from Dynamo.
+    And a failure will be triggered if recompilation ever happens due to that.
+    Enabling this allow observing the dynamic shapes guards in the tl-parse artifact.
+    """
+
     # TODO add a debug mode to fail
 
     def compute_hash(self) -> str:
@@ -224,6 +232,7 @@ def compute_hash(self) -> str:
         """
         factors: list[Any] = []
         factors.append(self.dynamic_shapes_type.value)
+        factors.append(self.evaluate_guards)
         hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
         return hash_str