vllm-project
diff --git a/‎tests/compile/piecewise/test_multiple_graphs.py‎
Lines changed: 3 additions & 0 deletions b/‎tests/compile/piecewise/test_multiple_graphs.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎tests/compile/piecewise/test_simple.py‎
Lines changed: 3 additions & 0 deletions b/‎tests/compile/piecewise/test_simple.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎tests/compile/piecewise/test_toy_llama.py‎
Lines changed: 8 additions & 1 deletion b/‎tests/compile/piecewise/test_toy_llama.py‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎tests/compile/test_wrapper.py‎
Lines changed: 94 additions & 39 deletions b/‎tests/compile/test_wrapper.py‎
Lines changed: 94 additions & 39 deletions
diff --git a/‎tests/v1/e2e/test_spec_decode.py‎
Lines changed: 8 additions & 0 deletions b/‎tests/v1/e2e/test_spec_decode.py‎
Lines changed: 8 additions & 0 deletions
@@ -22,6 +22,8 @@
 from vllm.forward_context import BatchDescriptor, set_forward_context
 from vllm.utils.torch_utils import is_torch_equal_or_newer
 
+from ...utils import create_new_process_for_each_test
+
 # This import automatically registers `torch.ops.silly.attention`
 from .. import silly_attention  # noqa: F401
 
@@ -193,6 +195,7 @@ def run_model(
 
 
 @pytest.mark.parametrize("use_inductor_graph_partition", [False, True])
+@create_new_process_for_each_test("spawn")
 def test_multi_graph_piecewise_compile(use_inductor_graph_partition: bool):
     if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
         pytest.skip("inductor graph partition is only available in PyTorch 2.9+")
 
@@ -21,6 +21,8 @@
 from vllm.forward_context import BatchDescriptor, set_forward_context
 from vllm.utils.torch_utils import is_torch_equal_or_newer
 
+from ...utils import create_new_process_for_each_test
+
 # This import automatically registers `torch.ops.silly.attention`
 from ..silly_attention import get_global_counter, reset_global_counter
 
@@ -125,6 +127,7 @@ def _run_simple_model(
 
 @pytest.mark.parametrize("use_inductor", [True, False])
 @torch.inference_mode()
+@create_new_process_for_each_test("spawn")
 def test_simple_piecewise_compile(use_inductor):
     _run_simple_model(
         splitting_ops=["silly::attention"],
 
@@ -29,6 +29,8 @@
 from vllm.forward_context import BatchDescriptor, set_forward_context
 from vllm.utils.torch_utils import is_torch_equal_or_newer
 
+from ...utils import create_new_process_for_each_test
+
 # This import automatically registers `torch.ops.silly.attention`
 from .. import silly_attention  # noqa: F401
 
@@ -334,6 +336,7 @@ def run_model(llama_config, compile_config: CompilationConfig) -> torch.Tensor:
         ("inductor", True),  # Inductor, Inductor partition
     ],
 )
+@create_new_process_for_each_test("spawn")
 def test_toy_llama(
     backend: str, use_inductor_graph_partition: bool, monkeypatch, tmp_path
 ):
@@ -514,4 +517,8 @@ def benchmark():
 
 
 if __name__ == "__main__":
-    benchmark()
+    # Protect against subprocess reimport when using spawn_new_process_for_each_test
+    import os
+
+    if os.environ.get("RUNNING_IN_SUBPROCESS") != "1":
+        benchmark()
@@ -4,57 +4,112 @@
 
 import torch
 
-from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
-from vllm.config import CompilationMode
+from vllm.compilation.wrapper import TorchCompileWithNoGuardsWrapper
+from vllm.config import (
+    CompilationConfig,
+    CompilationMode,
+    VllmConfig,
+    set_current_vllm_config,
+)
 
 
 class MyMod(torch.nn.Module):
     def forward(self, x: torch.Tensor, cache: torch.Tensor | None = None):
-        if cache is not None:
-            return x + cache
-        return x * 2
+        if x.size()[0] >= 4:
+            return x * 2
+        else:
+            return x * 100
 
 
-class MyWrapper(TorchCompileWrapperWithCustomDispatcher):
+class MyWrapper(TorchCompileWithNoGuardsWrapper):
     def __init__(self, model):
         self.model = model
-        compiled_callable = torch.compile(self.forward, backend="eager")
-        super().__init__(
-            compiled_callable, compilation_mode=CompilationMode.DYNAMO_TRACE_ONCE
-        )
+        super().__init__()
 
-    def forward(self, x: torch.Tensor, cache: torch.Tensor | None = None):
+    def forward(self, x: torch.Tensor):  # type: ignore[override]
         # this is the function to be compiled
-        return self.model(x, cache)
-
-    def __call__(self, x: torch.Tensor, cache: torch.Tensor | None = None):
-        # let torch.compile compile twice
-        if len(self.compiled_codes) == 2:
-            dispatch_id = 0 if cache is None else 1
-            with self.dispatch_to_code(dispatch_id):
-                return self.forward(x, cache)
-        else:
-            return self.compiled_callable(x, cache)
+        return self.model(x)
 
 
 def test_torch_compile_wrapper():
-    mod = MyMod()
-    wrappers = []
-    for i in range(3):
+    """Test basic functionality of TorchCompileWithNoGuardsWrapper."""
+    # Create a proper vLLM config instead of mocking
+    vllm_config = VllmConfig()
+    vllm_config.compilation_config = CompilationConfig()
+    vllm_config.compilation_config.mode = CompilationMode.DYNAMO_TRACE_ONCE
+    vllm_config.compilation_config.backend = "inductor"
+
+    # Test DYNAMO_TRACE_ONCE
+    with set_current_vllm_config(vllm_config):
+        torch._dynamo.reset()
+        mod = MyMod()
+        wrapper = MyWrapper(mod)
+
+        # First call should trigger compilation
+        x = torch.tensor([1, 2, 3, 4])
+        torch._dynamo.mark_dynamic(x, 0)
+
+        result1 = wrapper(x)
+        expected1 = torch.tensor([2, 4, 6, 8])
+        assert torch.allclose(result1, expected1), (
+            f"Expected {expected1}, got {result1}"
+        )
+
+        # Second call should use compiled code
+        x2 = torch.tensor([1, 2, 3])
+        result2 = wrapper(x2)
+        expected2 = torch.tensor([2, 4, 6])
+        assert torch.allclose(result2, expected2), (
+            f"Expected {expected2}, got {result2}"
+        )
+
+        # without the wrapper result would be different.
+        result3 = mod(x2)
+        expected3 = torch.tensor([100, 200, 300])
+
+        assert torch.allclose(result3, expected3), (
+            f"Expected {result3}, got {expected3}"
+        )
+
+    # with STOCK_TORCH_COMPILE we do not remove guards.
+    vllm_config.compilation_config.mode = CompilationMode.STOCK_TORCH_COMPILE
+    torch._dynamo.reset()
+    with set_current_vllm_config(vllm_config):
         torch._dynamo.reset()
+        mod = MyMod()
         wrapper = MyWrapper(mod)
-        wrappers.append(wrapper)
-        x = torch.tensor([1])
-        wrapper(x, None)  # profile run, compile
-        # create a cache tensor
-        cache = torch.tensor([2])
-        wrapper(x, cache)  # warm up with cache, recompile
-
-        # for new input, dispatch to the compiled code directly
-        new_x = torch.tensor([3])
-        assert wrapper(new_x, None).item() == 6  # dispatch to the first compiled code
-        assert wrapper(new_x, cache).item() == 5  # dispatch to the second compiled code
-
-    for wrapper in wrappers:
-        # make sure they have independent compiled codes
-        assert len(wrapper.compiled_codes) == 2
+
+        # First call should trigger compilation
+        x = torch.tensor([1, 2, 3, 4])
+        torch._dynamo.mark_dynamic(x, 0)
+
+        result1 = wrapper(x)
+        expected1 = torch.tensor([2, 4, 6, 8])
+        assert torch.allclose(result1, expected1), (
+            f"Expected {expected1}, got {result1}"
+        )
+
+        # Second call should triger another compilation
+        x2 = torch.tensor([1, 2, 3])
+        result2 = wrapper(x2)
+        expected2 = torch.tensor([100, 200, 300])
+        assert torch.allclose(result2, expected2), (
+            f"Expected {expected2}, got {result2}"
+        )
+
+    # NO_COMPILATION level not supported.
+    vllm_config.compilation_config.mode = None
+    torch._dynamo.reset()
+    with set_current_vllm_config(vllm_config):
+        torch._dynamo.reset()
+        mod = MyMod()
+
+        try:
+            wrapper = MyWrapper(mod)
+        except Exception:
+            return
+        raise AssertionError("expected an exception to be raised")
+
+
+if __name__ == "__main__":
+    test_torch_compile_wrapper()
@@ -75,6 +75,14 @@ def model_name():
     return "meta-llama/Llama-3.1-8B-Instruct"
 
 
+@pytest.fixture(autouse=True)
+def reset_torch_dynamo():
+    """Reset torch dynamo cache before each test"""
+    yield
+    # Cleanup after test
+    torch._dynamo.reset()
+
+
 @pytest.mark.parametrize(
     "speculative_config",
     [