vllm-project
diff --git a/‎tests/compile/test_wrapper.py‎
Lines changed: 87 additions & 40 deletions b/‎tests/compile/test_wrapper.py‎
Lines changed: 87 additions & 40 deletions
diff --git a/‎vllm/compilation/decorators.py‎
Lines changed: 67 additions & 91 deletions b/‎vllm/compilation/decorators.py‎
Lines changed: 67 additions & 91 deletions
@@ -5,60 +5,107 @@
 
 import torch
 
-from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
-from vllm.config import CompilationLevel
+from vllm.compilation.wrapper import TorchCompileGuardsStripWrapper
+from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig,
+                         set_current_vllm_config)
 
 
 class MyMod(torch.nn.Module):
 
     def forward(self, x: torch.Tensor, cache: Optional[torch.Tensor] = None):
-        if cache is not None:
-            return x + cache
-        return x * 2
+        if x.size()[0] >= 4:
+            return x * 2
+        else:
+            return x * 100
 
 
-class MyWrapper(TorchCompileWrapperWithCustomDispatcher):
+class MyWrapper(TorchCompileGuardsStripWrapper):
 
     def __init__(self, model):
         self.model = model
-        compiled_callable = torch.compile(self.forward, backend="eager")
-        super().__init__(compiled_callable,
-                         compilation_level=CompilationLevel.DYNAMO_ONCE)
+        super().__init__()
 
-    def forward(self, x: torch.Tensor, cache: Optional[torch.Tensor] = None):
+    def forward(self, x: torch.Tensor):  # type: ignore[override]
         # this is the function to be compiled
-        return self.model(x, cache)
-
-    def __call__(self, x: torch.Tensor, cache: Optional[torch.Tensor] = None):
-        # let torch.compile compile twice
-        if len(self.compiled_codes) == 2:
-            dispatch_id = 0 if cache is None else 1
-            with self.dispatch_to_code(dispatch_id):
-                return self.forward(x, cache)
-        else:
-            return self.compiled_callable(x, cache)
+        return self.model(x)
 
 
 def test_torch_compile_wrapper():
-    mod = MyMod()
-    wrappers = []
-    for i in range(3):
+    """Test basic functionality of TorchCompileGuardsStripWrapper."""
+    # Create a proper vLLM config instead of mocking
+    vllm_config = VllmConfig()
+    vllm_config.compilation_config = CompilationConfig()
+    vllm_config.compilation_config.level = CompilationLevel.DYNAMO_ONCE
+    vllm_config.compilation_config.backend = "inductor"
+
+    with set_current_vllm_config(vllm_config):
         torch._dynamo.reset()
+        mod = MyMod()
         wrapper = MyWrapper(mod)
-        wrappers.append(wrapper)
-        x = torch.tensor([1])
-        wrapper(x, None)  # profile run, compile
-        # create a cache tensor
-        cache = torch.tensor([2])
-        wrapper(x, cache)  # warm up with cache, recompile
-
-        # for new input, dispatch to the compiled code directly
-        new_x = torch.tensor([3])
-        assert wrapper(new_x,
-                       None).item() == 6  # dispatch to the first compiled code
-        assert wrapper(
-            new_x, cache).item() == 5  # dispatch to the second compiled code
-
-    for wrapper in wrappers:
-        # make sure they have independent compiled codes
-        assert len(wrapper.compiled_codes) == 2
+
+        # First call should trigger compilation
+        x = torch.tensor([1, 2, 3, 4])
+        torch._dynamo.mark_dynamic(x, 0)
+
+        result1 = wrapper(x)
+        expected1 = torch.tensor([2, 4, 6, 8])
+        assert torch.allclose(
+            result1, expected1), f"Expected {expected1}, got {result1}"
+
+        # Second call should use compiled code
+        x2 = torch.tensor([1, 2, 3])
+        result2 = wrapper(x2)
+        expected2 = torch.tensor([2, 4, 6])
+        assert torch.allclose(
+            result2, expected2), f"Expected {expected2}, got {result2}"
+
+        # without the wrapper result would be different.
+        result3 = mod(x2)
+        expected3 = torch.tensor([100, 200, 300])
+
+        assert torch.allclose(
+            result3, expected3), f"Expected {result3}, got {expected3}"
+
+        # Verify compilation flag is set
+        assert wrapper.compiled is True
+
+    # with DYNAMO_AS_IS we do not remove guards.
+    vllm_config.compilation_config.level = CompilationLevel.DYNAMO_AS_IS
+    torch._dynamo.reset()
+    with set_current_vllm_config(vllm_config):
+        torch._dynamo.reset()
+        mod = MyMod()
+        wrapper = MyWrapper(mod)
+
+        # First call should trigger compilation
+        x = torch.tensor([1, 2, 3, 4])
+        torch._dynamo.mark_dynamic(x, 0)
+
+        result1 = wrapper(x)
+        expected1 = torch.tensor([2, 4, 6, 8])
+        assert torch.allclose(
+            result1, expected1), f"Expected {expected1}, got {result1}"
+
+        # Second call should triger another compilation
+        x2 = torch.tensor([1, 2, 3])
+        result2 = wrapper(x2)
+        expected2 = torch.tensor([100, 200, 300])
+        assert torch.allclose(
+            result2, expected2), f"Expected {expected2}, got {result2}"
+
+    # NO_COMPILATION level not supported.
+    vllm_config.compilation_config.level = CompilationLevel.NO_COMPILATION
+    torch._dynamo.reset()
+    with set_current_vllm_config(vllm_config):
+        torch._dynamo.reset()
+        mod = MyMod()
+
+        try:
+            wrapper = MyWrapper(mod)
+        except Exception:
+            return
+        raise AssertionError("expected an exception to be raised")
+
+
+if __name__ == "__main__":
+    test_torch_compile_wrapper()
@@ -10,7 +10,7 @@
 from torch._dynamo.symbolic_convert import InliningInstructionTranslator
 
 from vllm.compilation.counter import compilation_counter
-from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
+from vllm.compilation.wrapper import TorchCompileGuardsStripWrapper
 from vllm.config import CompilationLevel, VllmConfig
 from vllm.logger import init_logger
 from vllm.sequence import IntermediateTensors
@@ -32,11 +32,11 @@ def ignore_torch_compile(cls: _T) -> _T:
     a support_torch_compile decorator, but we don't want to
     compile the class `cls` that inherits the parent class.
     This only ignores compiling the forward of the class the
-    decorator is applied to. 
+    decorator is applied to.
 
     If the parent has ignore_torch_compile but the child has
     support_torch_compile, the child will still be compiled.
-    
+
     If the class has one or more submodules
     that have support_torch_compile decorator applied, compile will
     not be ignored for those submodules.
@@ -182,14 +182,14 @@ def _support_torch_compile(
     """
     A decorator to add support for compiling the forward method of a class.
     """
-    if TorchCompileWrapperWithCustomDispatcher in cls.__bases__:
+    if TorchCompileGuardsStripWrapper in cls.__bases__:
         # support decorating multiple times
         return cls
 
     # take care of method resolution order
     # make sure super().__init__ is called on the base class
-    #  other than TorchCompileWrapperWithCustomDispatcher
-    cls.__bases__ = cls.__bases__ + (TorchCompileWrapperWithCustomDispatcher, )
+    #  other than TorchCompileGuardsStripWrapper
+    cls.__bases__ = cls.__bases__ + (TorchCompileGuardsStripWrapper, )
 
     old_init = cls.__init__
 
@@ -210,107 +210,83 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = '', **kwargs):
             return
 
         compilation_counter.num_models_seen += 1
-        TorchCompileWrapperWithCustomDispatcher.__init__(
-            self, compilation_level=vllm_config.compilation_config.level)
+        TorchCompileGuardsStripWrapper.__init__(self)
 
     cls.__init__ = __init__
 
+    def _mark_dynamic_inputs(mod, *args, **kwargs):
+        sig = inspect.signature(mod.__class__.forward)
+        bound_args = sig.bind(mod, *args, **kwargs)
+        bound_args.apply_defaults()
+        for k, dims in dynamic_arg_dims.items():
+            arg = bound_args.arguments.get(k)
+            if arg is not None:
+                dims = [dims] if isinstance(dims, int) else dims
+                if isinstance(arg, torch.Tensor):
+                    # In case dims is specified with negative indexing
+                    dims = [arg.ndim + dim if dim < 0 else dim for dim in dims]
+                    torch._dynamo.mark_dynamic(arg, dims)
+                elif isinstance(arg, IntermediateTensors):
+                    for tensor in arg.tensors.values():
+                        # In case dims is specified with negative indexing
+                        dims = [
+                            tensor.ndim + dim if dim < 0 else dim
+                            for dim in dims
+                        ]
+                        torch._dynamo.mark_dynamic(tensor, dims)
+                else:
+                    raise ValueError(
+                        "Unsupported dynamic dimensions"
+                        f" {dims} for argument {k} with type {type(arg)}.")
+
     def __call__(self, *args, **kwargs):
         # torch.compiler.is_compiling() means we are inside the compilation
         # e.g. TPU has the compilation logic in model runner, so we don't
         # need to compile the model inside.
         if self.do_not_compile or torch.compiler.is_compiling():
             return self.forward(*args, **kwargs)
 
+        # This attributed is added by TorchCompileGuardsStripWrapper
+        if self.compiled:
+            return TorchCompileGuardsStripWrapper.__call__(
+                self, *args, **kwargs)
+
+        # This is the path for the first compilation.
+        _mark_dynamic_inputs(self, *args, **kwargs)
+
         # the first compilation needs to have dynamic shapes marked
-        if len(self.compiled_codes) < 1:
-            sig = inspect.signature(self.__class__.forward)
-            bound_args = sig.bind(self, *args, **kwargs)
-            bound_args.apply_defaults()
-            for k, dims in dynamic_arg_dims.items():
-                arg = bound_args.arguments.get(k)
-                if arg is not None:
-                    dims = [dims] if isinstance(dims, int) else dims
-                    if isinstance(arg, torch.Tensor):
-                        # In case dims is specified with negative indexing
-                        dims = [
-                            arg.ndim + dim if dim < 0 else dim for dim in dims
-                        ]
-                        torch._dynamo.mark_dynamic(arg, dims)
-                    elif isinstance(arg, IntermediateTensors):
-                        for tensor in arg.tensors.values():
-                            # In case dims is specified with negative indexing
-                            dims = [
-                                tensor.ndim + dim if dim < 0 else dim
-                                for dim in dims
-                            ]
-                            torch._dynamo.mark_dynamic(tensor, dims)
-                    else:
-                        raise ValueError(
-                            "Unsupported dynamic dimensions"
-                            f" {dims} for argument {k} with type {type(arg)}.")
-            # here, it is the starting point of the `torch.compile` process
-            start_monitoring_torch_compile(self.vllm_config)
-            logger.debug("Start compiling function %s",
-                         self.original_code_object)
+        start_monitoring_torch_compile(self.vllm_config)
+        logger.debug("Start compiling function %s",
+                     self.original_code_object())
 
         # if we don't use custom dispatcher, we can directly call the
         # compiled function and let torch.compile handle the dispatching,
         # with the overhead of guard evaluation and recompilation.
-        if len(self.compiled_codes) < 1 or not self.use_custom_dispatcher:
-            # it seems Dynamo reuse the compilation across instances,
-            # while we need to make sure the compiled code is not reused.
-            # we need to control all the compilation of the model.
-            torch._dynamo.eval_frame.remove_from_cache(
-                self.original_code_object)
-
-            # collect all relevant files traced by Dynamo,
-            # so that the compilation cache can trigger re-compilation
-            # properly when any of these files change.
-
-            # 1. the file containing the top-level forward function
+
+        # collect all relevant files traced by Dynamo,
+        # so that the compilation cache can trigger re-compilation
+        # properly when any of these files change.
+
+        # 1. the file containing the top-level forward function
+        self.vllm_config.compilation_config.traced_files.add(
+            self.original_code_object().co_filename)
+
+        # 2. every time Dynamo sees a function call, it will inline
+        # the function by calling InliningInstructionTranslator.inline_call
+        # we hijack this function to know all the functions called
+        # during Dynamo tracing, and their corresponding files
+        inline_call = InliningInstructionTranslator.inline_call
+
+        def patched_inline_call(parent, func, args, kwargs):
+            code = func.get_code()
             self.vllm_config.compilation_config.traced_files.add(
-                self.original_code_object.co_filename)
-
-            # 2. every time Dynamo sees a function call, it will inline
-            # the function by calling InliningInstructionTranslator.inline_call
-            # we hijack this function to know all the functions called
-            # during Dynamo tracing, and their corresponding files
-            inline_call = InliningInstructionTranslator.inline_call
-
-            def patched_inline_call(parent, func, args, kwargs):
-                code = func.get_code()
-                self.vllm_config.compilation_config.traced_files.add(
-                    code.co_filename)
-                return inline_call(parent, func, args, kwargs)
-
-            # Disable the C++ compilation of symbolic shape guards. C++-fication
-            # of symbolic shape guards can improve guard overhead. But, since
-            # vllm skip guards anyways, setting this flag to False can improve
-            # compile time.
-            dynamo_config_patches = {}
-            try:
-                _ = torch._dynamo.config.enable_cpp_symbolic_shape_guards
-                dynamo_config_patches[
-                    "enable_cpp_symbolic_shape_guards"] = False
-            except AttributeError:
-                # Note: this config is not available in torch 2.6, we can skip
-                # if the config doesn't exist
-                logger.debug(
-                    "enable_cpp_symbolic_shape_guards config not available")
-
-            with patch.object(InliningInstructionTranslator, 'inline_call',
-                              patched_inline_call), torch._dynamo.config.patch(
-                                  **dynamo_config_patches):
-                output = self.compiled_callable(*args, **kwargs)
-            return output
-
-        # usually, capturing the model once is enough, and then we can
-        # dispatch to the compiled code directly, without going through
-        # the Dynamo guard mechanism.
-        with self.dispatch_to_code(0):
-            model_output = self.forward(*args, **kwargs)
-            return model_output
+                code.co_filename)
+            return inline_call(parent, func, args, kwargs)
+
+        with patch.object(InliningInstructionTranslator, "inline_call",
+                          patched_inline_call):
+            return TorchCompileGuardsStripWrapper.__call__(
+                self, *args, **kwargs)
 
     cls.__call__ = __call__
     return cls