AOT compilation workflow [1/n]

zhxchen17 · dolpm · commit e925619abb4a · 2025-09-25T13:41:33.000-07:00
Signed-off-by: zhxchen17 &lt;zhxchen17@fb.com&gt;
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -387,6 +387,7 @@ steps:
     - pytest -v -s compile/test_fusion_all_reduce.py
     - pytest -v -s compile/test_decorator.py
     - pytest -v -s compile/test_noop_elimination.py
+    - pytest -v -s compile/test_aot_compile.py
 
 - label: PyTorch Fullgraph Smoke Test # 15min
   timeout_in_minutes: 30
diff --git a/tests/compile/test_aot_compile.py b/tests/compile/test_aot_compile.py
@@ -0,0 +1,61 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from contextlib import contextmanager
+
+import pytest
+import torch
+
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig,
+                         set_current_vllm_config)
+from vllm.forward_context import set_forward_context
+
+
+class MyMod(torch.nn.Module):
+
+    def __init__(self, **kwargs):
+        super().__init__()
+
+    def forward(self, x: torch.Tensor):
+        for _ in range(3000):
+            x = x + x.shape[0]
+        return x
+
+
+def make_vllm_config() -> VllmConfig:
+    return VllmConfig(compilation_config=CompilationConfig(
+        level=CompilationLevel.PIECEWISE, ))
+
+
+@contextmanager
+def use_vllm_config(vllm_config: VllmConfig):
+    with set_forward_context(
+        {}, vllm_config), set_current_vllm_config(vllm_config):
+        yield
+
+
+def test_no_eval_frame(monkeypatch: pytest.MonkeyPatch):
+    with monkeypatch.context() as m:
+        mod = MyMod()
+        args = (torch.randn(10, 10), )
+        expected = mod(*args)
+        CompiledMod = support_torch_compile(MyMod)
+
+        vllm_config = make_vllm_config()
+        m.setenv("VLLM_USE_AOT_COMPILE", "0")
+        try:
+            with use_vllm_config(vllm_config), torch.compiler.set_stance(
+                    "fail_on_recompile"):
+                CompiledMod(vllm_config=vllm_config)(*args)
+        except RuntimeError as e:
+            assert "Detected recompile" in str(e)
+        else:
+            raise AssertionError("Expected exception to be raised")
+
+        m.setenv("VLLM_USE_AOT_COMPILE", "1")
+        torch._dynamo.reset()
+        with use_vllm_config(vllm_config), torch.compiler.set_stance(
+                "fail_on_recompile"):
+            ret = CompiledMod(vllm_config=vllm_config)(*args)
+            assert torch.allclose(ret, expected)
diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
@@ -398,6 +398,35 @@ def set_model_tag(tag: str):
         model_tag = old_tag
 
 
+try:
+    from torch._dynamo.aot_compile import SerializableCallable
+except ImportError:
+    SerializableCallable = object
+
+assert isinstance(SerializableCallable, type)
+
+
+class VllmCompiledFunction(SerializableCallable):
+
+    def __init__(self, graph_module, example_inputs, vllm_config,
+                 optimized_call):
+        self.graph_module = graph_module
+        self.example_inputs = example_inputs
+        self.vllm_config = vllm_config
+        self.optimized_call = optimized_call
+
+    def __call__(self, *args, **kwargs):
+        return self.optimized_call(*args, **kwargs)
+
+    @classmethod
+    def serialize_compile_artifacts(cls, compiled_fn):
+        raise NotImplementedError("serialization not implemented")
+
+    @classmethod
+    def deserialize_compile_artifacts(cls, data):
+        raise NotImplementedError("deserialization not implemented")
+
+
 class VllmBackend:
     """The compilation backend for `torch.compile` with vLLM.
     It is used for compilation level of `CompilationLevel.PIECEWISE`,
@@ -605,7 +634,8 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
 
         if self.compilation_config.cudagraph_mode == CUDAGraphMode.NONE or \
             not self.compilation_config.cudagraph_copy_inputs:
-            return self.split_gm
+            return VllmCompiledFunction(graph, example_inputs, vllm_config,
+                                        self.split_gm)
 
         # if we need to copy input buffers for cudagraph
         from torch._guards import detect_fake_mode
@@ -647,4 +677,5 @@ def copy_and_call(*args):
                 list_args[index] = static_tensor
             return self.split_gm(*list_args)
 
-        return copy_and_call
+        return VllmCompiledFunction(graph, example_inputs, vllm_config,
+                                    copy_and_call)
diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py
@@ -11,6 +11,7 @@
 from packaging import version
 from torch._dynamo.symbolic_convert import InliningInstructionTranslator
 
+import vllm.envs as envs
 from vllm.compilation.counter import compilation_counter
 from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
 from vllm.config import CompilationLevel, VllmConfig
@@ -34,11 +35,11 @@ def ignore_torch_compile(cls: _T) -> _T:
     a support_torch_compile decorator, but we don't want to
     compile the class `cls` that inherits the parent class.
     This only ignores compiling the forward of the class the
-    decorator is applied to. 
+    decorator is applied to.
 
     If the parent has ignore_torch_compile but the child has
     support_torch_compile, the child will still be compiled.
-    
+
     If the class has one or more submodules
     that have support_torch_compile decorator applied, compile will
     not be ignored for those submodules.
@@ -224,6 +225,9 @@ def __call__(self, *args, **kwargs):
         if self.do_not_compile or torch.compiler.is_compiling():
             return self.forward(*args, **kwargs)
 
+        if getattr(self, "aot_compiled_fn", None) is not None:
+            return self.aot_compiled_fn(self, *args, **kwargs)
+
         # the first compilation needs to have dynamic shapes marked
         if len(self.compiled_codes) < 1:
             sig = inspect.signature(self.__class__.forward)
@@ -307,7 +311,11 @@ def patched_inline_call(parent, func, args, kwargs):
                         **dynamo_config_patches
                     ), maybe_use_cudagraph_partition_wrapper(
                         self.vllm_config), _torch27_patch_tensor_subclasses():
-                output = self.compiled_callable(*args, **kwargs)
+                if envs.VLLM_USE_AOT_COMPILE:
+                    self.aot_compiled_fn = self.aot_compile(*args, **kwargs)
+                    output = self.aot_compiled_fn(self, *args, **kwargs)
+                else:
+                    output = self.compiled_callable(*args, **kwargs)
             return output
 
         # usually, capturing the model once is enough, and then we can
diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py
@@ -45,6 +45,18 @@ def __init__(self,
             if isinstance(backend, str) and backend == "inductor":
                 options = get_current_vllm_config(
                 ).compilation_config.inductor_compile_config
+            if envs.VLLM_USE_AOT_COMPILE:
+                options = options or {}
+                options["guard_filter_fn"] = lambda guards: [
+                    False for _ in guards
+                ]
+                if hasattr(torch._dynamo.config, "enable_aot_compile"):
+                    torch._dynamo.config.enable_aot_compile = True
+                else:
+                    msg = "torch._dynamo.config.enable_aot_compile is not "
+                    msg += "available. AOT compile is disabled and please "
+                    msg += "upgrade PyTorch version to use AOT compile."
+                    logger.warning(msg)
 
             compiled_callable = torch.compile(self.forward,
                                               fullgraph=True,
@@ -62,6 +74,14 @@ def __init__(self,
         self.use_custom_dispatcher: bool = \
             compilation_level >= CompilationLevel.DYNAMO_ONCE
 
+    def aot_compile(self, *args, **kwargs):
+        if not hasattr(self.compiled_callable, "aot_compile"):
+            raise RuntimeError(
+                "aot_compile is not supported by the current configuration. " +
+                "Please make sure torch.compile is enabled with the latest " +
+                "version of PyTorch")
+        return self.compiled_callable.aot_compile((args, kwargs))
+
     def __call__(self, *args, **kwargs):
         """Implement the dispatch logic here, beyond the torch.compile level.
         NOTE: this function can have additional arguments beyond the forward
diff --git a/vllm/envs.py b/vllm/envs.py
@@ -511,6 +511,12 @@ def get_vllm_port() -> Optional[int]:
     "VLLM_PATTERN_MATCH_DEBUG":
     lambda: os.environ.get("VLLM_PATTERN_MATCH_DEBUG", None),
 
+    # Feature flag to enable/disable AOT compilation. This will ensure
+    # compilation is done in warmup phase and the compilation will be
+    # reused in subsequent calls.
+    "VLLM_USE_AOT_COMPILE":
+    lambda: os.environ.get("VLLM_USE_AOT_COMPILE", "0") == "1",
+
     # local rank of the process in the distributed setting, used to determine
     # the GPU device id
     "LOCAL_RANK":