AOT compilation workflow [1/n]

zhxchen17 · zhxchen17 · commit d297defa0a54 · 2025-10-06T08:22:53.000-07:00
Signed-off-by: zhxchen17 &lt;zhxchen17@fb.com&gt;
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -404,6 +404,7 @@ steps:
     - pytest -v -s compile/test_fusion_all_reduce.py
     - pytest -v -s compile/test_decorator.py
     - pytest -v -s compile/test_noop_elimination.py
+    - pytest -v -s compile/test_aot_compile.py
 
 - label: PyTorch Fullgraph Smoke Test # 15min
   timeout_in_minutes: 30
diff --git a/tests/compile/test_aot_compile.py b/tests/compile/test_aot_compile.py
@@ -0,0 +1,61 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from contextlib import contextmanager
+
+import pytest
+import torch
+
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig,
+                         set_current_vllm_config)
+from vllm.forward_context import set_forward_context
+
+
+class MyMod(torch.nn.Module):
+
+    def __init__(self, **kwargs):
+        super().__init__()
+
+    def forward(self, x: torch.Tensor):
+        for _ in range(3000):
+            x = x + x.shape[0]
+        return x
+
+
+def make_vllm_config() -> VllmConfig:
+    return VllmConfig(compilation_config=CompilationConfig(
+        level=CompilationLevel.PIECEWISE, ))
+
+
+@contextmanager
+def use_vllm_config(vllm_config: VllmConfig):
+    with set_forward_context(
+        {}, vllm_config), set_current_vllm_config(vllm_config):
+        yield
+
+
+def test_no_eval_frame(monkeypatch: pytest.MonkeyPatch):
+    with monkeypatch.context() as m:
+        mod = MyMod()
+        args = (torch.randn(10, 10), )
+        expected = mod(*args)
+        CompiledMod = support_torch_compile(MyMod)
+
+        vllm_config = make_vllm_config()
+        m.setenv("VLLM_USE_AOT_COMPILE", "0")
+        try:
+            with use_vllm_config(vllm_config), torch.compiler.set_stance(
+                    "fail_on_recompile"):
+                CompiledMod(vllm_config=vllm_config)(*args)
+        except RuntimeError as e:
+            assert "Detected recompile" in str(e)
+        else:
+            raise AssertionError("Expected exception to be raised")
+
+        m.setenv("VLLM_USE_AOT_COMPILE", "1")
+        torch._dynamo.reset()
+        with use_vllm_config(vllm_config), torch.compiler.set_stance(
+                "fail_on_recompile"):
+            ret = CompiledMod(vllm_config=vllm_config)(*args)
+            assert torch.allclose(ret, expected)
diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
@@ -441,6 +441,35 @@ def set_model_tag(tag: str):
         model_tag = old_tag
 
 
+try:
+    from torch._dynamo.aot_compile import SerializableCallable
+except ImportError:
+    SerializableCallable = object
+
+assert isinstance(SerializableCallable, type)
+
+
+class VllmCompiledFunction(SerializableCallable):
+
+    def __init__(self, graph_module, example_inputs, vllm_config,
+                 optimized_call):
+        self.graph_module = graph_module
+        self.example_inputs = example_inputs
+        self.vllm_config = vllm_config
+        self.optimized_call = optimized_call
+
+    def __call__(self, *args, **kwargs):
+        return self.optimized_call(*args, **kwargs)
+
+    @classmethod
+    def serialize_compile_artifacts(cls, compiled_fn):
+        raise NotImplementedError("serialization not implemented")
+
+    @classmethod
+    def deserialize_compile_artifacts(cls, data):
+        raise NotImplementedError("deserialization not implemented")
+
+
 class VllmBackend:
     """The compilation backend for `torch.compile` with vLLM.
     It is used for compilation level of `CompilationLevel.PIECEWISE`,
@@ -659,7 +688,8 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
             self.compilation_config.cudagraph_mode == CUDAGraphMode.NONE
             or not self.compilation_config.cudagraph_copy_inputs
         ):
-            return self.split_gm
+            return VllmCompiledFunction(graph, example_inputs, vllm_config,
+                                        self.split_gm)
 
         # if we need to copy input buffers for cudagraph
         from torch._guards import detect_fake_mode
@@ -704,4 +734,5 @@ def copy_and_call(*args):
                 list_args[index] = static_tensor
             return self.split_gm(*list_args)
 
-        return copy_and_call
+        return VllmCompiledFunction(graph, example_inputs, vllm_config,
+                                    copy_and_call)
diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py
@@ -11,6 +11,7 @@
 from packaging import version
 from torch._dynamo.symbolic_convert import InliningInstructionTranslator
 
+import vllm.envs as envs
 from vllm.compilation.counter import compilation_counter
 from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
 from vllm.config import CompilationLevel, VllmConfig
@@ -227,6 +228,9 @@ def __call__(self, *args, **kwargs):
         if self.do_not_compile or torch.compiler.is_compiling():
             return self.forward(*args, **kwargs)
 
+        if getattr(self, "aot_compiled_fn", None) is not None:
+            return self.aot_compiled_fn(self, *args, **kwargs)
+
         # the first compilation needs to have dynamic shapes marked
         if len(self.compiled_codes) < 1:
             sig = inspect.signature(self.__class__.forward)
@@ -306,7 +310,11 @@ def patched_inline_call(parent, func, args, kwargs):
                 maybe_use_cudagraph_partition_wrapper(self.vllm_config),
                 _torch27_patch_tensor_subclasses(),
             ):
-                output = self.compiled_callable(*args, **kwargs)
+                if envs.VLLM_USE_AOT_COMPILE:
+                    self.aot_compiled_fn = self.aot_compile(*args, **kwargs)
+                    output = self.aot_compiled_fn(self, *args, **kwargs)
+                else:
+                    output = self.compiled_callable(*args, **kwargs)
             return output
 
         # usually, capturing the model once is enough, and then we can
diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py
@@ -10,6 +10,7 @@
 
 import torch
 
+import vllm.envs as envs
 from vllm.config import CompilationLevel, CUDAGraphMode, get_current_vllm_config
 from vllm.logger import init_logger
 
@@ -41,9 +42,26 @@ def __init__(
             backend = vllm_config.compilation_config.init_backend(vllm_config)
             options = None
             if isinstance(backend, str) and backend == "inductor":
+<<<<<<< HEAD
                 options = (
                     get_current_vllm_config().compilation_config.inductor_compile_config
                 )
+=======
+                options = get_current_vllm_config(
+                ).compilation_config.inductor_compile_config
+            if envs.VLLM_USE_AOT_COMPILE:
+                options = options or {}
+                options["guard_filter_fn"] = lambda guards: [
+                    False for _ in guards
+                ]
+                if hasattr(torch._dynamo.config, "enable_aot_compile"):
+                    torch._dynamo.config.enable_aot_compile = True
+                else:
+                    msg = "torch._dynamo.config.enable_aot_compile is not "
+                    msg += "available. AOT compile is disabled and please "
+                    msg += "upgrade PyTorch version to use AOT compile."
+                    logger.warning(msg)
+>>>>>>> 6fc29676a (AOT compilation workflow [1/n])
 
             compiled_callable = torch.compile(
                 self.forward, fullgraph=True, backend=backend, options=options
@@ -61,6 +79,14 @@ def __init__(
             compilation_level >= CompilationLevel.DYNAMO_ONCE
         )
 
+    def aot_compile(self, *args, **kwargs):
+        if not hasattr(self.compiled_callable, "aot_compile"):
+            raise RuntimeError(
+                "aot_compile is not supported by the current configuration. " +
+                "Please make sure torch.compile is enabled with the latest " +
+                "version of PyTorch")
+        return self.compiled_callable.aot_compile((args, kwargs))
+
     def __call__(self, *args, **kwargs):
         """Implement the dispatch logic here, beyond the torch.compile level.
         NOTE: this function can have additional arguments beyond the forward
diff --git a/vllm/envs.py b/vllm/envs.py
@@ -499,6 +499,13 @@ def get_vllm_port() -> Optional[int]:
     # Dump fx graphs to the given directory.
     # It will override CompilationConfig.debug_dump_path if set.
     "VLLM_DEBUG_DUMP_PATH": lambda: os.environ.get("VLLM_DEBUG_DUMP_PATH", None),
+
+    # Feature flag to enable/disable AOT compilation. This will ensure
+    # compilation is done in warmup phase and the compilation will be
+    # reused in subsequent calls.
+    "VLLM_USE_AOT_COMPILE":
+    lambda: os.environ.get("VLLM_USE_AOT_COMPILE", "0") == "1",
+
     # local rank of the process in the distributed setting, used to determine
     # the GPU device id
     "LOCAL_RANK": lambda: int(os.environ.get("LOCAL_RANK", "0")),