AOT compilation workflow [2/n]

zhxchen17 · dolpm · commit 8d189413e5ef · 2025-09-23T12:42:42.000-07:00
Signed-off-by: zhxchen17 &lt;zhxchen17@fb.com&gt;
diff --git a/tests/compile/test_aot_compile.py b/tests/compile/test_aot_compile.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import tempfile
 from contextlib import contextmanager
 
 import pytest
@@ -59,3 +60,38 @@ def test_no_eval_frame(monkeypatch: pytest.MonkeyPatch):
                 "fail_on_recompile"):
             ret = CompiledMod(vllm_config=vllm_config)(*args)
             assert torch.allclose(ret, expected)
+
+
+def test_force_aot_load(monkeypatch: pytest.MonkeyPatch):
+    with tempfile.TemporaryDirectory() as tmpdirname, monkeypatch.context(
+    ) as m:
+        args = (torch.randn(10, 10), )
+        m.setenv("VLLM_USE_AOT_COMPILE", "1")
+        m.setenv("VLLM_FORCE_AOT_LOAD", "1")
+        m.setenv("VLLM_CACHE_ROOT", tmpdirname)
+        vllm_config = make_vllm_config()
+        with use_vllm_config(vllm_config):
+            CompiledMod = support_torch_compile(MyMod)
+            try:
+                CompiledMod(vllm_config=vllm_config)(*args)
+            except Exception as e:
+                assert isinstance(e, FileNotFoundError)
+            else:
+                raise AssertionError(
+                    "Expected failed aot compilation with clean state.")
+
+
+def test_basic(monkeypatch: pytest.MonkeyPatch):
+    with monkeypatch.context() as m:
+        args = (torch.randn(10, 10), )
+        CompiledMod = support_torch_compile(MyMod)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            m.setenv("VLLM_CACHE_ROOT", tmpdirname)
+            m.setenv("VLLM_USE_AOT_COMPILE", "1")
+            vllm_config = make_vllm_config()
+            with use_vllm_config(vllm_config):
+                expected = CompiledMod(vllm_config=vllm_config)(*args)
+                m.setenv("VLLM_FORCE_AOT_LOAD", "1")
+                ret = CompiledMod(vllm_config=vllm_config)(*args)
+                assert torch.allclose(ret, expected)
diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
@@ -3,16 +3,20 @@
 
 import ast
 import dataclasses
+import inspect
 import os
+import pickle
 import pprint
 import time
 from collections.abc import Sequence
 from contextlib import contextmanager
 from typing import Any, Callable, Optional
+from unittest.mock import patch
 
 import torch
 import torch.fx as fx
 from torch._dispatch.python import enable_python_dispatcher
+from torch.utils import _pytree as pytree
 
 import vllm.envs as envs
 from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig
@@ -408,23 +412,94 @@ def set_model_tag(tag: str):
 
 class VllmCompiledFunction(SerializableCallable):
 
-    def __init__(self, graph_module, example_inputs, vllm_config,
+    def __init__(self, graph_module, example_inputs, vllm_config, prefix,
                  optimized_call):
+        assert isinstance(graph_module, torch.fx.GraphModule)
         self.graph_module = graph_module
         self.example_inputs = example_inputs
         self.vllm_config = vllm_config
+        self.prefix = prefix
         self.optimized_call = optimized_call
 
     def __call__(self, *args, **kwargs):
         return self.optimized_call(*args, **kwargs)
 
     @classmethod
-    def serialize_compile_artifacts(cls, compiled_fn):
-        raise NotImplementedError("serialization not implemented")
+    def serialize_compile_artifacts(
+            cls, compiled_fn: "VllmCompiledFunction") -> bytes:
+        import sympy
+        from torch._subclasses import FakeTensorMode
+        from torch.fx._graph_pickler import GraphPickler, Options
+        state = compiled_fn.__dict__.copy()
+        state.pop("optimized_call")
+        for node in state["graph_module"].graph.nodes:
+            node.meta.pop("source_fn_stack", None)
+            node.meta.pop("nn_module_stack", None)
+
+        graph_reducer_override = GraphPickler.reducer_override
+
+        def _graph_reducer_override(self, obj):
+            if (inspect.isclass(obj) and issubclass(obj, sympy.Function)
+                    and hasattr(obj, "_torch_unpickler")):
+                return obj._torch_unpickler, (obj._torch_handler_name, )
+            if isinstance(obj, FakeTensorMode):
+                return type(None), ()
+            return graph_reducer_override(self, obj)
+
+        # Mask off tensor inputs since they are large and not needed.
+        state["example_inputs"] = pytree.tree_map_only(torch.Tensor,
+                                                       lambda _: None,
+                                                       state["example_inputs"])
+        with patch.object(GraphPickler, 'reducer_override',
+                          _graph_reducer_override):
+            state["graph_module"] = GraphPickler.dumps(
+                state["graph_module"], Options(ops_filter=None))
+            state["example_inputs"] = GraphPickler.dumps(
+                state["example_inputs"])
+        return pickle.dumps(state)
 
     @classmethod
-    def deserialize_compile_artifacts(cls, data):
-        raise NotImplementedError("deserialization not implemented")
+    def deserialize_compile_artifacts(cls,
+                                      data: bytes) -> "VllmCompiledFunction":
+        from torch._guards import TracingContext, tracing
+        from torch._subclasses import FakeTensorMode
+        from torch.fx._graph_pickler import GraphPickler
+        from torch.fx.experimental.symbolic_shapes import ShapeEnv
+
+        state = pickle.loads(data)
+        fake_mode = FakeTensorMode(shape_env=ShapeEnv())
+        state["graph_module"] = GraphPickler.loads(state["graph_module"],
+                                                   fake_mode)
+        state["example_inputs"] = GraphPickler.loads(state["example_inputs"],
+                                                     fake_mode)
+        vllm_backend = VllmBackend(state["vllm_config"], state["prefix"])
+
+        def optimized_call(*example_inputs):
+            compile_inputs = [
+                inp or example_inputs[i]
+                for i, inp in enumerate(fn.example_inputs)
+            ]
+            with tracing(TracingContext(fake_mode)):
+                fn.optimized_call = vllm_backend(state["graph_module"],
+                                                 compile_inputs).optimized_call
+            return fn.optimized_call(*example_inputs)
+
+        fn = cls(**state, optimized_call=optimized_call)
+        return fn
+
+
+def compilation_config_hash_factors(vllm_config: VllmConfig) -> list[str]:
+    factors = []
+    # 0. factors come from the env, for example, The values of
+    # VLLM_PP_LAYER_PARTITION will affect the computation graph.
+    env_hash = envs.compute_hash()
+    factors.append(env_hash)
+
+    # 1. factors come from the vllm_config (it mainly summarizes how the
+    #    model is created)
+    config_hash = vllm_config.compute_hash()
+    factors.append(config_hash)
+    return factors
 
 
 class VllmBackend:
@@ -502,7 +577,8 @@ def configure_post_pass(self):
                 self.post_grad_pass_manager.add(inductor_config[PASS_KEY])
         inductor_config[PASS_KEY] = self.post_grad_pass_manager
 
-    def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
+    def __call__(self, graph: fx.GraphModule,
+                 example_inputs) -> VllmCompiledFunction:
 
         vllm_config = self.vllm_config
         if not self.compilation_config.cache_dir:
@@ -511,17 +587,7 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
             # the cache dir will be the same so that we can reuse the compiled
             # graph.
 
-            factors = []
-            # 0. factors come from the env, for example, The values of
-            # VLLM_PP_LAYER_PARTITION will affect the computation graph.
-            env_hash = envs.compute_hash()
-            factors.append(env_hash)
-
-            # 1. factors come from the vllm_config (it mainly summarizes how the
-            #    model is created)
-            config_hash = vllm_config.compute_hash()
-            factors.append(config_hash)
-
+            factors = compilation_config_hash_factors(vllm_config)
             # 2. factors come from the code files that are traced by Dynamo (
             #    it mainly summarizes how the model is used in forward pass)
             forward_code_files = list(
@@ -635,7 +701,7 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
         if self.compilation_config.cudagraph_mode == CUDAGraphMode.NONE or \
             not self.compilation_config.cudagraph_copy_inputs:
             return VllmCompiledFunction(graph, example_inputs, vllm_config,
-                                        self.split_gm)
+                                        self.prefix, self.split_gm)
 
         # if we need to copy input buffers for cudagraph
         from torch._guards import detect_fake_mode
@@ -678,4 +744,4 @@ def copy_and_call(*args):
             return self.split_gm(*list_args)
 
         return VllmCompiledFunction(graph, example_inputs, vllm_config,
-                                    copy_and_call)
+                                    self.prefix, copy_and_call)
diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py
@@ -2,7 +2,9 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import contextlib
+import hashlib
 import inspect
+import os
 from typing import Callable, Optional, TypeVar, Union, overload
 from unittest.mock import patch
 
@@ -176,6 +178,13 @@ def cls_decorator_helper(cls: _T) -> _T:
     return cls_decorator_helper
 
 
+def _model_hash_key(fn) -> str:
+    sha256_hash = hashlib.sha256()
+    sha256_hash.update(fn.__qualname__.encode())
+    sha256_hash.update(str(fn.__code__.co_firstlineno).encode())
+    return sha256_hash.hexdigest()
+
+
 def _support_torch_compile(
     cls: _T,
     dynamic_arg_dims: dict[str, Union[int, list[int]]],
@@ -227,6 +236,39 @@ def __call__(self, *args, **kwargs):
         if getattr(self, "aot_compiled_fn", None) is not None:
             return self.aot_compiled_fn(self, *args, **kwargs)
 
+        cache_dir = None
+        aot_compilation_path = None
+        if envs.VLLM_USE_AOT_COMPILE:
+            from .backends import compilation_config_hash_factors
+            factors: list[str] = compilation_config_hash_factors(
+                self.vllm_config)
+
+            factors.append(_model_hash_key(self.forward))
+            hash_key = hashlib.sha256(str(factors).encode()).hexdigest()
+
+            cache_dir = os.path.join(
+                envs.VLLM_CACHE_ROOT,
+                "aot_compilation",
+                hash_key,
+            )
+
+            rank = self.vllm_config.parallel_config.rank
+            dp_rank = self.vllm_config.parallel_config.data_parallel_rank
+            cache_dir = os.path.join(cache_dir, f"rank_{rank}_{dp_rank}")
+            aot_compilation_path = os.path.join(cache_dir, "model")
+            try:
+                with open(aot_compilation_path, "rb") as f:
+                    loaded_fn = torch.compiler.load_compiled_function(f)
+                self.aot_compiled_fn = loaded_fn
+                return self.aot_compiled_fn(self, *args, **kwargs)
+            except Exception as e:
+                if os.path.exists(aot_compilation_path):
+                    logger.warning(
+                        "Cannot load aot compilation from path %s, error: %s",
+                        aot_compilation_path, str(e))
+                if envs.VLLM_FORCE_AOT_LOAD:
+                    raise e
+
         # the first compilation needs to have dynamic shapes marked
         if len(self.compiled_codes) < 1:
             sig = inspect.signature(self.__class__.forward)
@@ -312,6 +354,11 @@ def patched_inline_call(parent, func, args, kwargs):
                 if envs.VLLM_USE_AOT_COMPILE:
                     self.aot_compiled_fn = self.aot_compile(*args, **kwargs)
                     output = self.aot_compiled_fn(self, *args, **kwargs)
+                    assert aot_compilation_path is not None
+                    assert cache_dir is not None
+                    os.makedirs(cache_dir, exist_ok=True)
+                    self.aot_compiled_fn.save_compiled_function(
+                        aot_compilation_path)
                 else:
                     output = self.compiled_callable(*args, **kwargs)
 
diff --git a/vllm/envs.py b/vllm/envs.py
@@ -510,6 +510,12 @@ def get_vllm_port() -> Optional[int]:
     "VLLM_USE_AOT_COMPILE":
     lambda: os.environ.get("VLLM_USE_AOT_COMPILE", "0") == "1",
 
+    # Force vllm to always load AOT compiled models from disk. Failure
+    # to load will result in a hard error when this is enabled.
+    # Will be ignored when VLLM_USE_AOT_COMPILE is disabled.
+    "VLLM_FORCE_AOT_LOAD":
+    lambda: os.environ.get("VLLM_FORCE_AOT_LOAD", "0") == "1",
+
     # local rank of the process in the distributed setting, used to determine
     # the GPU device id
     "LOCAL_RANK":