Merge branch 'main' into user/fanrongl/mtp3_support_for_ds32

lfr-0531 · web-flow · commit af5528fc82cb · 2025-11-11T15:05:05.000+08:00
diff --git a/ATTRIBUTIONS-CPP-aarch64.md b/ATTRIBUTIONS-CPP-aarch64.md
@@ -14889,6 +14889,24 @@ Chen, Tianqi
 
 ```
 
+## Mooncake
+
+- **Repository URL**: https://github.com/kvcache-ai/Mooncake
+- **License URL**: https://github.com/kvcache-ai/Mooncake/blob/main/LICENSE-APACHE
+- **License name**: Apache 2.0
+
+### Authors
+
+© Copyright 2025, Mooncake Team.
+Copyright (c) Meta Platforms, Inc. and affiliates.
+Copyright 2024 KVCache.AI
+Ruoyu Qin
+Zheming Li
+Weiran He
+Mingxing Zhang
+Yongwei Wu
+Weimin Zheng
+Xinran Xu
 ## flashinfer
 
 ### License Text
diff --git a/ATTRIBUTIONS-CPP-x86_64.md b/ATTRIBUTIONS-CPP-x86_64.md
@@ -14697,6 +14697,24 @@ Chen, Tianqi
 
 ```
 
+## Mooncake
+
+- **Repository URL**: https://github.com/kvcache-ai/Mooncake
+- **License URL**: https://github.com/kvcache-ai/Mooncake/blob/main/LICENSE-APACHE
+- **License name**: Apache 2.0
+
+### Authors
+
+© Copyright 2025, Mooncake Team.
+Copyright (c) Meta Platforms, Inc. and affiliates.
+Copyright 2024 KVCache.AI
+Ruoyu Qin
+Zheming Li
+Weiran He
+Mingxing Zhang
+Yongwei Wu
+Weimin Zheng
+Xinran Xu
 ## flashinfer
 
 ### License Text
diff --git a/docker/Dockerfile.multi b/docker/Dockerfile.multi
@@ -44,6 +44,7 @@ COPY docker/common/install.sh \
      docker/common/install_ucx.sh \
      docker/common/install_nixl.sh \
      docker/common/install_etcd.sh \
+     docker/common/install_mooncake.sh \
      ./
 
 RUN GITHUB_MIRROR=${GITHUB_MIRROR} \
@@ -103,6 +104,13 @@ COPY docker/common/install_triton.sh \
 
 RUN GITHUB_MIRROR=${GITHUB_MIRROR} bash ./install_triton.sh && rm install_triton.sh
 
+# Install Mooncake, after triton handles boost requirement
+RUN if [ -f /etc/redhat-release ]; then \
+        echo "Rocky8 detected, skipping mooncake installation"; \
+    else \
+        bash ./install_mooncake.sh; \
+    fi && rm install_mooncake.sh
+
 FROM ${DEVEL_IMAGE} AS wheel
 WORKDIR /src/tensorrt_llm
 COPY benchmarks benchmarks
diff --git a/docker/common/install_mooncake.sh b/docker/common/install_mooncake.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+set -ex
+
+MOONCAKE_VERSION="v0.3.6.post1"
+MOONCAKE_REPO="https://github.com/kvcache-ai/Mooncake.git"
+MOONCAKE_INSTALL_PATH="/usr/local/Mooncake"
+
+apt-get update
+
+# https://kvcache-ai.github.io/Mooncake/getting_started/build.html
+# libboost-all-dev is removed because it will install a duplicated MPI library
+# triton also installed boost so the requirement is already met
+apt-get install -y --no-install-recommends \
+    build-essential \
+    libibverbs-dev \
+    libgoogle-glog-dev \
+    libgtest-dev \
+    libjsoncpp-dev \
+    libnuma-dev \
+    libunwind-dev \
+    libssl-dev \
+    libyaml-cpp-dev \
+    libcurl4-openssl-dev \
+    libhiredis-dev \
+    pkg-config \
+    patchelf
+
+mkdir -p /third-party-source
+
+git clone --depth 1 https://github.com/alibaba/yalantinglibs.git
+tar -czf /third-party-source/yalantinglibs.tar.gz yalantinglibs
+cd yalantinglibs
+mkdir build && cd build
+cmake .. -DBUILD_EXAMPLES=OFF -DBUILD_BENCHMARK=OFF -DBUILD_UNIT_TESTS=OFF
+make -j
+make install
+cd ../..
+rm -rf yalantinglibs
+
+git clone --depth 1 -b ${MOONCAKE_VERSION} ${MOONCAKE_REPO}
+tar -czf /third-party-source/Mooncake-${MOONCAKE_VERSION}.tar.gz Mooncake
+cd Mooncake
+git submodule update --init --recursive --depth 1
+mkdir build && cd build
+cmake .. -DUSE_CUDA=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_INSTALL_PREFIX=${MOONCAKE_INSTALL_PATH}
+make -j
+make install
+cd ../..
+rm -rf Mooncake
+
+echo "export LD_LIBRARY_PATH=${MOONCAKE_INSTALL_PATH}/lib:\$LD_LIBRARY_PATH" >> "${ENV}"
diff --git a/examples/auto_deploy/.gitignore b/examples/auto_deploy/.gitignore
@@ -4,3 +4,4 @@ benchmark_results.json
 *.png
 # ignore config files that users might put here for debugging
 *.yaml
+!nano_v3.yaml
diff --git a/examples/auto_deploy/nano_v3.yaml b/examples/auto_deploy/nano_v3.yaml
@@ -0,0 +1,23 @@
+runtime: trtllm
+compile_backend: torch-cudagraph
+max_batch_size: 384
+max_seq_len: 65536 # tunable
+enable_chunked_prefill: true
+attn_backend: flashinfer
+model_factory: AutoModelForCausalLM
+skip_loading_weights: false
+free_mem_ratio: 0.9
+cuda_graph_batch_sizes: [1, 2, 4, 8, 16, 24, 32, 64, 128, 256, 320, 384]
+kv_cache_config:
+  # disable kv_cache reuse since not supported for hybrid/ssm models
+  enable_block_reuse: false
+transforms:
+  detect_sharding:
+    sharding_source: ['factory', 'heuristic']
+    sharding_dims: ['ep', 'bmm']
+  # tunable mamba cache dtype
+  # --> use float32 for accuracy and default (null) for speed
+  insert_cached_ssm_attention:
+      cache_config:
+        # mamba_dtype: float32
+        mamba_dtype: null
diff --git a/examples/models/core/llama/README.md b/examples/models/core/llama/README.md
@@ -1540,14 +1540,15 @@ bash -c 'python ./examples/mmlu.py --test_trt_llm \
 ## Run LLaMa-3.3 70B Model on PyTorch Backend
 This section provides the steps to run LLaMa-3.3 70B model FP8 precision on PyTorch backend by launching TensorRT LLM server and run performance benchmarks.
 
-
 ### Prepare TensorRT LLM extra configs
 ```bash
 cat >./extra-llm-api-config.yml <<EOF
-stream_interval: 2
+stream_interval: 10
 cuda_graph_config:
   max_batch_size: 1024
   enable_padding: true
+kv_cache_config:
+  dtype: fp8
 EOF
 ```
 Explanation:
@@ -1581,5 +1582,5 @@ python -m tensorrt_llm.serve.scripts.benchmark_serving \
         --random-input-len 1024 \
         --random-output-len 2048 \
         --random-ids \
-        --max-concurrency 1024 \
+        --max-concurrency 1024
 ```
diff --git a/examples/models/core/llama4/README.md b/examples/models/core/llama4/README.md
@@ -27,7 +27,7 @@ This section provides the steps to launch TensorRT LLM server and run performanc
 ```bash
 cat >./extra-llm-api-config.yml <<EOF
 enable_attention_dp: true
-stream_interval: 2
+stream_interval: 10
 cuda_graph_config:
   max_batch_size: 512
   enable_padding: true
@@ -78,7 +78,7 @@ python -m tensorrt_llm.serve.scripts.benchmark_serving \
 cat >./extra-llm-api-config.yml <<EOF
 enable_attention_dp: false
 enable_min_latency: true
-stream_interval: 2
+stream_interval: 10
 cuda_graph_config:
   max_batch_size: 8
   enable_padding: true
@@ -126,7 +126,7 @@ python -m tensorrt_llm.serve.scripts.benchmark_serving \
 #### 1. Prepare TensorRT LLM extra configs
 ```bash
 cat >./extra-llm-api-config.yml <<EOF
-stream_interval: 2
+stream_interval: 10
 cuda_graph_config:
   max_batch_size: 1024
   enable_padding: true
diff --git a/jenkins/current_image_tags.properties b/jenkins/current_image_tags.properties
@@ -13,7 +13,7 @@
 #     images are adopted from PostMerge pipelines, the abbreviated commit hash is used instead.
 IMAGE_NAME=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm
 
-LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.10-py3-x86_64-ubuntu24.04-trt10.13.3.9-skip-tritondevel-202511021230-8838
-LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.10-py3-aarch64-ubuntu24.04-trt10.13.3.9-skip-tritondevel-202511021230-8838
-LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.0.2-devel-rocky8-x86_64-rocky8-py310-trt10.13.3.9-skip-tritondevel-202511021230-8838
-LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.0.2-devel-rocky8-x86_64-rocky8-py312-trt10.13.3.9-skip-tritondevel-202511021230-8838
+LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.10-py3-x86_64-ubuntu24.04-trt10.13.3.9-skip-tritondevel-202511110140-8447
+LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.10-py3-aarch64-ubuntu24.04-trt10.13.3.9-skip-tritondevel-202511110140-8447
+LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.0.2-devel-rocky8-x86_64-rocky8-py310-trt10.13.3.9-skip-tritondevel-202511110140-8447
+LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.0.2-devel-rocky8-x86_64-rocky8-py312-trt10.13.3.9-skip-tritondevel-202511110140-8447
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/attention_interface.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/attention_interface.py
@@ -10,10 +10,10 @@
 """
 
 from abc import ABC, abstractmethod
-from dataclasses import dataclass
 from typing import Dict, List, Literal, Optional, Protocol, Sequence, Set, Tuple, Type, Union
 
 import torch
+from pydantic import BaseModel, ConfigDict, Field, field_validator
 from torch._ops import OpOverloadPacket
 from torch.fx import Node
 from torch.types import Number
@@ -24,11 +24,39 @@
 Constant = Union[int, float, str, None]
 
 
-@dataclass
-class CacheConfig:
-    """A dataclass to hold information how to configure the cache."""
+class CacheConfig(BaseModel):
+    """Cache configuration for attention-related dtypes."""
 
-    dtype: Optional[torch.dtype] = None
+    model_config = ConfigDict(
+        arbitrary_types_allowed=True,
+        extra="forbid",
+    )
+
+    dtype: Optional[torch.dtype] = Field(default=None, description="KV cache dtype.")
+    mamba_dtype: Optional[torch.dtype] = Field(default=None, description="Mamba cache dtype.")
+
+    @field_validator("dtype", "mamba_dtype", mode="before")
+    @classmethod
+    def _coerce_dtype(cls, value):
+        if value is None or isinstance(value, torch.dtype):
+            return value
+        if isinstance(value, str):
+            dtype = getattr(torch, value, None)
+            assert isinstance(dtype, torch.dtype), f"Invalid {dtype=}"
+            return dtype
+        return value
+
+    def __or__(self, other: "CacheConfig") -> "CacheConfig":
+        """Combine two CacheConfig objects field-wise using Python's `or` semantics.
+
+        For each field, selects the first non-None value between `self` and `other`.
+        """
+        if not isinstance(other, CacheConfig):
+            raise NotImplementedError(f"Cannot combine CacheConfig with {type(other)}")
+        merged_kwargs = {}
+        for field_name in type(self).model_fields.keys():
+            merged_kwargs[field_name] = getattr(self, field_name) or getattr(other, field_name)
+        return CacheConfig(**merged_kwargs)
 
 
 class SequenceInfo:
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/torch_backend_mamba.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/torch_backend_mamba.py
@@ -347,14 +347,17 @@ def get_cache_initializers(
             # Fallback: assume last dim is n_groups * state_size and choose a minimal positive size
             ssm_state_size = max(1, B_fake.shape[-1])
 
+        # extract ssm_state_dtype from cache_config or hs_fake
+        ssm_state_dtype = cache_config.mamba_dtype or hs_fake.dtype
+
         def _get_ssm_cache(si: SequenceInfo):
             return torch.empty(
                 si.max_batch_size,
                 num_heads,
                 head_dim,
                 ssm_state_size,
                 device=si.device,
-                dtype=cache_config.dtype or hs_fake.dtype,
+                dtype=ssm_state_dtype,
             )
 
         return {"ssm_state_cache": _get_ssm_cache}
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/triton_backend_mamba.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/triton_backend_mamba.py
@@ -125,6 +125,7 @@ def _triton_cached_ssm(
             dt_limit=(time_step_limit[0], time_step_limit[1]),
             return_final_states=False,
             return_varlen_states=True,
+            mamba_ssm_cache_dtype=ssm_state_cache.dtype,
         )
 
         y_flat[:total_prefill_tokens] = y_prefill[0].to(y_flat.dtype)
@@ -198,9 +199,7 @@ def _triton_cached_ssm_fake(
     )
 
 
-## Note: we reuse the existing metadata custom op and its registered fake from torch backend.
-
-
+# TODO: consider inheriting from TorchBackendSSM instead of redefining everything
 @AttentionRegistry.register("triton_ssm")
 class TritonBackendSSM(AttentionDescriptor):
     @classmethod
@@ -247,14 +246,17 @@ def get_cache_initializers(
         else:
             ssm_state_size = max(1, B_fake.shape[-1])
 
+        # extract ssm_state_dtype from cache_config or hs_fake
+        ssm_state_dtype = cache_config.mamba_dtype or hs_fake.dtype
+
         def _get_ssm_cache(si: SequenceInfo):
             return torch.empty(
                 si.max_batch_size,
                 num_heads,
                 head_dim,
                 ssm_state_size,
                 device=si.device,
-                dtype=cache_config.dtype or hs_fake.dtype,
+                dtype=ssm_state_dtype,
             )
 
         return {"ssm_state_cache": _get_ssm_cache}
diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/kvcache.py b/tensorrt_llm/_torch/auto_deploy/transform/library/kvcache.py
@@ -8,7 +8,12 @@
 from pydantic import Field
 from torch.fx import GraphModule, Node
 
-from ...custom_ops.attention_interface import AttentionDescriptor, AttentionRegistry, Constant
+from ...custom_ops.attention_interface import (
+    AttentionDescriptor,
+    AttentionRegistry,
+    CacheConfig,
+    Constant,
+)
 from ...distributed.common import all_gather_object, get_world_size
 from ...distributed.common import is_initialized as is_distributed_initialized
 from ...models.factory import ModelFactory
@@ -66,6 +71,9 @@ class InsertCachedAttentionConfig(TransformConfig):
     """Configuration for the insert cached attention transform."""
 
     backend: Optional[str] = Field(default=None, description="The attention backend to use.")
+    cache_config: CacheConfig = Field(
+        default_factory=CacheConfig, description="The custom cache configuration to use."
+    )
 
 
 @TransformRegistry.register("insert_cached_attention")
@@ -137,7 +145,9 @@ def _apply(
         """Replace uncached source attention node with corresponding cached attn node."""
         attn_descriptor = self.attn_descriptor
 
-        cache_config = factory.get_cache_config()
+        # run field-wise or to combine the cache config from the transform and the factory
+        # the transform config takes precedence over the factory config
+        cache_config = self.config.cache_config | factory.get_cache_config()
 
         # Get all attention nodes and their info objects
         source_op = attn_descriptor.get_source_attention_op()
diff --git a/tensorrt_llm/_torch/pyexecutor/executor_request_queue.py b/tensorrt_llm/_torch/pyexecutor/executor_request_queue.py
@@ -66,6 +66,7 @@ def __init__(self, dist: Distributed, enable_attention_dp: bool,
         self.start_times = {}
         self.active = True
         self.batch_wait_timeout_ms = batch_wait_timeout_ms
+        self.send_requests_handler = None
 
         # State tracking
         self.num_fetch_requests = 0
@@ -609,15 +610,11 @@ def _broadcast_new_requests(
 
         if not self.dist.is_last_pp_rank:
             with nvtx_range("send_requests_to_next_pp"):
-                if self._disable_mpi:
-                    isend_payload = self.dist.isend_object(
-                        payloads,
-                        self.dist.next_pp_rank,
-                        tag,
-                    )
-                    isend_payload.wait()
-                else:
-                    self.dist.send_object(payloads, self.dist.next_pp_rank, tag)
+                if self.send_requests_handler is not None:
+                    with nvtx_range("wait_prev_send_requests_handler"):
+                        self.send_requests_handler.wait()
+                self.send_requests_handler = self.dist.isend_object(
+                    payloads, self.dist.next_pp_rank, tag)
 
         return payloads
 
diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt