Skip to content

Commit af5528f

Browse files
authored
Merge branch 'main' into user/fanrongl/mtp3_support_for_ds32
2 parents 7611b09 + 7aeac97 commit af5528f

File tree

15 files changed

+196
-31
lines changed

15 files changed

+196
-31
lines changed

ATTRIBUTIONS-CPP-aarch64.md

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14889,6 +14889,24 @@ Chen, Tianqi
1488914889

1489014890
```
1489114891

14892+
## Mooncake
14893+
14894+
- **Repository URL**: https://github.com/kvcache-ai/Mooncake
14895+
- **License URL**: https://github.com/kvcache-ai/Mooncake/blob/main/LICENSE-APACHE
14896+
- **License name**: Apache 2.0
14897+
14898+
### Authors
14899+
14900+
© Copyright 2025, Mooncake Team.
14901+
Copyright (c) Meta Platforms, Inc. and affiliates.
14902+
Copyright 2024 KVCache.AI
14903+
Ruoyu Qin
14904+
Zheming Li
14905+
Weiran He
14906+
Mingxing Zhang
14907+
Yongwei Wu
14908+
Weimin Zheng
14909+
Xinran Xu
1489214910
## flashinfer
1489314911

1489414912
### License Text

ATTRIBUTIONS-CPP-x86_64.md

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14697,6 +14697,24 @@ Chen, Tianqi
1469714697

1469814698
```
1469914699

14700+
## Mooncake
14701+
14702+
- **Repository URL**: https://github.com/kvcache-ai/Mooncake
14703+
- **License URL**: https://github.com/kvcache-ai/Mooncake/blob/main/LICENSE-APACHE
14704+
- **License name**: Apache 2.0
14705+
14706+
### Authors
14707+
14708+
© Copyright 2025, Mooncake Team.
14709+
Copyright (c) Meta Platforms, Inc. and affiliates.
14710+
Copyright 2024 KVCache.AI
14711+
Ruoyu Qin
14712+
Zheming Li
14713+
Weiran He
14714+
Mingxing Zhang
14715+
Yongwei Wu
14716+
Weimin Zheng
14717+
Xinran Xu
1470014718
## flashinfer
1470114719

1470214720
### License Text

docker/Dockerfile.multi

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ COPY docker/common/install.sh \
4444
docker/common/install_ucx.sh \
4545
docker/common/install_nixl.sh \
4646
docker/common/install_etcd.sh \
47+
docker/common/install_mooncake.sh \
4748
./
4849

4950
RUN GITHUB_MIRROR=${GITHUB_MIRROR} \
@@ -103,6 +104,13 @@ COPY docker/common/install_triton.sh \
103104

104105
RUN GITHUB_MIRROR=${GITHUB_MIRROR} bash ./install_triton.sh && rm install_triton.sh
105106

107+
# Install Mooncake, after triton handles boost requirement
108+
RUN if [ -f /etc/redhat-release ]; then \
109+
echo "Rocky8 detected, skipping mooncake installation"; \
110+
else \
111+
bash ./install_mooncake.sh; \
112+
fi && rm install_mooncake.sh
113+
106114
FROM ${DEVEL_IMAGE} AS wheel
107115
WORKDIR /src/tensorrt_llm
108116
COPY benchmarks benchmarks

docker/common/install_mooncake.sh

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
#!/bin/bash
2+
set -ex
3+
4+
MOONCAKE_VERSION="v0.3.6.post1"
5+
MOONCAKE_REPO="https://github.com/kvcache-ai/Mooncake.git"
6+
MOONCAKE_INSTALL_PATH="/usr/local/Mooncake"
7+
8+
apt-get update
9+
10+
# https://kvcache-ai.github.io/Mooncake/getting_started/build.html
11+
# libboost-all-dev is removed because it will install a duplicated MPI library
12+
# triton also installed boost so the requirement is already met
13+
apt-get install -y --no-install-recommends \
14+
build-essential \
15+
libibverbs-dev \
16+
libgoogle-glog-dev \
17+
libgtest-dev \
18+
libjsoncpp-dev \
19+
libnuma-dev \
20+
libunwind-dev \
21+
libssl-dev \
22+
libyaml-cpp-dev \
23+
libcurl4-openssl-dev \
24+
libhiredis-dev \
25+
pkg-config \
26+
patchelf
27+
28+
mkdir -p /third-party-source
29+
30+
git clone --depth 1 https://github.com/alibaba/yalantinglibs.git
31+
tar -czf /third-party-source/yalantinglibs.tar.gz yalantinglibs
32+
cd yalantinglibs
33+
mkdir build && cd build
34+
cmake .. -DBUILD_EXAMPLES=OFF -DBUILD_BENCHMARK=OFF -DBUILD_UNIT_TESTS=OFF
35+
make -j
36+
make install
37+
cd ../..
38+
rm -rf yalantinglibs
39+
40+
git clone --depth 1 -b ${MOONCAKE_VERSION} ${MOONCAKE_REPO}
41+
tar -czf /third-party-source/Mooncake-${MOONCAKE_VERSION}.tar.gz Mooncake
42+
cd Mooncake
43+
git submodule update --init --recursive --depth 1
44+
mkdir build && cd build
45+
cmake .. -DUSE_CUDA=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_INSTALL_PREFIX=${MOONCAKE_INSTALL_PATH}
46+
make -j
47+
make install
48+
cd ../..
49+
rm -rf Mooncake
50+
51+
echo "export LD_LIBRARY_PATH=${MOONCAKE_INSTALL_PATH}/lib:\$LD_LIBRARY_PATH" >> "${ENV}"

examples/auto_deploy/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,4 @@ benchmark_results.json
44
*.png
55
# ignore config files that users might put here for debugging
66
*.yaml
7+
!nano_v3.yaml

examples/auto_deploy/nano_v3.yaml

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
runtime: trtllm
2+
compile_backend: torch-cudagraph
3+
max_batch_size: 384
4+
max_seq_len: 65536 # tunable
5+
enable_chunked_prefill: true
6+
attn_backend: flashinfer
7+
model_factory: AutoModelForCausalLM
8+
skip_loading_weights: false
9+
free_mem_ratio: 0.9
10+
cuda_graph_batch_sizes: [1, 2, 4, 8, 16, 24, 32, 64, 128, 256, 320, 384]
11+
kv_cache_config:
12+
# disable kv_cache reuse since not supported for hybrid/ssm models
13+
enable_block_reuse: false
14+
transforms:
15+
detect_sharding:
16+
sharding_source: ['factory', 'heuristic']
17+
sharding_dims: ['ep', 'bmm']
18+
# tunable mamba cache dtype
19+
# --> use float32 for accuracy and default (null) for speed
20+
insert_cached_ssm_attention:
21+
cache_config:
22+
# mamba_dtype: float32
23+
mamba_dtype: null

examples/models/core/llama/README.md

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1540,14 +1540,15 @@ bash -c 'python ./examples/mmlu.py --test_trt_llm \
15401540
## Run LLaMa-3.3 70B Model on PyTorch Backend
15411541
This section provides the steps to run LLaMa-3.3 70B model FP8 precision on PyTorch backend by launching TensorRT LLM server and run performance benchmarks.
15421542

1543-
15441543
### Prepare TensorRT LLM extra configs
15451544
```bash
15461545
cat >./extra-llm-api-config.yml <<EOF
1547-
stream_interval: 2
1546+
stream_interval: 10
15481547
cuda_graph_config:
15491548
max_batch_size: 1024
15501549
enable_padding: true
1550+
kv_cache_config:
1551+
dtype: fp8
15511552
EOF
15521553
```
15531554
Explanation:
@@ -1581,5 +1582,5 @@ python -m tensorrt_llm.serve.scripts.benchmark_serving \
15811582
--random-input-len 1024 \
15821583
--random-output-len 2048 \
15831584
--random-ids \
1584-
--max-concurrency 1024 \
1585+
--max-concurrency 1024
15851586
```

examples/models/core/llama4/README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ This section provides the steps to launch TensorRT LLM server and run performanc
2727
```bash
2828
cat >./extra-llm-api-config.yml <<EOF
2929
enable_attention_dp: true
30-
stream_interval: 2
30+
stream_interval: 10
3131
cuda_graph_config:
3232
max_batch_size: 512
3333
enable_padding: true
@@ -78,7 +78,7 @@ python -m tensorrt_llm.serve.scripts.benchmark_serving \
7878
cat >./extra-llm-api-config.yml <<EOF
7979
enable_attention_dp: false
8080
enable_min_latency: true
81-
stream_interval: 2
81+
stream_interval: 10
8282
cuda_graph_config:
8383
max_batch_size: 8
8484
enable_padding: true
@@ -126,7 +126,7 @@ python -m tensorrt_llm.serve.scripts.benchmark_serving \
126126
#### 1. Prepare TensorRT LLM extra configs
127127
```bash
128128
cat >./extra-llm-api-config.yml <<EOF
129-
stream_interval: 2
129+
stream_interval: 10
130130
cuda_graph_config:
131131
max_batch_size: 1024
132132
enable_padding: true

jenkins/current_image_tags.properties

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
# images are adopted from PostMerge pipelines, the abbreviated commit hash is used instead.
1414
IMAGE_NAME=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm
1515

16-
LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.10-py3-x86_64-ubuntu24.04-trt10.13.3.9-skip-tritondevel-202511021230-8838
17-
LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.10-py3-aarch64-ubuntu24.04-trt10.13.3.9-skip-tritondevel-202511021230-8838
18-
LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.0.2-devel-rocky8-x86_64-rocky8-py310-trt10.13.3.9-skip-tritondevel-202511021230-8838
19-
LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.0.2-devel-rocky8-x86_64-rocky8-py312-trt10.13.3.9-skip-tritondevel-202511021230-8838
16+
LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.10-py3-x86_64-ubuntu24.04-trt10.13.3.9-skip-tritondevel-202511110140-8447
17+
LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.10-py3-aarch64-ubuntu24.04-trt10.13.3.9-skip-tritondevel-202511110140-8447
18+
LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.0.2-devel-rocky8-x86_64-rocky8-py310-trt10.13.3.9-skip-tritondevel-202511110140-8447
19+
LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.0.2-devel-rocky8-x86_64-rocky8-py312-trt10.13.3.9-skip-tritondevel-202511110140-8447

tensorrt_llm/_torch/auto_deploy/custom_ops/attention_interface.py

Lines changed: 33 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,10 @@
1010
"""
1111

1212
from abc import ABC, abstractmethod
13-
from dataclasses import dataclass
1413
from typing import Dict, List, Literal, Optional, Protocol, Sequence, Set, Tuple, Type, Union
1514

1615
import torch
16+
from pydantic import BaseModel, ConfigDict, Field, field_validator
1717
from torch._ops import OpOverloadPacket
1818
from torch.fx import Node
1919
from torch.types import Number
@@ -24,11 +24,39 @@
2424
Constant = Union[int, float, str, None]
2525

2626

27-
@dataclass
28-
class CacheConfig:
29-
"""A dataclass to hold information how to configure the cache."""
27+
class CacheConfig(BaseModel):
28+
"""Cache configuration for attention-related dtypes."""
3029

31-
dtype: Optional[torch.dtype] = None
30+
model_config = ConfigDict(
31+
arbitrary_types_allowed=True,
32+
extra="forbid",
33+
)
34+
35+
dtype: Optional[torch.dtype] = Field(default=None, description="KV cache dtype.")
36+
mamba_dtype: Optional[torch.dtype] = Field(default=None, description="Mamba cache dtype.")
37+
38+
@field_validator("dtype", "mamba_dtype", mode="before")
39+
@classmethod
40+
def _coerce_dtype(cls, value):
41+
if value is None or isinstance(value, torch.dtype):
42+
return value
43+
if isinstance(value, str):
44+
dtype = getattr(torch, value, None)
45+
assert isinstance(dtype, torch.dtype), f"Invalid {dtype=}"
46+
return dtype
47+
return value
48+
49+
def __or__(self, other: "CacheConfig") -> "CacheConfig":
50+
"""Combine two CacheConfig objects field-wise using Python's `or` semantics.
51+
52+
For each field, selects the first non-None value between `self` and `other`.
53+
"""
54+
if not isinstance(other, CacheConfig):
55+
raise NotImplementedError(f"Cannot combine CacheConfig with {type(other)}")
56+
merged_kwargs = {}
57+
for field_name in type(self).model_fields.keys():
58+
merged_kwargs[field_name] = getattr(self, field_name) or getattr(other, field_name)
59+
return CacheConfig(**merged_kwargs)
3260

3361

3462
class SequenceInfo:

0 commit comments

Comments
 (0)