Skip to content

Commit 70ffb31

Browse files
committed
Merge branch 'develop'
2 parents 91856ac + b7ce463 commit 70ffb31

File tree

311 files changed

+12710
-4647
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

311 files changed

+12710
-4647
lines changed

ucm/csrc/ucmnfsstore/.clang-format renamed to .clang-format

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
BasedOnStyle: LLVM
22
IndentWidth: 4
3-
ColumnLimit: 120
3+
ColumnLimit: 100
44
AccessModifierOffset: -4
55
AlwaysBreakTemplateDeclarations: true
66
PointerAlignment: Left

.github/actionlint.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
self-hosted-runner:
2+
# Labels of self-hosted runner in array of strings.
3+
labels:
4+
- default
5+
- arc-runner-ucm

.github/workflows/e2e_test.yml

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
name: offline_inference_test
2+
on:
3+
workflow_dispatch:
4+
5+
jobs:
6+
offline-inference:
7+
runs-on: arc-runner-ucm
8+
steps:
9+
- uses: actions/checkout@v4
10+
- run: nvidia-smi
11+
- name: Run offline_inference in container
12+
run: |
13+
docker run --rm \
14+
--gpus all \
15+
-v ${{ github.workspace }}:/workspace/unified-cache-management \
16+
-v /home_116/models/Qwen2.5-1.5B-Instruct:/home/models/Qwen2.5-1.5B-Instruct \
17+
-w /workspace/unified-cache-management \
18+
--entrypoint /bin/bash \
19+
vllm/vllm-openai:v0.9.2 \
20+
-c "
21+
set -euo pipefail
22+
export PLATFORM=cuda
23+
export MODEL_PATH=/home/models/Qwen2.5-1.5B-Instruct
24+
pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
25+
pip install -v -e . --no-build-isolation
26+
cd \$(pip show vllm | grep Location | awk '{print \$2}') &&
27+
git apply /workspace/unified-cache-management/ucm/integration/vllm/patch/0.9.2/vllm-adapt.patch
28+
cd /workspace/unified-cache-management
29+
python3 examples/offline_inference.py
30+
"

.github/workflows/ucmnfsstore-ut.yml renamed to .github/workflows/ucmstore.yml

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
# This starter workflow is for a CMake project running on a single platform. There is a different starter workflow if you need cross-platform coverage.
22
# See: https://github.com/actions/starter-workflows/blob/main/ci/cmake-multi-platform.yml
3-
name: ucmnfsstore-ut
3+
name: ucmstore
44

55
on:
66
push:
7-
branches: [ "dev*", "main", "*release" ]
7+
branches: [ "*" ]
88
pull_request:
99
branches: [ "dev*", "main", "*release" ]
1010

@@ -24,7 +24,7 @@ jobs:
2424

2525
- name: Install googletest
2626
run: |
27-
git clone https://github.com/google/googletest.git --depth=1 --branch=release-1.11.0
27+
git clone https://github.com/google/googletest.git --depth=1 --branch=v1.12.0
2828
cd googletest
2929
mkdir build && cd build
3030
cmake -DCMAKE_CXX_FLAGS="-fPIC" -DCMAKE_C_FLAGS="-fPIC" -DCMAKE_CXX_STANDARD=17 -DCMAKE_CXX_STANDARD_REQUIRED=True ..
@@ -41,8 +41,7 @@ jobs:
4141
- name: Configure CMake
4242
# Configure CMake in a 'build' subdirectory. `CMAKE_BUILD_TYPE` is only required if you are using a single-configuration generator such as make.
4343
# See https://cmake.org/cmake/help/latest/variable/CMAKE_BUILD_TYPE.html?highlight=cmake_build_type
44-
working-directory: ${{github.workspace}}/ucm/csrc/ucmnfsstore
45-
run: cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DBUILD_TESTS=ON -DCOVERAGE_ENABLED=ON -DDOWNLOAD_DEPENDENCE=ON -DRUNTIME_ENVIRONMENT=simu
44+
run: cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DBUILD_UCM_SPARSE=OFF -DBUILD_UNIT_TESTS=ON -DRUNTIME_ENVIRONMENT=simu
4645

4746
- name: Build
4847
# Build your program with the given configuration

.github/workflows/unifiedcache_test.yml

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,9 @@ on:
1313
- '*release'
1414

1515
jobs:
16+
# gpu-test:
17+
# uses: ./.github/workflows/e2e_test.yml
18+
1619
call-lint:
1720
uses: ./.github/workflows/pre-commit.yml
1821

@@ -43,10 +46,10 @@ jobs:
4346
--entrypoint /bin/bash \
4447
vllm/vllm-openai:v0.9.2 \
4548
-c "
49+
set -euo pipefail
4650
pip install -v -e . --no-build-isolation
4751
cd \$(pip show vllm | grep Location | awk '{print \$2}') &&
48-
git apply /workspace/unified-cache-management/ucm/patch/0.9.2/vllm-adapt.patch &&
49-
git apply /workspace/unified-cache-management/ucm/patch/0.9.2/vllm-adapt-sparse.patch
52+
git apply /workspace/unified-cache-management/ucm/integration/vllm/patch/0.9.2/vllm-adapt.patch
5053
cd /workspace/unified-cache-management
5154
python3 -m unittest discover -s test
52-
"
55+
"

.readthedocs.yaml

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# Read the Docs configuration file
2+
# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
3+
4+
# Required
5+
version: 2
6+
7+
# Set the OS, Python version, and other tools you might need
8+
build:
9+
os: ubuntu-22.04
10+
tools:
11+
python: "3.12"
12+
13+
# Build documentation in the "docs/" directory with Sphinx
14+
sphinx:
15+
configuration: docs/source/conf.py
16+
17+
# Optionally, but recommended,
18+
# declare the Python requirements required to build your documentation
19+
# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
20+
python:
21+
install:
22+
- requirements: docs/requirements-docs.txt
23+

CMakeLists.txt

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
cmake_minimum_required(VERSION 3.18)
2+
project(unified-cache-management VERSION 1.0.0 LANGUAGES CXX)
3+
4+
set(CMAKE_CXX_STANDARD 17)
5+
set(CMAKE_CXX_STANDARD_REQUIRED ON)
6+
set(CMAKE_CXX_EXTENSIONS OFF)
7+
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
8+
9+
option(BUILD_UCM_STORE "build ucm store module." ON)
10+
option(BUILD_UCM_SPARSE "build ucm sparse module." ON)
11+
option(BUILD_UNIT_TESTS "build all unit test suits." OFF)
12+
set(RUNTIME_ENVIRONMENT "simu" CACHE STRING "runtime: simu, ascend or cuda.")
13+
14+
execute_process(COMMAND git rev-parse HEAD OUTPUT_VARIABLE UCM_COMMIT_ID OUTPUT_STRIP_TRAILING_WHITESPACE)
15+
add_compile_definitions(UCM_PROJECT_NAME="${PROJECT_NAME}")
16+
add_compile_definitions(UCM_PROJECT_VERSION="${PROJECT_VERSION}")
17+
add_compile_definitions(UCM_COMMIT_ID="${UCM_COMMIT_ID}")
18+
add_compile_definitions(UCM_BUILD_TYPE="${CMAKE_BUILD_TYPE}")
19+
20+
set(CMAKE_SKIP_RPATH TRUE)
21+
set(FLAGS_PUBLIC "-Wall -Werror -fPIC -Wl,-z,relro,-z,now")
22+
set(FLAGS_DEBUG "-O0 -g")
23+
set(FLAGS_RELEASE "-O3 -D_FORTIFY_SOURCE=2")
24+
string(TOLOWER "${CMAKE_BUILD_TYPE}" CMAKE_BUILD_TYPE_LOWER)
25+
if(CMAKE_BUILD_TYPE_LOWER STREQUAL "debug")
26+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLAGS_PUBLIC} ${FLAGS_DEBUG}")
27+
else()
28+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLAGS_PUBLIC} ${FLAGS_RELEASE}")
29+
endif()
30+
if(BUILD_UNIT_TESTS)
31+
enable_testing()
32+
endif()
33+
34+
add_subdirectory(ucm)
35+
if(BUILD_UNIT_TESTS)
36+
add_subdirectory(test)
37+
endif()

MANIFEST.in

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
1-
recursive-include unifiedcache/csrc *
1+
recursive-include ucm/csrc *
2+
exclude CMakeLists.txt

README.md

Lines changed: 52 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1,75 +1,91 @@
11
<p align="center">
22
<picture>
3-
<source media="(prefers-color-scheme: dark)" srcset="docs/source/logos/UCM.png">
3+
<source media="(prefers-color-scheme: dark)" srcset="docs/source/logos/UCM-dark.png">
44
<img alt="UCM" src="docs/source/logos/UCM-light.png" width=50%>
55
</picture>
66
</p>
77

88
<p align="center">
9-
| <a href="docs/source/index.md"><b>Documentation</b></a> | <a href="https://github.com/ModelEngine-Group/unified-cache-management/issues/16"><b>Roadmap</b></a> |
9+
| <a href="docs/source/index.md"><b>Documentation</b></a> | <a href="https://modelengine-ai.net/#/ucm"><b>Website</b></a> | <a href="https://github.com/ModelEngine-Group/unified-cache-management/issues/78"><b>RoadMap</b></a> | <a href="https://github.com/ModelEngine-Group/unified-cache-management/blob/main/README_zh.md"><b>中文</b></a> |
1010
</p>
1111

1212
---
1313

14-
*Latest News* 🔥
15-
- [2025/08/01] We are excited to announce the alpha release of Unified Cache Manager.
16-
17-
---
14+
## Overview
1815

19-
## Performance
20-
nfs connector has reached about 4x TTFT accelerate.
16+
The core principle of Unified Cache Manager (UCM) is to persist the LLM KVCache and replace redundant computations
17+
through multiple retrieval mechanisms. UCM not only supports prefix caching but also offers a variety of training-free
18+
sparse attention retrieval methods, delivering higher performance when handling extremely long sequence inference tasks.
19+
Additionally, UCM provides a PD disaggregation solution based on a storage-compute separation architecture, which
20+
enables more straightforward and flexible management of heterogeneous computing resources. When integrated with vLLM,
21+
UCM achieves a 3-10x reduction in inference latency across various scenarios, including multi-turn dialogue and
22+
long-context reasoning tasks.
2123

22-
![perf](docs/source/images/nfs_performance.png)
24+
### Motivation
2325

24-
## Overview
26+
With the increase of model size, the KV cache became larger and sparser, especially for long sequence requests. To
27+
reduce the GPU memory used, offload full KV to external storage and only keep partial or compressed KV in GPU memory
28+
became the popular direction. This can also reduce the GPU calculation, increase the sequence length and batch size of
29+
decoding.
2530

26-
### Motivation
27-
With the increase of model size, the KV cache became larger and sparser, especially for long sequence requests. To reduce the GPU memory used, offload full KV to external storage and only keep partial or compressed KV in GPU memory became the popular direction. This can also reduce the GPU calculation, increase the sequence length and batch size of decoding.
31+
Sparse KV cache have many different choices. Recently paper point out that there is no common way can fit all scenarios
32+
and all models. So better to build a common framework then different sparse algorithms can be plugin to it like KV
33+
connector for PC.
2834

29-
Sparse KV cache have many different choices. Recently paper point out that there is no common way can fit all scenarios and all models. So better to build a common framework then different sparse algorithms can be plugin to it like KV connector for PC.
35+
![architecture.png](./docs/source/_static/images/idea.png)
3036

31-
### Proposed Change
32-
![idea](docs/source/images/idea.png)
37+
All gray boxes in the diagram represent existing classes in vLLM version 0.9.2, while the green boxes indicate newly added components by UCM.
38+
The light green boxes demonstrate potential future subclass extensions based on this framework.
3339

34-
All gray boxes are current classes in 0.9.2. Green boxes are proposed to add. Light green ones show out the future sub classes base on this framework.
40+
UcmSparseBase is the base class of different sparse algorithms. Just like KV connector design, it will hook few places of
41+
scheduler and layer.py to do additional load, dump and calculate sparse KV blocks.
3542

36-
SpareKVBase is the base class of different algorithms. Just like KV connector design, it will hook few places of scheduler and layer.py to allow sparse algorithms do additional load, dump and calculate sparse KV blocks.
43+
SparseKVManager allows users to define custom KV block allocations for different algorithms.
44+
To keep all implementations unified under the SparseKVBase framework, the system calls the SparseKVBase base class,
45+
while the actual implementation occurs in subclasses of sparse algorithms.
3746

38-
SparseKVManager provide different KV block allocation methods for different algorithms. To keep all implementation under SpareKVBase, it will call SparseKVBase and real implementation will happen in sub class of sparse algorithms.
47+
KVStoreBase helps decouple sparse algorithms from external storage. It defines methods for communicating with external storage,
48+
enabling any sparse algorithm to work seamlessly with any external storage system.
49+
The core concept here involves identifying blocks through IDs and offsets.
50+
This approach is not only suitable for sparse scenarios but also naturally accommodates prefix caching.
51+
The KVStoreConnector links it with the current KVConnectorBase_V1 to provide PC (Prefix Caching) functionality.
52+
For example, NFSStore serves as a reference implementation that provides the capability to store KVCache
53+
in either a local filesystem for single-machine scenarios or through NFS mount points in multi-server environments.
3954

40-
KVStoreBase helps decoupling sparse algorithms and external storage. It defined the methods how to talk to external storage, so any sparse algorithms can work with any external storage. Concepts here is blocks identify by ID with offset. This is not only for sparse but also naturally for prefix cache also. KVStoreConnector connect it with current KVConnectorBase_V1 to provide PC function.
55+
---
4156

42-
NFSStore is sample implementation here provide ability to store blocks in local file system or NFS mount point in multi-server case.
57+
## Support Features
4358

44-
LocalCachedStore can reference any store to provide local DRAM read cache layer.
59+
- Prefix Cache
60+
- Cache Blend
61+
- Model Window Extrapolation
62+
- Prefill Offload
63+
- Sparse Attention
64+
- Sparse Attention Offload
65+
- Heterogeneous PD Disaggregation
4566

4667
---
4768

4869
## Quick Start
49-
please refer to [installation](docs/source/getting-started/installation.md) and [example](docs/source/getting-started/example/dram_conn.md)
50-
51-
---
5270

53-
## Support Features
54-
please refer to [features matrix](docs/source/feature/support.md).
71+
please refer to [Quick Start](./docs/source/getting-started/quick_start.md).
5572

5673
---
5774

58-
## Branch Policy
59-
Unified Cache has main branch, develop branch and release branch.
60-
- **main**: main is the most stable branch. Only the release branch can be integrated. The tag is attached to the main branch.
61-
- **develop**: develop is a daily development branch, new features will be merged in this branch.
62-
- **x.x.x-release**: each time we decide to release a new version, we checkout a release branch and test on this branch, this branch only accepted [bugfix]. When the branch passed test, we merge the branch into develop and main, tag the corresponding x.x.x tag based on the main branch, and finish the release.
75+
## Branch
6376

64-
Usually, a commit should be ONLY first merged in the develop branch.
77+
| **Branch** | Status | vLLM version |
78+
|-----------:|-----------:|-------------:|
79+
| main | Maintained | v0.9.2 |
80+
| develop | Maintained | v0.9.2 |
6581

6682
---
6783

68-
## Contributing
69-
When you want to contribute some features to the Unified Cache Community, first fork a branch (usually develop) to your own repository, then commit in your own repository, and finally submit a pull request to the community.
84+
## Contact Us
7085

71-
---
86+
For technical questions and feature requests, please use
87+
GitHub [Issues](https://github.com/ModelEngine-Group/unified-cache-management/issues).
7288

7389
## License
7490

75-
UCM is licensed under the MIT with additional conditions. Please read the [LICENSE](./LICENSE) file for details.
91+
UCM is licensed under the MIT with additional conditions. Please read the [LICENSE](./LICENSE) file for details.

README_zh.md

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
<p align="center">
2+
<picture>
3+
<source media="(prefers-color-scheme: dark)" srcset="docs/source/logos/UCM-dark.png">
4+
<img alt="UCM" src="docs/source/logos/UCM-light.png" width=50%>
5+
</picture>
6+
</p>
7+
8+
<p align="center">
9+
| <a href="docs/source/index.md"><b>文档</b></a> | <a href="https://modelengine-ai.net/#/ucm"><b>网站</b></a> | <a href="https://github.com/ModelEngine-Group/unified-cache-management/issues/78"><b>发展路线图</b></a> | <a href="https://github.com/ModelEngine-Group/unified-cache-management"><b>EN</b></a> |
10+
</p>
11+
12+
---
13+
14+
## 概述
15+
16+
统一缓存管理器(Unified Cache Management, UCM)的核心原理是持久化 LLM 的 KVCache,并通过多种检索机制替代冗余计算。UCM 不仅支持前缀缓存(prefix cache, PC),还提供了多种无需训练的稀疏注意力检索方法,在处理极长序列推理任务时达到更高性能。此外,UCM 基于存算分离架构提供了 PD 分离方案,使得异构计算资源的管理更加简单灵活。与 vLLM 集成后,UCM 在多轮对话和长上下文推理等多种场景下可将推理延迟降低 3–10 倍。
17+
18+
---
19+
20+
## 动机
21+
22+
随着模型尺寸的不断增长,KV 缓存也变得越来越大,且越来越稀疏,对于长序列请求来说尤为明显。为了减小 GPU 显存的使用,主流的方向是将全量的 KV 数据卸载到外部存储中,而在 GPU 显存中只保留部分或者被压缩的 KV 数据。这同时可以减小 GPU 的运算量,在解码时增加最大生成序列长度和批大小。
23+
24+
有许多种不同的稀疏 KV 缓存的实现。最新的论文指出,能够最好地适配所有场景和所有模型的方法是不存在的。因此,更好的做法是搭建一套公共的框架,并在此之上接入不同的稀疏化算法,就像 KV 连接器和 PC 一样。
25+
26+
![architecture.png](./docs/source/_static/images/idea.png)
27+
28+
图中所有灰色框代表vLLM 0.9.2版本中的现有类,绿色框则代表UCM新增组件。浅绿色框展示了基于此框架未来规划扩展的子类。
29+
30+
UcmSparseBase是不同稀疏算法的基类。类似于KV连接器的设计,它将在scheduler和layer.py中的关键位置植入hook点,用于执行稀疏KVCache block的加载、转储和计算操作。
31+
32+
SparseKVManager允许用户针对不同算法自定义KVCache block的分配策略。为了将所有实现统一在SparseKVBase框架下,系统会调用SparseKVBase基类,而具体实现则由稀疏算法的子类完成。
33+
34+
KVStoreBase有助于实现稀疏算法与外部存储的解耦。它定义了与外部存储的通信方法,使得任何稀疏算法都能与任意外部存储系统无缝协作。其核心机制是通过ID和偏移量来标识数据块。这种方法不仅适用于稀疏场景,还能天然支持前缀缓存。KVStoreConnector将其与vLLM的KVConnectorBase_V1连接以提供前缀缓存功能。例如,NFSStore作为参考实现,提供了在单机本地文件系统或多服务器环境下通过NFS挂载点存储KVCache的能力。
35+
36+
---
37+
38+
## 支持特性
39+
- 前缀匹配
40+
- 缓存融合
41+
- 模型窗口外推
42+
- 预填充卸载
43+
- 稀疏注意力
44+
- 稀疏注意力卸载
45+
- 异构PD分离
46+
47+
---
48+
49+
## 快速开始
50+
51+
请参考 [快速开始](./docs/source/getting-started/quick_start.md).
52+
53+
---
54+
55+
## 分支
56+
57+
| **分支** | 状态 | vLLM 版本 |
58+
|-----------:|-----------:|-------------:|
59+
| main | 维护中 | v0.9.2 |
60+
| develop | 维护中 | v0.9.2 |
61+
62+
---
63+
64+
## 联系我们
65+
如需技术咨询或功能请求,请提交 GitHub [Issues](https://github.com/ModelEngine-Group/unified-cache-management/issues).
66+
67+
## 许可协议
68+
69+
UCM 采用 MIT 许可证(附加额外条件),详情请参阅 [LICENSE](./LICENSE) 文件。

0 commit comments

Comments
 (0)