ModelEngine-Group
diff --git a/‎ucm/csrc/ucmnfsstore/.clang-format‎ renamed to ‎.clang-format‎
Lines changed: 1 addition & 1 deletion b/‎ucm/csrc/ucmnfsstore/.clang-format‎ renamed to ‎.clang-format‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/actionlint.yaml‎
Lines changed: 5 additions & 0 deletions b/‎.github/actionlint.yaml‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎.github/workflows/e2e_test.yml‎
Lines changed: 30 additions & 0 deletions b/‎.github/workflows/e2e_test.yml‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎.github/workflows/ucmnfsstore-ut.yml‎ renamed to ‎.github/workflows/ucmstore.yml‎
Lines changed: 4 additions & 5 deletions b/‎.github/workflows/ucmnfsstore-ut.yml‎ renamed to ‎.github/workflows/ucmstore.yml‎
Lines changed: 4 additions & 5 deletions
diff --git a/‎.github/workflows/unifiedcache_test.yml‎
Lines changed: 6 additions & 3 deletions b/‎.github/workflows/unifiedcache_test.yml‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎.readthedocs.yaml‎
Lines changed: 23 additions & 0 deletions b/‎.readthedocs.yaml‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 37 additions & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 37 additions & 0 deletions
diff --git a/‎MANIFEST.in‎
Lines changed: 2 additions & 1 deletion b/‎MANIFEST.in‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎README.md‎
Lines changed: 52 additions & 36 deletions b/‎README.md‎
Lines changed: 52 additions & 36 deletions
diff --git a/‎README_zh.md‎
Lines changed: 69 additions & 0 deletions b/‎README_zh.md‎
Lines changed: 69 additions & 0 deletions
@@ -1,6 +1,6 @@
 BasedOnStyle: LLVM
 IndentWidth: 4
-ColumnLimit: 120
+ColumnLimit: 100
 AccessModifierOffset: -4
 AlwaysBreakTemplateDeclarations: true
 PointerAlignment: Left
 
@@ -0,0 +1,5 @@
+self-hosted-runner:
+  # Labels of self-hosted runner in array of strings.
+  labels:
+    - default
+    - arc-runner-ucm
@@ -0,0 +1,30 @@
+name: offline_inference_test
+on: 
+    workflow_dispatch:
+
+jobs:
+  offline-inference:
+    runs-on: arc-runner-ucm       
+    steps:
+      - uses: actions/checkout@v4
+      - run: nvidia-smi
+      - name: Run offline_inference in container
+        run: |
+          docker run --rm \
+            --gpus all \
+            -v ${{ github.workspace }}:/workspace/unified-cache-management \
+            -v /home_116/models/Qwen2.5-1.5B-Instruct:/home/models/Qwen2.5-1.5B-Instruct \
+            -w /workspace/unified-cache-management \
+            --entrypoint /bin/bash \
+            vllm/vllm-openai:v0.9.2 \
+            -c "
+              set -euo pipefail
+              export PLATFORM=cuda
+              export MODEL_PATH=/home/models/Qwen2.5-1.5B-Instruct
+              pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
+              pip install -v -e . --no-build-isolation
+              cd \$(pip show vllm | grep Location | awk '{print \$2}') &&
+              git apply /workspace/unified-cache-management/ucm/integration/vllm/patch/0.9.2/vllm-adapt.patch
+              cd /workspace/unified-cache-management
+              python3 examples/offline_inference.py
+            "
@@ -1,10 +1,10 @@
 # This starter workflow is for a CMake project running on a single platform. There is a different starter workflow if you need cross-platform coverage.
 # See: https://github.com/actions/starter-workflows/blob/main/ci/cmake-multi-platform.yml
-name: ucmnfsstore-ut
+name: ucmstore
 
 on:
   push:
-    branches: [ "dev*", "main", "*release" ]
+    branches: [ "*" ]
   pull_request:
     branches: [ "dev*", "main", "*release" ]
 
@@ -24,7 +24,7 @@ jobs:
 
     - name: Install googletest
       run: |
-        git clone https://github.com/google/googletest.git --depth=1 --branch=release-1.11.0
+        git clone https://github.com/google/googletest.git --depth=1 --branch=v1.12.0
         cd googletest
         mkdir build && cd build
         cmake -DCMAKE_CXX_FLAGS="-fPIC" -DCMAKE_C_FLAGS="-fPIC" -DCMAKE_CXX_STANDARD=17 -DCMAKE_CXX_STANDARD_REQUIRED=True ..
@@ -41,8 +41,7 @@ jobs:
     - name: Configure CMake
       # Configure CMake in a 'build' subdirectory. `CMAKE_BUILD_TYPE` is only required if you are using a single-configuration generator such as make.
       # See https://cmake.org/cmake/help/latest/variable/CMAKE_BUILD_TYPE.html?highlight=cmake_build_type
-      working-directory: ${{github.workspace}}/ucm/csrc/ucmnfsstore
-      run: cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DBUILD_TESTS=ON -DCOVERAGE_ENABLED=ON -DDOWNLOAD_DEPENDENCE=ON -DRUNTIME_ENVIRONMENT=simu
+      run: cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DBUILD_UCM_SPARSE=OFF -DBUILD_UNIT_TESTS=ON -DRUNTIME_ENVIRONMENT=simu
 
     - name: Build
       # Build your program with the given configuration
 
@@ -13,6 +13,9 @@ on:
       - '*release'
 
 jobs:
+  # gpu-test:
+  #   uses: ./.github/workflows/e2e_test.yml
+
   call-lint:
     uses: ./.github/workflows/pre-commit.yml
 
@@ -43,10 +46,10 @@ jobs:
             --entrypoint /bin/bash \
             vllm/vllm-openai:v0.9.2 \
             -c "
+              set -euo pipefail
               pip install -v -e . --no-build-isolation
               cd \$(pip show vllm | grep Location | awk '{print \$2}') &&
-              git apply /workspace/unified-cache-management/ucm/patch/0.9.2/vllm-adapt.patch &&
-              git apply /workspace/unified-cache-management/ucm/patch/0.9.2/vllm-adapt-sparse.patch
+              git apply /workspace/unified-cache-management/ucm/integration/vllm/patch/0.9.2/vllm-adapt.patch
               cd /workspace/unified-cache-management
               python3 -m unittest discover -s test
-            "
+            "
@@ -0,0 +1,23 @@
+# Read the Docs configuration file
+# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
+
+# Required
+version: 2
+
+# Set the OS, Python version, and other tools you might need
+build:
+  os: ubuntu-22.04
+  tools:
+    python: "3.12"
+
+# Build documentation in the "docs/" directory with Sphinx
+sphinx:
+   configuration: docs/source/conf.py
+
+# Optionally, but recommended,
+# declare the Python requirements required to build your documentation
+# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
+python:
+   install:
+   - requirements: docs/requirements-docs.txt
+        
@@ -0,0 +1,37 @@
+cmake_minimum_required(VERSION 3.18)
+project(unified-cache-management VERSION 1.0.0 LANGUAGES CXX)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
+option(BUILD_UCM_STORE "build ucm store module." ON)
+option(BUILD_UCM_SPARSE "build ucm sparse module." ON)
+option(BUILD_UNIT_TESTS "build all unit test suits." OFF)
+set(RUNTIME_ENVIRONMENT "simu" CACHE STRING "runtime: simu, ascend or cuda.")
+
+execute_process(COMMAND git rev-parse HEAD OUTPUT_VARIABLE UCM_COMMIT_ID OUTPUT_STRIP_TRAILING_WHITESPACE)
+add_compile_definitions(UCM_PROJECT_NAME="${PROJECT_NAME}")
+add_compile_definitions(UCM_PROJECT_VERSION="${PROJECT_VERSION}")
+add_compile_definitions(UCM_COMMIT_ID="${UCM_COMMIT_ID}")
+add_compile_definitions(UCM_BUILD_TYPE="${CMAKE_BUILD_TYPE}")
+
+set(CMAKE_SKIP_RPATH TRUE)
+set(FLAGS_PUBLIC "-Wall -Werror -fPIC -Wl,-z,relro,-z,now")
+set(FLAGS_DEBUG "-O0 -g")
+set(FLAGS_RELEASE "-O3 -D_FORTIFY_SOURCE=2")
+string(TOLOWER "${CMAKE_BUILD_TYPE}" CMAKE_BUILD_TYPE_LOWER)
+if(CMAKE_BUILD_TYPE_LOWER STREQUAL "debug")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLAGS_PUBLIC} ${FLAGS_DEBUG}")
+else()
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLAGS_PUBLIC} ${FLAGS_RELEASE}")
+endif()
+if(BUILD_UNIT_TESTS)
+    enable_testing()
+endif()
+
+add_subdirectory(ucm)
+if(BUILD_UNIT_TESTS)
+    add_subdirectory(test)
+endif()
@@ -1 +1,2 @@
-recursive-include unifiedcache/csrc *
+recursive-include ucm/csrc *
+exclude CMakeLists.txt
@@ -1,75 +1,91 @@
 <p align="center">
   <picture>
-    <source media="(prefers-color-scheme: dark)" srcset="docs/source/logos/UCM.png">
+    <source media="(prefers-color-scheme: dark)" srcset="docs/source/logos/UCM-dark.png">
     <img alt="UCM" src="docs/source/logos/UCM-light.png" width=50%>
   </picture>
 </p>
 
 <p align="center">
-| <a href="docs/source/index.md"><b>Documentation</b></a> | <a href="https://github.com/ModelEngine-Group/unified-cache-management/issues/16"><b>Roadmap</b></a> |
+| <a href="docs/source/index.md"><b>Documentation</b></a> | <a href="https://modelengine-ai.net/#/ucm"><b>Website</b></a> | <a href="https://github.com/ModelEngine-Group/unified-cache-management/issues/78"><b>RoadMap</b></a> | <a href="https://github.com/ModelEngine-Group/unified-cache-management/blob/main/README_zh.md"><b>中文</b></a> |
 </p>
 
 ---
 
-*Latest News* 🔥
-- [2025/08/01] We are excited to announce the alpha release of Unified Cache Manager.
-
----
+## Overview
 
-## Performance
-nfs connector has reached about 4x TTFT accelerate.
+The core principle of Unified Cache Manager (UCM) is to persist the LLM KVCache and replace redundant computations
+through multiple retrieval mechanisms. UCM not only supports prefix caching but also offers a variety of training-free
+sparse attention retrieval methods, delivering higher performance when handling extremely long sequence inference tasks.
+Additionally, UCM provides a PD disaggregation solution based on a storage-compute separation architecture, which
+enables more straightforward and flexible management of heterogeneous computing resources. When integrated with vLLM,
+UCM achieves a 3-10x reduction in inference latency across various scenarios, including multi-turn dialogue and
+long-context reasoning tasks.
 
-![perf](docs/source/images/nfs_performance.png)
+### Motivation
 
-## Overview
+With the increase of model size, the KV cache became larger and sparser, especially for long sequence requests. To
+reduce the GPU memory used, offload full KV to external storage and only keep partial or compressed KV in GPU memory
+became the popular direction. This can also reduce the GPU calculation, increase the sequence length and batch size of
+decoding.
 
-### Motivation
-With the increase of model size, the KV cache became larger and sparser, especially for long sequence requests. To reduce the GPU memory used, offload full KV to external storage and only keep partial or compressed KV in GPU memory became the popular direction. This can also reduce the GPU calculation, increase the sequence length and batch size of decoding.
+Sparse KV cache have many different choices. Recently paper point out that there is no common way can fit all scenarios
+and all models. So better to build a common framework then different sparse algorithms can be plugin to it like KV
+connector for PC.
 
-Sparse KV cache have many different choices. Recently paper point out that there is no common way can fit all scenarios and all models. So better to build a common framework then different sparse algorithms can be plugin to it like KV connector for PC.
+![architecture.png](./docs/source/_static/images/idea.png)
 
-### Proposed Change
-![idea](docs/source/images/idea.png)
+All gray boxes in the diagram represent existing classes in vLLM version 0.9.2, while the green boxes indicate newly added components by UCM. 
+The light green boxes demonstrate potential future subclass extensions based on this framework.
 
-All gray boxes are current classes in 0.9.2. Green boxes are proposed to add. Light green ones show out the future sub classes base on this framework.
+UcmSparseBase is the base class of different sparse algorithms. Just like KV connector design, it will hook few places of
+scheduler and layer.py to do additional load, dump and calculate sparse KV blocks.
 
-SpareKVBase is the base class of different algorithms. Just like KV connector design, it will hook few places of scheduler and layer.py to allow sparse algorithms do additional load, dump and calculate sparse KV blocks.
+SparseKVManager allows users to define custom KV block allocations for different algorithms. 
+To keep all implementations unified under the SparseKVBase framework, the system calls the SparseKVBase base class, 
+while the actual implementation occurs in subclasses of sparse algorithms.
 
-SparseKVManager provide different KV block allocation methods for different algorithms. To keep all implementation under SpareKVBase, it will call SparseKVBase and real implementation will happen in sub class of sparse algorithms.
+KVStoreBase helps decouple sparse algorithms from external storage. It defines methods for communicating with external storage, 
+enabling any sparse algorithm to work seamlessly with any external storage system. 
+The core concept here involves identifying blocks through IDs and offsets. 
+This approach is not only suitable for sparse scenarios but also naturally accommodates prefix caching. 
+The KVStoreConnector links it with the current KVConnectorBase_V1 to provide PC (Prefix Caching) functionality. 
+For example, NFSStore serves as a reference implementation that provides the capability to store KVCache 
+in either a local filesystem for single-machine scenarios or through NFS mount points in multi-server environments.
 
-KVStoreBase helps decoupling sparse algorithms and external storage. It defined the methods how to talk to external storage, so any sparse algorithms can work with any external storage. Concepts here is blocks identify by ID with offset. This is not only for sparse but also naturally for prefix cache also. KVStoreConnector connect it with current KVConnectorBase_V1 to provide PC function.
+---
 
-NFSStore is sample implementation here provide ability to store blocks in local file system or NFS mount point in multi-server case.
+## Support Features
 
-LocalCachedStore can reference any store to provide local DRAM read cache layer.
+- Prefix Cache
+- Cache Blend
+- Model Window Extrapolation
+- Prefill Offload
+- Sparse Attention
+- Sparse Attention Offload
+- Heterogeneous PD Disaggregation
 
 ---
 
 ## Quick Start
-please refer to [installation](docs/source/getting-started/installation.md) and [example](docs/source/getting-started/example/dram_conn.md)。
-
----
 
-## Support Features
-please refer to [features matrix](docs/source/feature/support.md).
+please refer to [Quick Start](./docs/source/getting-started/quick_start.md).
 
 ---
 
-## Branch Policy
-Unified Cache has main branch, develop branch and release branch.
-- **main**: main is the most stable branch. Only the release branch can be integrated. The tag is attached to the main branch.
-- **develop**: develop is a daily development branch, new features will be merged in this branch.
-- **x.x.x-release**: each time we decide to release a new version, we checkout a release branch and test on this branch, this branch only accepted [bugfix]. When the branch passed test, we merge the branch into develop and main, tag the corresponding x.x.x tag based on the main branch, and finish the release.
+## Branch
 
-Usually, a commit should be ONLY first merged in the develop branch.
+| **Branch** |     Status | vLLM version |
+|-----------:|-----------:|-------------:|
+|       main | Maintained |       v0.9.2 |
+|    develop | Maintained |       v0.9.2 |
 
 ---
 
-## Contributing
-When you want to contribute some features to the Unified Cache Community, first fork a branch (usually develop) to your own repository, then commit in your own repository, and finally submit a pull request to the community.
+## Contact Us
 
----
+For technical questions and feature requests, please use
+GitHub [Issues](https://github.com/ModelEngine-Group/unified-cache-management/issues).
 
 ## License
 
-UCM is licensed under the MIT with additional conditions. Please read the [LICENSE](./LICENSE) file for details.
+UCM is licensed under the MIT with additional conditions. Please read the [LICENSE](./LICENSE) file for details.
@@ -0,0 +1,69 @@
+<p align="center">
+  <picture>
+    <source media="(prefers-color-scheme: dark)" srcset="docs/source/logos/UCM-dark.png">
+    <img alt="UCM" src="docs/source/logos/UCM-light.png" width=50%>
+  </picture>
+</p>
+
+<p align="center">
+| <a href="docs/source/index.md"><b>文档</b></a> | <a href="https://modelengine-ai.net/#/ucm"><b>网站</b></a> | <a href="https://github.com/ModelEngine-Group/unified-cache-management/issues/78"><b>发展路线图</b></a> | <a href="https://github.com/ModelEngine-Group/unified-cache-management"><b>EN</b></a> |
+</p>
+
+---
+
+## 概述
+
+统一缓存管理器（Unified Cache Management, UCM）的核心原理是持久化 LLM 的 KVCache，并通过多种检索机制替代冗余计算。UCM 不仅支持前缀缓存（prefix cache, PC），还提供了多种无需训练的稀疏注意力检索方法，在处理极长序列推理任务时达到更高性能。此外，UCM 基于存算分离架构提供了 PD 分离方案，使得异构计算资源的管理更加简单灵活。与 vLLM 集成后，UCM 在多轮对话和长上下文推理等多种场景下可将推理延迟降低 3–10 倍。
+
+---
+
+## 动机
+
+随着模型尺寸的不断增长，KV 缓存也变得越来越大，且越来越稀疏，对于长序列请求来说尤为明显。为了减小 GPU 显存的使用，主流的方向是将全量的 KV 数据卸载到外部存储中，而在 GPU 显存中只保留部分或者被压缩的 KV 数据。这同时可以减小 GPU 的运算量，在解码时增加最大生成序列长度和批大小。
+
+有许多种不同的稀疏 KV 缓存的实现。最新的论文指出，能够最好地适配所有场景和所有模型的方法是不存在的。因此，更好的做法是搭建一套公共的框架，并在此之上接入不同的稀疏化算法，就像 KV 连接器和 PC 一样。
+
+![architecture.png](./docs/source/_static/images/idea.png)
+
+图中所有灰色框代表vLLM 0.9.2版本中的现有类，绿色框则代表UCM新增组件。浅绿色框展示了基于此框架未来规划扩展的子类。
+
+UcmSparseBase是不同稀疏算法的基类。类似于KV连接器的设计，它将在scheduler和layer.py中的关键位置植入hook点，用于执行稀疏KVCache block的加载、转储和计算操作。
+
+SparseKVManager允许用户针对不同算法自定义KVCache block的分配策略。为了将所有实现统一在SparseKVBase框架下，系统会调用SparseKVBase基类，而具体实现则由稀疏算法的子类完成。
+
+KVStoreBase有助于实现稀疏算法与外部存储的解耦。它定义了与外部存储的通信方法，使得任何稀疏算法都能与任意外部存储系统无缝协作。其核心机制是通过ID和偏移量来标识数据块。这种方法不仅适用于稀疏场景，还能天然支持前缀缓存。KVStoreConnector将其与vLLM的KVConnectorBase_V1连接以提供前缀缓存功能。例如，NFSStore作为参考实现，提供了在单机本地文件系统或多服务器环境下通过NFS挂载点存储KVCache的能力。
+
+---
+
+## 支持特性
+- 前缀匹配
+- 缓存融合
+- 模型窗口外推
+- 预填充卸载
+- 稀疏注意力
+- 稀疏注意力卸载
+- 异构PD分离
+
+---
+
+## 快速开始
+
+请参考 [快速开始](./docs/source/getting-started/quick_start.md).
+
+---
+
+## 分支
+
+| **分支**   |     状态   | vLLM 版本 | 
+|-----------:|-----------:|-------------:|
+|       main | 维护中 |       v0.9.2 | 
+|    develop | 维护中 |       v0.9.2 |
+
+---
+
+## 联系我们
+如需技术咨询或功能请求，请提交 GitHub [Issues](https://github.com/ModelEngine-Group/unified-cache-management/issues).
+
+## 许可协议
+
+UCM 采用 MIT 许可证（附加额外条件），详情请参阅 [LICENSE](./LICENSE) 文件。
Original file line number	Diff line number	Diff line change
`@@ -1 +1,2 @@`
`1`		`-recursive-include unifiedcache/csrc *`
	`1`	`+recursive-include ucm/csrc *`
	`2`	`+exclude CMakeLists.txt`