WIP: Fix OOB in cache_kernels.cu (Issue #27909)

Flink-ddd · Flink-ddd · commit 2e048d1d445b · 2025-11-14T22:03:14.000Z
diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu
@@ -966,22 +966,26 @@ __global__ void gather_and_maybe_dequant_cache(
   };
 
   for (int pid = split_start; pid < full_blocks_end; ++pid) {
-    auto block_id = batch_block_table[pid];
-    auto block_start_ptr = src_cache + block_id * cache_block_stride;
-    auto block_dst_ptr = dst + pid * block_size * dst_entry_stride;
-    for (int eid = 0; eid < block_size; ++eid) {
-      copy_entry(block_start_ptr + eid * cache_entry_stride,
+    if (offset + pid < block_table_stride){
+       auto block_id = batch_block_table[pid];
+       auto block_start_ptr = src_cache + block_id * cache_block_stride;
+       auto block_dst_ptr = dst + pid * block_size * dst_entry_stride;
+       for (int eid = 0; eid < block_size; ++eid) {
+        copy_entry(block_start_ptr + eid * cache_entry_stride,
                  block_dst_ptr + eid * dst_entry_stride);
+       }
     }
   }
 
   if (partial_block_size) {
-    auto block_id = batch_block_table[full_blocks_end];
-    auto block_start_ptr = src_cache + block_id * cache_block_stride;
-    auto block_dst_ptr = dst + full_blocks_end * block_size * dst_entry_stride;
-    for (int eid = 0; eid < partial_block_size; ++eid) {
+    if (offset + full_blocks_end < block_table_stride) {
+      auto block_id = batch_block_table[full_blocks_end];
+      auto block_start_ptr = src_cache + block_id * cache_block_stride;
+      auto block_dst_ptr = dst + full_blocks_end * block_size * dst_entry_stride;
+      for (int eid = 0; eid < partial_block_size; ++eid) {
       copy_entry(block_start_ptr + eid * cache_entry_stride,
                  block_dst_ptr + eid * dst_entry_stride);
+      }
     }
   }
 }
diff --git a/tests/kernels/test_cache_kernels.py b/tests/kernels/test_cache_kernels.py
@@ -0,0 +1,79 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Unit tests for CUDA kernels in cache_kernels.cu."""
+
+import torch
+import pytest
+
+try:
+    from vllm import cache_ops
+except ImportError:
+    try:
+        from vllm.ops import cache_ops
+    except ImportError:
+        pytest.skip("Could not import vllm cache_ops. Skipping test.", 
+                    allow_module_level=True)
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Need CUDA device")
+def test_gather_cache_oob_issue_27909():
+    """
+    Tests for OOB read in gather_and_maybe_dequant_cache (Issue #27909).
+    This test constructs a boundary case identified in the issue where
+    seq_starts causes the block_table offset to read out of bounds.
+    """
+    
+    batch_size = 1
+    block_size = 64
+    entry_size = 128
+    
+    block_table = torch.tensor(
+        [[1, 2]],
+        dtype=torch.int32, 
+        device="cuda"
+    )
+    
+    #This will result in offset = 128 / block_size = 128 / 64 = 2
+    # This will cause the kernel to try to read from block_table[0, 2], but its size is only 2.
+    seq_starts = torch.tensor([128], dtype=torch.int32, device="cuda")
+    
+    seq_len = 1
+    cu_seq_lens = torch.tensor(
+        [0, seq_len], # BATCH+1 = [0, 1]
+        dtype=torch.int32, 
+        device="cuda"
+    )
+    
+    # src_cache: [num_blocks, block_size, entry_size]
+    num_blocks = 5 
+    src_cache = torch.randn(
+        (num_blocks, block_size, entry_size), 
+        dtype=torch.float16, 
+        device="cuda"
+    )
+    
+    dst = torch.empty(
+        (seq_len, entry_size), 
+        dtype=torch.float16, 
+        device="cuda"
+    )
+    
+    scale = torch.tensor([1.0], dtype=torch.float32, device="cuda")
+
+    # Calling the C++ function gather_and_maybe_dequant_cache
+    cache_ops.gather_and_maybe_dequant_cache(
+        src_cache,
+        dst,
+        block_table,
+        cu_seq_lens,
+        batch_size,
+        "auto",  # kv_cache_dtype
+        scale,
+        seq_starts
+    )
+    
+    torch.cuda.synchronize()
+    assert True
+
+if __name__ == "__main__":
+    pytest.main([__file__])