NVIDIA
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/cacheTransceiver.h‎
Lines changed: 2 additions & 1 deletion b/‎cpp/include/tensorrt_llm/batch_manager/cacheTransceiver.h‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h‎
Lines changed: 53 additions & 0 deletions b/‎cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h‎
Lines changed: 53 additions & 0 deletions
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/kvCacheUtils.h‎
Lines changed: 21 additions & 7 deletions b/‎cpp/include/tensorrt_llm/batch_manager/kvCacheUtils.h‎
Lines changed: 21 additions & 7 deletions
diff --git a/‎cpp/include/tensorrt_llm/executor/dataTransceiverState.h‎
Lines changed: 36 additions & 3 deletions b/‎cpp/include/tensorrt_llm/executor/dataTransceiverState.h‎
Lines changed: 36 additions & 3 deletions
diff --git a/‎cpp/tensorrt_llm/batch_manager/cacheFormatter.cpp‎
Lines changed: 15 additions & 8 deletions b/‎cpp/tensorrt_llm/batch_manager/cacheFormatter.cpp‎
Lines changed: 15 additions & 8 deletions
diff --git a/‎cpp/tensorrt_llm/batch_manager/cacheFormatter.h‎
Lines changed: 2 additions & 2 deletions b/‎cpp/tensorrt_llm/batch_manager/cacheFormatter.h‎
Lines changed: 2 additions & 2 deletions
@@ -269,7 +269,8 @@ class CacheTransceiver : public BaseCacheTransceiver
     std::unique_ptr<executor::kv_cache::CacheState> mCacheState;
     std::unique_ptr<executor::kv_cache::ConnectionManager> mManager;
     std::optional<executor::CacheTransceiverConfig> mCacheTransceiverConfig;
-    std::unique_ptr<kv_cache_manager::CacheTransBufferManager> mCacheTransBufferManager;
+    std::vector<std::unique_ptr<kv_cache_manager::CacheTransBufferManager>> mCacheTransBufferManagers;
+    std::vector<kv_cache_manager::CacheTransBufferManager*> mCacheTransBufferManagerPtrs;
     // library handle to the communicator related features,
     // this is used to defer dependency resolution until needed.
     static std::mutex mDllMutex;
 
@@ -595,6 +595,21 @@ class WindowBlockManager
 
     ~WindowBlockManager();
 
+    [[nodiscard]] bool isEnableIndexerKCache() const
+    {
+        return mEnableIndexerKCache;
+    }
+
+    [[nodiscard]] SizeType32 getIndexerKCacheQuantBlockSize() const
+    {
+        return mIndexerKCacheQuantBlockSize;
+    }
+
+    [[nodiscard]] SizeType32 getIndexerKCacheIndexHeadDim() const
+    {
+        return mIndexerKCacheIndexHeadDim;
+    }
+
     void allocatePools(bool useUvm);
 
     void releasePools();
@@ -1021,6 +1036,21 @@ class BlockManager
         std::optional<kvc::BaseAgentConfig> agentConfig = std::nullopt, bool enableIndexerKCache = false,
         SizeType32 indexerKCacheQuantBlockSize = 128, SizeType32 indexerKCacheIndexHeadDim = 0);
 
+    [[nodiscard]] bool isEnableIndexerKCache() const
+    {
+        return mIsEnableIndexerKCache;
+    }
+
+    [[nodiscard]] SizeType32 getIndexerKCacheQuantBlockSize() const
+    {
+        return mIndexerKCacheQuantBlockSize;
+    }
+
+    [[nodiscard]] SizeType32 getIndexerKCacheIndexHeadDim() const
+    {
+        return mIndexerKCacheIndexHeadDim;
+    }
+
     BlockManager(BlockManager const&) = delete;
     BlockManager& operator=(BlockManager const&) = delete;
 
@@ -1398,6 +1428,10 @@ class BlockManager
     std::vector<SizeType32> mAbsolutePoolToRelativePoolIndex;
     // Record what sequences are currently managed by the block manager
     std::set<LlmRequest::RequestIdType> mManagedSequences;
+
+    bool mIsEnableIndexerKCache{false};
+    SizeType32 mIndexerKCacheQuantBlockSize{0};
+    SizeType32 mIndexerKCacheIndexHeadDim{0};
 };
 
 struct OffsetTableDimensions
@@ -1500,6 +1534,10 @@ class BaseKVCacheManager
 
     [[nodiscard]] virtual bool isEnableBlockReuse() const = 0;
 
+    [[nodiscard]] virtual bool isEnableIndexerKCache() const = 0;
+    [[nodiscard]] virtual SizeType32 getIndexerKCacheIndexHeadDim() const = 0;
+    [[nodiscard]] virtual SizeType32 getIndexerKCacheQuantBlockSize() const = 0;
+
     // void removeToken(SizeType32 seqSlotIdx);
     virtual void rewindKVCache(LlmRequest::RequestIdType requestId, SizeType32 rewindLengths) = 0;
 
@@ -1834,6 +1872,21 @@ class KVCacheManager : public BaseKVCacheManager
         return mEnableBlockReuse;
     }
 
+    [[nodiscard]] bool isEnableIndexerKCache() const override
+    {
+        return mBlockManager.isEnableIndexerKCache();
+    }
+
+    [[nodiscard]] SizeType32 getIndexerKCacheIndexHeadDim() const override
+    {
+        return mBlockManager.getIndexerKCacheIndexHeadDim();
+    }
+
+    [[nodiscard]] SizeType32 getIndexerKCacheQuantBlockSize() const override
+    {
+        return mBlockManager.getIndexerKCacheQuantBlockSize();
+    }
+
     void removeToken(LlmRequest::RequestIdType requestId);
     void rewindKVCache(LlmRequest::RequestIdType requestId, SizeType32 rewindLengths) override;
 
 
@@ -73,7 +73,8 @@ class BlockRange
         BaseKVCacheManager& cacheManager, BlockKey const& lastBlockKey, int32_t indexFromEnd)
     {
 
-        auto poolNum = cacheManager.getNumPools();
+        auto poolNum = cacheManager.getBlockManager().getNumPools(
+            /*includeBlockScalePools=*/false, /*includeIndexerKCachePools=*/false);
         TLLM_CHECK_WITH_INFO(poolNum == 1, "Reuse tree is not supported for multiple pools or variable window size");
 
         auto windowSize = cacheManager.getBlockManager().getWindowSizesMetadata().begin()->first;
@@ -136,13 +137,21 @@ class BlockRange
         return blockHashesPerWindow;
     }
 
-    BlockRangeForWindow getBlockRangeForWindow(SizeType32 windowSize) const
+    BlockRangeForWindow getBlockRangeForWindow(SizeType32 windowSize, bool useIndexerKCache = false) const
     {
         TLLM_CHECK_WITH_INFO(
             mPoolsPerWindow.find(windowSize) != mPoolsPerWindow.end(), "Window size %d not found", windowSize);
         auto pool = mPoolsPerWindow.at(windowSize).front();
         auto blockIds = mBlockIdsPerWindow.at(windowSize);
-        return BlockRangeForWindow(mManager, windowSize, std::move(blockIds), std::move(pool));
+        if (useIndexerKCache)
+        {
+            TLLM_CHECK(mIndexerKCachePool);
+            return BlockRangeForWindow(mManager, windowSize, std::move(blockIds), mIndexerKCachePool);
+        }
+        else
+        {
+            return BlockRangeForWindow(mManager, windowSize, std::move(blockIds), std::move(pool));
+        }
     }
 
     std::vector<SizeType32> getWindowSizes() const
@@ -167,9 +176,8 @@ class BlockRange
         , mRequestId(requestId)
         , mBlockIdsPerWindow(std::move(blockIdsPerWindow))
     {
-
-        // cacheManager.getBlockManager.getPrimaryPool(0);
-        auto poolNum = mManager->getNumPools();
+        auto poolNum = mManager->getBlockManager().getNumPools(
+            /*includeBlockScalePools=*/false, /*includeIndexerKCachePools=*/false);
         for (SizeType32 poolIdx = 0; poolIdx < poolNum; ++poolIdx)
         {
             auto windowSize = cacheManager.getBlockManager().getPoolWindowSize(poolIdx);
@@ -181,21 +189,27 @@ class BlockRange
         : mManager(&cacheManager)
         , mRequestId(requestId)
     {
-        auto poolNum = mManager->getNumPools();
+        auto poolNum = mManager->getBlockManager().getNumPools(
+            /*includeBlockScalePools=*/false, /*includeIndexerKCachePools=*/false);
         for (SizeType32 poolIdx = 0; poolIdx < poolNum; ++poolIdx)
         {
             auto windowSize = cacheManager.getBlockManager().getPoolWindowSize(poolIdx);
             mPoolsPerWindow[windowSize].push_back(cacheManager.getBlockManager().getPrimaryPool(poolIdx));
             mBlockIdsPerWindow[windowSize]
                 = cacheManager.getSequence(mRequestId).getCacheBlockIds(windowSize).at(kFIRST_AND_ONLY_BEAM);
         }
+        if (cacheManager.isEnableIndexerKCache())
+        {
+            mIndexerKCachePool = cacheManager.getIndexerKCachePool();
+        }
     }
 
 private:
     BaseKVCacheManager const* mManager;
     LlmRequest::RequestIdType const mRequestId;
     std::unordered_map<SizeType32, std::vector<SizeType32>> mBlockIdsPerWindow;
     std::unordered_map<SizeType32, std::vector<runtime::ITensor::SharedPtr>> mPoolsPerWindow;
+    runtime::ITensor::SharedPtr mIndexerKCachePool;
 
     static constexpr SizeType32 kFIRST_AND_ONLY_BEAM = 0;
     static constexpr SizeType32 kFIRST_POOL_INDEX = 0;
 
@@ -50,7 +50,8 @@ class CacheState final
 
     CacheState(ModelConfig modelConfig, runtime::WorldConfig const& worldConfig,
         std::vector<SizeType32> const& attentionLayerNumPerPP, nvinfer1::DataType dataType,
-        AttentionType attentionType = AttentionType::kDEFAULT, int kvFactor = 2, bool enableBlockReuse = false)
+        AttentionType attentionType = AttentionType::kDEFAULT, int kvFactor = 2, bool enableBlockReuse = false,
+        bool hasIndexerKCache = false, SizeType32 indexerDimPerHead = 0, SizeType32 indexerKCacheQuantBlockSize = 128)
         : mModelConfig(std::move(modelConfig))
         , mParallelConfig{worldConfig.getTensorParallelism(), worldConfig.getPipelineParallelism(),
               worldConfig.getContextParallelism(), worldConfig.enableAttentionDP(), worldConfig.getTensorParallelRank(),
@@ -59,34 +60,45 @@ class CacheState final
         , mAttentionConfig(attentionType, kvFactor)
     {
         mEnableBlockReuse = enableBlockReuse;
+        mHasIndexerKCache = hasIndexerKCache;
+        mIndexerDimPerHead = indexerDimPerHead;
+        mIndexerKCacheQuantBlockSize = indexerKCacheQuantBlockSize;
     }
 
     CacheState(std::vector<SizeType32> nbKvHeadPerLayer, SizeType32 sizePerHead, SizeType32 tokensPerBlock,
         SizeType32 tensorParallelism, SizeType32 pipelineParallelism, SizeType32 contextParallelism,
         std::vector<SizeType32> const& attentionLayerNumPerPP, nvinfer1::DataType dataType,
         AttentionType attentionType = AttentionType::kDEFAULT, int kvFactor = 2, bool enableAttentionDP = false,
-        int DPrank = 0, int DPsize = 0, bool enableBlockReuse = false)
+        int DPrank = 0, int DPsize = 0, bool enableBlockReuse = false, bool hasIndexerKCache = false,
+        SizeType32 indexerDimPerHead = 0, SizeType32 indexerKCacheQuantBlockSize = 128)
         : mModelConfig{std::move(nbKvHeadPerLayer), sizePerHead, tokensPerBlock}
         , mParallelConfig{tensorParallelism, pipelineParallelism, contextParallelism, enableAttentionDP, DPrank, DPsize,
               attentionLayerNumPerPP}
         , mDataType{dataType}
         , mAttentionConfig(attentionType, kvFactor)
     {
         mEnableBlockReuse = enableBlockReuse;
+        mHasIndexerKCache = hasIndexerKCache;
+        mIndexerDimPerHead = indexerDimPerHead;
+        mIndexerKCacheQuantBlockSize = indexerKCacheQuantBlockSize;
     }
 
     CacheState(SizeType32 nbAttentionLayers, SizeType32 nbKvHeads, SizeType32 sizePerHead, SizeType32 tokensPerBlock,
         SizeType32 tensorParallelism, SizeType32 pipelineParallelism, SizeType32 contextParallelism,
         std::vector<SizeType32> const& attentionLayerNumPerPP, nvinfer1::DataType dataType,
         AttentionType attentionType = AttentionType::kDEFAULT, int kvFactor = 2, bool enableAttentionDP = false,
-        int DPrank = 0, int DPsize = 0, bool enableBlockReuse = false)
+        int DPrank = 0, int DPsize = 0, bool enableBlockReuse = false, bool hasIndexerKCache = false,
+        SizeType32 indexerDimPerHead = 0, SizeType32 indexerKCacheQuantBlockSize = 128)
         : mModelConfig{std::vector(nbAttentionLayers, nbKvHeads), sizePerHead, tokensPerBlock}
         , mParallelConfig{tensorParallelism, pipelineParallelism, contextParallelism, enableAttentionDP, DPrank, DPsize,
               attentionLayerNumPerPP}
         , mDataType{dataType}
         , mAttentionConfig(attentionType, kvFactor)
     {
         mEnableBlockReuse = enableBlockReuse;
+        mHasIndexerKCache = hasIndexerKCache;
+        mIndexerDimPerHead = indexerDimPerHead;
+        mIndexerKCacheQuantBlockSize = indexerKCacheQuantBlockSize;
     }
 
     [[nodiscard]] bool operator==(kv_cache::CacheState const& other) const noexcept
@@ -174,6 +186,21 @@ class CacheState final
         return mEnableBlockReuse;
     }
 
+    [[nodiscard]] bool getHasIndexerKCache() const
+    {
+        return mHasIndexerKCache;
+    }
+
+    [[nodiscard]] SizeType32 getIndexerDimPerHead() const
+    {
+        return mIndexerDimPerHead;
+    }
+
+    [[nodiscard]] SizeType32 getIndexerKCacheQuantBlockSize() const
+    {
+        return mIndexerKCacheQuantBlockSize;
+    }
+
     [[nodiscard]] std::string toString() const
     {
         std::stringstream sstring;
@@ -194,6 +221,9 @@ class CacheState final
         sstring << "dpRank:" << mParallelConfig.mDPrank << "\n";
         sstring << "dpSize:" << mParallelConfig.mDPsize << "\n";
         sstring << "enableBlockReuse:" << mEnableBlockReuse << "\n";
+        sstring << "hasIndexerKCache:" << mHasIndexerKCache << "\n";
+        sstring << "indexerDimPerHead:" << mIndexerDimPerHead << "\n";
+        sstring << "indexerKCacheQuantBlockSize:" << mIndexerKCacheQuantBlockSize << "\n";
         return sstring.str();
     }
 
@@ -204,6 +234,9 @@ class CacheState final
     nvinfer1::DataType mDataType;
     AttentionConfig mAttentionConfig;
     bool mEnableBlockReuse{false};
+    bool mHasIndexerKCache{false};
+    SizeType32 mIndexerDimPerHead{0};
+    SizeType32 mIndexerKCacheQuantBlockSize{128};
 };
 
 struct MpiState
 
@@ -45,7 +45,9 @@ namespace tensorrt_llm::batch_manager::kv_cache_manager
 BlockRange getBlockRangeForSending(BaseKVCacheManager* cacheManager, LlmRequest const& llmRequest,
     BlockKey const& lastBlockKey, int32_t indexFromEnd, bool recvSideHasCP)
 {
-    auto poolNum = cacheManager->getBlockManager().getNumPools();
+    auto poolNum = cacheManager->getBlockManager().getNumPools(
+        /*includeBlockScalePools=*/false, /*includeIndexerKCachePools=*/false);
+
     // Note: When recv side has CP, the requested seqLen is lesser than seqLen on the sender side as seqLen is
     // distributed among CP ranks. So, we transfer all blocks from send side.
     if (poolNum > 1 || !cacheManager->isEnableBlockReuse() || lastBlockKey.uniqueTokens.size() == 0 || recvSideHasCP)
@@ -88,8 +90,9 @@ BlockRange getBlockRangeForSending(BaseKVCacheManager* cacheManager, LlmRequest
 BlockRange getBlockRangeForReceiving(
     BaseKVCacheManager* cacheManager, LlmRequest const& llmRequest, bool srcEnableBlockReuse, bool recvSideHasCP)
 {
-    auto poolNum = cacheManager->getBlockManager().getNumPools();
     // Note: When recv side has CP, we request all blocks from send side right now.
+    auto poolNum = cacheManager->getBlockManager().getNumPools(
+        /*includeBlockScalePools=*/false, /*includeIndexerKCachePools=*/false);
     if (poolNum == 1 && srcEnableBlockReuse && !recvSideHasCP)
     {
         // Build from all block ids, then slice off the reused blocks so we only transfer newly allocated ones.
@@ -171,7 +174,8 @@ void checkAlternateWindow(BaseKVCacheManager* cacheManager, BaseCacheFormatter::
     // if gen PP and context PP are different, cache formatter only support alternative window like gpt-oss.
     // which is one layer is WSA, and another layer is Full attention.
 
-    auto numPools = cacheManager->getBlockManager().getNumPools();
+    auto numPools = cacheManager->getBlockManager().getNumPools(
+        /*includeBlockScalePools=*/false, /*includeIndexerKCachePools=*/false);
     auto layerNum = cacheManager->getBlockManager().getNumLayers();
 
     auto selfPPNum = selfConfig.getParallelConfig().mPipelineParallelism;
@@ -248,7 +252,8 @@ void CacheFormatter::format(tensorrt_llm::batch_manager::TransferSession& sessio
     auto& blockManager = mCacheManager->getBlockManager();
     auto const& lastBlockKey = session.getLastBlockKey();
     auto blockRange = getBlockRangeForSending(mCacheManager, llmRequest, lastBlockKey, indexFromEnd);
-    auto const numPools = blockManager.getNumPools();
+    auto const numPools
+        = blockManager.getNumPools(/*includeBlockScalePools=*/false, /*includeIndexerKCachePools=*/false);
     // TODO(oargov): are we sure the other side has the same number of pools? this might not hold for pp_size>1...
 
     bool layerWise = common::getEnvDisaggLayerwise() && numPools == 1;
@@ -556,7 +561,8 @@ void CacheFormatter::unformat(tensorrt_llm::batch_manager::TransferSession& sess
     TLLM_LOG_DEBUG("pickUpConnections size: %d connections size: %d", pickUpConnections.size(), connections.size());
     std::vector<runtime::ITensor::SharedPtr> recvBufferTmps;
     std::map<SizeType32, std::vector<runtime::ITensor::SharedPtr>> outputBuffersPerWindow;
-    auto const numPools = mCacheManager->getBlockManager().getNumPools();
+    auto const numPools = mCacheManager->getBlockManager().getNumPools(
+        /*includeBlockScalePools=*/false, /*includeIndexerKCachePools=*/false);
     // TODO(oargov): are we sure the other side has the same number of pools? this might not hold for pp_size>1...
     size_t blockNum = 0;
     size_t cacheBlockSizeSum = 0;
@@ -966,13 +972,14 @@ void CacheFormatter::unformat(tensorrt_llm::batch_manager::TransferSession& sess
 }
 
 std::unique_ptr<BaseCacheFormatter> createCacheFormatter(
-    BaseKVCacheManager* cacheManager, CacheTransBufferManager* cacheTransBufferManager, bool isMLA)
+    BaseKVCacheManager* cacheManager, std::vector<CacheTransBufferManager*> const& cacheTransBufferManagers, bool isMLA)
 {
+    TLLM_CHECK(!cacheTransBufferManagers.empty());
     if (isMLA)
     {
-        return std::make_unique<MLACacheFormatter>(cacheManager, cacheTransBufferManager);
+        return std::make_unique<MLACacheFormatter>(cacheManager, cacheTransBufferManagers);
     }
-    return std::make_unique<CacheFormatter>(cacheManager, cacheTransBufferManager);
+    return std::make_unique<CacheFormatter>(cacheManager, cacheTransBufferManagers[0]);
 }
 
 } // namespace tensorrt_llm::batch_manager::kv_cache_manager
@@ -133,7 +133,7 @@ class CacheFormatter final : public BaseCacheFormatter
     CacheTransBufferManager* mCacheTransBufferManager;
 };
 
-std::unique_ptr<BaseCacheFormatter> createCacheFormatter(
-    BaseKVCacheManager* cacheManager, CacheTransBufferManager* cacheTransBufferManager, bool isMLA = false);
+std::unique_ptr<BaseCacheFormatter> createCacheFormatter(BaseKVCacheManager* cacheManager,
+    std::vector<CacheTransBufferManager*> const& cacheTransBufferManagers, bool isMLA = false);
 
 } // namespace tensorrt_llm::batch_manager::kv_cache_manager