PaddlePaddle · kevincheng2 · Dec 1, 2025 · Dec 1, 2025 · Dec 1, 2025 · Dec 1, 2025
diff --git a/fastdeploy/cache_manager/cache_data.py b/fastdeploy/cache_manager/cache_data.py
@@ -21,18 +21,6 @@
 logger = get_logger("prefix_cache_manager", "cache_manager.log")
 
 
-DISABLE_PREFIX_CACHE_MM_MODEL: set[str] = {
-    "Ernie5ForCausalLM",
-}
-
-
-def is_mm_model_disable_prefix_cache(model_config):
-    """
-    check if the model architecture is in DISABLE_PREFIX_CACHE_MM_MODEL
-    """
-    return model_config._architecture in DISABLE_PREFIX_CACHE_MM_MODEL
-
-
 class CacheStatus(Enum):
     """
     cache status enum class

diff --git a/fastdeploy/cache_manager/multimodal_cache_manager.py b/fastdeploy/cache_manager/multimodal_cache_manager.py
@@ -53,9 +53,6 @@ def apply_cache(self, mm_hashes: list[str], mm_items: list[Any]) -> list[str]:
             else:
                 item_size = self.get_item_size(mm_items[idx])
                 if self.current_cache_size + item_size >= self.max_cache_size:
-                    if item_size > self.max_cache_size:
-                        # cannot be inserted even if we clear all cached data, skip it directly
-                        continue
                     needed = item_size - (self.max_cache_size - self.current_cache_size)
                     evicted_hashes.extend(self.evict_cache(needed))
                 self.cache[mm_hashes[idx]] = mm_items[idx]

diff --git a/fastdeploy/config.py b/fastdeploy/config.py
@@ -1591,7 +1591,7 @@ def __init__(
                 and self.model_config is not None
                 and self.model_config.enable_mm
             ):
-                self.max_prefill_batch = 1  # TODO:当前多模prefill阶段只支持并行度为1,待优化
+                self.max_prefill_batch = 1  # TODO:当前V0多模prefill阶段只支持并行度为1,待优化
         else:
             self.max_prefill_batch = self.scheduler_config.max_num_seqs
 

diff --git a/fastdeploy/engine/args_utils.py b/fastdeploy/engine/args_utils.py
@@ -552,8 +552,6 @@ def __post_init__(self):
 
         if "PaddleOCR" in get_model_architecture(self.model, self.model_config_name):
             envs.FD_ENABLE_MAX_PREFILL = 1
-            self.enable_prefix_caching = False
-            self.max_encoder_cache = 0
 
     @staticmethod
     def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:

diff --git a/fastdeploy/engine/sched/resource_manager_v1.py b/fastdeploy/engine/sched/resource_manager_v1.py
@@ -333,26 +333,6 @@ def _update_mm_hashes(self, request):
             inputs["mm_positions"] = []
             inputs["mm_hashes"] = []
 
-    def _is_mm_request(self, request):
-        inputs = request.multimodal_inputs
-        if inputs is None or len(inputs) == 0:
-            return False
-
-        if (
-            (inputs.get("video_feature_urls") is not None and len(inputs["video_feature_urls"]) > 0)
-            or (inputs.get("image_feature_urls") is not None and len(inputs["image_feature_urls"]) > 0)
-            or (inputs.get("audio_feature_urls") is not None and len(inputs["audio_feature_urls"]) > 0)
-        ):
-            return True
-        elif (
-            inputs.get("images", None) is not None
-            and inputs.get("image_patch_id", None) is not None
-            and inputs.get("grid_thw", None) is not None
-        ):
-            return True
-
-        return False
-
     def _get_num_new_tokens(self, request, token_budget):
         # TODO: set condition to new _get_num_new_tokens
         num_new_tokens = request.need_prefill_tokens - request.num_computed_tokens
@@ -484,20 +464,14 @@ def _get_num_new_tokens(self, request, token_budget):
                 request.image_start = np.sum(np.prod(grid_thw[: request.num_image_start], axis=1))
                 request.image_end = np.sum(np.prod(grid_thw[: request.num_image_end], axis=1))
 
-                cur_mm_hashes = inputs["mm_hashes"][request.num_image_start : request.num_image_end]
-                cur_mm_positions = inputs["mm_positions"][request.num_image_start : request.num_image_end]
                 if self.encoder_cache:
+                    cur_mm_hashes = inputs["mm_hashes"][request.num_image_start : request.num_image_end]
+                    cur_mm_positions = inputs["mm_positions"][request.num_image_start : request.num_image_end]
                     request.evict_mm_hashes = self.encoder_cache.apply_cache(cur_mm_hashes, cur_mm_positions)
 
         # Compatible with scenarios without images and videos.
         return num_new_tokens
 
-    def exist_mm_prefill(self, scheduled_reqs):
-        for request in scheduled_reqs:
-            if request.task_type == RequestType.PREFILL and self._is_mm_request(request):
-                return True
-        return False
-
     def exist_prefill(self, scheduled_reqs):
         for request in scheduled_reqs:
             if request.task_type == RequestType.PREFILL:
@@ -654,11 +628,7 @@ def _allocate_decode_and_extend():
                         break
 
                     request = self.waiting[0]
-                    if (
-                        not envs.FD_ENABLE_MAX_PREFILL
-                        and self._is_mm_request(request)
-                        and self.exist_mm_prefill(scheduled_reqs)
-                    ) or (paddle.is_compiled_with_xpu() and self.exist_prefill(scheduled_reqs)):
+                    if paddle.is_compiled_with_xpu() and self.exist_prefill(scheduled_reqs):
                         break
                     if request.status == RequestStatus.WAITING:
                         result = self._waiting_async_process(request)

diff --git a/fastdeploy/entrypoints/engine_client.py b/fastdeploy/entrypoints/engine_client.py
@@ -82,13 +82,6 @@ def __init__(self, pid: int | str, port: int | str, fd_config: FDConfig, workers
         self.enable_splitwise = self.fd_config.scheduler_config.splitwise_role != "mixed"
         self.max_chips_per_node = 16 if current_platform.is_iluvatar() else 8
 
-        if self.enable_mm and self.enable_prefix_caching:
-            from fastdeploy.cache_manager.cache_data import (
-                is_mm_model_disable_prefix_cache,
-            )
-
-            self.disable_prefix_mm = is_mm_model_disable_prefix_cache(self.fd_config.model_config)
-
         if self.tensor_parallel_size <= self.max_chips_per_node:
             self.is_master = True
         else:
@@ -265,16 +258,6 @@ async def format_and_add_data(self, prompts: dict):
         await self.add_requests(prompts)
         return prompts["prompt_token_ids"]
 
-    def _check_mm_disable_prefix_cache(self, task):
-        is_multimodal_data = False
-        if self.disable_prefix_mm:
-            multimodal_inputs = task.get("multimodal_inputs", [])
-            if multimodal_inputs:
-                token_type_ids = multimodal_inputs.get("token_type_ids", [])
-                if token_type_ids:
-                    is_multimodal_data = np.sum(token_type_ids) > 0
-        return is_multimodal_data
-
     async def add_requests(self, task):
         """
         Add a new request to the queue.
@@ -298,16 +281,6 @@ async def add_requests(self, task):
             else:
                 self.data_processor.process_request_dict(task, self.max_model_len)
 
-            if self.enable_mm and self.enable_prefix_caching:
-                if self._check_mm_disable_prefix_cache(task):
-                    api_server_logger.error(
-                        "The current service does not support processing requests containing multimodal data when prefix cache is enabled. Please send only text-based requests or disable prefix cache"
-                    )
-                    raise EngineError(
-                        "The current service does not support processing requests containing multimodal data when prefix cache is enabled. Please send only text-based requests or disable prefix cache",
-                        error_code=400,
-                    )
-
             task["prompt_token_ids_len"] = len(task["prompt_token_ids"])
             input_ids_len = task["prompt_token_ids_len"]
 

diff --git a/fastdeploy/model_executor/models/ernie4_5_vl/modeling_resampler.py b/fastdeploy/model_executor/models/ernie4_5_vl/modeling_resampler.py
@@ -218,15 +218,11 @@ def spatial_conv_reshape(self, x, spatial_conv_size):
         x = x.reshape([-1, C * (spatial_conv_size**2)])
         return x
 
-    def forward(self, x, image_mask, token_type_ids, image_type_ids, grid_thw):
+    def forward(self, x, grid_thw):
         """
         x: image_features
-        image_mask: [B]
-        token_types_ids: [B]
-        image_type_ids:  [B_image]
         grid_thw: [B_image, 3]
         """
-        assert image_type_ids is not None
 
         def fwd_spatial(x):
             """