Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 0 additions & 12 deletions fastdeploy/cache_manager/cache_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,18 +21,6 @@
logger = get_logger("prefix_cache_manager", "cache_manager.log")


DISABLE_PREFIX_CACHE_MM_MODEL: set[str] = {
"Ernie5ForCausalLM",
}


def is_mm_model_disable_prefix_cache(model_config):
"""
check if the model architecture is in DISABLE_PREFIX_CACHE_MM_MODEL
"""
return model_config._architecture in DISABLE_PREFIX_CACHE_MM_MODEL


class CacheStatus(Enum):
"""
cache status enum class
Expand Down
3 changes: 0 additions & 3 deletions fastdeploy/cache_manager/multimodal_cache_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,6 @@ def apply_cache(self, mm_hashes: list[str], mm_items: list[Any]) -> list[str]:
else:
item_size = self.get_item_size(mm_items[idx])
if self.current_cache_size + item_size >= self.max_cache_size:
if item_size > self.max_cache_size:
# cannot be inserted even if we clear all cached data, skip it directly
continue
needed = item_size - (self.max_cache_size - self.current_cache_size)
evicted_hashes.extend(self.evict_cache(needed))
self.cache[mm_hashes[idx]] = mm_items[idx]
Expand Down
2 changes: 1 addition & 1 deletion fastdeploy/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -1591,7 +1591,7 @@ def __init__(
and self.model_config is not None
and self.model_config.enable_mm
):
self.max_prefill_batch = 1 # TODO:当前多模prefill阶段只支持并行度为1,待优化
self.max_prefill_batch = 1 # TODO:当前V0多模prefill阶段只支持并行度为1,待优化
else:
self.max_prefill_batch = self.scheduler_config.max_num_seqs

Expand Down
2 changes: 0 additions & 2 deletions fastdeploy/engine/args_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -552,8 +552,6 @@ def __post_init__(self):

if "PaddleOCR" in get_model_architecture(self.model, self.model_config_name):
envs.FD_ENABLE_MAX_PREFILL = 1
self.enable_prefix_caching = False
self.max_encoder_cache = 0

@staticmethod
def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
Expand Down
36 changes: 3 additions & 33 deletions fastdeploy/engine/sched/resource_manager_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -333,26 +333,6 @@ def _update_mm_hashes(self, request):
inputs["mm_positions"] = []
inputs["mm_hashes"] = []

def _is_mm_request(self, request):
inputs = request.multimodal_inputs
if inputs is None or len(inputs) == 0:
return False

if (
(inputs.get("video_feature_urls") is not None and len(inputs["video_feature_urls"]) > 0)
or (inputs.get("image_feature_urls") is not None and len(inputs["image_feature_urls"]) > 0)
or (inputs.get("audio_feature_urls") is not None and len(inputs["audio_feature_urls"]) > 0)
):
return True
elif (
inputs.get("images", None) is not None
and inputs.get("image_patch_id", None) is not None
and inputs.get("grid_thw", None) is not None
):
return True

return False

def _get_num_new_tokens(self, request, token_budget):
# TODO: set condition to new _get_num_new_tokens
num_new_tokens = request.need_prefill_tokens - request.num_computed_tokens
Expand Down Expand Up @@ -484,20 +464,14 @@ def _get_num_new_tokens(self, request, token_budget):
request.image_start = np.sum(np.prod(grid_thw[: request.num_image_start], axis=1))
request.image_end = np.sum(np.prod(grid_thw[: request.num_image_end], axis=1))

cur_mm_hashes = inputs["mm_hashes"][request.num_image_start : request.num_image_end]
cur_mm_positions = inputs["mm_positions"][request.num_image_start : request.num_image_end]
if self.encoder_cache:
cur_mm_hashes = inputs["mm_hashes"][request.num_image_start : request.num_image_end]
cur_mm_positions = inputs["mm_positions"][request.num_image_start : request.num_image_end]
request.evict_mm_hashes = self.encoder_cache.apply_cache(cur_mm_hashes, cur_mm_positions)

# Compatible with scenarios without images and videos.
return num_new_tokens

def exist_mm_prefill(self, scheduled_reqs):
for request in scheduled_reqs:
if request.task_type == RequestType.PREFILL and self._is_mm_request(request):
return True
return False

def exist_prefill(self, scheduled_reqs):
for request in scheduled_reqs:
if request.task_type == RequestType.PREFILL:
Expand Down Expand Up @@ -654,11 +628,7 @@ def _allocate_decode_and_extend():
break

request = self.waiting[0]
if (
not envs.FD_ENABLE_MAX_PREFILL
and self._is_mm_request(request)
and self.exist_mm_prefill(scheduled_reqs)
) or (paddle.is_compiled_with_xpu() and self.exist_prefill(scheduled_reqs)):
if paddle.is_compiled_with_xpu() and self.exist_prefill(scheduled_reqs):
break
if request.status == RequestStatus.WAITING:
result = self._waiting_async_process(request)
Expand Down
27 changes: 0 additions & 27 deletions fastdeploy/entrypoints/engine_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,13 +82,6 @@ def __init__(self, pid: int | str, port: int | str, fd_config: FDConfig, workers
self.enable_splitwise = self.fd_config.scheduler_config.splitwise_role != "mixed"
self.max_chips_per_node = 16 if current_platform.is_iluvatar() else 8

if self.enable_mm and self.enable_prefix_caching:
from fastdeploy.cache_manager.cache_data import (
is_mm_model_disable_prefix_cache,
)

self.disable_prefix_mm = is_mm_model_disable_prefix_cache(self.fd_config.model_config)

if self.tensor_parallel_size <= self.max_chips_per_node:
self.is_master = True
else:
Expand Down Expand Up @@ -265,16 +258,6 @@ async def format_and_add_data(self, prompts: dict):
await self.add_requests(prompts)
return prompts["prompt_token_ids"]

def _check_mm_disable_prefix_cache(self, task):
is_multimodal_data = False
if self.disable_prefix_mm:
multimodal_inputs = task.get("multimodal_inputs", [])
if multimodal_inputs:
token_type_ids = multimodal_inputs.get("token_type_ids", [])
if token_type_ids:
is_multimodal_data = np.sum(token_type_ids) > 0
return is_multimodal_data

async def add_requests(self, task):
"""
Add a new request to the queue.
Expand All @@ -298,16 +281,6 @@ async def add_requests(self, task):
else:
self.data_processor.process_request_dict(task, self.max_model_len)

if self.enable_mm and self.enable_prefix_caching:
if self._check_mm_disable_prefix_cache(task):
api_server_logger.error(
"The current service does not support processing requests containing multimodal data when prefix cache is enabled. Please send only text-based requests or disable prefix cache"
)
raise EngineError(
"The current service does not support processing requests containing multimodal data when prefix cache is enabled. Please send only text-based requests or disable prefix cache",
error_code=400,
)

task["prompt_token_ids_len"] = len(task["prompt_token_ids"])
input_ids_len = task["prompt_token_ids_len"]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -218,15 +218,11 @@ def spatial_conv_reshape(self, x, spatial_conv_size):
x = x.reshape([-1, C * (spatial_conv_size**2)])
return x

def forward(self, x, image_mask, token_type_ids, image_type_ids, grid_thw):
def forward(self, x, grid_thw):
"""
x: image_features
image_mask: [B]
token_types_ids: [B]
image_type_ids: [B_image]
grid_thw: [B_image, 3]
"""
assert image_type_ids is not None

def fwd_spatial(x):
"""
Expand Down
Loading
Loading