diff --git a/tests/entrypoints/offline_mode/test_offline_mode.py b/tests/entrypoints/offline_mode/test_offline_mode.py index 25e663f3af0e..539ff89abe9c 100644 --- a/tests/entrypoints/offline_mode/test_offline_mode.py +++ b/tests/entrypoints/offline_mode/test_offline_mode.py @@ -23,6 +23,16 @@ "max_num_seqs": 64, "tensor_parallel_size": 1, }, + { + "model": "Qwen/Qwen3-0.6B", + "enforce_eager": True, + "gpu_memory_utilization": 0.50, + "max_model_len": 64, + "max_num_batched_tokens": 64, + "max_num_seqs": 64, + "tensor_parallel_size": 1, + "tokenizer": "Qwen/Qwen3-4B", + }, { "model": "mistralai/Mistral-7B-Instruct-v0.1", "enforce_eager": True, diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 31825980f3a1..186a2a414187 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -581,15 +581,26 @@ def __post_init__(self): from vllm.plugins import load_general_plugins load_general_plugins() - # when use hf offline,replace model id to local model path + # when use hf offline,replace model and tokenizer id to local model path if huggingface_hub.constants.HF_HUB_OFFLINE: model_id = self.model self.model = get_model_path(self.model, self.revision) - logger.info( - "HF_HUB_OFFLINE is True, replace model_id [%s] to model_path [%s]", - model_id, - self.model, - ) + if model_id is not self.model: + logger.info( + "HF_HUB_OFFLINE is True, replace model_id [%s] to model_path [%s]", + model_id, + self.model, + ) + if self.tokenizer is not None: + tokenizer_id = self.tokenizer + self.tokenizer = get_model_path(self.tokenizer, self.tokenizer_revision) + if tokenizer_id is not self.tokenizer: + logger.info( + "HF_HUB_OFFLINE is True, replace tokenizer_id [%s] " + "to tokenizer_path [%s]", + tokenizer_id, + self.tokenizer, + ) @staticmethod def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: