4747from vllm .attention .layer import check_upstream_fa_availability
4848from vllm .compilation .decorators import support_torch_compile
4949from vllm .config import VllmConfig
50- from vllm .config .multimodal import BaseDummyOptions
50+ from vllm .config .multimodal import BaseDummyOptions , VideoDummyOptions
5151from vllm .distributed import get_pp_group
5252from vllm .logger import init_logger
5353from vllm .model_executor .layers .activation import _ACTIVATION_REGISTRY
@@ -741,20 +741,57 @@ def get_dummy_mm_data(
741741 ) -> MultiModalDataDict :
742742 num_images = mm_counts .get ("image" , 0 )
743743 num_videos = mm_counts .get ("video" , 0 )
744+ image_overrides = mm_options .get ("image" ) if mm_options else None
745+ video_overrides = mm_options .get ("video" ) if mm_options else None
744746
745747 target_width , target_height = (
746748 self .info .get_image_size_with_most_features ())
747749 target_num_frames = self .info .get_num_frames_with_most_features (
748750 seq_len , mm_counts )
751+
752+ if video_overrides :
753+ assert isinstance (video_overrides , VideoDummyOptions )
754+ num_frames_override = video_overrides .num_frames
755+ if num_frames_override :
756+ if num_frames_override > target_num_frames :
757+ logger .warning (
758+ "video.num_frames override (%d) exceeds model's "
759+ "maximum number of frames (%d), will be ignored" ,
760+ num_frames_override , target_num_frames )
761+ if num_frames_override < 2 :
762+ logger .warning (
763+ "video.num_frames override (%d) cannot be less "
764+ "than 2, will be ignored" , num_frames_override )
765+ target_num_frames = min (target_num_frames , num_frames_override )
766+ target_num_frames = max (target_num_frames , 2 )
767+
749768 target_video_size , _ = self .info ._get_vision_info (
750769 image_width = target_width ,
751770 image_height = target_height ,
752771 num_frames = target_num_frames ,
753772 image_processor = self .info .get_video_processor (),
754773 )
755-
756- image_overrides = mm_options .get ("image" ) if mm_options else None
757- video_overrides = mm_options .get ("video" ) if mm_options else None
774+ # NOTE: we need to do this check here since Qwen3-VL resizes video
775+ # frames depending on how many frames there are.
776+ width , height = target_video_size .width , target_video_size .height
777+ if video_overrides :
778+ assert isinstance (video_overrides , VideoDummyOptions )
779+ width_override = video_overrides .width
780+ if width_override :
781+ if width_override > width :
782+ logger .warning (
783+ "video.width override (%d) exceeds model's "
784+ "maximum width (%d), will be ignored" , width_override ,
785+ width )
786+ width = min (width , width_override )
787+ height_override = video_overrides .height
788+ if height_override :
789+ if height_override > height :
790+ logger .warning (
791+ "video.height override (%d) exceeds model's "
792+ "maximum height (%d), will be ignored" ,
793+ height_override , height )
794+ height = min (height , height_override )
758795
759796 return {
760797 "image" :
@@ -764,11 +801,10 @@ def get_dummy_mm_data(
764801 overrides = image_overrides ),
765802 "video" :
766803 self ._get_dummy_videos (
767- width = target_video_size . width ,
768- height = target_video_size . height ,
804+ width = width ,
805+ height = height ,
769806 num_frames = target_num_frames ,
770807 num_videos = num_videos ,
771- overrides = video_overrides ,
772808 ),
773809 }
774810
@@ -780,7 +816,6 @@ def _get_dummy_videos(
780816 num_frames : int ,
781817 num_videos : int ,
782818 ) -> list [VideoItem ]:
783- num_frames = max (num_frames , 2 )
784819 video = np .full ((num_frames , width , height , 3 ), 255 , dtype = np .uint8 )
785820 video_items = []
786821 for i in range (num_videos ):
@@ -796,18 +831,6 @@ def _get_dummy_videos(
796831 video_items .append (video_item )
797832 return video_items
798833
799- def get_dummy_processor_inputs (self , seq_len , mm_counts ):
800- processor_inputs = super ().get_dummy_processor_inputs (
801- seq_len , mm_counts )
802- # HACK(Isotr0py): We set do_resize to False here to reuse Qwen2-VL's
803- # profiling logic, which will be problematic for configurable mm
804- # profiling.
805- # TODO(Isotr0py): Switch to the implementation in
806- # https://github.com/vllm-project/vllm/pull/25557
807- # after supporting configurable mm profiling.
808- processor_inputs .hf_processor_mm_kwargs = {"do_resize" : False }
809- return processor_inputs
810-
811834
812835class Qwen3VLMultiModalProcessor (BaseMultiModalProcessor [Qwen3VLProcessingInfo ]
813836 ):
0 commit comments