From 357e602f0ac3b38fb236a8bc3047d32412af38d2 Mon Sep 17 00:00:00 2001 From: yaogang2060 Date: Fri, 7 Nov 2025 16:17:23 +0800 Subject: [PATCH 1/8] qwen3vl video process padding video frames --- .../models/qwen3_vl/video_processing_qwen3_vl.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/transformers/models/qwen3_vl/video_processing_qwen3_vl.py b/src/transformers/models/qwen3_vl/video_processing_qwen3_vl.py index 8a70a1a68584..217efbfe0ce6 100644 --- a/src/transformers/models/qwen3_vl/video_processing_qwen3_vl.py +++ b/src/transformers/models/qwen3_vl/video_processing_qwen3_vl.py @@ -195,9 +195,13 @@ def _preprocess( resized_videos_grouped = {} for shape, stacked_videos in grouped_videos.items(): - B, T, C, H, W = stacked_videos.shape - num_frames, height, width = T, H, W if do_resize: + T = stacked_videos.shape[1] + if pad := -T % temporal_patch_size: + repeats = stacked_videos[:, -1:].expand(-1, pad, -1, -1, -1) + stacked_videos = torch.cat((stacked_videos, repeats), dim=1) + B, T, C, H, W = stacked_videos.shape + num_frames, height, width = T, H, W resized_height, resized_width = smart_resize( num_frames=num_frames, height=height, @@ -232,9 +236,10 @@ def _preprocess( patches = stacked_videos # Check that videos have `num_frames` divisible by `temporal_patch_size` - if patches.shape[1] % temporal_patch_size != 0: - repeats = patches[:, -1:].repeat(1, temporal_patch_size - 1, 1, 1, 1) - patches = torch.cat([patches, repeats], dim=1) + T = patches.shape[1] + if pad := -T % temporal_patch_size: + repeats = patches[:, -1:].expand(-1, pad, -1, -1, -1) + patches = torch.cat((patches, repeats), dim=1) batch_size, grid_t, channel = patches.shape[:3] grid_t = grid_t // temporal_patch_size grid_h, grid_w = resized_height // patch_size, resized_width // patch_size From ec89ddf624fd740b1cd5bcea55937848f1221f43 Mon Sep 17 00:00:00 2001 From: yaogang2060 Date: Fri, 7 Nov 2025 17:20:55 +0800 Subject: [PATCH 2/8] add two video processor test cases --- .../test_video_processing_qwen3_vl.py | 38 ++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/tests/models/qwen3_vl/test_video_processing_qwen3_vl.py b/tests/models/qwen3_vl/test_video_processing_qwen3_vl.py index 60f4023938bb..047a917416e1 100644 --- a/tests/models/qwen3_vl/test_video_processing_qwen3_vl.py +++ b/tests/models/qwen3_vl/test_video_processing_qwen3_vl.py @@ -41,7 +41,7 @@ def __init__( num_channels=3, min_resolution=32, max_resolution=80, - temporal_patch_size=2, + temporal_patch_size=3, patch_size=16, merge_size=2, do_resize=True, @@ -328,3 +328,39 @@ def test_call_sample_frames(self): self.video_processor_tester.min_resolution = prev_min_resolution if prev_max_resolution is not None: self.video_processor_tester.max_resolution = prev_max_resolution + + def test_image_input(self): + for video_processing_class in self.video_processor_list: + video_processor_dict = self.video_processor_dict.copy() + video_processor_dict["size"] = {"longest_edge": 40960, "shortest_edge": 4096} + video_processor_dict["do_sample_frames"] = False + video_processor_dict["temporal_patch_size"] = 3 + video_processing = video_processing_class(**video_processor_dict) + + n, w, h = 1, 64, 64 + video_inputs = [(np.random.randint(0, 256, (h, w, 3), dtype=np.uint8)) for _ in range(n)] + + video_processed = video_processing(video_inputs, return_tensors="pt") + encoded_videos = video_processed[self.input_name] + self.assertEqual(list(encoded_videos.shape), [16, 2304]) + + video_grid_thw = video_processed["video_grid_thw"] + self.assertEqual(video_grid_thw.tolist(), [[1, 4, 4]]) + + def test_num_frames_equal_temporal_patch_size_plus_two(self): + for video_processing_class in self.video_processor_list: + video_processor_dict = self.video_processor_dict.copy() + video_processor_dict["size"] = {"longest_edge": 40960, "shortest_edge": 4096} + video_processor_dict["do_sample_frames"] = False + video_processor_dict["temporal_patch_size"] = 3 + video_processing = video_processing_class(**video_processor_dict) + + n, w, h = 5, 64, 64 + video_inputs = [(np.random.randint(0, 256, (h, w, 3), dtype=np.uint8)) for _ in range(n)] + + video_processed = video_processing(video_inputs, return_tensors="pt") + encoded_videos = video_processed[self.input_name] + self.assertEqual(list(encoded_videos.shape), [32, 2304]) + + video_grid_thw = video_processed["video_grid_thw"] + self.assertEqual(video_grid_thw.tolist(), [[2, 4, 4]]) From de72cd2b5f9c2a592a6ac95ed3041acfa2de729c Mon Sep 17 00:00:00 2001 From: yaogang2060 Date: Fri, 7 Nov 2025 17:21:44 +0800 Subject: [PATCH 3/8] fix typo --- tests/models/qwen3_vl/test_video_processing_qwen3_vl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/qwen3_vl/test_video_processing_qwen3_vl.py b/tests/models/qwen3_vl/test_video_processing_qwen3_vl.py index 047a917416e1..c9fbaf8a6264 100644 --- a/tests/models/qwen3_vl/test_video_processing_qwen3_vl.py +++ b/tests/models/qwen3_vl/test_video_processing_qwen3_vl.py @@ -41,7 +41,7 @@ def __init__( num_channels=3, min_resolution=32, max_resolution=80, - temporal_patch_size=3, + temporal_patch_size=2, patch_size=16, merge_size=2, do_resize=True, From f91ae0f048a923fe110026e540952268ebbcd7bd Mon Sep 17 00:00:00 2001 From: yaogang2060 Date: Sat, 8 Nov 2025 07:01:52 +0800 Subject: [PATCH 4/8] down test image size --- .../qwen3_vl/test_video_processing_qwen3_vl.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tests/models/qwen3_vl/test_video_processing_qwen3_vl.py b/tests/models/qwen3_vl/test_video_processing_qwen3_vl.py index c9fbaf8a6264..47a8de0e2def 100644 --- a/tests/models/qwen3_vl/test_video_processing_qwen3_vl.py +++ b/tests/models/qwen3_vl/test_video_processing_qwen3_vl.py @@ -329,38 +329,38 @@ def test_call_sample_frames(self): if prev_max_resolution is not None: self.video_processor_tester.max_resolution = prev_max_resolution - def test_image_input(self): + def test_only_one_image_input(self): for video_processing_class in self.video_processor_list: video_processor_dict = self.video_processor_dict.copy() - video_processor_dict["size"] = {"longest_edge": 40960, "shortest_edge": 4096} + video_processor_dict["size"] = {"longest_edge": 1 * 32 * 32, "shortest_edge": 32 * 32} video_processor_dict["do_sample_frames"] = False video_processor_dict["temporal_patch_size"] = 3 video_processing = video_processing_class(**video_processor_dict) - n, w, h = 1, 64, 64 + n, w, h = 1, 32, 32 video_inputs = [(np.random.randint(0, 256, (h, w, 3), dtype=np.uint8)) for _ in range(n)] video_processed = video_processing(video_inputs, return_tensors="pt") encoded_videos = video_processed[self.input_name] - self.assertEqual(list(encoded_videos.shape), [16, 2304]) + self.assertEqual(list(encoded_videos.shape), [4, 2304]) video_grid_thw = video_processed["video_grid_thw"] - self.assertEqual(video_grid_thw.tolist(), [[1, 4, 4]]) + self.assertEqual(video_grid_thw.tolist(), [[1, 2, 2]]) def test_num_frames_equal_temporal_patch_size_plus_two(self): for video_processing_class in self.video_processor_list: video_processor_dict = self.video_processor_dict.copy() - video_processor_dict["size"] = {"longest_edge": 40960, "shortest_edge": 4096} + video_processor_dict["size"] = {"longest_edge": 5 * 32 * 32, "shortest_edge": 32 * 32} video_processor_dict["do_sample_frames"] = False video_processor_dict["temporal_patch_size"] = 3 video_processing = video_processing_class(**video_processor_dict) - n, w, h = 5, 64, 64 + n, w, h = 5, 32, 32 video_inputs = [(np.random.randint(0, 256, (h, w, 3), dtype=np.uint8)) for _ in range(n)] video_processed = video_processing(video_inputs, return_tensors="pt") encoded_videos = video_processed[self.input_name] - self.assertEqual(list(encoded_videos.shape), [32, 2304]) + self.assertEqual(list(encoded_videos.shape), [8, 2304]) video_grid_thw = video_processed["video_grid_thw"] - self.assertEqual(video_grid_thw.tolist(), [[2, 4, 4]]) + self.assertEqual(video_grid_thw.tolist(), [[2, 2, 2]]) From f257f08652b336d910d041537ebd4c3ae15f1a01 Mon Sep 17 00:00:00 2001 From: yaogang2060 Date: Sat, 8 Nov 2025 07:27:04 +0800 Subject: [PATCH 5/8] fix qwen2vl video processor t padding --- .../qwen2_vl/video_processing_qwen2_vl.py | 7 ++++--- .../test_video_processing_qwen2_vl.py | 19 +++++++++++++++++++ .../test_video_processing_qwen3_vl.py | 5 +++-- 3 files changed, 26 insertions(+), 5 deletions(-) diff --git a/src/transformers/models/qwen2_vl/video_processing_qwen2_vl.py b/src/transformers/models/qwen2_vl/video_processing_qwen2_vl.py index 7153154048b6..c0ae21ecd84e 100644 --- a/src/transformers/models/qwen2_vl/video_processing_qwen2_vl.py +++ b/src/transformers/models/qwen2_vl/video_processing_qwen2_vl.py @@ -232,9 +232,10 @@ def _preprocess( patches = stacked_videos # Check that videos have `num_frames` divisible by `temporal_patch_size` - if patches.shape[1] % temporal_patch_size != 0: - repeats = patches[:, -1:].repeat(1, self.temporal_patch_size - 1, 1, 1, 1) - patches = torch.cat([patches, repeats], dim=1) + T = patches.shape[1] + if pad := -T % temporal_patch_size: + repeats = patches[:, -1:].expand(-1, pad, -1, -1, -1) + patches = torch.cat((patches, repeats), dim=1) batch_size, grid_t, channel = patches.shape[:3] grid_t = grid_t // temporal_patch_size diff --git a/tests/models/qwen2_vl/test_video_processing_qwen2_vl.py b/tests/models/qwen2_vl/test_video_processing_qwen2_vl.py index b80adebbd9ab..0ccffca73fa7 100644 --- a/tests/models/qwen2_vl/test_video_processing_qwen2_vl.py +++ b/tests/models/qwen2_vl/test_video_processing_qwen2_vl.py @@ -343,3 +343,22 @@ def test_call_sample_frames(self): # Assign back the actual num frames in tester self.video_processor_tester.num_frames = prev_num_frames + + def test_num_frames_equal_temporal_patch_size_plus_two(self): + for video_processing_class in self.video_processor_list: + video_processor_dict = self.video_processor_dict.copy() + video_processor_dict["size"] = {"longest_edge": 5 * 28 * 28, "shortest_edge": 28 * 28} + video_processor_dict["do_sample_frames"] = False + temporal_patch_size = 3 + video_processor_dict["temporal_patch_size"] = temporal_patch_size + video_processing = video_processing_class(**video_processor_dict) + + n, w, h = 5, 28, 28 + video_inputs = [(np.random.randint(0, 256, (h, w, 3), dtype=np.uint8)) for _ in range(n)] + + video_processed = video_processing(video_inputs, return_tensors="pt") + encoded_videos = video_processed[self.input_name] + self.assertEqual(list(encoded_videos.shape), [8, temporal_patch_size * 3 * 14 * 14]) + + video_grid_thw = video_processed["video_grid_thw"] + self.assertEqual(video_grid_thw.tolist(), [[2, 2, 2]]) diff --git a/tests/models/qwen3_vl/test_video_processing_qwen3_vl.py b/tests/models/qwen3_vl/test_video_processing_qwen3_vl.py index 47a8de0e2def..0f62aeab66f6 100644 --- a/tests/models/qwen3_vl/test_video_processing_qwen3_vl.py +++ b/tests/models/qwen3_vl/test_video_processing_qwen3_vl.py @@ -352,7 +352,8 @@ def test_num_frames_equal_temporal_patch_size_plus_two(self): video_processor_dict = self.video_processor_dict.copy() video_processor_dict["size"] = {"longest_edge": 5 * 32 * 32, "shortest_edge": 32 * 32} video_processor_dict["do_sample_frames"] = False - video_processor_dict["temporal_patch_size"] = 3 + temporal_patch_size = 3 + video_processor_dict["temporal_patch_size"] = temporal_patch_size video_processing = video_processing_class(**video_processor_dict) n, w, h = 5, 32, 32 @@ -360,7 +361,7 @@ def test_num_frames_equal_temporal_patch_size_plus_two(self): video_processed = video_processing(video_inputs, return_tensors="pt") encoded_videos = video_processed[self.input_name] - self.assertEqual(list(encoded_videos.shape), [8, 2304]) + self.assertEqual(list(encoded_videos.shape), [8, temporal_patch_size * 3 * 16 * 16]) video_grid_thw = video_processed["video_grid_thw"] self.assertEqual(video_grid_thw.tolist(), [[2, 2, 2]]) From d059f0931abb0b75a95ab2529140747b81c593ef Mon Sep 17 00:00:00 2001 From: yaogang2060 Date: Mon, 10 Nov 2025 18:57:41 +0800 Subject: [PATCH 6/8] delete padding when num_frames < temporal_patch_size --- .../qwen3_vl/video_processing_qwen3_vl.py | 4 ---- .../qwen3_vl/test_video_processing_qwen3_vl.py | 18 ------------------ 2 files changed, 22 deletions(-) diff --git a/src/transformers/models/qwen3_vl/video_processing_qwen3_vl.py b/src/transformers/models/qwen3_vl/video_processing_qwen3_vl.py index 217efbfe0ce6..e16c74730eec 100644 --- a/src/transformers/models/qwen3_vl/video_processing_qwen3_vl.py +++ b/src/transformers/models/qwen3_vl/video_processing_qwen3_vl.py @@ -196,10 +196,6 @@ def _preprocess( for shape, stacked_videos in grouped_videos.items(): if do_resize: - T = stacked_videos.shape[1] - if pad := -T % temporal_patch_size: - repeats = stacked_videos[:, -1:].expand(-1, pad, -1, -1, -1) - stacked_videos = torch.cat((stacked_videos, repeats), dim=1) B, T, C, H, W = stacked_videos.shape num_frames, height, width = T, H, W resized_height, resized_width = smart_resize( diff --git a/tests/models/qwen3_vl/test_video_processing_qwen3_vl.py b/tests/models/qwen3_vl/test_video_processing_qwen3_vl.py index 0f62aeab66f6..d3b9423030c2 100644 --- a/tests/models/qwen3_vl/test_video_processing_qwen3_vl.py +++ b/tests/models/qwen3_vl/test_video_processing_qwen3_vl.py @@ -329,24 +329,6 @@ def test_call_sample_frames(self): if prev_max_resolution is not None: self.video_processor_tester.max_resolution = prev_max_resolution - def test_only_one_image_input(self): - for video_processing_class in self.video_processor_list: - video_processor_dict = self.video_processor_dict.copy() - video_processor_dict["size"] = {"longest_edge": 1 * 32 * 32, "shortest_edge": 32 * 32} - video_processor_dict["do_sample_frames"] = False - video_processor_dict["temporal_patch_size"] = 3 - video_processing = video_processing_class(**video_processor_dict) - - n, w, h = 1, 32, 32 - video_inputs = [(np.random.randint(0, 256, (h, w, 3), dtype=np.uint8)) for _ in range(n)] - - video_processed = video_processing(video_inputs, return_tensors="pt") - encoded_videos = video_processed[self.input_name] - self.assertEqual(list(encoded_videos.shape), [4, 2304]) - - video_grid_thw = video_processed["video_grid_thw"] - self.assertEqual(video_grid_thw.tolist(), [[1, 2, 2]]) - def test_num_frames_equal_temporal_patch_size_plus_two(self): for video_processing_class in self.video_processor_list: video_processor_dict = self.video_processor_dict.copy() From 59994f0841f1a3641ced696ebfb477572aabbfcf Mon Sep 17 00:00:00 2001 From: yaogang2060 Date: Mon, 10 Nov 2025 18:59:20 +0800 Subject: [PATCH 7/8] to default format --- src/transformers/models/qwen3_vl/video_processing_qwen3_vl.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/qwen3_vl/video_processing_qwen3_vl.py b/src/transformers/models/qwen3_vl/video_processing_qwen3_vl.py index e16c74730eec..d3292c9e39e9 100644 --- a/src/transformers/models/qwen3_vl/video_processing_qwen3_vl.py +++ b/src/transformers/models/qwen3_vl/video_processing_qwen3_vl.py @@ -195,9 +195,9 @@ def _preprocess( resized_videos_grouped = {} for shape, stacked_videos in grouped_videos.items(): + B, T, C, H, W = stacked_videos.shape + num_frames, height, width = T, H, W if do_resize: - B, T, C, H, W = stacked_videos.shape - num_frames, height, width = T, H, W resized_height, resized_width = smart_resize( num_frames=num_frames, height=height, From 040d88816358a4f3dc654e157f0926dba9f7be00 Mon Sep 17 00:00:00 2001 From: yaogang2060 Date: Mon, 10 Nov 2025 21:52:06 +0800 Subject: [PATCH 8/8] fix smart_resize in qwen3vl --- src/transformers/models/qwen3_vl/video_processing_qwen3_vl.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/transformers/models/qwen3_vl/video_processing_qwen3_vl.py b/src/transformers/models/qwen3_vl/video_processing_qwen3_vl.py index d3292c9e39e9..90d7dd0abfb9 100644 --- a/src/transformers/models/qwen3_vl/video_processing_qwen3_vl.py +++ b/src/transformers/models/qwen3_vl/video_processing_qwen3_vl.py @@ -40,8 +40,6 @@ def smart_resize( min_pixels: int = 128 * 128, max_pixels: int = 16 * 16 * 2 * 2 * 2 * 6144, ): - if num_frames < temporal_factor: - raise ValueError(f"t:{num_frames} must be larger than temporal_factor:{temporal_factor}") if height < factor or width < factor: raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor}") elif max(height, width) / min(height, width) > 200: @@ -50,7 +48,7 @@ def smart_resize( ) h_bar = round(height / factor) * factor w_bar = round(width / factor) * factor - t_bar = round(num_frames / temporal_factor) * temporal_factor + t_bar = math.ceil(num_frames / temporal_factor) * temporal_factor if t_bar * h_bar * w_bar > max_pixels: beta = math.sqrt((num_frames * height * width) / max_pixels)