fix qwen2vl video processor t padding

yaogang2060 · yaogang2060 · commit f257f08652b3 · 2025-11-08T07:27:04.000+08:00
diff --git a/src/transformers/models/qwen2_vl/video_processing_qwen2_vl.py b/src/transformers/models/qwen2_vl/video_processing_qwen2_vl.py
@@ -232,9 +232,10 @@ def _preprocess(
             patches = stacked_videos
 
             # Check that videos have `num_frames` divisible by `temporal_patch_size`
-            if patches.shape[1] % temporal_patch_size != 0:
-                repeats = patches[:, -1:].repeat(1, self.temporal_patch_size - 1, 1, 1, 1)
-                patches = torch.cat([patches, repeats], dim=1)
+            T = patches.shape[1]
+            if pad := -T % temporal_patch_size:
+                repeats = patches[:, -1:].expand(-1, pad, -1, -1, -1)
+                patches = torch.cat((patches, repeats), dim=1)
 
             batch_size, grid_t, channel = patches.shape[:3]
             grid_t = grid_t // temporal_patch_size
diff --git a/tests/models/qwen2_vl/test_video_processing_qwen2_vl.py b/tests/models/qwen2_vl/test_video_processing_qwen2_vl.py
@@ -343,3 +343,22 @@ def test_call_sample_frames(self):
 
             # Assign back the actual num frames in tester
             self.video_processor_tester.num_frames = prev_num_frames
+
+    def test_num_frames_equal_temporal_patch_size_plus_two(self):
+        for video_processing_class in self.video_processor_list:
+            video_processor_dict = self.video_processor_dict.copy()
+            video_processor_dict["size"] = {"longest_edge": 5 * 28 * 28, "shortest_edge": 28 * 28}
+            video_processor_dict["do_sample_frames"] = False
+            temporal_patch_size = 3
+            video_processor_dict["temporal_patch_size"] = temporal_patch_size
+            video_processing = video_processing_class(**video_processor_dict)
+
+            n, w, h = 5, 28, 28
+            video_inputs = [(np.random.randint(0, 256, (h, w, 3), dtype=np.uint8)) for _ in range(n)]
+
+            video_processed = video_processing(video_inputs, return_tensors="pt")
+            encoded_videos = video_processed[self.input_name]
+            self.assertEqual(list(encoded_videos.shape), [8, temporal_patch_size * 3 * 14 * 14])
+
+            video_grid_thw = video_processed["video_grid_thw"]
+            self.assertEqual(video_grid_thw.tolist(), [[2, 2, 2]])
diff --git a/tests/models/qwen3_vl/test_video_processing_qwen3_vl.py b/tests/models/qwen3_vl/test_video_processing_qwen3_vl.py
@@ -352,15 +352,16 @@ def test_num_frames_equal_temporal_patch_size_plus_two(self):
             video_processor_dict = self.video_processor_dict.copy()
             video_processor_dict["size"] = {"longest_edge": 5 * 32 * 32, "shortest_edge": 32 * 32}
             video_processor_dict["do_sample_frames"] = False
-            video_processor_dict["temporal_patch_size"] = 3
+            temporal_patch_size = 3
+            video_processor_dict["temporal_patch_size"] = temporal_patch_size
             video_processing = video_processing_class(**video_processor_dict)
 
             n, w, h = 5, 32, 32
             video_inputs = [(np.random.randint(0, 256, (h, w, 3), dtype=np.uint8)) for _ in range(n)]
 
             video_processed = video_processing(video_inputs, return_tensors="pt")
             encoded_videos = video_processed[self.input_name]
-            self.assertEqual(list(encoded_videos.shape), [8, 2304])
+            self.assertEqual(list(encoded_videos.shape), [8, temporal_patch_size * 3 * 16 * 16])
 
             video_grid_thw = video_processed["video_grid_thw"]
             self.assertEqual(video_grid_thw.tolist(), [[2, 2, 2]])