delete padding when num_frames < temporal_patch_size

yaogang2060 · yaogang2060 · commit d059f0931abb · 2025-11-10T18:57:41.000+08:00
diff --git a/src/transformers/models/qwen3_vl/video_processing_qwen3_vl.py b/src/transformers/models/qwen3_vl/video_processing_qwen3_vl.py
@@ -196,10 +196,6 @@ def _preprocess(
 
         for shape, stacked_videos in grouped_videos.items():
             if do_resize:
-                T = stacked_videos.shape[1]
-                if pad := -T % temporal_patch_size:
-                    repeats = stacked_videos[:, -1:].expand(-1, pad, -1, -1, -1)
-                    stacked_videos = torch.cat((stacked_videos, repeats), dim=1)
                 B, T, C, H, W = stacked_videos.shape
                 num_frames, height, width = T, H, W
                 resized_height, resized_width = smart_resize(
diff --git a/tests/models/qwen3_vl/test_video_processing_qwen3_vl.py b/tests/models/qwen3_vl/test_video_processing_qwen3_vl.py
@@ -329,24 +329,6 @@ def test_call_sample_frames(self):
             if prev_max_resolution is not None:
                 self.video_processor_tester.max_resolution = prev_max_resolution
 
-    def test_only_one_image_input(self):
-        for video_processing_class in self.video_processor_list:
-            video_processor_dict = self.video_processor_dict.copy()
-            video_processor_dict["size"] = {"longest_edge": 1 * 32 * 32, "shortest_edge": 32 * 32}
-            video_processor_dict["do_sample_frames"] = False
-            video_processor_dict["temporal_patch_size"] = 3
-            video_processing = video_processing_class(**video_processor_dict)
-
-            n, w, h = 1, 32, 32
-            video_inputs = [(np.random.randint(0, 256, (h, w, 3), dtype=np.uint8)) for _ in range(n)]
-
-            video_processed = video_processing(video_inputs, return_tensors="pt")
-            encoded_videos = video_processed[self.input_name]
-            self.assertEqual(list(encoded_videos.shape), [4, 2304])
-
-            video_grid_thw = video_processed["video_grid_thw"]
-            self.assertEqual(video_grid_thw.tolist(), [[1, 2, 2]])
-
     def test_num_frames_equal_temporal_patch_size_plus_two(self):
         for video_processing_class in self.video_processor_list:
             video_processor_dict = self.video_processor_dict.copy()