Skip to content

Commit 700c48a

Browse files
authored
fix qwen2vl/qwen3vl video processor temporal padding when num_frames%temporal_patch_size!=1 (#42083)
* qwen3vl video process padding video frames * add two video processor test cases * fix typo * down test image size * fix qwen2vl video processor t padding * delete padding when num_frames < temporal_patch_size * to default format * fix smart_resize in qwen3vl
1 parent 18a19de commit 700c48a

File tree

4 files changed

+47
-9
lines changed

4 files changed

+47
-9
lines changed

src/transformers/models/qwen2_vl/video_processing_qwen2_vl.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -232,9 +232,10 @@ def _preprocess(
232232
patches = stacked_videos
233233

234234
# Check that videos have `num_frames` divisible by `temporal_patch_size`
235-
if patches.shape[1] % temporal_patch_size != 0:
236-
repeats = patches[:, -1:].repeat(1, self.temporal_patch_size - 1, 1, 1, 1)
237-
patches = torch.cat([patches, repeats], dim=1)
235+
T = patches.shape[1]
236+
if pad := -T % temporal_patch_size:
237+
repeats = patches[:, -1:].expand(-1, pad, -1, -1, -1)
238+
patches = torch.cat((patches, repeats), dim=1)
238239

239240
batch_size, grid_t, channel = patches.shape[:3]
240241
grid_t = grid_t // temporal_patch_size

src/transformers/models/qwen3_vl/video_processing_qwen3_vl.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,6 @@ def smart_resize(
4040
min_pixels: int = 128 * 128,
4141
max_pixels: int = 16 * 16 * 2 * 2 * 2 * 6144,
4242
):
43-
if num_frames < temporal_factor:
44-
raise ValueError(f"t:{num_frames} must be larger than temporal_factor:{temporal_factor}")
4543
if height < factor or width < factor:
4644
raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor}")
4745
elif max(height, width) / min(height, width) > 200:
@@ -50,7 +48,7 @@ def smart_resize(
5048
)
5149
h_bar = round(height / factor) * factor
5250
w_bar = round(width / factor) * factor
53-
t_bar = round(num_frames / temporal_factor) * temporal_factor
51+
t_bar = math.ceil(num_frames / temporal_factor) * temporal_factor
5452

5553
if t_bar * h_bar * w_bar > max_pixels:
5654
beta = math.sqrt((num_frames * height * width) / max_pixels)
@@ -232,9 +230,10 @@ def _preprocess(
232230
patches = stacked_videos
233231

234232
# Check that videos have `num_frames` divisible by `temporal_patch_size`
235-
if patches.shape[1] % temporal_patch_size != 0:
236-
repeats = patches[:, -1:].repeat(1, temporal_patch_size - 1, 1, 1, 1)
237-
patches = torch.cat([patches, repeats], dim=1)
233+
T = patches.shape[1]
234+
if pad := -T % temporal_patch_size:
235+
repeats = patches[:, -1:].expand(-1, pad, -1, -1, -1)
236+
patches = torch.cat((patches, repeats), dim=1)
238237
batch_size, grid_t, channel = patches.shape[:3]
239238
grid_t = grid_t // temporal_patch_size
240239
grid_h, grid_w = resized_height // patch_size, resized_width // patch_size

tests/models/qwen2_vl/test_video_processing_qwen2_vl.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -343,3 +343,22 @@ def test_call_sample_frames(self):
343343

344344
# Assign back the actual num frames in tester
345345
self.video_processor_tester.num_frames = prev_num_frames
346+
347+
def test_num_frames_equal_temporal_patch_size_plus_two(self):
348+
for video_processing_class in self.video_processor_list:
349+
video_processor_dict = self.video_processor_dict.copy()
350+
video_processor_dict["size"] = {"longest_edge": 5 * 28 * 28, "shortest_edge": 28 * 28}
351+
video_processor_dict["do_sample_frames"] = False
352+
temporal_patch_size = 3
353+
video_processor_dict["temporal_patch_size"] = temporal_patch_size
354+
video_processing = video_processing_class(**video_processor_dict)
355+
356+
n, w, h = 5, 28, 28
357+
video_inputs = [(np.random.randint(0, 256, (h, w, 3), dtype=np.uint8)) for _ in range(n)]
358+
359+
video_processed = video_processing(video_inputs, return_tensors="pt")
360+
encoded_videos = video_processed[self.input_name]
361+
self.assertEqual(list(encoded_videos.shape), [8, temporal_patch_size * 3 * 14 * 14])
362+
363+
video_grid_thw = video_processed["video_grid_thw"]
364+
self.assertEqual(video_grid_thw.tolist(), [[2, 2, 2]])

tests/models/qwen3_vl/test_video_processing_qwen3_vl.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -328,3 +328,22 @@ def test_call_sample_frames(self):
328328
self.video_processor_tester.min_resolution = prev_min_resolution
329329
if prev_max_resolution is not None:
330330
self.video_processor_tester.max_resolution = prev_max_resolution
331+
332+
def test_num_frames_equal_temporal_patch_size_plus_two(self):
333+
for video_processing_class in self.video_processor_list:
334+
video_processor_dict = self.video_processor_dict.copy()
335+
video_processor_dict["size"] = {"longest_edge": 5 * 32 * 32, "shortest_edge": 32 * 32}
336+
video_processor_dict["do_sample_frames"] = False
337+
temporal_patch_size = 3
338+
video_processor_dict["temporal_patch_size"] = temporal_patch_size
339+
video_processing = video_processing_class(**video_processor_dict)
340+
341+
n, w, h = 5, 32, 32
342+
video_inputs = [(np.random.randint(0, 256, (h, w, 3), dtype=np.uint8)) for _ in range(n)]
343+
344+
video_processed = video_processing(video_inputs, return_tensors="pt")
345+
encoded_videos = video_processed[self.input_name]
346+
self.assertEqual(list(encoded_videos.shape), [8, temporal_patch_size * 3 * 16 * 16])
347+
348+
video_grid_thw = video_processed["video_grid_thw"]
349+
self.assertEqual(video_grid_thw.tolist(), [[2, 2, 2]])

0 commit comments

Comments
 (0)