From 357e602f0ac3b38fb236a8bc3047d32412af38d2 Mon Sep 17 00:00:00 2001
From: yaogang2060 <yaogang2060@foxmail.com>
Date: Fri, 7 Nov 2025 16:17:23 +0800
Subject: [PATCH 1/8]  qwen3vl video process padding video frames

---
 .../models/qwen3_vl/video_processing_qwen3_vl.py  | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/qwen3_vl/video_processing_qwen3_vl.py b/src/transformers/models/qwen3_vl/video_processing_qwen3_vl.py
index 8a70a1a68584..217efbfe0ce6 100644
--- a/src/transformers/models/qwen3_vl/video_processing_qwen3_vl.py
+++ b/src/transformers/models/qwen3_vl/video_processing_qwen3_vl.py
@@ -195,9 +195,13 @@ def _preprocess(
         resized_videos_grouped = {}
 
         for shape, stacked_videos in grouped_videos.items():
-            B, T, C, H, W = stacked_videos.shape
-            num_frames, height, width = T, H, W
             if do_resize:
+                T = stacked_videos.shape[1]
+                if pad := -T % temporal_patch_size:
+                    repeats = stacked_videos[:, -1:].expand(-1, pad, -1, -1, -1)
+                    stacked_videos = torch.cat((stacked_videos, repeats), dim=1)
+                B, T, C, H, W = stacked_videos.shape
+                num_frames, height, width = T, H, W
                 resized_height, resized_width = smart_resize(
                     num_frames=num_frames,
                     height=height,
@@ -232,9 +236,10 @@ def _preprocess(
             patches = stacked_videos
 
             # Check that videos have `num_frames` divisible by `temporal_patch_size`
-            if patches.shape[1] % temporal_patch_size != 0:
-                repeats = patches[:, -1:].repeat(1, temporal_patch_size - 1, 1, 1, 1)
-                patches = torch.cat([patches, repeats], dim=1)
+            T = patches.shape[1]
+            if pad := -T % temporal_patch_size:
+                repeats = patches[:, -1:].expand(-1, pad, -1, -1, -1)
+                patches = torch.cat((patches, repeats), dim=1)
             batch_size, grid_t, channel = patches.shape[:3]
             grid_t = grid_t // temporal_patch_size
             grid_h, grid_w = resized_height // patch_size, resized_width // patch_size

From ec89ddf624fd740b1cd5bcea55937848f1221f43 Mon Sep 17 00:00:00 2001
From: yaogang2060 <yaogang2060@foxmail.com>
Date: Fri, 7 Nov 2025 17:20:55 +0800
Subject: [PATCH 2/8] add two video processor test cases

---
 .../test_video_processing_qwen3_vl.py         | 38 ++++++++++++++++++-
 1 file changed, 37 insertions(+), 1 deletion(-)

diff --git a/tests/models/qwen3_vl/test_video_processing_qwen3_vl.py b/tests/models/qwen3_vl/test_video_processing_qwen3_vl.py
index 60f4023938bb..047a917416e1 100644
--- a/tests/models/qwen3_vl/test_video_processing_qwen3_vl.py
+++ b/tests/models/qwen3_vl/test_video_processing_qwen3_vl.py
@@ -41,7 +41,7 @@ def __init__(
         num_channels=3,
         min_resolution=32,
         max_resolution=80,
-        temporal_patch_size=2,
+        temporal_patch_size=3,
         patch_size=16,
         merge_size=2,
         do_resize=True,
@@ -328,3 +328,39 @@ def test_call_sample_frames(self):
                 self.video_processor_tester.min_resolution = prev_min_resolution
             if prev_max_resolution is not None:
                 self.video_processor_tester.max_resolution = prev_max_resolution
+
+    def test_image_input(self):
+        for video_processing_class in self.video_processor_list:
+            video_processor_dict = self.video_processor_dict.copy()
+            video_processor_dict["size"] = {"longest_edge": 40960, "shortest_edge": 4096}
+            video_processor_dict["do_sample_frames"] = False
+            video_processor_dict["temporal_patch_size"] = 3
+            video_processing = video_processing_class(**video_processor_dict)
+
+            n, w, h = 1, 64, 64
+            video_inputs = [(np.random.randint(0, 256, (h, w, 3), dtype=np.uint8)) for _ in range(n)]
+
+            video_processed = video_processing(video_inputs, return_tensors="pt")
+            encoded_videos = video_processed[self.input_name]
+            self.assertEqual(list(encoded_videos.shape), [16, 2304])
+
+            video_grid_thw = video_processed["video_grid_thw"]
+            self.assertEqual(video_grid_thw.tolist(), [[1, 4, 4]])
+
+    def test_num_frames_equal_temporal_patch_size_plus_two(self):
+        for video_processing_class in self.video_processor_list:
+            video_processor_dict = self.video_processor_dict.copy()
+            video_processor_dict["size"] = {"longest_edge": 40960, "shortest_edge": 4096}
+            video_processor_dict["do_sample_frames"] = False
+            video_processor_dict["temporal_patch_size"] = 3
+            video_processing = video_processing_class(**video_processor_dict)
+
+            n, w, h = 5, 64, 64
+            video_inputs = [(np.random.randint(0, 256, (h, w, 3), dtype=np.uint8)) for _ in range(n)]
+
+            video_processed = video_processing(video_inputs, return_tensors="pt")
+            encoded_videos = video_processed[self.input_name]
+            self.assertEqual(list(encoded_videos.shape), [32, 2304])
+
+            video_grid_thw = video_processed["video_grid_thw"]
+            self.assertEqual(video_grid_thw.tolist(), [[2, 4, 4]])

From de72cd2b5f9c2a592a6ac95ed3041acfa2de729c Mon Sep 17 00:00:00 2001
From: yaogang2060 <yaogang2060@foxmail.com>
Date: Fri, 7 Nov 2025 17:21:44 +0800
Subject: [PATCH 3/8] fix typo

---
 tests/models/qwen3_vl/test_video_processing_qwen3_vl.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/qwen3_vl/test_video_processing_qwen3_vl.py b/tests/models/qwen3_vl/test_video_processing_qwen3_vl.py
index 047a917416e1..c9fbaf8a6264 100644
--- a/tests/models/qwen3_vl/test_video_processing_qwen3_vl.py
+++ b/tests/models/qwen3_vl/test_video_processing_qwen3_vl.py
@@ -41,7 +41,7 @@ def __init__(
         num_channels=3,
         min_resolution=32,
         max_resolution=80,
-        temporal_patch_size=3,
+        temporal_patch_size=2,
         patch_size=16,
         merge_size=2,
         do_resize=True,

From f91ae0f048a923fe110026e540952268ebbcd7bd Mon Sep 17 00:00:00 2001
From: yaogang2060 <yaogang2060@foxmail.com>
Date: Sat, 8 Nov 2025 07:01:52 +0800
Subject: [PATCH 4/8] down test image size

---
 .../qwen3_vl/test_video_processing_qwen3_vl.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/tests/models/qwen3_vl/test_video_processing_qwen3_vl.py b/tests/models/qwen3_vl/test_video_processing_qwen3_vl.py
index c9fbaf8a6264..47a8de0e2def 100644
--- a/tests/models/qwen3_vl/test_video_processing_qwen3_vl.py
+++ b/tests/models/qwen3_vl/test_video_processing_qwen3_vl.py
@@ -329,38 +329,38 @@ def test_call_sample_frames(self):
             if prev_max_resolution is not None:
                 self.video_processor_tester.max_resolution = prev_max_resolution
 
-    def test_image_input(self):
+    def test_only_one_image_input(self):
         for video_processing_class in self.video_processor_list:
             video_processor_dict = self.video_processor_dict.copy()
-            video_processor_dict["size"] = {"longest_edge": 40960, "shortest_edge": 4096}
+            video_processor_dict["size"] = {"longest_edge": 1 * 32 * 32, "shortest_edge": 32 * 32}
             video_processor_dict["do_sample_frames"] = False
             video_processor_dict["temporal_patch_size"] = 3
             video_processing = video_processing_class(**video_processor_dict)
 
-            n, w, h = 1, 64, 64
+            n, w, h = 1, 32, 32
             video_inputs = [(np.random.randint(0, 256, (h, w, 3), dtype=np.uint8)) for _ in range(n)]
 
             video_processed = video_processing(video_inputs, return_tensors="pt")
             encoded_videos = video_processed[self.input_name]
-            self.assertEqual(list(encoded_videos.shape), [16, 2304])
+            self.assertEqual(list(encoded_videos.shape), [4, 2304])
 
             video_grid_thw = video_processed["video_grid_thw"]
-            self.assertEqual(video_grid_thw.tolist(), [[1, 4, 4]])
+            self.assertEqual(video_grid_thw.tolist(), [[1, 2, 2]])
 
     def test_num_frames_equal_temporal_patch_size_plus_two(self):
         for video_processing_class in self.video_processor_list:
             video_processor_dict = self.video_processor_dict.copy()
-            video_processor_dict["size"] = {"longest_edge": 40960, "shortest_edge": 4096}
+            video_processor_dict["size"] = {"longest_edge": 5 * 32 * 32, "shortest_edge": 32 * 32}
             video_processor_dict["do_sample_frames"] = False
             video_processor_dict["temporal_patch_size"] = 3
             video_processing = video_processing_class(**video_processor_dict)
 
-            n, w, h = 5, 64, 64
+            n, w, h = 5, 32, 32
             video_inputs = [(np.random.randint(0, 256, (h, w, 3), dtype=np.uint8)) for _ in range(n)]
 
             video_processed = video_processing(video_inputs, return_tensors="pt")
             encoded_videos = video_processed[self.input_name]
-            self.assertEqual(list(encoded_videos.shape), [32, 2304])
+            self.assertEqual(list(encoded_videos.shape), [8, 2304])
 
             video_grid_thw = video_processed["video_grid_thw"]
-            self.assertEqual(video_grid_thw.tolist(), [[2, 4, 4]])
+            self.assertEqual(video_grid_thw.tolist(), [[2, 2, 2]])

From f257f08652b336d910d041537ebd4c3ae15f1a01 Mon Sep 17 00:00:00 2001
From: yaogang2060 <yaogang2060@foxmail.com>
Date: Sat, 8 Nov 2025 07:27:04 +0800
Subject: [PATCH 5/8] fix qwen2vl video processor t padding

---
 .../qwen2_vl/video_processing_qwen2_vl.py     |  7 ++++---
 .../test_video_processing_qwen2_vl.py         | 19 +++++++++++++++++++
 .../test_video_processing_qwen3_vl.py         |  5 +++--
 3 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/qwen2_vl/video_processing_qwen2_vl.py b/src/transformers/models/qwen2_vl/video_processing_qwen2_vl.py
index 7153154048b6..c0ae21ecd84e 100644
--- a/src/transformers/models/qwen2_vl/video_processing_qwen2_vl.py
+++ b/src/transformers/models/qwen2_vl/video_processing_qwen2_vl.py
@@ -232,9 +232,10 @@ def _preprocess(
             patches = stacked_videos
 
             # Check that videos have `num_frames` divisible by `temporal_patch_size`
-            if patches.shape[1] % temporal_patch_size != 0:
-                repeats = patches[:, -1:].repeat(1, self.temporal_patch_size - 1, 1, 1, 1)
-                patches = torch.cat([patches, repeats], dim=1)
+            T = patches.shape[1]
+            if pad := -T % temporal_patch_size:
+                repeats = patches[:, -1:].expand(-1, pad, -1, -1, -1)
+                patches = torch.cat((patches, repeats), dim=1)
 
             batch_size, grid_t, channel = patches.shape[:3]
             grid_t = grid_t // temporal_patch_size
diff --git a/tests/models/qwen2_vl/test_video_processing_qwen2_vl.py b/tests/models/qwen2_vl/test_video_processing_qwen2_vl.py
index b80adebbd9ab..0ccffca73fa7 100644
--- a/tests/models/qwen2_vl/test_video_processing_qwen2_vl.py
+++ b/tests/models/qwen2_vl/test_video_processing_qwen2_vl.py
@@ -343,3 +343,22 @@ def test_call_sample_frames(self):
 
             # Assign back the actual num frames in tester
             self.video_processor_tester.num_frames = prev_num_frames
+
+    def test_num_frames_equal_temporal_patch_size_plus_two(self):
+        for video_processing_class in self.video_processor_list:
+            video_processor_dict = self.video_processor_dict.copy()
+            video_processor_dict["size"] = {"longest_edge": 5 * 28 * 28, "shortest_edge": 28 * 28}
+            video_processor_dict["do_sample_frames"] = False
+            temporal_patch_size = 3
+            video_processor_dict["temporal_patch_size"] = temporal_patch_size
+            video_processing = video_processing_class(**video_processor_dict)
+
+            n, w, h = 5, 28, 28
+            video_inputs = [(np.random.randint(0, 256, (h, w, 3), dtype=np.uint8)) for _ in range(n)]
+
+            video_processed = video_processing(video_inputs, return_tensors="pt")
+            encoded_videos = video_processed[self.input_name]
+            self.assertEqual(list(encoded_videos.shape), [8, temporal_patch_size * 3 * 14 * 14])
+
+            video_grid_thw = video_processed["video_grid_thw"]
+            self.assertEqual(video_grid_thw.tolist(), [[2, 2, 2]])
diff --git a/tests/models/qwen3_vl/test_video_processing_qwen3_vl.py b/tests/models/qwen3_vl/test_video_processing_qwen3_vl.py
index 47a8de0e2def..0f62aeab66f6 100644
--- a/tests/models/qwen3_vl/test_video_processing_qwen3_vl.py
+++ b/tests/models/qwen3_vl/test_video_processing_qwen3_vl.py
@@ -352,7 +352,8 @@ def test_num_frames_equal_temporal_patch_size_plus_two(self):
             video_processor_dict = self.video_processor_dict.copy()
             video_processor_dict["size"] = {"longest_edge": 5 * 32 * 32, "shortest_edge": 32 * 32}
             video_processor_dict["do_sample_frames"] = False
-            video_processor_dict["temporal_patch_size"] = 3
+            temporal_patch_size = 3
+            video_processor_dict["temporal_patch_size"] = temporal_patch_size
             video_processing = video_processing_class(**video_processor_dict)
 
             n, w, h = 5, 32, 32
@@ -360,7 +361,7 @@ def test_num_frames_equal_temporal_patch_size_plus_two(self):
 
             video_processed = video_processing(video_inputs, return_tensors="pt")
             encoded_videos = video_processed[self.input_name]
-            self.assertEqual(list(encoded_videos.shape), [8, 2304])
+            self.assertEqual(list(encoded_videos.shape), [8, temporal_patch_size * 3 * 16 * 16])
 
             video_grid_thw = video_processed["video_grid_thw"]
             self.assertEqual(video_grid_thw.tolist(), [[2, 2, 2]])

From d059f0931abb0b75a95ab2529140747b81c593ef Mon Sep 17 00:00:00 2001
From: yaogang2060 <yaogang2060@foxmail.com>
Date: Mon, 10 Nov 2025 18:57:41 +0800
Subject: [PATCH 6/8] delete padding when num_frames < temporal_patch_size

---
 .../qwen3_vl/video_processing_qwen3_vl.py      |  4 ----
 .../qwen3_vl/test_video_processing_qwen3_vl.py | 18 ------------------
 2 files changed, 22 deletions(-)

diff --git a/src/transformers/models/qwen3_vl/video_processing_qwen3_vl.py b/src/transformers/models/qwen3_vl/video_processing_qwen3_vl.py
index 217efbfe0ce6..e16c74730eec 100644
--- a/src/transformers/models/qwen3_vl/video_processing_qwen3_vl.py
+++ b/src/transformers/models/qwen3_vl/video_processing_qwen3_vl.py
@@ -196,10 +196,6 @@ def _preprocess(
 
         for shape, stacked_videos in grouped_videos.items():
             if do_resize:
-                T = stacked_videos.shape[1]
-                if pad := -T % temporal_patch_size:
-                    repeats = stacked_videos[:, -1:].expand(-1, pad, -1, -1, -1)
-                    stacked_videos = torch.cat((stacked_videos, repeats), dim=1)
                 B, T, C, H, W = stacked_videos.shape
                 num_frames, height, width = T, H, W
                 resized_height, resized_width = smart_resize(
diff --git a/tests/models/qwen3_vl/test_video_processing_qwen3_vl.py b/tests/models/qwen3_vl/test_video_processing_qwen3_vl.py
index 0f62aeab66f6..d3b9423030c2 100644
--- a/tests/models/qwen3_vl/test_video_processing_qwen3_vl.py
+++ b/tests/models/qwen3_vl/test_video_processing_qwen3_vl.py
@@ -329,24 +329,6 @@ def test_call_sample_frames(self):
             if prev_max_resolution is not None:
                 self.video_processor_tester.max_resolution = prev_max_resolution
 
-    def test_only_one_image_input(self):
-        for video_processing_class in self.video_processor_list:
-            video_processor_dict = self.video_processor_dict.copy()
-            video_processor_dict["size"] = {"longest_edge": 1 * 32 * 32, "shortest_edge": 32 * 32}
-            video_processor_dict["do_sample_frames"] = False
-            video_processor_dict["temporal_patch_size"] = 3
-            video_processing = video_processing_class(**video_processor_dict)
-
-            n, w, h = 1, 32, 32
-            video_inputs = [(np.random.randint(0, 256, (h, w, 3), dtype=np.uint8)) for _ in range(n)]
-
-            video_processed = video_processing(video_inputs, return_tensors="pt")
-            encoded_videos = video_processed[self.input_name]
-            self.assertEqual(list(encoded_videos.shape), [4, 2304])
-
-            video_grid_thw = video_processed["video_grid_thw"]
-            self.assertEqual(video_grid_thw.tolist(), [[1, 2, 2]])
-
     def test_num_frames_equal_temporal_patch_size_plus_two(self):
         for video_processing_class in self.video_processor_list:
             video_processor_dict = self.video_processor_dict.copy()

From 59994f0841f1a3641ced696ebfb477572aabbfcf Mon Sep 17 00:00:00 2001
From: yaogang2060 <yaogang2060@foxmail.com>
Date: Mon, 10 Nov 2025 18:59:20 +0800
Subject: [PATCH 7/8] to default format

---
 src/transformers/models/qwen3_vl/video_processing_qwen3_vl.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/qwen3_vl/video_processing_qwen3_vl.py b/src/transformers/models/qwen3_vl/video_processing_qwen3_vl.py
index e16c74730eec..d3292c9e39e9 100644
--- a/src/transformers/models/qwen3_vl/video_processing_qwen3_vl.py
+++ b/src/transformers/models/qwen3_vl/video_processing_qwen3_vl.py
@@ -195,9 +195,9 @@ def _preprocess(
         resized_videos_grouped = {}
 
         for shape, stacked_videos in grouped_videos.items():
+            B, T, C, H, W = stacked_videos.shape
+            num_frames, height, width = T, H, W
             if do_resize:
-                B, T, C, H, W = stacked_videos.shape
-                num_frames, height, width = T, H, W
                 resized_height, resized_width = smart_resize(
                     num_frames=num_frames,
                     height=height,

From 040d88816358a4f3dc654e157f0926dba9f7be00 Mon Sep 17 00:00:00 2001
From: yaogang2060 <yaogang2060@foxmail.com>
Date: Mon, 10 Nov 2025 21:52:06 +0800
Subject: [PATCH 8/8] fix smart_resize in qwen3vl

---
 src/transformers/models/qwen3_vl/video_processing_qwen3_vl.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/transformers/models/qwen3_vl/video_processing_qwen3_vl.py b/src/transformers/models/qwen3_vl/video_processing_qwen3_vl.py
index d3292c9e39e9..90d7dd0abfb9 100644
--- a/src/transformers/models/qwen3_vl/video_processing_qwen3_vl.py
+++ b/src/transformers/models/qwen3_vl/video_processing_qwen3_vl.py
@@ -40,8 +40,6 @@ def smart_resize(
     min_pixels: int = 128 * 128,
     max_pixels: int = 16 * 16 * 2 * 2 * 2 * 6144,
 ):
-    if num_frames < temporal_factor:
-        raise ValueError(f"t:{num_frames} must be larger than temporal_factor:{temporal_factor}")
     if height < factor or width < factor:
         raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor}")
     elif max(height, width) / min(height, width) > 200:
@@ -50,7 +48,7 @@ def smart_resize(
         )
     h_bar = round(height / factor) * factor
     w_bar = round(width / factor) * factor
-    t_bar = round(num_frames / temporal_factor) * temporal_factor
+    t_bar = math.ceil(num_frames / temporal_factor) * temporal_factor
 
     if t_bar * h_bar * w_bar > max_pixels:
         beta = math.sqrt((num_frames * height * width) / max_pixels)