[Bugfix] Handle broken frames in video loading (#29001)

gcanlin · Jixin10 · web-flow · commit fe25772aa97b · 2025-11-20T04:38:12.000Z
Signed-off-by: gcanlin &lt;canlinguosdu@gmail.com&gt;
Signed-off-by: 凌葭 &lt;lvjiang.lj@alibaba-inc.com&gt;
Co-authored-by: 凌葭 &lt;lvjiang.lj@alibaba-inc.com&gt;
diff --git a/tests/multimodal/assets/corrupted.mp4 b/tests/multimodal/assets/corrupted.mp4
diff --git a/tests/multimodal/test_video.py b/tests/multimodal/test_video.py
@@ -18,6 +18,7 @@
 
 pytestmark = pytest.mark.cpu_test
 
+ASSETS_DIR = Path(__file__).parent / "assets"
 NUM_FRAMES = 10
 FAKE_OUTPUT_1 = np.random.rand(NUM_FRAMES, 1280, 720, 3)
 FAKE_OUTPUT_2 = np.random.rand(NUM_FRAMES, 1280, 720, 3)
@@ -140,3 +141,39 @@ def test_opencv_video_io_colorspace(is_color: bool, fourcc: str, ext: str):
             )
             assert np.sum(np.isnan(sim)) / sim.size < 0.001
             assert np.nanmean(sim) > 0.99
+
+
+def test_video_backend_handles_broken_frames(monkeypatch: pytest.MonkeyPatch):
+    """
+    Regression test for handling videos with broken frames.
+    This test uses a pre-corrupted video file (assets/corrupted.mp4) that
+    contains broken/unreadable frames to verify the video loader handles
+    them gracefully without crashing and returns accurate metadata.
+    """
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_VIDEO_LOADER_BACKEND", "opencv")
+
+        # Load the pre-corrupted video file that contains broken frames
+        corrupted_video_path = ASSETS_DIR / "corrupted.mp4"
+
+        with open(corrupted_video_path, "rb") as f:
+            video_data = f.read()
+
+        loader = VIDEO_LOADER_REGISTRY.load("opencv")
+        frames, metadata = loader.load_bytes(video_data, num_frames=-1)
+
+        # Verify metadata consistency:
+        # frames_indices must match actual loaded frames
+        assert frames.shape[0] == len(metadata["frames_indices"]), (
+            f"Frames array size must equal frames_indices length. "
+            f"Got {frames.shape[0]} frames but "
+            f"{len(metadata['frames_indices'])} indices"
+        )
+
+        # Verify that broken frames were skipped:
+        # loaded frames should be less than total
+        assert frames.shape[0] < metadata["total_num_frames"], (
+            f"Should load fewer frames than total due to broken frames. "
+            f"Expected fewer than {metadata['total_num_frames']} frames, "
+            f"but loaded {frames.shape[0]} frames"
+        )
diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
@@ -63,6 +63,63 @@ def load_bytes(
     ) -> tuple[npt.NDArray, dict[str, Any]]:
         raise NotImplementedError
 
+    @staticmethod
+    def _read_frames(
+        cap,
+        frame_indices: set[int],
+        num_expected_frames: int,
+        max_frame_idx: int,
+    ) -> tuple[npt.NDArray, int, list[int]]:
+        import cv2
+
+        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        frames = np.empty((num_expected_frames, height, width, 3), dtype=np.uint8)
+
+        i = 0
+        valid_frame_indices = []
+        for idx in range(max_frame_idx + 1):
+            ok = cap.grab()
+            if not ok:
+                # Frame is broken/unreadable, log warning
+                if idx in frame_indices:
+                    logger.warning(
+                        "Failed to grab frame %d during video loading. "
+                        "This frame will be skipped.",
+                        idx,
+                    )
+                continue
+            if idx in frame_indices:
+                ret, frame = cap.retrieve()
+                if ret:
+                    frames[i] = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                    valid_frame_indices.append(idx)
+                    i += 1
+                else:
+                    # retrieve() failed even though grab() succeeded
+                    logger.warning(
+                        "Failed to retrieve frame %d during video loading. "
+                        "This frame will be skipped.",
+                        idx,
+                    )
+
+        valid_num_frames = len(valid_frame_indices)
+        if valid_num_frames < num_expected_frames:
+            logger.warning(
+                "Video loading completed with %d broken/unreadable frames. "
+                "Expected %d frames but only loaded %d frames.",
+                num_expected_frames - valid_num_frames,
+                num_expected_frames,
+                valid_num_frames,
+            )
+
+        assert i == valid_num_frames, (
+            f"Expected reading {valid_num_frames} frames, "
+            f"but only loaded {i} frames from video."
+        )
+
+        return frames[:valid_num_frames], valid_num_frames, valid_frame_indices
+
 
 VIDEO_LOADER_REGISTRY = ExtensionManager()
 
@@ -120,24 +177,10 @@ def load_bytes(
             )
             frame_idx = uniform_sampled_frames.tolist()
 
-        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
-        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
-        frames = np.empty((len(frame_idx), height, width, 3), dtype=np.uint8)
-
-        i = 0
-        for idx in range(max(frame_idx) + 1):
-            ok = cap.grab()
-            if not ok:
-                break
-            if idx in frame_idx:
-                ret, frame = cap.retrieve()
-                if ret:
-                    frames[i] = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-                    i += 1
-
-        assert i == num_frames_to_sample, (
-            f"Expected reading {num_frames_to_sample} frames, "
-            f"but only loaded {i} frames from video."
+        # Convert to set for O(1) lookup performance
+        frame_idx_set = set(frame_idx)
+        frames, valid_num_frames, valid_frame_indices = cls._read_frames(
+            cap, frame_idx_set, num_frames_to_sample, max(frame_idx)
         )
 
         # Use transformers transformers.video_utils.VideoMetadata format
@@ -148,10 +191,10 @@ def load_bytes(
             "fps": original_fps,
             "duration": duration,
             "video_backend": "opencv",
-            "frames_indices": list(frame_idx),
+            "frames_indices": valid_frame_indices,
             # extra field used to control hf processor's video
             # sampling behavior
-            "do_sample_frames": num_frames_to_sample == total_frames_num,
+            "do_sample_frames": valid_num_frames == total_frames_num,
         }
 
         return frames, metadata
@@ -185,10 +228,10 @@ def load_bytes(
 
         # Refer to:
         # https://github.com/huggingface/transformers/blob/v4.55.4/src/transformers/models/glm4v/video_processing_glm4v.py#L103-L140
-        frame_indices: range | list[int]
+        frame_indices_list: list[int]
         if duration <= max_duration:
             n = int(math.floor(duration * fps))
-            frame_indices = sorted(
+            frame_indices_list = sorted(
                 {
                     min(max_frame_idx, int(math.ceil(i * original_fps / fps)))
                     for i in range(n)
@@ -197,34 +240,23 @@ def load_bytes(
         else:
             num_samples = int(max_duration * fps)
             if num_samples >= total_frames_num:
-                frame_indices = range(total_frames_num)
+                frame_indices_list = list(range(total_frames_num))
             else:
                 target_seconds = np.linspace(0, duration, num_samples, endpoint=True)
-                frame_indices = sorted(
+                frame_indices_list = sorted(
                     {
                         min(max_frame_idx, int(math.ceil(t * original_fps)))
                         for t in target_seconds
                     }
                 )
 
-        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
-        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
-        frames = np.empty((len(frame_indices), height, width, 3), dtype=np.uint8)
-
-        i = 0
-        for idx in range(total_frames_num):
-            ok = cap.grab()
-            if not ok:
-                break
-            if idx in frame_indices:
-                ret, frame = cap.retrieve()
-                if ret:
-                    frames[i] = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-                    i += 1
-
-        assert i == len(frame_indices), (
-            f"Expected reading {len(frame_indices)} frames, "
-            f"but only loaded {i} frames from video."
+        # Convert to set for O(1) lookup performance
+        frame_indices_set = set(frame_indices_list)
+        frames, valid_num_frames, valid_frame_indices = cls._read_frames(
+            cap,
+            frame_indices_set,
+            len(frame_indices_list),
+            total_frames_num - 1,
         )
 
         # Use transformers transformers.video_utils.VideoMetadata format
@@ -233,7 +265,7 @@ def load_bytes(
             "fps": original_fps,
             "duration": duration,
             "video_backend": "opencv_dynamic",
-            "frames_indices": list(frame_indices),
+            "frames_indices": valid_frame_indices,
             "do_sample_frames": False,
         }