Skip to content

Commit fe25772

Browse files
gcanlinJixin10
andauthored
[Bugfix] Handle broken frames in video loading (#29001)
Signed-off-by: gcanlin <canlinguosdu@gmail.com> Signed-off-by: 凌葭 <lvjiang.lj@alibaba-inc.com> Co-authored-by: 凌葭 <lvjiang.lj@alibaba-inc.com>
1 parent 0cca9b4 commit fe25772

File tree

3 files changed

+112
-43
lines changed

3 files changed

+112
-43
lines changed
89.5 KB
Binary file not shown.

tests/multimodal/test_video.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818

1919
pytestmark = pytest.mark.cpu_test
2020

21+
ASSETS_DIR = Path(__file__).parent / "assets"
2122
NUM_FRAMES = 10
2223
FAKE_OUTPUT_1 = np.random.rand(NUM_FRAMES, 1280, 720, 3)
2324
FAKE_OUTPUT_2 = np.random.rand(NUM_FRAMES, 1280, 720, 3)
@@ -140,3 +141,39 @@ def test_opencv_video_io_colorspace(is_color: bool, fourcc: str, ext: str):
140141
)
141142
assert np.sum(np.isnan(sim)) / sim.size < 0.001
142143
assert np.nanmean(sim) > 0.99
144+
145+
146+
def test_video_backend_handles_broken_frames(monkeypatch: pytest.MonkeyPatch):
147+
"""
148+
Regression test for handling videos with broken frames.
149+
This test uses a pre-corrupted video file (assets/corrupted.mp4) that
150+
contains broken/unreadable frames to verify the video loader handles
151+
them gracefully without crashing and returns accurate metadata.
152+
"""
153+
with monkeypatch.context() as m:
154+
m.setenv("VLLM_VIDEO_LOADER_BACKEND", "opencv")
155+
156+
# Load the pre-corrupted video file that contains broken frames
157+
corrupted_video_path = ASSETS_DIR / "corrupted.mp4"
158+
159+
with open(corrupted_video_path, "rb") as f:
160+
video_data = f.read()
161+
162+
loader = VIDEO_LOADER_REGISTRY.load("opencv")
163+
frames, metadata = loader.load_bytes(video_data, num_frames=-1)
164+
165+
# Verify metadata consistency:
166+
# frames_indices must match actual loaded frames
167+
assert frames.shape[0] == len(metadata["frames_indices"]), (
168+
f"Frames array size must equal frames_indices length. "
169+
f"Got {frames.shape[0]} frames but "
170+
f"{len(metadata['frames_indices'])} indices"
171+
)
172+
173+
# Verify that broken frames were skipped:
174+
# loaded frames should be less than total
175+
assert frames.shape[0] < metadata["total_num_frames"], (
176+
f"Should load fewer frames than total due to broken frames. "
177+
f"Expected fewer than {metadata['total_num_frames']} frames, "
178+
f"but loaded {frames.shape[0]} frames"
179+
)

vllm/multimodal/video.py

Lines changed: 75 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,63 @@ def load_bytes(
6363
) -> tuple[npt.NDArray, dict[str, Any]]:
6464
raise NotImplementedError
6565

66+
@staticmethod
67+
def _read_frames(
68+
cap,
69+
frame_indices: set[int],
70+
num_expected_frames: int,
71+
max_frame_idx: int,
72+
) -> tuple[npt.NDArray, int, list[int]]:
73+
import cv2
74+
75+
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
76+
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
77+
frames = np.empty((num_expected_frames, height, width, 3), dtype=np.uint8)
78+
79+
i = 0
80+
valid_frame_indices = []
81+
for idx in range(max_frame_idx + 1):
82+
ok = cap.grab()
83+
if not ok:
84+
# Frame is broken/unreadable, log warning
85+
if idx in frame_indices:
86+
logger.warning(
87+
"Failed to grab frame %d during video loading. "
88+
"This frame will be skipped.",
89+
idx,
90+
)
91+
continue
92+
if idx in frame_indices:
93+
ret, frame = cap.retrieve()
94+
if ret:
95+
frames[i] = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
96+
valid_frame_indices.append(idx)
97+
i += 1
98+
else:
99+
# retrieve() failed even though grab() succeeded
100+
logger.warning(
101+
"Failed to retrieve frame %d during video loading. "
102+
"This frame will be skipped.",
103+
idx,
104+
)
105+
106+
valid_num_frames = len(valid_frame_indices)
107+
if valid_num_frames < num_expected_frames:
108+
logger.warning(
109+
"Video loading completed with %d broken/unreadable frames. "
110+
"Expected %d frames but only loaded %d frames.",
111+
num_expected_frames - valid_num_frames,
112+
num_expected_frames,
113+
valid_num_frames,
114+
)
115+
116+
assert i == valid_num_frames, (
117+
f"Expected reading {valid_num_frames} frames, "
118+
f"but only loaded {i} frames from video."
119+
)
120+
121+
return frames[:valid_num_frames], valid_num_frames, valid_frame_indices
122+
66123

67124
VIDEO_LOADER_REGISTRY = ExtensionManager()
68125

@@ -120,24 +177,10 @@ def load_bytes(
120177
)
121178
frame_idx = uniform_sampled_frames.tolist()
122179

123-
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
124-
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
125-
frames = np.empty((len(frame_idx), height, width, 3), dtype=np.uint8)
126-
127-
i = 0
128-
for idx in range(max(frame_idx) + 1):
129-
ok = cap.grab()
130-
if not ok:
131-
break
132-
if idx in frame_idx:
133-
ret, frame = cap.retrieve()
134-
if ret:
135-
frames[i] = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
136-
i += 1
137-
138-
assert i == num_frames_to_sample, (
139-
f"Expected reading {num_frames_to_sample} frames, "
140-
f"but only loaded {i} frames from video."
180+
# Convert to set for O(1) lookup performance
181+
frame_idx_set = set(frame_idx)
182+
frames, valid_num_frames, valid_frame_indices = cls._read_frames(
183+
cap, frame_idx_set, num_frames_to_sample, max(frame_idx)
141184
)
142185

143186
# Use transformers transformers.video_utils.VideoMetadata format
@@ -148,10 +191,10 @@ def load_bytes(
148191
"fps": original_fps,
149192
"duration": duration,
150193
"video_backend": "opencv",
151-
"frames_indices": list(frame_idx),
194+
"frames_indices": valid_frame_indices,
152195
# extra field used to control hf processor's video
153196
# sampling behavior
154-
"do_sample_frames": num_frames_to_sample == total_frames_num,
197+
"do_sample_frames": valid_num_frames == total_frames_num,
155198
}
156199

157200
return frames, metadata
@@ -185,10 +228,10 @@ def load_bytes(
185228

186229
# Refer to:
187230
# https://github.com/huggingface/transformers/blob/v4.55.4/src/transformers/models/glm4v/video_processing_glm4v.py#L103-L140
188-
frame_indices: range | list[int]
231+
frame_indices_list: list[int]
189232
if duration <= max_duration:
190233
n = int(math.floor(duration * fps))
191-
frame_indices = sorted(
234+
frame_indices_list = sorted(
192235
{
193236
min(max_frame_idx, int(math.ceil(i * original_fps / fps)))
194237
for i in range(n)
@@ -197,34 +240,23 @@ def load_bytes(
197240
else:
198241
num_samples = int(max_duration * fps)
199242
if num_samples >= total_frames_num:
200-
frame_indices = range(total_frames_num)
243+
frame_indices_list = list(range(total_frames_num))
201244
else:
202245
target_seconds = np.linspace(0, duration, num_samples, endpoint=True)
203-
frame_indices = sorted(
246+
frame_indices_list = sorted(
204247
{
205248
min(max_frame_idx, int(math.ceil(t * original_fps)))
206249
for t in target_seconds
207250
}
208251
)
209252

210-
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
211-
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
212-
frames = np.empty((len(frame_indices), height, width, 3), dtype=np.uint8)
213-
214-
i = 0
215-
for idx in range(total_frames_num):
216-
ok = cap.grab()
217-
if not ok:
218-
break
219-
if idx in frame_indices:
220-
ret, frame = cap.retrieve()
221-
if ret:
222-
frames[i] = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
223-
i += 1
224-
225-
assert i == len(frame_indices), (
226-
f"Expected reading {len(frame_indices)} frames, "
227-
f"but only loaded {i} frames from video."
253+
# Convert to set for O(1) lookup performance
254+
frame_indices_set = set(frame_indices_list)
255+
frames, valid_num_frames, valid_frame_indices = cls._read_frames(
256+
cap,
257+
frame_indices_set,
258+
len(frame_indices_list),
259+
total_frames_num - 1,
228260
)
229261

230262
# Use transformers transformers.video_utils.VideoMetadata format
@@ -233,7 +265,7 @@ def load_bytes(
233265
"fps": original_fps,
234266
"duration": duration,
235267
"video_backend": "opencv_dynamic",
236-
"frames_indices": list(frame_indices),
268+
"frames_indices": valid_frame_indices,
237269
"do_sample_frames": False,
238270
}
239271

0 commit comments

Comments
 (0)