@@ -63,6 +63,63 @@ def load_bytes(
6363 ) -> tuple [npt .NDArray , dict [str , Any ]]:
6464 raise NotImplementedError
6565
66+ @staticmethod
67+ def _read_frames (
68+ cap ,
69+ frame_indices : set [int ],
70+ num_expected_frames : int ,
71+ max_frame_idx : int ,
72+ ) -> tuple [npt .NDArray , int , list [int ]]:
73+ import cv2
74+
75+ width = int (cap .get (cv2 .CAP_PROP_FRAME_WIDTH ))
76+ height = int (cap .get (cv2 .CAP_PROP_FRAME_HEIGHT ))
77+ frames = np .empty ((num_expected_frames , height , width , 3 ), dtype = np .uint8 )
78+
79+ i = 0
80+ valid_frame_indices = []
81+ for idx in range (max_frame_idx + 1 ):
82+ ok = cap .grab ()
83+ if not ok :
84+ # Frame is broken/unreadable, log warning
85+ if idx in frame_indices :
86+ logger .warning (
87+ "Failed to grab frame %d during video loading. "
88+ "This frame will be skipped." ,
89+ idx ,
90+ )
91+ continue
92+ if idx in frame_indices :
93+ ret , frame = cap .retrieve ()
94+ if ret :
95+ frames [i ] = cv2 .cvtColor (frame , cv2 .COLOR_BGR2RGB )
96+ valid_frame_indices .append (idx )
97+ i += 1
98+ else :
99+ # retrieve() failed even though grab() succeeded
100+ logger .warning (
101+ "Failed to retrieve frame %d during video loading. "
102+ "This frame will be skipped." ,
103+ idx ,
104+ )
105+
106+ valid_num_frames = len (valid_frame_indices )
107+ if valid_num_frames < num_expected_frames :
108+ logger .warning (
109+ "Video loading completed with %d broken/unreadable frames. "
110+ "Expected %d frames but only loaded %d frames." ,
111+ num_expected_frames - valid_num_frames ,
112+ num_expected_frames ,
113+ valid_num_frames ,
114+ )
115+
116+ assert i == valid_num_frames , (
117+ f"Expected reading { valid_num_frames } frames, "
118+ f"but only loaded { i } frames from video."
119+ )
120+
121+ return frames [:valid_num_frames ], valid_num_frames , valid_frame_indices
122+
66123
67124VIDEO_LOADER_REGISTRY = ExtensionManager ()
68125
@@ -120,24 +177,10 @@ def load_bytes(
120177 )
121178 frame_idx = uniform_sampled_frames .tolist ()
122179
123- width = int (cap .get (cv2 .CAP_PROP_FRAME_WIDTH ))
124- height = int (cap .get (cv2 .CAP_PROP_FRAME_HEIGHT ))
125- frames = np .empty ((len (frame_idx ), height , width , 3 ), dtype = np .uint8 )
126-
127- i = 0
128- for idx in range (max (frame_idx ) + 1 ):
129- ok = cap .grab ()
130- if not ok :
131- break
132- if idx in frame_idx :
133- ret , frame = cap .retrieve ()
134- if ret :
135- frames [i ] = cv2 .cvtColor (frame , cv2 .COLOR_BGR2RGB )
136- i += 1
137-
138- assert i == num_frames_to_sample , (
139- f"Expected reading { num_frames_to_sample } frames, "
140- f"but only loaded { i } frames from video."
180+ # Convert to set for O(1) lookup performance
181+ frame_idx_set = set (frame_idx )
182+ frames , valid_num_frames , valid_frame_indices = cls ._read_frames (
183+ cap , frame_idx_set , num_frames_to_sample , max (frame_idx )
141184 )
142185
143186 # Use transformers transformers.video_utils.VideoMetadata format
@@ -148,10 +191,10 @@ def load_bytes(
148191 "fps" : original_fps ,
149192 "duration" : duration ,
150193 "video_backend" : "opencv" ,
151- "frames_indices" : list ( frame_idx ) ,
194+ "frames_indices" : valid_frame_indices ,
152195 # extra field used to control hf processor's video
153196 # sampling behavior
154- "do_sample_frames" : num_frames_to_sample == total_frames_num ,
197+ "do_sample_frames" : valid_num_frames == total_frames_num ,
155198 }
156199
157200 return frames , metadata
@@ -185,10 +228,10 @@ def load_bytes(
185228
186229 # Refer to:
187230 # https://github.com/huggingface/transformers/blob/v4.55.4/src/transformers/models/glm4v/video_processing_glm4v.py#L103-L140
188- frame_indices : range | list [int ]
231+ frame_indices_list : list [int ]
189232 if duration <= max_duration :
190233 n = int (math .floor (duration * fps ))
191- frame_indices = sorted (
234+ frame_indices_list = sorted (
192235 {
193236 min (max_frame_idx , int (math .ceil (i * original_fps / fps )))
194237 for i in range (n )
@@ -197,34 +240,23 @@ def load_bytes(
197240 else :
198241 num_samples = int (max_duration * fps )
199242 if num_samples >= total_frames_num :
200- frame_indices = range (total_frames_num )
243+ frame_indices_list = list ( range (total_frames_num ) )
201244 else :
202245 target_seconds = np .linspace (0 , duration , num_samples , endpoint = True )
203- frame_indices = sorted (
246+ frame_indices_list = sorted (
204247 {
205248 min (max_frame_idx , int (math .ceil (t * original_fps )))
206249 for t in target_seconds
207250 }
208251 )
209252
210- width = int (cap .get (cv2 .CAP_PROP_FRAME_WIDTH ))
211- height = int (cap .get (cv2 .CAP_PROP_FRAME_HEIGHT ))
212- frames = np .empty ((len (frame_indices ), height , width , 3 ), dtype = np .uint8 )
213-
214- i = 0
215- for idx in range (total_frames_num ):
216- ok = cap .grab ()
217- if not ok :
218- break
219- if idx in frame_indices :
220- ret , frame = cap .retrieve ()
221- if ret :
222- frames [i ] = cv2 .cvtColor (frame , cv2 .COLOR_BGR2RGB )
223- i += 1
224-
225- assert i == len (frame_indices ), (
226- f"Expected reading { len (frame_indices )} frames, "
227- f"but only loaded { i } frames from video."
253+ # Convert to set for O(1) lookup performance
254+ frame_indices_set = set (frame_indices_list )
255+ frames , valid_num_frames , valid_frame_indices = cls ._read_frames (
256+ cap ,
257+ frame_indices_set ,
258+ len (frame_indices_list ),
259+ total_frames_num - 1 ,
228260 )
229261
230262 # Use transformers transformers.video_utils.VideoMetadata format
@@ -233,7 +265,7 @@ def load_bytes(
233265 "fps" : original_fps ,
234266 "duration" : duration ,
235267 "video_backend" : "opencv_dynamic" ,
236- "frames_indices" : list ( frame_indices ) ,
268+ "frames_indices" : valid_frame_indices ,
237269 "do_sample_frames" : False ,
238270 }
239271
0 commit comments