88import numpy as np
99from PIL import Image
1010import logging as log
11+ from transformers .image_utils import load_image
1112from .model_utils import get_param_from_file
13+ from .model_utils import resolve_media_file_path
1214from .parse_json_data import parse_text_json_data
15+ from .parse_json_data import parse_vlm_json_data
16+ from pathlib import Path
17+ import openvino as ov
1318
1419
1520def get_text_prompt (args ):
@@ -26,19 +31,35 @@ def get_text_prompt(args):
2631
2732
2833def print_video_frames_number_and_convert_to_tensor (func ):
29- def inner (video_path , decym_frames ):
34+ def inner (video_path , decym_frames , genai_flag ):
3035 log .info (f"Input video file: { video_path } " )
3136 if decym_frames is not None :
3237 log .info (f"Requested to reduce into { decym_frames } frames" )
3338 out_frames = func (video_path , decym_frames )
3439 log .info (f"Final frames number: { len (out_frames )} " )
40+ log .info (f"First frame shape: { out_frames [0 ].shape } " )
41+ log .info (f"First frame dtype: { out_frames [0 ].dtype } " )
42+ if genai_flag :
43+ return [ov .Tensor (frame ) for frame in out_frames ]
3544 return np .array (out_frames )
3645 return inner
3746
3847
3948@print_video_frames_number_and_convert_to_tensor
4049def make_video_tensor (video_path , decym_frames = None ):
41- supported_files = set ([".mp4" ])
50+ supported_files = {
51+ '.mp4' , # MPEG-4 (most common)
52+ '.avi' , # Audio Video Interleave
53+ '.mov' , # QuickTime Movie
54+ '.mkv' , # Matroska Video
55+ '.wmv' , # Windows Media Video
56+ '.flv' , # Flash Video
57+ '.webm' , # WebM
58+ '.m4v' , # iTunes Video
59+ '.3gp' , # 3GPP
60+ '.mpeg' , # MPEG
61+ '.mpg' # MPEG
62+ }
4263
4364 assert os .path .exists (video_path ), f"no input video file: { video_path } "
4465 assert video_path .suffix .lower () in supported_files , "no supported video file"
@@ -49,38 +70,95 @@ def make_video_tensor(video_path, decym_frames=None):
4970 ret , frame = cap .read ()
5071 if not ret :
5172 break
73+
5274 frame_rgb = cv2 .cvtColor (frame , cv2 .COLOR_BGR2RGB )
5375 pil_image = Image .fromarray (frame_rgb )
5476
55- shape = np .array (pil_image ).shape
56- dtype = np .array (pil_image ).dtype
57- log .info (f"Video shape: { shape } " )
58- log .info (f"Video dtype: { dtype } " )
59- new_frame = np .zeros (shape , dtype )
60-
61- width , height = pil_image .size
62- log .info (f"Video size: { width } x{ height } " )
63- for x in range (0 , width ):
64- for y in range (0 , height ):
65- new_frame [y , x ] = frame_rgb [y , x ]
66- output_frames .append (np .array (pil_image ))
77+ np_img_array = np .array (pil_image )
78+ log .debug (f"Video shape: { np_img_array .shape } " )
79+ log .debug (f"Video dtype: { np_img_array .dtype } " )
80+ output_frames .append (np_img_array )
6781
6882 if decym_frames is None :
83+ log .info ("Video decym: none: skip" )
6984 return output_frames
7085 if int (decym_frames ) == 0 :
86+ log .info ("Video decym: zero: skip" )
7187 return output_frames
7288
73- # decimation procedure
74- # decim_fames is required frame number if positive
75- # or decimation factor if negative
89+ # decymation procedure
90+ # decym_fames is required max frame number if positive
91+ # or decymation factor if negative
7692
7793 decym_frames = int (decym_frames )
7894 if decym_frames > 0 :
7995 if len (output_frames ) <= decym_frames :
80- return output_frames
81- decym_factor = int (len (output_frames ) / decym_frames )
96+ log .info (f"Video decym: too short to decym: crop: { decym_frames } " )
97+ return list (output_frames [:decym_frames ])
98+ decym_factor = 1 + int (len (output_frames ) / decym_frames )
8299 else :
83100 decym_factor = - decym_frames
101+ log .info (f"Video decym factor: { decym_factor } " )
84102 if decym_factor >= 2 :
85- return output_frames [::decym_factor ]
103+ return list (output_frames [::decym_factor ])
104+ log .info ("Video decym: too large decym factor: skip" )
86105 return output_frames
106+
107+
108+ def load_image_genai (image_path ):
109+ pil_image = load_image (image_path )
110+ image_data = np .array (pil_image )[None ]
111+ return ov .Tensor (image_data )
112+
113+
114+ def extract_prompt_issues (inputs , required_frames , genai_flag ):
115+ prompts , images , videos = [], [], []
116+ if not isinstance (inputs , (list , tuple , set )):
117+ inputs = [inputs ]
118+ for input_data in inputs :
119+ if input_data .get ("video" ) is not None :
120+ entry = Path (input_data ["video" ])
121+ if entry .is_dir ():
122+ for filename in sorted (entry .iterdir ()):
123+ video_tensor = make_video_tensor (filename , required_frames , genai_flag )
124+ if genai_flag :
125+ videos .extend (video_tensor )
126+ else :
127+ videos .append (video_tensor )
128+ else :
129+ video_tensor = make_video_tensor (entry , required_frames , genai_flag )
130+ if genai_flag :
131+ videos .extend (video_tensor )
132+ else :
133+ videos .append (video_tensor )
134+ if input_data .get ("media" ) is not None :
135+ func_load_image = load_image_genai if genai_flag else load_image
136+ entry = Path (input_data ["media" ])
137+ if entry .is_dir ():
138+ for file in sorted (entry .iterdir ()):
139+ img = func_load_image (str (file ))
140+ images .append (img )
141+ else :
142+ img = func_load_image (input_data ["media" ])
143+ images .append (img )
144+ prompts .append (input_data ["prompt" ])
145+ return prompts , images , videos
146+
147+
148+ def get_image_text_prompt (args ):
149+ vlm_file_list = []
150+ output_data_list , is_json_data = get_param_from_file (args , ["video" , "media" , "prompt" ])
151+ if is_json_data :
152+ vlm_param_list = parse_vlm_json_data (output_data_list )
153+ if len (vlm_param_list ) > 0 :
154+ for vlm_file in vlm_param_list :
155+ if args ['prompt_file' ] is not None and len (args ['prompt_file' ]) > 0 and 'media' in vlm_file :
156+ if 'video' in vlm_file :
157+ raise ValueError ('media and video cannot be specify in a single prompt file' )
158+ vlm_file ['media' ] = resolve_media_file_path (vlm_file .get ('media' ), args ['prompt_file' ][0 ])
159+ if args ['prompt_file' ] is not None and len (args ['prompt_file' ]) > 0 and 'video' in vlm_file :
160+ vlm_file ['video' ] = resolve_media_file_path (vlm_file .get ('video' ), args ['prompt_file' ][0 ])
161+ vlm_file_list .append (vlm_file )
162+ else :
163+ vlm_file_list .append (output_data_list )
164+ return vlm_file_list
0 commit comments