88import numpy as np
99from PIL import Image
1010import logging as log
11+ from transformers .image_utils import load_image
1112from .model_utils import get_param_from_file
13+ from .model_utils import resolve_media_file_path
1214from .parse_json_data import parse_text_json_data
15+ from .parse_json_data import parse_vlm_json_data
16+ from pathlib import Path
17+ import openvino as ov
1318
1419
1520def get_text_prompt (args ):
@@ -26,19 +31,36 @@ def get_text_prompt(args):
2631
2732
2833def print_video_frames_number_and_convert_to_tensor (func ):
29- def inner (video_path , decym_frames ):
34+ def inner (video_path , genai_flag , decym_frames ):
3035 log .info (f"Input video file: { video_path } " )
3136 if decym_frames is not None :
3237 log .info (f"Requested to reduce into { decym_frames } frames" )
33- out_frames = func (video_path , decym_frames )
38+ out_frames = func (video_path , genai_flag , decym_frames )
3439 log .info (f"Final frames number: { len (out_frames )} " )
35- return np .array (out_frames )
40+ log .info (f"First frame shape: { out_frames [0 ].shape } " )
41+ log .info (f"First frame dtype: { out_frames [0 ].dtype } " )
42+ if genai_flag :
43+ return [ov .Tensor (frame [None ]) for frame in out_frames ]
44+ else :
45+ return np .array (out_frames )
3646 return inner
3747
3848
3949@print_video_frames_number_and_convert_to_tensor
40- def make_video_tensor (video_path , decym_frames = None ):
41- supported_files = set ([".mp4" ])
50+ def make_video_tensor (video_path , genai_flag , decym_frames = None ):
51+ supported_files = {
52+ '.mp4' , # MPEG-4 (most common)
53+ '.avi' , # Audio Video Interleave
54+ '.mov' , # QuickTime Movie
55+ '.mkv' , # Matroska Video
56+ '.wmv' , # Windows Media Video
57+ '.flv' , # Flash Video
58+ '.webm' , # WebM
59+ '.m4v' , # iTunes Video
60+ '.3gp' , # 3GPP
61+ '.mpeg' , # MPEG
62+ '.mpg' # MPEG
63+ }
4264
4365 assert os .path .exists (video_path ), f"no input video file: { video_path } "
4466 assert video_path .suffix .lower () in supported_files , "no supported video file"
@@ -49,38 +71,89 @@ def make_video_tensor(video_path, decym_frames=None):
4971 ret , frame = cap .read ()
5072 if not ret :
5173 break
74+
5275 frame_rgb = cv2 .cvtColor (frame , cv2 .COLOR_BGR2RGB )
5376 pil_image = Image .fromarray (frame_rgb )
5477
55- shape = np .array (pil_image ).shape
56- dtype = np .array (pil_image ).dtype
57- log .info (f"Video shape: { shape } " )
58- log .info (f"Video dtype: { dtype } " )
59- new_frame = np .zeros (shape , dtype )
60-
61- width , height = pil_image .size
62- log .info (f"Video size: { width } x{ height } " )
63- for x in range (0 , width ):
64- for y in range (0 , height ):
65- new_frame [y , x ] = frame_rgb [y , x ]
66- output_frames .append (np .array (pil_image ))
78+ np_img_array = np .array (pil_image )
79+ log .debug (f"Video shape: { np_img_array .shape } " )
80+ log .debug (f"Video dtype: { np_img_array .dtype } " )
81+ output_frames .append (np_img_array )
6782
6883 if decym_frames is None :
84+ log .info ("Video decym: none: skip" )
6985 return output_frames
7086 if int (decym_frames ) == 0 :
87+ log .info ("Video decym: zero: skip" )
7188 return output_frames
7289
73- # decimation procedure
74- # decim_fames is required frame number if positive
75- # or decimation factor if negative
90+ # decymation procedure
91+ # decym_fames is required max frame number if positive
92+ # or decymation factor if negative
7693
7794 decym_frames = int (decym_frames )
7895 if decym_frames > 0 :
7996 if len (output_frames ) <= decym_frames :
80- return output_frames
81- decym_factor = int (len (output_frames ) / decym_frames )
97+ log .info (f"Video decym: too short to decym: crop: { decym_frames } " )
98+ return list (output_frames [:decym_frames ])
99+ decym_factor = 1 + int (len (output_frames ) / decym_frames )
82100 else :
83101 decym_factor = - decym_frames
102+ log .info (f"Video decym factor: { decym_factor } " )
84103 if decym_factor >= 2 :
85- return output_frames [::decym_factor ]
104+ return list (output_frames [::decym_factor ])
105+ log .info ("Video decym: too large decym factor: skip" )
86106 return output_frames
107+
108+
109+ def load_image_genai (image_path ):
110+ pil_image = load_image (image_path )
111+ image_data = np .array (pil_image )[None ]
112+ return ov .Tensor (image_data )
113+
114+
115+ def extract_prompt_issues (inputs , genai_flag , required_frames ):
116+ prompts , images , videos = [], [], []
117+ if not isinstance (inputs , (list , tuple , set )):
118+ inputs = [inputs ]
119+ for input_data in inputs :
120+ if input_data .get ("video" ) is not None :
121+ entry = Path (input_data ["video" ])
122+ if entry .is_dir ():
123+ for filename in sorted (entry .iterdir ()):
124+ video_tensor = make_video_tensor (filename , genai_flag , required_frames )
125+ videos .extend (video_tensor )
126+ else :
127+ video_tensor = make_video_tensor (entry , genai_flag , required_frames )
128+ videos .extend (video_tensor )
129+ if input_data .get ("media" ) is not None :
130+ func_load_image = load_image_genai if genai_flag else load_image
131+ entry = Path (input_data ["media" ])
132+ if entry .is_dir ():
133+ for file in sorted (entry .iterdir ()):
134+ img = func_load_image (str (file ))
135+ images .append (img )
136+ else :
137+ img = func_load_image (input_data ["media" ])
138+ images .append (img )
139+ prompts .append (input_data ["prompt" ])
140+ return prompts , images , videos
141+
142+
143+ def get_image_text_prompt (args ):
144+ vlm_file_list = []
145+ output_data_list , is_json_data = get_param_from_file (args , ["video" , "media" , "prompt" ])
146+ if is_json_data :
147+ vlm_param_list = parse_vlm_json_data (output_data_list )
148+ if len (vlm_param_list ) > 0 :
149+ for vlm_file in vlm_param_list :
150+ if args ['prompt_file' ] is not None and len (args ['prompt_file' ]) > 0 and 'media' in vlm_file :
151+ if 'video' in vlm_file :
152+ raise ValueError ('media and video cannot be specify in a single prompt file' )
153+ vlm_file ['media' ] = resolve_media_file_path (vlm_file .get ('media' ), args ['prompt_file' ][0 ])
154+ if args ['prompt_file' ] is not None and len (args ['prompt_file' ]) > 0 and 'video' in vlm_file :
155+ vlm_file ['video' ] = resolve_media_file_path (vlm_file .get ('video' ), args ['prompt_file' ][0 ])
156+ vlm_file_list .append (vlm_file )
157+ else :
158+ vlm_file_list .append (output_data_list )
159+ return vlm_file_list
0 commit comments