1717import llm_bench_utils .output_file
1818import llm_bench_utils .gen_output_data as gen_output_data
1919import llm_bench_utils .parse_json_data as parse_json_data
20+ import llm_bench_utils .prompt_utils as pu
2021from pathlib import Path
2122
22-
2323FW_UTILS = {'pt' : llm_bench_utils .pt_utils , 'ov' : llm_bench_utils .ov_utils }
2424
2525DEFAULT_OUTPUT_TOKEN_SIZE = 512
2626
2727
2828def run_visual_language_generation_optimum (
29- inputs , num , model , processor , args , iter_data_list , md5_list , prompt_index , bench_hook , model_precision , proc_id , mem_consumption
30- ):
29+ inputs , num , model , processor , args , iter_data_list , md5_list , prompt_index ,
30+ bench_hook , model_precision , proc_id , mem_consumption , required_frames = None ):
3131 from optimum .intel .utils .import_utils import is_transformers_version
3232 set_seed (args ['seed' ])
3333 if args ['batch_size' ] != 1 :
3434 log .warning ("Only batch size 1 available for benchmarking" )
3535 args ["batch_size" ] = 1
3636 images = []
3737 prompts = []
38+ videos = []
3839 inputs = [inputs ] if not isinstance (inputs , (list , tuple )) else inputs
3940 for input_data in inputs :
40- if input_data .get ("media" , None ):
41+ if input_data .get ("video" , None ):
42+ entry = Path (input_data ["video" ])
43+ video_tensor = pu .make_video_tensor (entry , required_frames )
44+ videos .append (video_tensor )
45+ elif input_data .get ("media" , None ):
4146 entry = Path (input_data ["media" ])
4247 if entry .is_dir ():
4348 for file in sorted (entry .iterdir ()):
@@ -52,6 +57,8 @@ def run_visual_language_generation_optimum(
5257 llm_bench_utils .output_file .output_input_text (in_text , args , model_precision , prompt_index , bs_index , proc_id )
5358 tok_encode_start = time .perf_counter ()
5459 input_data = model .preprocess_inputs (text = prompts [0 ], image = images [0 ] if images else None , ** processor )
60+ if videos :
61+ input_data ["videos" ] = videos
5562 tok_encode_end = time .perf_counter ()
5663 tok_encode_time = (tok_encode_end - tok_encode_start ) * 1000
5764 # Remove `token_type_ids` from inputs
@@ -189,16 +196,21 @@ def load_image_genai(image_path):
189196
190197
191198def run_visual_language_generation_genai (
192- inputs , num , model , processor , args , iter_data_list , md5_list , prompt_index , streamer , model_precision , proc_id , mem_consumption
193- ):
199+ inputs , num , model , processor , args , iter_data_list , md5_list , prompt_index ,
200+ streamer , model_precision , proc_id , mem_consumption , required_frames = None ):
194201 if args ['batch_size' ] != 1 :
195202 log .warning ("Only batch size 1 available for benchmarking" )
196203 args ["batch_size" ] = 1
197204 images = []
198205 prompts = []
206+ videos = []
199207 inputs = [inputs ] if not isinstance (inputs , (list , tuple )) else inputs
200208 for input_data in inputs :
201- if input_data .get ("media" , None ):
209+ if input_data .get ("video" , None ):
210+ entry = Path (input_data ["video" ])
211+ video_tensor = pu .make_video_tensor (entry , required_frames )
212+ videos .append (video_tensor )
213+ elif input_data .get ("media" , None ):
202214 entry = Path (input_data ["media" ])
203215 if entry .is_dir ():
204216 for file in sorted (entry .iterdir ()):
@@ -222,8 +234,10 @@ def run_visual_language_generation_genai(
222234 gen_config .do_sample = False
223235 gen_config .ignore_eos = True
224236 kwargs = {}
225- if len ( images ) >= 1 :
237+ if images :
226238 kwargs ["images" ] = images
239+ if videos :
240+ kwargs ["videos" ] = videos
227241 prefix = '[warm-up]' if num == 0 else '[{}]' .format (num )
228242 log .info (f'{ prefix } [P{ prompt_index } ] Input image nums:{ len (images )} ' )
229243 start = time .perf_counter ()
@@ -304,8 +318,11 @@ def run_visual_language_generation_genai(
304318 metrics_print .print_generated (num , warm_up = (num == 0 ), generated = generated_text [0 ], prompt_idx = prompt_index )
305319
306320
307- def run_visual_language_generation_benchmark (model_path , framework , device , args , num_iters , mem_consumption ):
308- model , processor , pretrain_time , bench_hook , use_genai = FW_UTILS [framework ].create_image_text_gen_model (model_path , device , mem_consumption , ** args )
321+ def run_visual_language_generation_benchmark (
322+ model_path , framework , device , args , num_iters ,
323+ mem_consumption , required_frames = None ):
324+ outs = FW_UTILS [framework ].create_image_text_gen_model (model_path , device , mem_consumption , ** args )
325+ model , processor , pretrain_time , bench_hook , use_genai = outs
309326 model_precision = model_utils .get_model_precision (model_path .parts )
310327 iter_data_list = []
311328 md5_list = {num : {} for num in range (num_iters + 1 )}
@@ -325,10 +342,10 @@ def run_visual_language_generation_benchmark(model_path, framework, device, args
325342 log .info (f"Numbeams: { args ['num_beams' ]} , benchmarking iter nums(exclude warm-up): { num_iters } , "
326343 f'prompt nums: { len (image_text_list )} , prompt idx: { prompt_idx_list } ' )
327344
328- if not use_genai :
329- gen_fn = run_visual_language_generation_optimum
330- else :
345+ if use_genai :
331346 gen_fn = run_visual_language_generation_genai
347+ else :
348+ gen_fn = run_visual_language_generation_optimum
332349
333350 proc_id = os .getpid ()
334351 iter_timestamp = model_utils .init_timestamp (num_iters , image_text_list , prompt_idx_list )
@@ -337,41 +354,47 @@ def run_visual_language_generation_benchmark(model_path, framework, device, args
337354 for idx , input_text in enumerate (image_text_list ):
338355 p_idx = prompt_idx_list [idx ]
339356 if num == 0 :
340- metrics_print .print_unicode (f'[warm-up][P{ p_idx } ] Input text: { input_text } ' , max_output = metrics_print .MAX_INPUT_TXT_IN_LOG )
357+ metrics_print .print_unicode (f'[warm-up][P{ p_idx } ] Input text: { input_text } ' ,
358+ max_output = metrics_print .MAX_INPUT_TXT_IN_LOG )
341359 iter_timestamp [num ][p_idx ]['start' ] = datetime .datetime .now ().isoformat ()
342360 gen_fn (
343361 input_text , num , model , processor , args , iter_data_list , md5_list ,
344- p_idx , bench_hook , model_precision , proc_id , mem_consumption )
362+ p_idx , bench_hook , model_precision , proc_id , mem_consumption , required_frames )
345363 iter_timestamp [num ][p_idx ]['end' ] = datetime .datetime .now ().isoformat ()
346- prefix = ' [warm-up]' if num == 0 else '[{}]' . format ( num )
347- log .info (f"{ prefix } [P { p_idx } ] start: { iter_timestamp [num ][p_idx ]['start' ]} , end: { iter_timestamp [num ][p_idx ]['end' ]} " )
364+ prefix = f" [warm-up][P { p_idx } ]" if num == 0 else f"[ { num } ][P { p_idx } ]"
365+ log .info (f"{ prefix } start: { iter_timestamp [num ][p_idx ]['start' ]} , end: { iter_timestamp [num ][p_idx ]['end' ]} " )
348366 else :
349367 for idx , input_text in enumerate (image_text_list ):
350368 p_idx = prompt_idx_list [idx ]
351369 for num in range (num_iters + 1 ):
352370 if num == 0 :
353- metrics_print .print_unicode (f'[warm-up][P{ p_idx } ] Input text: { input_text } ' , max_output = metrics_print .MAX_INPUT_TXT_IN_LOG )
371+ metrics_print .print_unicode (f'[warm-up][P{ p_idx } ] Input text: { input_text } ' ,
372+ max_output = metrics_print .MAX_INPUT_TXT_IN_LOG )
354373 iter_timestamp [num ][p_idx ]['start' ] = datetime .datetime .now ().isoformat ()
355374 gen_fn (
356- input_text , num , model , processor , args , iter_data_list , md5_list ,
357- prompt_idx_list [ idx ], bench_hook , model_precision , proc_id , mem_consumption )
375+ input_text , num , model , processor , args , iter_data_list , md5_list , prompt_idx_list [ idx ],
376+ bench_hook , model_precision , proc_id , mem_consumption , required_frames )
358377 iter_timestamp [num ][p_idx ]['end' ] = datetime .datetime .now ().isoformat ()
359- prefix = ' [warm-up]' if num == 0 else '[{}]' . format ( num )
360- log .info (f"{ prefix } [P { p_idx } ] start: { iter_timestamp [num ][p_idx ]['start' ]} , end: { iter_timestamp [num ][p_idx ]['end' ]} " )
378+ prefix = f" [warm-up][P { p_idx } ]" if num == 0 else f"[ { num } ][P { p_idx } ]"
379+ log .info (f"{ prefix } start: { iter_timestamp [num ][p_idx ]['start' ]} , end: { iter_timestamp [num ][p_idx ]['end' ]} " )
361380
362381 metrics_print .print_average (iter_data_list , prompt_idx_list , args ['batch_size' ], True )
363382 return iter_data_list , pretrain_time , iter_timestamp
364383
365384
366385def get_image_text_prompt (args ):
367386 vlm_file_list = []
368- output_data_list , is_json_data = model_utils .get_param_from_file (args , [' media' , "prompt" ])
387+ output_data_list , is_json_data = model_utils .get_param_from_file (args , [" media" , "prompt" ])
369388 if is_json_data :
370389 vlm_param_list = parse_json_data .parse_vlm_json_data (output_data_list )
371390 if len (vlm_param_list ) > 0 :
372391 for vlm_file in vlm_param_list :
373- if args ['prompt_file' ] is not None and len (args ['prompt_file' ]) > 0 :
374- vlm_file ['media' ] = model_utils .resolve_media_file_path (vlm_file .get ("media" ), args ['prompt_file' ][0 ])
392+ if args ['prompt_file' ] is not None and len (args ['prompt_file' ]) > 0 and 'media' in vlm_file :
393+ if 'video' in vlm_file :
394+ raise ValueError ('media and video cannot be specify in a single prompt file' )
395+ vlm_file ['media' ] = model_utils .resolve_media_file_path (vlm_file .get ('media' ), args ['prompt_file' ][0 ])
396+ elif args ['prompt_file' ] is not None and len (args ['prompt_file' ]) > 0 and 'video' in vlm_file :
397+ vlm_file ['video' ] = model_utils .resolve_media_file_path (vlm_file .get ('video' ), args ['prompt_file' ][0 ])
375398 vlm_file_list .append (vlm_file )
376399 else :
377400 vlm_file_list .append (output_data_list )
0 commit comments