99import llm_bench_utils .pt_utils
1010import llm_bench_utils .model_utils as model_utils
1111import numpy as np
12- import openvino as ov
1312import hashlib
14- import llm_bench_utils .metrics_print as metrics_print
1513from transformers import set_seed
16- from transformers .image_utils import load_image
1714import llm_bench_utils .output_file
15+ import llm_bench_utils .metrics_print as metrics_print
1816import llm_bench_utils .gen_output_data as gen_output_data
19- import llm_bench_utils .parse_json_data as parse_json_data
20- import llm_bench_utils .prompt_utils as pu
21- from pathlib import Path
17+ from llm_bench_utils .prompt_utils import extract_prompt_issues
18+ from llm_bench_utils .prompt_utils import get_image_text_prompt
19+ import openvino as ov
2220
23- FW_UTILS = {'pt' : llm_bench_utils .pt_utils , 'ov' : llm_bench_utils .ov_utils }
2421
2522DEFAULT_OUTPUT_TOKEN_SIZE = 512
23+ FW_UTILS = {
24+ 'pt' : llm_bench_utils .pt_utils ,
25+ 'ov' : llm_bench_utils .ov_utils
26+ }
2627
2728
2829def run_visual_language_generation_optimum (
@@ -33,32 +34,22 @@ def run_visual_language_generation_optimum(
3334 if args ['batch_size' ] != 1 :
3435 log .warning ("Only batch size 1 available for benchmarking" )
3536 args ["batch_size" ] = 1
36- images = []
37- prompts = []
38- videos = []
39- inputs = [inputs ] if not isinstance (inputs , (list , tuple )) else inputs
40- for input_data in inputs :
41- if input_data .get ("video" , None ):
42- entry = Path (input_data ["video" ])
43- video_tensor = pu .make_video_tensor (entry , required_frames )
44- videos .append (video_tensor )
45- elif input_data .get ("media" , None ):
46- entry = Path (input_data ["media" ])
47- if entry .is_dir ():
48- for file in sorted (entry .iterdir ()):
49- images .append (load_image (str (file )))
50- else :
51- images .append (load_image (input_data ["media" ]))
52- prompts .append (input_data ["prompt" ])
53- prefix = '[warm-up]' if num == 0 else '[{}]' .format (num )
54- log .info (f'{ prefix } [P{ prompt_index } ] Input image nums:{ len (images )} ' )
37+
38+ prompts , images , videos = extract_prompt_issues (inputs , False , required_frames )
5539 if args ["output_dir" ] is not None and num == 0 :
5640 for bs_index , in_text in enumerate (prompts ):
57- llm_bench_utils .output_file .output_input_text (in_text , args , model_precision , prompt_index , bs_index , proc_id )
41+ llm_bench_utils .output_file .output_input_text (
42+ in_text , args , model_precision ,
43+ prompt_index , bs_index , proc_id )
5844 tok_encode_start = time .perf_counter ()
45+
46+ prefix = '[warm-up]' if num == 0 else '[{}]' .format (num )
47+ log .info (f'{ prefix } [P{ prompt_index } ] Input image nums: { len (images )} ' )
48+ log .info (f'{ prefix } [P{ prompt_index } ] Input video nums: { len (videos )} ' )
5949 input_data = model .preprocess_inputs (text = prompts [0 ], image = images [0 ] if images else None , ** processor )
60- if videos :
61- input_data ["videos" ] = videos
50+ if videos : # to check
51+ input_data ["videos" ] = [videos ]
52+
6253 tok_encode_end = time .perf_counter ()
6354 tok_encode_time = (tok_encode_end - tok_encode_start ) * 1000
6455 # Remove `token_type_ids` from inputs
@@ -189,38 +180,20 @@ def run_visual_language_generation_optimum(
189180 bench_hook .clear_mm_embeddins_time_list ()
190181
191182
192- def load_image_genai (image_path ):
193- pil_image = load_image (image_path )
194- image_data = np .array (pil_image )[None ]
195- return ov .Tensor (image_data )
196-
197-
198183def run_visual_language_generation_genai (
199184 inputs , num , model , processor , args , iter_data_list , md5_list , prompt_index ,
200185 streamer , model_precision , proc_id , mem_consumption , required_frames = None ):
201186 if args ['batch_size' ] != 1 :
202187 log .warning ("Only batch size 1 available for benchmarking" )
203188 args ["batch_size" ] = 1
204- images = []
205- prompts = []
206- videos = []
207- inputs = [inputs ] if not isinstance (inputs , (list , tuple )) else inputs
208- for input_data in inputs :
209- if input_data .get ("video" , None ):
210- entry = Path (input_data ["video" ])
211- video_tensor = pu .make_video_tensor (entry , required_frames )
212- videos .append (video_tensor )
213- elif input_data .get ("media" , None ):
214- entry = Path (input_data ["media" ])
215- if entry .is_dir ():
216- for file in sorted (entry .iterdir ()):
217- images .append (load_image_genai (str (file )))
218- else :
219- images .append (load_image_genai (input_data ["media" ]))
220- prompts .append (input_data ["prompt" ])
189+
190+ prompts , images , videos = extract_prompt_issues (inputs , True , required_frames )
221191 if args ["output_dir" ] is not None and num == 0 :
222192 for bs_index , in_text in enumerate (prompts ):
223- llm_bench_utils .output_file .output_input_text (in_text , args , model_precision , prompt_index , bs_index , proc_id )
193+ llm_bench_utils .output_file .output_input_text (
194+ in_text , args , model_precision ,
195+ prompt_index , bs_index , proc_id )
196+
224197 max_rss_mem_consumption = ''
225198 max_sys_mem_consumption = ''
226199 max_rss_mem_increase = ''
@@ -233,13 +206,17 @@ def run_visual_language_generation_genai(
233206 gen_config .num_beams = args ["num_beams" ]
234207 gen_config .do_sample = False
235208 gen_config .ignore_eos = True
209+
236210 kwargs = {}
211+ prefix = '[warm-up]' if num == 0 else '[{}]' .format (num )
212+ log .info (f'{ prefix } [P{ prompt_index } ] Input image nums: { len (images )} ' )
213+ log .info (f'{ prefix } [P{ prompt_index } ] Input video nums: { len (videos )} ' )
214+
237215 if images :
238216 kwargs ["images" ] = images
239217 if videos :
240218 kwargs ["videos" ] = videos
241- prefix = '[warm-up]' if num == 0 else '[{}]' .format (num )
242- log .info (f'{ prefix } [P{ prompt_index } ] Input image nums:{ len (images )} ' )
219+
243220 start = time .perf_counter ()
244221 generation_result = model .generate (prompts [0 ], generation_config = gen_config , ** kwargs )
245222 end = time .perf_counter ()
@@ -354,8 +331,8 @@ def run_visual_language_generation_benchmark(
354331 for idx , input_text in enumerate (image_text_list ):
355332 p_idx = prompt_idx_list [idx ]
356333 if num == 0 :
357- metrics_print . print_unicode ( f'[warm-up][P{ p_idx } ] Input text: { input_text } ' ,
358- max_output = metrics_print .MAX_INPUT_TXT_IN_LOG )
334+ prefix = f'[warm-up][P{ p_idx } ] Input text: { input_text } '
335+ metrics_print . print_unicode ( prefix , max_output = metrics_print .MAX_INPUT_TXT_IN_LOG )
359336 iter_timestamp [num ][p_idx ]['start' ] = datetime .datetime .now ().isoformat ()
360337 gen_fn (
361338 input_text , num , model , processor , args , iter_data_list , md5_list ,
@@ -368,8 +345,8 @@ def run_visual_language_generation_benchmark(
368345 p_idx = prompt_idx_list [idx ]
369346 for num in range (num_iters + 1 ):
370347 if num == 0 :
371- metrics_print . print_unicode ( f'[warm-up][P{ p_idx } ] Input text: { input_text } ' ,
372- max_output = metrics_print .MAX_INPUT_TXT_IN_LOG )
348+ prefix = f'[warm-up][P{ p_idx } ] Input text: { input_text } '
349+ metrics_print . print_unicode ( prefix , max_output = metrics_print .MAX_INPUT_TXT_IN_LOG )
373350 iter_timestamp [num ][p_idx ]['start' ] = datetime .datetime .now ().isoformat ()
374351 gen_fn (
375352 input_text , num , model , processor , args , iter_data_list , md5_list , prompt_idx_list [idx ],
@@ -382,20 +359,3 @@ def run_visual_language_generation_benchmark(
382359 return iter_data_list , pretrain_time , iter_timestamp
383360
384361
385- def get_image_text_prompt (args ):
386- vlm_file_list = []
387- output_data_list , is_json_data = model_utils .get_param_from_file (args , ["media" , "prompt" ])
388- if is_json_data :
389- vlm_param_list = parse_json_data .parse_vlm_json_data (output_data_list )
390- if len (vlm_param_list ) > 0 :
391- for vlm_file in vlm_param_list :
392- if args ['prompt_file' ] is not None and len (args ['prompt_file' ]) > 0 and 'media' in vlm_file :
393- if 'video' in vlm_file :
394- raise ValueError ('media and video cannot be specify in a single prompt file' )
395- vlm_file ['media' ] = model_utils .resolve_media_file_path (vlm_file .get ('media' ), args ['prompt_file' ][0 ])
396- elif args ['prompt_file' ] is not None and len (args ['prompt_file' ]) > 0 and 'video' in vlm_file :
397- vlm_file ['video' ] = model_utils .resolve_media_file_path (vlm_file .get ('video' ), args ['prompt_file' ][0 ])
398- vlm_file_list .append (vlm_file )
399- else :
400- vlm_file_list .append (output_data_list )
401- return vlm_file_list
0 commit comments