Skip to content

Commit 8eb5383

Browse files
committed
After review
1 parent 0233d1f commit 8eb5383

File tree

4 files changed

+132
-103
lines changed

4 files changed

+132
-103
lines changed

tools/llm_bench/benchmark.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ def get_argprser():
6161
parser.add_argument('-pi', '--prompt_index', nargs='+', type=num_iters_type, default=None,
6262
help='Run the specified prompt index. You can specify multiple prompt indexes, separated by spaces.')
6363
parser.add_argument('--media', default=None, help='Media file path for speech or visual models.')
64+
parser.add_argument('--video', default=None, help='Video file path for visual models.')
6465
parser.add_argument(
6566
'-ic',
6667
'--infer_count',
@@ -235,11 +236,11 @@ def get_argprser():
235236

236237

237238
CASE_TO_BENCH = {
238-
'text_gen': bench_text.run_text_generation_benchmark,
239-
'image_gen': bench_image.run_image_generation_benchmark,
240-
'code_gen': bench_text.run_text_generation_benchmark,
241-
'ldm_super_resolution': bench_ldm_sr.run_ldm_super_resolution_benchmark,
242-
'speech_to_text': bench_speech.run_speech_2_txt_benchmark,
239+
"text_gen": bench_text.run_text_generation_benchmark,
240+
"image_gen": bench_image.run_image_generation_benchmark,
241+
"code_gen": bench_text.run_text_generation_benchmark,
242+
"ldm_super_resolution": bench_ldm_sr.run_ldm_super_resolution_benchmark,
243+
"speech_to_text": bench_speech.run_speech_2_txt_benchmark,
243244
"visual_text_gen": bench_vlm.run_visual_language_generation_benchmark,
244245
"text_embed": bench_text_embed.run_text_embddings_benchmark,
245246
"text_to_speech": bench_text_to_speech.run_text_2_speech_benchmark,

tools/llm_bench/llm_bench_utils/parse_json_data.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ def parse_vlm_json_data(json_data_list):
2727
for json_data in json_data_list:
2828
prompt_data = create_base_prompt(json_data)
2929
if ("media" in json_data) and ("video" in json_data):
30-
raise ValueError("only one key is avaialble from media & video")
30+
raise ValueError("only one key is available from media & video")
3131
if "media" in json_data:
3232
prompt_data["media"] = json_data["media"]
3333
if "video" in json_data:

tools/llm_bench/llm_bench_utils/prompt_utils.py

Lines changed: 89 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,13 @@
88
import numpy as np
99
from PIL import Image
1010
import logging as log
11+
from transformers.image_utils import load_image
1112
from .model_utils import get_param_from_file
13+
from .model_utils import resolve_media_file_path
1214
from .parse_json_data import parse_text_json_data
15+
from .parse_json_data import parse_vlm_json_data
16+
from pathlib import Path
17+
import openvino as ov
1318

1419

1520
def get_text_prompt(args):
@@ -26,19 +31,23 @@ def get_text_prompt(args):
2631

2732

2833
def print_video_frames_number_and_convert_to_tensor(func):
29-
def inner(video_path, decym_frames):
34+
def inner(video_path, genai_flag, decym_frames):
3035
log.info(f"Input video file: {video_path}")
3136
if decym_frames is not None:
3237
log.info(f"Requested to reduce into {decym_frames} frames")
3338
out_frames = func(video_path, decym_frames)
3439
log.info(f"Final frames number: {len(out_frames)}")
35-
return np.array(out_frames)
40+
log.info(f"First frame shape: {out_frames[0].shape}")
41+
log.info(f"First frame dtype: {out_frames[0].dtype}")
42+
if genai_flag:
43+
return [ov.Tensor(frame[None]) for frame in out_frames]
44+
else: return np.array(out_frames)
3645
return inner
3746

3847

3948
@print_video_frames_number_and_convert_to_tensor
40-
def make_video_tensor(video_path, decym_frames=None):
41-
supported_files = set([".mp4"])
49+
def make_video_tensor(video_path, genai_flag, decym_frames=None):
50+
supported_files = {".mp4"}
4251

4352
assert os.path.exists(video_path), f"no input video file: {video_path}"
4453
assert video_path.suffix.lower() in supported_files, "no supported video file"
@@ -49,38 +58,97 @@ def make_video_tensor(video_path, decym_frames=None):
4958
ret, frame = cap.read()
5059
if not ret:
5160
break
61+
5262
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
5363
pil_image = Image.fromarray(frame_rgb)
5464

55-
shape = np.array(pil_image).shape
56-
dtype = np.array(pil_image).dtype
57-
log.info(f"Video shape: {shape}")
58-
log.info(f"Video dtype: {dtype}")
59-
new_frame = np.zeros(shape, dtype)
65+
np_img_array = np.array(pil_image)
66+
log.debug(f"Video shape: {np_img_array.shape}")
67+
log.debug(f"Video dtype: {np_img_array.dtype}")
68+
output_frames.append(np_img_array)
6069

61-
width, height = pil_image.size
62-
log.info(f"Video size: {width}x{height}")
63-
for x in range(0, width):
64-
for y in range(0, height):
65-
new_frame[y, x] = frame_rgb[y, x]
66-
output_frames.append(np.array(pil_image))
70+
# new_frame = np.zeros(shape, dtype=int)
71+
# width, height = pil_image.size
72+
# log.debug(f"Video size: {width}x{height}")
73+
# for x in range(0, width):
74+
# for y in range(0, height):
75+
# new_frame[y, x] = frame_rgb[y, x]
76+
# output_frames.append(new_frame)
6777

6878
if decym_frames is None:
79+
log.info(f"Video decym: skip")
6980
return output_frames
7081
if int(decym_frames) == 0:
82+
log.info(f"Video decym: skip")
7183
return output_frames
7284

73-
# decimation procedure
74-
# decim_fames is required frame number if positive
75-
# or decimation factor if negative
85+
# decymation procedure
86+
# decym_fames is required max frame number if positive
87+
# or decymation factor if negative
7688

7789
decym_frames = int(decym_frames)
7890
if decym_frames > 0:
7991
if len(output_frames) <= decym_frames:
80-
return output_frames
81-
decym_factor = int(len(output_frames) / decym_frames)
92+
log.info(f"Video decym: too short to decym: crop: {decym_frames}")
93+
return list(output_frames[:decym_frames])
94+
decym_factor = 1 + int(len(output_frames) / decym_frames)
8295
else:
8396
decym_factor = -decym_frames
97+
log.info(f"Video decym: {decym_factor}")
8498
if decym_factor >= 2:
85-
return output_frames[::decym_factor]
99+
return list(output_frames[::decym_factor])
100+
log.info(f"Video decym: too large decym factor: skip")
86101
return output_frames
102+
103+
104+
def load_image_genai(image_path):
105+
pil_image = load_image(image_path)
106+
image_data = np.array(pil_image)[None]
107+
return ov.Tensor(image_data)
108+
109+
110+
def extract_prompt_issues(inputs, genai_flag, required_frames):
111+
prompts, images, videos = [], [], []
112+
if not isinstance(inputs, (list, tuple, set)):
113+
inputs = [inputs]
114+
for input_data in inputs:
115+
if input_data.get("video") is not None:
116+
entry = Path(input_data["video"])
117+
if entry.is_dir():
118+
for filename in sorted(entry.iterdir()):
119+
video_tensor = make_video_tensor(filename, genai_flag, required_frames)
120+
videos.extend(video_tensor)
121+
else:
122+
video_tensor = make_video_tensor(entry, genai_flag, required_frames)
123+
videos.extend(video_tensor)
124+
if input_data.get("media") is not None:
125+
func_load_image = load_image_genai if genai_flag else load_image
126+
entry = Path(input_data["media"])
127+
if entry.is_dir():
128+
for file in sorted(entry.iterdir()):
129+
img = func_load_image(str(file))
130+
images.append(img)
131+
else:
132+
img = func_load_image(input_data["media"])
133+
images.append(img)
134+
prompts.append(input_data["prompt"])
135+
return prompts, images, videos
136+
137+
138+
def get_image_text_prompt(args):
139+
vlm_file_list = []
140+
output_data_list, is_json_data = get_param_from_file(args, ["video", "media", "prompt"])
141+
if is_json_data:
142+
vlm_param_list = parse_vlm_json_data(output_data_list)
143+
if len(vlm_param_list) > 0:
144+
for vlm_file in vlm_param_list:
145+
if args['prompt_file'] is not None and len(args['prompt_file']) > 0 and 'media' in vlm_file:
146+
if 'video' in vlm_file:
147+
raise ValueError('media and video cannot be specify in a single prompt file')
148+
vlm_file['media'] = resolve_media_file_path(vlm_file.get('media'), args['prompt_file'][0])
149+
if args['prompt_file'] is not None and len(args['prompt_file']) > 0 and 'video' in vlm_file:
150+
vlm_file['video'] = resolve_media_file_path(vlm_file.get('video'), args['prompt_file'][0])
151+
vlm_file_list.append(vlm_file)
152+
else:
153+
vlm_file_list.append(output_data_list)
154+
return vlm_file_list

tools/llm_bench/task/visual_language_generation.py

Lines changed: 36 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -9,20 +9,21 @@
99
import llm_bench_utils.pt_utils
1010
import llm_bench_utils.model_utils as model_utils
1111
import numpy as np
12-
import openvino as ov
1312
import hashlib
14-
import llm_bench_utils.metrics_print as metrics_print
1513
from transformers import set_seed
16-
from transformers.image_utils import load_image
1714
import llm_bench_utils.output_file
15+
import llm_bench_utils.metrics_print as metrics_print
1816
import llm_bench_utils.gen_output_data as gen_output_data
19-
import llm_bench_utils.parse_json_data as parse_json_data
20-
import llm_bench_utils.prompt_utils as pu
21-
from pathlib import Path
17+
from llm_bench_utils.prompt_utils import extract_prompt_issues
18+
from llm_bench_utils.prompt_utils import get_image_text_prompt
19+
import openvino as ov
2220

23-
FW_UTILS = {'pt': llm_bench_utils.pt_utils, 'ov': llm_bench_utils.ov_utils}
2421

2522
DEFAULT_OUTPUT_TOKEN_SIZE = 512
23+
FW_UTILS = {
24+
'pt': llm_bench_utils.pt_utils,
25+
'ov': llm_bench_utils.ov_utils
26+
}
2627

2728

2829
def run_visual_language_generation_optimum(
@@ -33,32 +34,22 @@ def run_visual_language_generation_optimum(
3334
if args['batch_size'] != 1:
3435
log.warning("Only batch size 1 available for benchmarking")
3536
args["batch_size"] = 1
36-
images = []
37-
prompts = []
38-
videos = []
39-
inputs = [inputs] if not isinstance(inputs, (list, tuple)) else inputs
40-
for input_data in inputs:
41-
if input_data.get("video", None):
42-
entry = Path(input_data["video"])
43-
video_tensor = pu.make_video_tensor(entry, required_frames)
44-
videos.append(video_tensor)
45-
elif input_data.get("media", None):
46-
entry = Path(input_data["media"])
47-
if entry.is_dir():
48-
for file in sorted(entry.iterdir()):
49-
images.append(load_image(str(file)))
50-
else:
51-
images.append(load_image(input_data["media"]))
52-
prompts.append(input_data["prompt"])
53-
prefix = '[warm-up]' if num == 0 else '[{}]'.format(num)
54-
log.info(f'{prefix}[P{prompt_index}] Input image nums:{len(images)}')
37+
38+
prompts, images, videos = extract_prompt_issues(inputs, False, required_frames)
5539
if args["output_dir"] is not None and num == 0:
5640
for bs_index, in_text in enumerate(prompts):
57-
llm_bench_utils.output_file.output_input_text(in_text, args, model_precision, prompt_index, bs_index, proc_id)
41+
llm_bench_utils.output_file.output_input_text(
42+
in_text, args, model_precision,
43+
prompt_index, bs_index, proc_id)
5844
tok_encode_start = time.perf_counter()
45+
46+
prefix = '[warm-up]' if num == 0 else '[{}]'.format(num)
47+
log.info(f'{prefix}[P{prompt_index}] Input image nums: {len(images)}')
48+
log.info(f'{prefix}[P{prompt_index}] Input video nums: {len(videos)}')
5949
input_data = model.preprocess_inputs(text=prompts[0], image=images[0] if images else None, **processor)
60-
if videos:
61-
input_data["videos"] = videos
50+
if videos: # to check
51+
input_data["videos"] = [videos]
52+
6253
tok_encode_end = time.perf_counter()
6354
tok_encode_time = (tok_encode_end - tok_encode_start) * 1000
6455
# Remove `token_type_ids` from inputs
@@ -189,38 +180,20 @@ def run_visual_language_generation_optimum(
189180
bench_hook.clear_mm_embeddins_time_list()
190181

191182

192-
def load_image_genai(image_path):
193-
pil_image = load_image(image_path)
194-
image_data = np.array(pil_image)[None]
195-
return ov.Tensor(image_data)
196-
197-
198183
def run_visual_language_generation_genai(
199184
inputs, num, model, processor, args, iter_data_list, md5_list, prompt_index,
200185
streamer, model_precision, proc_id, mem_consumption, required_frames=None):
201186
if args['batch_size'] != 1:
202187
log.warning("Only batch size 1 available for benchmarking")
203188
args["batch_size"] = 1
204-
images = []
205-
prompts = []
206-
videos = []
207-
inputs = [inputs] if not isinstance(inputs, (list, tuple)) else inputs
208-
for input_data in inputs:
209-
if input_data.get("video", None):
210-
entry = Path(input_data["video"])
211-
video_tensor = pu.make_video_tensor(entry, required_frames)
212-
videos.append(video_tensor)
213-
elif input_data.get("media", None):
214-
entry = Path(input_data["media"])
215-
if entry.is_dir():
216-
for file in sorted(entry.iterdir()):
217-
images.append(load_image_genai(str(file)))
218-
else:
219-
images.append(load_image_genai(input_data["media"]))
220-
prompts.append(input_data["prompt"])
189+
190+
prompts, images, videos = extract_prompt_issues(inputs, True, required_frames)
221191
if args["output_dir"] is not None and num == 0:
222192
for bs_index, in_text in enumerate(prompts):
223-
llm_bench_utils.output_file.output_input_text(in_text, args, model_precision, prompt_index, bs_index, proc_id)
193+
llm_bench_utils.output_file.output_input_text(
194+
in_text, args, model_precision,
195+
prompt_index, bs_index, proc_id)
196+
224197
max_rss_mem_consumption = ''
225198
max_sys_mem_consumption = ''
226199
max_rss_mem_increase = ''
@@ -233,13 +206,17 @@ def run_visual_language_generation_genai(
233206
gen_config.num_beams = args["num_beams"]
234207
gen_config.do_sample = False
235208
gen_config.ignore_eos = True
209+
236210
kwargs = {}
211+
prefix = '[warm-up]' if num == 0 else '[{}]'.format(num)
212+
log.info(f'{prefix}[P{prompt_index}] Input image nums: {len(images)}')
213+
log.info(f'{prefix}[P{prompt_index}] Input video nums: {len(videos)}')
214+
237215
if images:
238216
kwargs["images"] = images
239217
if videos:
240218
kwargs["videos"] = videos
241-
prefix = '[warm-up]' if num == 0 else '[{}]'.format(num)
242-
log.info(f'{prefix}[P{prompt_index}] Input image nums:{len(images)}')
219+
243220
start = time.perf_counter()
244221
generation_result = model.generate(prompts[0], generation_config=gen_config, **kwargs)
245222
end = time.perf_counter()
@@ -354,8 +331,8 @@ def run_visual_language_generation_benchmark(
354331
for idx, input_text in enumerate(image_text_list):
355332
p_idx = prompt_idx_list[idx]
356333
if num == 0:
357-
metrics_print.print_unicode(f'[warm-up][P{p_idx}] Input text: {input_text}',
358-
max_output=metrics_print.MAX_INPUT_TXT_IN_LOG)
334+
prefix = f'[warm-up][P{p_idx}] Input text: {input_text}'
335+
metrics_print.print_unicode(prefix, max_output=metrics_print.MAX_INPUT_TXT_IN_LOG)
359336
iter_timestamp[num][p_idx]['start'] = datetime.datetime.now().isoformat()
360337
gen_fn(
361338
input_text, num, model, processor, args, iter_data_list, md5_list,
@@ -368,8 +345,8 @@ def run_visual_language_generation_benchmark(
368345
p_idx = prompt_idx_list[idx]
369346
for num in range(num_iters + 1):
370347
if num == 0:
371-
metrics_print.print_unicode(f'[warm-up][P{p_idx}] Input text: {input_text}',
372-
max_output=metrics_print.MAX_INPUT_TXT_IN_LOG)
348+
prefix = f'[warm-up][P{p_idx}] Input text: {input_text}'
349+
metrics_print.print_unicode(prefix, max_output=metrics_print.MAX_INPUT_TXT_IN_LOG)
373350
iter_timestamp[num][p_idx]['start'] = datetime.datetime.now().isoformat()
374351
gen_fn(
375352
input_text, num, model, processor, args, iter_data_list, md5_list, prompt_idx_list[idx],
@@ -382,20 +359,3 @@ def run_visual_language_generation_benchmark(
382359
return iter_data_list, pretrain_time, iter_timestamp
383360

384361

385-
def get_image_text_prompt(args):
386-
vlm_file_list = []
387-
output_data_list, is_json_data = model_utils.get_param_from_file(args, ["media", "prompt"])
388-
if is_json_data:
389-
vlm_param_list = parse_json_data.parse_vlm_json_data(output_data_list)
390-
if len(vlm_param_list) > 0:
391-
for vlm_file in vlm_param_list:
392-
if args['prompt_file'] is not None and len(args['prompt_file']) > 0 and 'media' in vlm_file:
393-
if 'video' in vlm_file:
394-
raise ValueError('media and video cannot be specify in a single prompt file')
395-
vlm_file['media'] = model_utils.resolve_media_file_path(vlm_file.get('media'), args['prompt_file'][0])
396-
elif args['prompt_file'] is not None and len(args['prompt_file']) > 0 and 'video' in vlm_file:
397-
vlm_file['video'] = model_utils.resolve_media_file_path(vlm_file.get('video'), args['prompt_file'][0])
398-
vlm_file_list.append(vlm_file)
399-
else:
400-
vlm_file_list.append(output_data_list)
401-
return vlm_file_list

0 commit comments

Comments
 (0)