Skip to content

Commit 2893a5c

Browse files
committed
After review
1 parent 0233d1f commit 2893a5c

File tree

5 files changed

+152
-114
lines changed

5 files changed

+152
-114
lines changed

tools/llm_bench/benchmark.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ def get_argprser():
7878
'if the value equals 0 (default), execute the warm-up iteration(0th iteration).',
7979
)
8080
parser.add_argument('-i', '--images', default=None, help='test images for vision tasks. Can be directory or path to single image')
81+
parser.add_argument('-vp', '--videos', default=None, help='test videos for vision tasks. Can be directory or path to single video')
8182
parser.add_argument('-s', '--seed', type=int, default=42, required=False, help='specific random seed to generate fix result. Default 42.')
8283
parser.add_argument(
8384
'-lc',
@@ -230,16 +231,16 @@ def get_argprser():
230231
parser.add_argument("--vocoder_path", type=str, default=None,
231232
help="Path to vocoder for text to speech scenarios")
232233
parser.add_argument("-vf", "--video_frames", type=int, default=None,
233-
help="controler of video frames to process")
234+
help="controller of video frames to process (required frame number or decymation factor if negative)")
234235
return parser.parse_args()
235236

236237

237238
CASE_TO_BENCH = {
238-
'text_gen': bench_text.run_text_generation_benchmark,
239-
'image_gen': bench_image.run_image_generation_benchmark,
240-
'code_gen': bench_text.run_text_generation_benchmark,
241-
'ldm_super_resolution': bench_ldm_sr.run_ldm_super_resolution_benchmark,
242-
'speech_to_text': bench_speech.run_speech_2_txt_benchmark,
239+
"text_gen": bench_text.run_text_generation_benchmark,
240+
"image_gen": bench_image.run_image_generation_benchmark,
241+
"code_gen": bench_text.run_text_generation_benchmark,
242+
"ldm_super_resolution": bench_ldm_sr.run_ldm_super_resolution_benchmark,
243+
"speech_to_text": bench_speech.run_speech_2_txt_benchmark,
243244
"visual_text_gen": bench_vlm.run_visual_language_generation_benchmark,
244245
"text_embed": bench_text_embed.run_text_embddings_benchmark,
245246
"text_to_speech": bench_text_to_speech.run_text_2_speech_benchmark,
@@ -318,7 +319,7 @@ def main():
318319
else:
319320
iter_data_list, pretrain_time, iter_timestamp = CASE_TO_BENCH[model_args['use_case'].task](
320321
model_path, framework, args.device, model_args, args.num_iters,
321-
memory_data_collector, args.video_frames)
322+
memory_data_collector, decym_frames=args.video_frames)
322323
if args.report is not None or args.report_json is not None:
323324
model_precision = ''
324325
if framework == 'ov':

tools/llm_bench/llm_bench_utils/model_utils.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ def get_param_from_file(args, input_key):
4343

4444
elif args[input_key] is not None and args['prompt_file'] is not None:
4545
raise RuntimeError(f'== {input_key} and prompt file should not exist together ==')
46+
4647
else:
4748
if args[input_key] is not None:
4849
if args[input_key] != '':
@@ -56,11 +57,16 @@ def get_param_from_file(args, input_key):
5657
if "media" in input_key:
5758
if args["media"] is None and args["images"] is None:
5859
if args["use_case"].task == "visual_text_gen":
59-
log.warn("Input image is not provided. Only text generation part will be evaluated")
60+
if args["videos"] is None:
61+
log.warn("Input image/video is not provided. Only text generation part will be evaluated")
6062
elif args["use_case"].task != "image_gen":
6163
raise RuntimeError("No input image. ImageToImage/Inpainting Models cannot start generation without one. Please, provide an image.")
6264
else:
6365
data_dict["media"] = args["media"] if args["media"] is not None else args["images"]
66+
if "video" in input_key:
67+
if args["videos"] is not None:
68+
data_dict["video"] = args["videos"]
69+
6470
if args["prompt"] is None:
6571
if args["use_case"].task == "visual_text_gen":
6672
data_dict["prompt"] = "What is OpenVINO?" if data_dict.get("media") is None else "Describe image"
@@ -112,6 +118,7 @@ def analyze_args(args):
112118
model_args["height"] = args.height
113119
model_args["width"] = args.width
114120
model_args['images'] = args.images
121+
model_args['videos'] = args.videos
115122
model_args['seed'] = args.seed
116123
model_args['mem_consumption'] = args.memory_consumption
117124
model_args['batch_size'] = args.batch_size

tools/llm_bench/llm_bench_utils/parse_json_data.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ def parse_vlm_json_data(json_data_list):
2727
for json_data in json_data_list:
2828
prompt_data = create_base_prompt(json_data)
2929
if ("media" in json_data) and ("video" in json_data):
30-
raise ValueError("only one key is avaialble from media & video")
30+
raise ValueError("only one key is available from media & video")
3131
if "media" in json_data:
3232
prompt_data["media"] = json_data["media"]
3333
if "video" in json_data:

tools/llm_bench/llm_bench_utils/prompt_utils.py

Lines changed: 96 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,13 @@
88
import numpy as np
99
from PIL import Image
1010
import logging as log
11+
from transformers.image_utils import load_image
1112
from .model_utils import get_param_from_file
13+
from .model_utils import resolve_media_file_path
1214
from .parse_json_data import parse_text_json_data
15+
from .parse_json_data import parse_vlm_json_data
16+
from pathlib import Path
17+
import openvino as ov
1318

1419

1520
def get_text_prompt(args):
@@ -26,19 +31,36 @@ def get_text_prompt(args):
2631

2732

2833
def print_video_frames_number_and_convert_to_tensor(func):
29-
def inner(video_path, decym_frames):
34+
def inner(video_path, genai_flag, decym_frames):
3035
log.info(f"Input video file: {video_path}")
3136
if decym_frames is not None:
3237
log.info(f"Requested to reduce into {decym_frames} frames")
33-
out_frames = func(video_path, decym_frames)
38+
out_frames = func(video_path, genai_flag, decym_frames)
3439
log.info(f"Final frames number: {len(out_frames)}")
35-
return np.array(out_frames)
40+
log.info(f"First frame shape: {out_frames[0].shape}")
41+
log.info(f"First frame dtype: {out_frames[0].dtype}")
42+
if genai_flag:
43+
return [ov.Tensor(frame[None]) for frame in out_frames]
44+
else:
45+
return np.array(out_frames)
3646
return inner
3747

3848

3949
@print_video_frames_number_and_convert_to_tensor
40-
def make_video_tensor(video_path, decym_frames=None):
41-
supported_files = set([".mp4"])
50+
def make_video_tensor(video_path, genai_flag, decym_frames=None):
51+
supported_files = {
52+
'.mp4', # MPEG-4 (most common)
53+
'.avi', # Audio Video Interleave
54+
'.mov', # QuickTime Movie
55+
'.mkv', # Matroska Video
56+
'.wmv', # Windows Media Video
57+
'.flv', # Flash Video
58+
'.webm', # WebM
59+
'.m4v', # iTunes Video
60+
'.3gp', # 3GPP
61+
'.mpeg', # MPEG
62+
'.mpg' # MPEG
63+
}
4264

4365
assert os.path.exists(video_path), f"no input video file: {video_path}"
4466
assert video_path.suffix.lower() in supported_files, "no supported video file"
@@ -49,38 +71,89 @@ def make_video_tensor(video_path, decym_frames=None):
4971
ret, frame = cap.read()
5072
if not ret:
5173
break
74+
5275
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
5376
pil_image = Image.fromarray(frame_rgb)
5477

55-
shape = np.array(pil_image).shape
56-
dtype = np.array(pil_image).dtype
57-
log.info(f"Video shape: {shape}")
58-
log.info(f"Video dtype: {dtype}")
59-
new_frame = np.zeros(shape, dtype)
60-
61-
width, height = pil_image.size
62-
log.info(f"Video size: {width}x{height}")
63-
for x in range(0, width):
64-
for y in range(0, height):
65-
new_frame[y, x] = frame_rgb[y, x]
66-
output_frames.append(np.array(pil_image))
78+
np_img_array = np.array(pil_image)
79+
log.debug(f"Video shape: {np_img_array.shape}")
80+
log.debug(f"Video dtype: {np_img_array.dtype}")
81+
output_frames.append(np_img_array)
6782

6883
if decym_frames is None:
84+
log.info("Video decym: none: skip")
6985
return output_frames
7086
if int(decym_frames) == 0:
87+
log.info("Video decym: zero: skip")
7188
return output_frames
7289

73-
# decimation procedure
74-
# decim_fames is required frame number if positive
75-
# or decimation factor if negative
90+
# decymation procedure
91+
# decym_fames is required max frame number if positive
92+
# or decymation factor if negative
7693

7794
decym_frames = int(decym_frames)
7895
if decym_frames > 0:
7996
if len(output_frames) <= decym_frames:
80-
return output_frames
81-
decym_factor = int(len(output_frames) / decym_frames)
97+
log.info(f"Video decym: too short to decym: crop: {decym_frames}")
98+
return list(output_frames[:decym_frames])
99+
decym_factor = 1 + int(len(output_frames) / decym_frames)
82100
else:
83101
decym_factor = -decym_frames
102+
log.info(f"Video decym factor: {decym_factor}")
84103
if decym_factor >= 2:
85-
return output_frames[::decym_factor]
104+
return list(output_frames[::decym_factor])
105+
log.info("Video decym: too large decym factor: skip")
86106
return output_frames
107+
108+
109+
def load_image_genai(image_path):
110+
pil_image = load_image(image_path)
111+
image_data = np.array(pil_image)[None]
112+
return ov.Tensor(image_data)
113+
114+
115+
def extract_prompt_issues(inputs, genai_flag, required_frames):
116+
prompts, images, videos = [], [], []
117+
if not isinstance(inputs, (list, tuple, set)):
118+
inputs = [inputs]
119+
for input_data in inputs:
120+
if input_data.get("video") is not None:
121+
entry = Path(input_data["video"])
122+
if entry.is_dir():
123+
for filename in sorted(entry.iterdir()):
124+
video_tensor = make_video_tensor(filename, genai_flag, required_frames)
125+
videos.extend(video_tensor)
126+
else:
127+
video_tensor = make_video_tensor(entry, genai_flag, required_frames)
128+
videos.extend(video_tensor)
129+
if input_data.get("media") is not None:
130+
func_load_image = load_image_genai if genai_flag else load_image
131+
entry = Path(input_data["media"])
132+
if entry.is_dir():
133+
for file in sorted(entry.iterdir()):
134+
img = func_load_image(str(file))
135+
images.append(img)
136+
else:
137+
img = func_load_image(input_data["media"])
138+
images.append(img)
139+
prompts.append(input_data["prompt"])
140+
return prompts, images, videos
141+
142+
143+
def get_image_text_prompt(args):
144+
vlm_file_list = []
145+
output_data_list, is_json_data = get_param_from_file(args, ["video", "media", "prompt"])
146+
if is_json_data:
147+
vlm_param_list = parse_vlm_json_data(output_data_list)
148+
if len(vlm_param_list) > 0:
149+
for vlm_file in vlm_param_list:
150+
if args['prompt_file'] is not None and len(args['prompt_file']) > 0 and 'media' in vlm_file:
151+
if 'video' in vlm_file:
152+
raise ValueError('media and video cannot be specify in a single prompt file')
153+
vlm_file['media'] = resolve_media_file_path(vlm_file.get('media'), args['prompt_file'][0])
154+
if args['prompt_file'] is not None and len(args['prompt_file']) > 0 and 'video' in vlm_file:
155+
vlm_file['video'] = resolve_media_file_path(vlm_file.get('video'), args['prompt_file'][0])
156+
vlm_file_list.append(vlm_file)
157+
else:
158+
vlm_file_list.append(output_data_list)
159+
return vlm_file_list

0 commit comments

Comments
 (0)