After review

krzyczar · krzyczar · commit 8eb5383a9af9 · 2025-11-13T14:08:37.000+01:00
diff --git a/tools/llm_bench/benchmark.py b/tools/llm_bench/benchmark.py
@@ -61,6 +61,7 @@ def get_argprser():
     parser.add_argument('-pi', '--prompt_index', nargs='+', type=num_iters_type, default=None,
                         help='Run the specified prompt index. You can specify multiple prompt indexes, separated by spaces.')
     parser.add_argument('--media', default=None, help='Media file path for speech or visual models.')
+    parser.add_argument('--video', default=None, help='Video file path for visual models.')
     parser.add_argument(
         '-ic',
         '--infer_count',
@@ -235,11 +236,11 @@ def get_argprser():
 
 
 CASE_TO_BENCH = {
-    'text_gen': bench_text.run_text_generation_benchmark,
-    'image_gen': bench_image.run_image_generation_benchmark,
-    'code_gen': bench_text.run_text_generation_benchmark,
-    'ldm_super_resolution': bench_ldm_sr.run_ldm_super_resolution_benchmark,
-    'speech_to_text': bench_speech.run_speech_2_txt_benchmark,
+    "text_gen": bench_text.run_text_generation_benchmark,
+    "image_gen": bench_image.run_image_generation_benchmark,
+    "code_gen": bench_text.run_text_generation_benchmark,
+    "ldm_super_resolution": bench_ldm_sr.run_ldm_super_resolution_benchmark,
+    "speech_to_text": bench_speech.run_speech_2_txt_benchmark,
     "visual_text_gen": bench_vlm.run_visual_language_generation_benchmark,
     "text_embed": bench_text_embed.run_text_embddings_benchmark,
     "text_to_speech": bench_text_to_speech.run_text_2_speech_benchmark,
diff --git a/tools/llm_bench/llm_bench_utils/parse_json_data.py b/tools/llm_bench/llm_bench_utils/parse_json_data.py
@@ -27,7 +27,7 @@ def parse_vlm_json_data(json_data_list):
     for json_data in json_data_list:
         prompt_data = create_base_prompt(json_data)
         if ("media" in json_data) and ("video" in json_data):
-            raise ValueError("only one key is avaialble from media & video")
+            raise ValueError("only one key is available from media & video")
         if "media" in json_data:
             prompt_data["media"] = json_data["media"]
         if "video" in json_data:
diff --git a/tools/llm_bench/llm_bench_utils/prompt_utils.py b/tools/llm_bench/llm_bench_utils/prompt_utils.py
@@ -8,8 +8,13 @@
 import numpy as np
 from PIL import Image
 import logging as log
+from transformers.image_utils import load_image
 from .model_utils import get_param_from_file
+from .model_utils import resolve_media_file_path
 from .parse_json_data import parse_text_json_data
+from .parse_json_data import parse_vlm_json_data
+from pathlib import Path
+import openvino as ov
 
 
 def get_text_prompt(args):
@@ -26,19 +31,23 @@ def get_text_prompt(args):
 
 
 def print_video_frames_number_and_convert_to_tensor(func):
-    def inner(video_path, decym_frames):
+    def inner(video_path, genai_flag, decym_frames):
         log.info(f"Input video file: {video_path}")
         if decym_frames is not None:
             log.info(f"Requested to reduce into {decym_frames} frames")
         out_frames = func(video_path, decym_frames)
         log.info(f"Final frames number: {len(out_frames)}")
-        return np.array(out_frames)
+        log.info(f"First frame shape: {out_frames[0].shape}")
+        log.info(f"First frame dtype: {out_frames[0].dtype}")
+        if genai_flag:
+            return [ov.Tensor(frame[None]) for frame in out_frames]
+        else: return np.array(out_frames)
     return inner
 
 
 @print_video_frames_number_and_convert_to_tensor
-def make_video_tensor(video_path, decym_frames=None):
-    supported_files = set([".mp4"])
+def make_video_tensor(video_path, genai_flag, decym_frames=None):
+    supported_files = {".mp4"}
 
     assert os.path.exists(video_path), f"no input video file: {video_path}"
     assert video_path.suffix.lower() in supported_files, "no supported video file"
@@ -49,38 +58,97 @@ def make_video_tensor(video_path, decym_frames=None):
         ret, frame = cap.read()
         if not ret:
             break
+
         frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
         pil_image = Image.fromarray(frame_rgb)
 
-        shape = np.array(pil_image).shape
-        dtype = np.array(pil_image).dtype
-        log.info(f"Video shape: {shape}")
-        log.info(f"Video dtype: {dtype}")
-        new_frame = np.zeros(shape, dtype)
+        np_img_array = np.array(pil_image)
+        log.debug(f"Video shape: {np_img_array.shape}")
+        log.debug(f"Video dtype: {np_img_array.dtype}")
+        output_frames.append(np_img_array)
 
-        width, height = pil_image.size
-        log.info(f"Video size: {width}x{height}")
-        for x in range(0, width):
-            for y in range(0, height):
-                new_frame[y, x] = frame_rgb[y, x]
-        output_frames.append(np.array(pil_image))
+        # new_frame = np.zeros(shape, dtype=int)
+        # width, height = pil_image.size
+        # log.debug(f"Video size: {width}x{height}")
+        # for x in range(0, width):
+        #     for y in range(0, height):
+        #         new_frame[y, x] = frame_rgb[y, x]
+        # output_frames.append(new_frame)
 
     if decym_frames is None:
+        log.info(f"Video decym: skip")
         return output_frames
     if int(decym_frames) == 0:
+        log.info(f"Video decym: skip")
         return output_frames
 
-    # decimation procedure
-    # decim_fames is required frame number if positive
-    # or decimation factor if negative
+    # decymation procedure
+    # decym_fames is required max frame number if positive
+    # or decymation factor if negative
 
     decym_frames = int(decym_frames)
     if decym_frames > 0:
         if len(output_frames) <= decym_frames:
-            return output_frames
-        decym_factor = int(len(output_frames) / decym_frames)
+            log.info(f"Video decym: too short to decym: crop: {decym_frames}")
+            return list(output_frames[:decym_frames])
+        decym_factor = 1 + int(len(output_frames) / decym_frames)
     else:
         decym_factor = -decym_frames
+    log.info(f"Video decym: {decym_factor}")
     if decym_factor >= 2:
-        return output_frames[::decym_factor]
+        return list(output_frames[::decym_factor])
+    log.info(f"Video decym: too large decym factor: skip")
     return output_frames
+
+
+def load_image_genai(image_path):
+    pil_image = load_image(image_path)
+    image_data = np.array(pil_image)[None]
+    return ov.Tensor(image_data)
+
+
+def extract_prompt_issues(inputs, genai_flag, required_frames):
+    prompts, images, videos = [], [], []
+    if not isinstance(inputs, (list, tuple, set)):
+        inputs = [inputs]
+    for input_data in inputs:
+        if input_data.get("video") is not None:
+            entry = Path(input_data["video"])
+            if entry.is_dir():
+                for filename in sorted(entry.iterdir()):
+                    video_tensor = make_video_tensor(filename, genai_flag, required_frames)
+                    videos.extend(video_tensor)
+            else:
+                video_tensor = make_video_tensor(entry, genai_flag, required_frames)
+                videos.extend(video_tensor)
+        if input_data.get("media") is not None:
+            func_load_image = load_image_genai if genai_flag else load_image
+            entry = Path(input_data["media"])
+            if entry.is_dir():
+                for file in sorted(entry.iterdir()):
+                    img = func_load_image(str(file))
+                    images.append(img)
+            else:
+                img = func_load_image(input_data["media"])
+                images.append(img)
+        prompts.append(input_data["prompt"])
+    return prompts, images, videos
+
+
+def get_image_text_prompt(args):
+    vlm_file_list = []
+    output_data_list, is_json_data = get_param_from_file(args, ["video", "media", "prompt"])
+    if is_json_data:
+        vlm_param_list = parse_vlm_json_data(output_data_list)
+        if len(vlm_param_list) > 0:
+            for vlm_file in vlm_param_list:
+                if args['prompt_file'] is not None and len(args['prompt_file']) > 0 and 'media' in vlm_file:
+                    if 'video' in vlm_file:
+                        raise ValueError('media and video cannot be specify in a single prompt file')
+                    vlm_file['media'] = resolve_media_file_path(vlm_file.get('media'), args['prompt_file'][0])
+                if args['prompt_file'] is not None and len(args['prompt_file']) > 0 and 'video' in vlm_file:
+                    vlm_file['video'] = resolve_media_file_path(vlm_file.get('video'), args['prompt_file'][0])
+                vlm_file_list.append(vlm_file)
+    else:
+        vlm_file_list.append(output_data_list)
+    return vlm_file_list
diff --git a/tools/llm_bench/task/visual_language_generation.py b/tools/llm_bench/task/visual_language_generation.py
@@ -9,20 +9,21 @@
 import llm_bench_utils.pt_utils
 import llm_bench_utils.model_utils as model_utils
 import numpy as np
-import openvino as ov
 import hashlib
-import llm_bench_utils.metrics_print as metrics_print
 from transformers import set_seed
-from transformers.image_utils import load_image
 import llm_bench_utils.output_file
+import llm_bench_utils.metrics_print as metrics_print
 import llm_bench_utils.gen_output_data as gen_output_data
-import llm_bench_utils.parse_json_data as parse_json_data
-import llm_bench_utils.prompt_utils as pu
-from pathlib import Path
+from llm_bench_utils.prompt_utils import extract_prompt_issues
+from llm_bench_utils.prompt_utils import get_image_text_prompt
+import openvino as ov
 
-FW_UTILS = {'pt': llm_bench_utils.pt_utils, 'ov': llm_bench_utils.ov_utils}
 
 DEFAULT_OUTPUT_TOKEN_SIZE = 512
+FW_UTILS = {
+    'pt': llm_bench_utils.pt_utils,
+    'ov': llm_bench_utils.ov_utils
+}
 
 
 def run_visual_language_generation_optimum(
@@ -33,32 +34,22 @@ def run_visual_language_generation_optimum(
     if args['batch_size'] != 1:
         log.warning("Only batch size 1 available for benchmarking")
         args["batch_size"] = 1
-    images = []
-    prompts = []
-    videos = []
-    inputs = [inputs] if not isinstance(inputs, (list, tuple)) else inputs
-    for input_data in inputs:
-        if input_data.get("video", None):
-            entry = Path(input_data["video"])
-            video_tensor = pu.make_video_tensor(entry, required_frames)
-            videos.append(video_tensor)
-        elif input_data.get("media", None):
-            entry = Path(input_data["media"])
-            if entry.is_dir():
-                for file in sorted(entry.iterdir()):
-                    images.append(load_image(str(file)))
-            else:
-                images.append(load_image(input_data["media"]))
-        prompts.append(input_data["prompt"])
-    prefix = '[warm-up]' if num == 0 else '[{}]'.format(num)
-    log.info(f'{prefix}[P{prompt_index}] Input image nums:{len(images)}')
+
+    prompts, images, videos = extract_prompt_issues(inputs, False, required_frames)
     if args["output_dir"] is not None and num == 0:
         for bs_index, in_text in enumerate(prompts):
-            llm_bench_utils.output_file.output_input_text(in_text, args, model_precision, prompt_index, bs_index, proc_id)
+            llm_bench_utils.output_file.output_input_text(
+                in_text, args, model_precision,
+                prompt_index, bs_index, proc_id)
     tok_encode_start = time.perf_counter()
+
+    prefix = '[warm-up]' if num == 0 else '[{}]'.format(num)
+    log.info(f'{prefix}[P{prompt_index}] Input image nums: {len(images)}')
+    log.info(f'{prefix}[P{prompt_index}] Input video nums: {len(videos)}')
     input_data = model.preprocess_inputs(text=prompts[0], image=images[0] if images else None, **processor)
-    if videos:
-        input_data["videos"] = videos
+    if videos: # to check
+        input_data["videos"] = [videos]
+
     tok_encode_end = time.perf_counter()
     tok_encode_time = (tok_encode_end - tok_encode_start) * 1000
     # Remove `token_type_ids` from inputs
@@ -189,38 +180,20 @@ def run_visual_language_generation_optimum(
         bench_hook.clear_mm_embeddins_time_list()
 
 
-def load_image_genai(image_path):
-    pil_image = load_image(image_path)
-    image_data = np.array(pil_image)[None]
-    return ov.Tensor(image_data)
-
-
 def run_visual_language_generation_genai(
         inputs, num, model, processor, args, iter_data_list, md5_list, prompt_index,
         streamer, model_precision, proc_id, mem_consumption, required_frames=None):
     if args['batch_size'] != 1:
         log.warning("Only batch size 1 available for benchmarking")
         args["batch_size"] = 1
-    images = []
-    prompts = []
-    videos = []
-    inputs = [inputs] if not isinstance(inputs, (list, tuple)) else inputs
-    for input_data in inputs:
-        if input_data.get("video", None):
-            entry = Path(input_data["video"])
-            video_tensor = pu.make_video_tensor(entry, required_frames)
-            videos.append(video_tensor)
-        elif input_data.get("media", None):
-            entry = Path(input_data["media"])
-            if entry.is_dir():
-                for file in sorted(entry.iterdir()):
-                    images.append(load_image_genai(str(file)))
-            else:
-                images.append(load_image_genai(input_data["media"]))
-        prompts.append(input_data["prompt"])
+
+    prompts, images, videos = extract_prompt_issues(inputs, True, required_frames)
     if args["output_dir"] is not None and num == 0:
         for bs_index, in_text in enumerate(prompts):
-            llm_bench_utils.output_file.output_input_text(in_text, args, model_precision, prompt_index, bs_index, proc_id)
+            llm_bench_utils.output_file.output_input_text(
+                in_text, args, model_precision,
+                prompt_index, bs_index, proc_id)
+
     max_rss_mem_consumption = ''
     max_sys_mem_consumption = ''
     max_rss_mem_increase = ''
@@ -233,13 +206,17 @@ def run_visual_language_generation_genai(
     gen_config.num_beams = args["num_beams"]
     gen_config.do_sample = False
     gen_config.ignore_eos = True
+
     kwargs = {}
+    prefix = '[warm-up]' if num == 0 else '[{}]'.format(num)
+    log.info(f'{prefix}[P{prompt_index}] Input image nums: {len(images)}')
+    log.info(f'{prefix}[P{prompt_index}] Input video nums: {len(videos)}')
+
     if images:
         kwargs["images"] = images
     if videos:
         kwargs["videos"] = videos
-    prefix = '[warm-up]' if num == 0 else '[{}]'.format(num)
-    log.info(f'{prefix}[P{prompt_index}] Input image nums:{len(images)}')
+
     start = time.perf_counter()
     generation_result = model.generate(prompts[0], generation_config=gen_config, **kwargs)
     end = time.perf_counter()
@@ -354,8 +331,8 @@ def run_visual_language_generation_benchmark(
             for idx, input_text in enumerate(image_text_list):
                 p_idx = prompt_idx_list[idx]
                 if num == 0:
-                    metrics_print.print_unicode(f'[warm-up][P{p_idx}] Input text: {input_text}',
-                                                max_output=metrics_print.MAX_INPUT_TXT_IN_LOG)
+                    prefix = f'[warm-up][P{p_idx}] Input text: {input_text}'
+                    metrics_print.print_unicode(prefix, max_output=metrics_print.MAX_INPUT_TXT_IN_LOG)
                 iter_timestamp[num][p_idx]['start'] = datetime.datetime.now().isoformat()
                 gen_fn(
                     input_text, num, model, processor, args, iter_data_list, md5_list,
@@ -368,8 +345,8 @@ def run_visual_language_generation_benchmark(
             p_idx = prompt_idx_list[idx]
             for num in range(num_iters + 1):
                 if num == 0:
-                    metrics_print.print_unicode(f'[warm-up][P{p_idx}] Input text: {input_text}',
-                                                max_output=metrics_print.MAX_INPUT_TXT_IN_LOG)
+                    prefix = f'[warm-up][P{p_idx}] Input text: {input_text}'
+                    metrics_print.print_unicode(prefix, max_output=metrics_print.MAX_INPUT_TXT_IN_LOG)
                 iter_timestamp[num][p_idx]['start'] = datetime.datetime.now().isoformat()
                 gen_fn(
                     input_text, num, model, processor, args, iter_data_list, md5_list, prompt_idx_list[idx],
@@ -382,20 +359,3 @@ def run_visual_language_generation_benchmark(
     return iter_data_list, pretrain_time, iter_timestamp
 
 
-def get_image_text_prompt(args):
-    vlm_file_list = []
-    output_data_list, is_json_data = model_utils.get_param_from_file(args, ["media", "prompt"])
-    if is_json_data:
-        vlm_param_list = parse_json_data.parse_vlm_json_data(output_data_list)
-        if len(vlm_param_list) > 0:
-            for vlm_file in vlm_param_list:
-                if args['prompt_file'] is not None and len(args['prompt_file']) > 0 and 'media' in vlm_file:
-                    if 'video' in vlm_file:
-                        raise ValueError('media and video cannot be specify in a single prompt file')
-                    vlm_file['media'] = model_utils.resolve_media_file_path(vlm_file.get('media'), args['prompt_file'][0])
-                elif args['prompt_file'] is not None and len(args['prompt_file']) > 0 and 'video' in vlm_file:
-                    vlm_file['video'] = model_utils.resolve_media_file_path(vlm_file.get('video'), args['prompt_file'][0])
-                vlm_file_list.append(vlm_file)
-    else:
-        vlm_file_list.append(output_data_list)
-    return vlm_file_list