diff --git a/tests/python_tests/samples/conftest.py b/tests/python_tests/samples/conftest.py index b84a6aad80..3aa71fcb3c 100644 --- a/tests/python_tests/samples/conftest.py +++ b/tests/python_tests/samples/conftest.py @@ -26,7 +26,7 @@ # - "name": the model's name or path # - "convert_args": a list of arguments for the conversion command MODELS: Dict[str, Dict[str, Any]] = { - "TinyLlama-1.1B-Chat-v1.0": { + "TinyLlama-1.1B-Chat-v1.0": { "name": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", "convert_args": ['--weight-format', 'fp16'] }, @@ -46,7 +46,7 @@ "SmolLM2-360M": { "name": "HuggingFaceTB/SmolLM2-360M", "convert_args": ['--trust-remote-code'] - }, + }, "WhisperTiny": { "name": "openai/whisper-tiny", "convert_args": ['--trust-remote-code', '--weight-format', 'fp16'] @@ -84,11 +84,11 @@ "LCM_Dreamshaper_v7-int8-ov": { "name": "OpenVINO/LCM_Dreamshaper_v7-int8-ov", "convert_args": [] - }, + }, "llava-1.5-7b-hf": { "name": "llava-hf/llava-1.5-7b-hf", "convert_args": ['--trust-remote-code', '--weight-format', 'fp16'] - }, + }, "llava-v1.6-mistral-7b-hf": { "name": "llava-hf/llava-v1.6-mistral-7b-hf", "convert_args": ['--trust-remote-code', '--weight-format', 'fp16'] @@ -129,6 +129,10 @@ "name": "katuni4ka/tiny-random-llava", "convert_args": ["--trust-remote-code", "--task", "image-text-to-text"] }, + "tiny-random-qwen2vl": { + "name": "katuni4ka/tiny-random-qwen2vl", + "convert_args": ["--trust-remote-code", "--task", "image-text-to-text"] + }, "bge-small-en-v1.5": { "name": "BAAI/bge-small-en-v1.5", "convert_args": ["--trust-remote-code"] @@ -148,6 +152,10 @@ "tiny-random-SpeechT5ForTextToSpeech": { "name": "hf-internal-testing/tiny-random-SpeechT5ForTextToSpeech", "convert_args": ["--model-kwargs", json.dumps({"vocoder": "fxmarty/speecht5-hifigan-tiny"})] + }, + "tiny-random-llava-next-video": { + "name": "katuni4ka/tiny-random-llava-next-video", + "convert_args": ["--trust-remote-code", "--task", "image-text-to-text"] } } @@ -164,7 +172,8 @@ "cat.png": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cat.png", "cat": "https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11", "3283_1447_000.tar.gz": "https://huggingface.co/datasets/facebook/multilingual_librispeech/resolve/main/data/mls_polish/train/audio/3283_1447_000.tar.gz", - "cmu_us_awb_arctic-wav-arctic_a0001.bin": "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_awb_arctic-wav-arctic_a0001.bin" + "cmu_us_awb_arctic-wav-arctic_a0001.bin": "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_awb_arctic-wav-arctic_a0001.bin", + "video0.mp4": "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/video/Coco%20Walking%20in%20Berkeley.mp4" } SAMPLES_PY_DIR = Path( @@ -182,23 +191,24 @@ ) ) + @pytest.fixture(scope="session", autouse=True) def setup_and_teardown(request, tmp_path_factory): """Fixture to set up and tear down the temporary directories.""" - - ov_cache = get_ov_cache_dir(tmp_path_factory.mktemp("ov_cache")) + + ov_cache = get_ov_cache_dir(tmp_path_factory.mktemp("ov_cache")) downloaded_models_dir = get_ov_cache_downloaded_models_dir() converted_models_dir = get_ov_cache_converted_models_dir() test_data = ov_cache / "test_data" - + logger.info(f"Creating directories: {downloaded_models_dir}, {converted_models_dir}, and {test_data}") test_data.mkdir(parents=True, exist_ok=True) - + request.config.cache.set("OV_CACHE", str(ov_cache)) request.config.cache.set("TEST_DATA", str(test_data)) - + yield - + if os.environ.get("CLEANUP_CACHE", "false").lower() != "false": if os.path.exists(ov_cache): logger.info(f"Removing temporary directory: {ov_cache}") @@ -213,9 +223,9 @@ def download_gguf_model(model: Dict[str, Any], model_path: str) -> None: model_name = model["name"] model_gguf_filename = model["gguf_filename"] dest_dir = Path(model_path) - + manager = AtomicDownloadManager(dest_dir) - + def download_to_temp(temp_path: Path) -> None: command = ["huggingface-cli", "download", model_name, model_gguf_filename, "--local-dir", str(temp_path)] logger.info(f"Downloading command: {' '.join(command)}") @@ -325,26 +335,27 @@ def download_to_temp(temp_path: Path) -> None: command = ["huggingface-cli", "download", model_name, "--local-dir", str(temp_path)] logger.info(f"Downloading command: {' '.join(command)}") retry_request(lambda: subprocess.run(command, check=True, capture_output=True, text=True, env=sub_env)) - + manager.execute(download_to_temp) - + yield str(model_path) - + if os.environ.get("CLEANUP_CACHE", "false").lower() == "true": if model_cache.exists(): logger.info(f"Removing downloaded model: {model_cache}") shutil.rmtree(model_cache) + @pytest.fixture(scope="session") def download_test_content(request): """Download the test content from the given URL and return the file path or extracted folder.""" - + test_data = request.config.cache.get("TEST_DATA", None) - + file_name = request.param file_url = TEST_FILES[file_name] file_path = os.path.join(test_data, file_name) - + if not os.path.exists(file_path): logger.info(f"Downloading test content from {file_url} to {file_path}...") os.makedirs(os.path.dirname(file_path), exist_ok=True) @@ -384,9 +395,9 @@ def download_test_content(request): @pytest.fixture(scope="session") def generate_test_content(request): """Generate an image of lines and return the file path.""" - + test_data = request.config.cache.get("TEST_DATA", None) - + file_name = request.param file_path = os.path.join(test_data, file_name) if not os.path.exists(file_path): @@ -412,24 +423,24 @@ def generate_test_content(request): @pytest.fixture(scope="session") def generate_image_generation_jsonl(request): """Generate a JSONL file for image generation prompts.""" - + test_data = request.config.cache.get("TEST_DATA", None) file_name, json_entries = request.param file_path = os.path.join(test_data, file_name) - + if not os.path.exists(file_path): os.makedirs(os.path.dirname(file_path), exist_ok=True) - + with open(file_path, "w", encoding="utf-8") as f: for entry in json_entries: f.write(json.dumps(entry) + "\n") - + logger.info(f"Generated image generation JSONL file at {file_path}") else: logger.info(f"Image generation JSONL file already exists at {file_path}") - + yield file_path - + # Cleanup the JSONL file after tests if os.environ.get("CLEANUP_CACHE", "false").lower() == "true": if os.path.exists(file_path): diff --git a/tests/python_tests/samples/test_tools_llm_benchmark.py b/tests/python_tests/samples/test_tools_llm_benchmark.py index 42f16bd413..4bd1c51a13 100644 --- a/tests/python_tests/samples/test_tools_llm_benchmark.py +++ b/tests/python_tests/samples/test_tools_llm_benchmark.py @@ -14,35 +14,39 @@ convert_draft_model = convert_model download_mask_image = download_test_content -image_generation_prompt = "side profile centered painted portrait, Gandhi rolling a blunt, Gloomhaven, matte painting concept art, art nouveau, 8K HD Resolution, beautifully background" +image_generation_prompt = \ + "side profile centered painted portrait, Gandhi rolling a blunt, "\ + "Gloomhaven, matte painting concept art, art nouveau, "\ + "8K HD Resolution, beautifully background" image_generation_json = [ {"steps": 30, "width": 64, "height": 128, "guidance_scale": 1.0, "prompt": image_generation_prompt}, {"steps": 4, "width": 64, "height": 32, "guidance_scale": 7.0, "prompt": image_generation_prompt} ] -image_generation_inpainting_json = [ - {"steps": 30, "width": 64, "height": 128, "guidance_scale": 1.0, "strength": "0.8", "media": "overture-creations.png", "mask_image": "overture-creations-mask.png", "prompt": image_generation_prompt}, -] -image_generation_i2i_prompt = "cat wizard, gandalf, lord of the rings, detailed, fantasy, cute, adorable, Pixar, Disney, 8k" -image_generation_i2i_json = [ - {"steps": 30, "width": 64, "height": 128, "guidance_scale": 1.0, "strength": "0.8", "media": "cat.png", "prompt": image_generation_i2i_prompt}, -] +image_generation_inpainting_json = [{ + "steps": 30, "width": 64, "height": 128, "guidance_scale": 1.0, "strength": "0.8", + "media": "overture-creations.png", "prompt": image_generation_prompt, + "mask_image": "overture-creations-mask.png" +}] +image_generation_i2i_json = [{ + "steps": 30, "width": 64, "height": 128, "guidance_scale": 1.0, "strength": "0.8", "media": "cat.png", + "prompt": "cat wizard, gandalf, lord of the rings, detailed, fantasy, cute, adorable, Pixar, Disney, 8k" +}] + + class TestBenchmarkLLM: + @pytest.mark.samples - @pytest.mark.parametrize( - "download_model, sample_args", - [ - pytest.param("tiny-dummy-qwen2", ["-d", "cpu", "-n", "1", "-f", "pt", "-ic", "20"]), - ], - indirect=["download_model"], - ) + @pytest.mark.parametrize("download_model, sample_args", [ + pytest.param("tiny-dummy-qwen2", ["-d", "cpu", "-n", "1", "-f", "pt", "-ic", "20"]), + ], indirect=["download_model"]) def test_python_tool_llm_benchmark_download_model(self, download_model, sample_args): # Run Python benchmark benchmark_script = SAMPLES_PY_DIR / 'llm_bench/benchmark.py' benchmark_py_command = [sys.executable, benchmark_script, "-m" , download_model] + sample_args run_sample(benchmark_py_command) - - + + @pytest.mark.samples @pytest.mark.parametrize( "convert_model, sample_args", @@ -54,31 +58,28 @@ def test_python_tool_llm_benchmark_download_model(self, download_model, sample_a pytest.param("tiny-random-llava", [ "-ic", "4", "--optimum", "-pf", SAMPLES_PY_DIR / "llm_bench/prompts/llava-1.5-7b.jsonl"]), pytest.param("tiny-random-latent-consistency", [ "-d", "cpu", "-n", "1", "--num_steps", "4", "--static_reshape", "-p", "'an astronaut riding a horse on mars'"]), pytest.param("tiny-random-latent-consistency", [ "-d", "cpu", "-n", "1", "--num_steps", "4", "--static_reshape", "-p", "'an astronaut riding a horse on mars'", "--optimum"]), - ], - indirect=["convert_model"], - ) + ], indirect=["convert_model"]) def test_python_tool_llm_benchmark_convert_model(self, convert_model, sample_args): # Run Python benchmark benchmark_script = SAMPLES_PY_DIR / 'llm_bench/benchmark.py' benchmark_py_command = [sys.executable, benchmark_script, "-m" , convert_model] + sample_args - run_sample(benchmark_py_command) - - + run_sample(benchmark_py_command) + + @pytest.mark.samples @pytest.mark.parametrize( "convert_model, sample_args", [ pytest.param("tiny-random-llava", [ "-ic", "20", "--prompt", "'What is unusual on this image?'"]), pytest.param("tiny-random-llava", [ "-ic", "20", "--optimum", "--prompt", "'What is unusual on this image?'"]), - ], - indirect=["convert_model"], - ) + ], indirect=["convert_model"]) @pytest.mark.parametrize("download_test_content", ["cat"], indirect=True) def test_python_tool_llm_benchmark_convert_model_media(self, convert_model, download_test_content, sample_args): # Run Python benchmark benchmark_script = SAMPLES_PY_DIR / 'llm_bench/benchmark.py' - benchmark_py_command = [sys.executable, benchmark_script, "-m" , convert_model, "--media", download_test_content] + sample_args - run_sample(benchmark_py_command) + benchmark_py_command = [sys.executable, benchmark_script, "-m" , convert_model, "--media", download_test_content] + benchmark_py_command += sample_args + run_sample(benchmark_py_command) @pytest.mark.samples @@ -102,7 +103,7 @@ def test_python_tool_llm_benchmark_speculative(self, convert_model, convert_draf @pytest.mark.samples - @pytest.mark.parametrize("sample_args", + @pytest.mark.parametrize("sample_args", [ ["-d", "cpu", "-n", "1", "--num_steps", "4", "--optimum"], ["-d", "cpu", "-n", "1", "--num_steps", "4"], @@ -117,14 +118,14 @@ def test_python_tool_llm_benchmark_jsonl(self, convert_model, generate_image_gen # Run Python benchmark benchmark_script = SAMPLES_PY_DIR / 'llm_bench/benchmark.py' benchmark_py_command = [ - sys.executable, - benchmark_script, - "-m", convert_model, - "-pf", generate_image_generation_jsonl, + sys.executable, + benchmark_script, + "-m", convert_model, + "-pf", generate_image_generation_jsonl, ] + sample_args run_sample(benchmark_py_command) - - + + @pytest.mark.samples @pytest.mark.parametrize("sample_args", [["-d", "cpu", "-n", "1", "--num_steps", "4"], ["-d", "cpu", "-n", "1", "--num_steps", "4", "--empty_lora"]]) @pytest.mark.parametrize("convert_model", ["tiny-random-latent-consistency"], indirect=True) @@ -132,19 +133,19 @@ def test_python_tool_llm_benchmark_jsonl(self, convert_model, generate_image_gen @pytest.mark.parametrize("generate_image_generation_jsonl", [("image_generation.jsonl", image_generation_json)], indirect=True) def test_python_tool_llm_benchmark_jsonl_lora(self, request, convert_model, download_model, generate_image_generation_jsonl, sample_args): model_name = request.node.callspec.params['download_model'] - + # Run Python benchmark benchmark_script = SAMPLES_PY_DIR / 'llm_bench/benchmark.py' benchmark_py_command = [ - sys.executable, - benchmark_script, - "-m", convert_model, + sys.executable, + benchmark_script, + "-m", convert_model, "-pf", generate_image_generation_jsonl, "--lora", f'{download_model}/{model_name}.safetensors', ] + sample_args run_sample(benchmark_py_command) - - + + @pytest.mark.samples @pytest.mark.parametrize("sample_args", [["-d", "cpu", "-n", "1", "--num_steps", "4", "--task", "inpainting"]]) @pytest.mark.parametrize("convert_model", ["tiny-random-latent-consistency"], indirect=True) @@ -152,16 +153,16 @@ def test_python_tool_llm_benchmark_jsonl_lora(self, request, convert_model, down @pytest.mark.parametrize("download_mask_image", ["overture-creations-mask.png"], indirect=True) @pytest.mark.parametrize("generate_image_generation_jsonl", [("image_generation_inpainting.jsonl", image_generation_inpainting_json)], indirect=True) def test_python_tool_llm_benchmark_inpainting(self, convert_model, download_test_content, download_mask_image, generate_image_generation_jsonl, sample_args): - + # to use the relative media and mask_image paths os.chdir(os.path.dirname(download_test_content)) # Run Python benchmark benchmark_script = SAMPLES_PY_DIR / 'llm_bench/benchmark.py' benchmark_py_command = [ - sys.executable, - benchmark_script, - "-m", convert_model, + sys.executable, + benchmark_script, + "-m", convert_model, "-pf", generate_image_generation_jsonl, ] + sample_args run_sample(benchmark_py_command) @@ -173,31 +174,33 @@ def test_python_tool_llm_benchmark_inpainting(self, convert_model, download_test @pytest.mark.parametrize("download_test_content", ["cat.png"], indirect=True) @pytest.mark.parametrize("generate_image_generation_jsonl", [("image_generation_i2i.jsonl", image_generation_i2i_json)], indirect=True) def test_python_tool_llm_benchmark_i2i(self, convert_model, download_test_content, generate_image_generation_jsonl, sample_args): - + # to use the relative media and mask_image paths os.chdir(os.path.dirname(download_test_content)) # Run Python benchmark benchmark_script = SAMPLES_PY_DIR / 'llm_bench/benchmark.py' benchmark_py_command = [ - sys.executable, - benchmark_script, - "-m", convert_model, + sys.executable, + benchmark_script, + "-m", convert_model, "-pf", generate_image_generation_jsonl, ] + sample_args run_sample(benchmark_py_command) @pytest.mark.samples - @pytest.mark.parametrize("sample_args", [["-d", "cpu", "-n", "1", "-p", "'Why is the Sun yellow?'"], ["-d", "cpu", "-n", "1", "-p", "'Why is the Sun yellow?'", "--optimum"]]) + @pytest.mark.parametrize("sample_args", [ + ["-d", "cpu", "-n", "1", "-p", "'Why is the Sun yellow?'"], + ["-d", "cpu", "-n", "1", "-p", "'Why is the Sun yellow?'", "--optimum"]]) @pytest.mark.parametrize("convert_model", ["tiny-random-SpeechT5ForTextToSpeech"], indirect=True) @pytest.mark.parametrize("download_test_content", ["cmu_us_awb_arctic-wav-arctic_a0001.bin"], indirect=True) def test_python_tool_llm_benchmark_tts(self, convert_model, download_test_content, sample_args): # Run Python benchmark benchmark_script = SAMPLES_PY_DIR / 'llm_bench/benchmark.py' benchmark_py_command = [ - sys.executable, - benchmark_script, + sys.executable, + benchmark_script, "-m", convert_model, "--speaker_embeddings", download_test_content ] + sample_args @@ -214,9 +217,9 @@ def test_python_tool_llm_benchmark_optimum(self, convert_model, download_test_co # Run Python benchmark benchmark_script = SAMPLES_PY_DIR / 'llm_bench/benchmark.py' benchmark_py_command = [ - sys.executable, - benchmark_script, - "-m", convert_model, + sys.executable, + benchmark_script, + "-m", convert_model, "--media", media_path, ] + sample_args run_sample(benchmark_py_command) @@ -232,9 +235,9 @@ def test_python_tool_llm_benchmark_optimum(self, convert_model, download_test_co def test_python_tool_llm_benchmark_text_embeddings(self, convert_model, sample_args): benchmark_script = SAMPLES_PY_DIR / 'llm_bench/benchmark.py' benchmark_py_command = [ - sys.executable, - benchmark_script, - "-m", convert_model, + sys.executable, + benchmark_script, + "-m", convert_model, ] + sample_args run_sample(benchmark_py_command) @@ -248,8 +251,8 @@ def test_python_tool_llm_benchmark_text_embeddings(self, convert_model, sample_a def test_python_tool_llm_benchmark_text_embeddings_qwen3(self, convert_model, sample_args): benchmark_script = SAMPLES_PY_DIR / 'llm_bench/benchmark.py' benchmark_py_command = [ - sys.executable, - benchmark_script, + sys.executable, + benchmark_script, "-m", convert_model, ] + sample_args run_sample(benchmark_py_command) @@ -283,10 +286,9 @@ def test_python_tool_llm_benchmark_text_reranking_qwen3(self, model_id, sample_a model_schema = download_and_convert_model(model_id) benchmark_script = SAMPLES_PY_DIR / 'llm_bench/benchmark.py' benchmark_py_command = [ - sys.executable, - benchmark_script, - "-m", - model_schema.models_path, + sys.executable, + benchmark_script, + "-m", model_schema.models_path ] + sample_args run_sample(benchmark_py_command) @@ -306,3 +308,24 @@ def test_python_tool_llm_benchmark_gguf_format(self, sample_args): "-m", gguf_full_path, ] + sample_args run_sample(benchmark_py_command) + + + + @pytest.mark.samples + @pytest.mark.parametrize("download_test_content", ["video0.mp4"], indirect=True) + @pytest.mark.parametrize("convert_model, sample_args", [ + pytest.param("tiny-random-llava-next-video", ["-d", "cpu", "-n", "1", "--genai", "-vf", "5"]), + pytest.param("tiny-random-llava-next-video", ["-d", "cpu", "-n", "1", "--genai", "-vf", "-3"]), + pytest.param("tiny-random-llava-next-video", ["-d", "cpu", "-n", "1", "--optimum", "-vf", "5"]), + ], indirect=["convert_model"]) + def test_python_tool_llm_benchmark_video_prompts(self, download_test_content, convert_model, sample_args): + benchmark_script = os.path.join(SAMPLES_PY_DIR, 'llm_bench/benchmark.py') + benchmark_py_command = [ + sys.executable, + benchmark_script, + "-m", convert_model, + "--video", download_test_content, + "--prompt", "What_is_presented_in_the_video?" + ] + benchmark_py_command.extend(sample_args) + run_sample(benchmark_py_command) diff --git a/tests/python_tests/test_vlm_pipeline.py b/tests/python_tests/test_vlm_pipeline.py index e9de552a70..8168ad86cf 100644 --- a/tests/python_tests/test_vlm_pipeline.py +++ b/tests/python_tests/test_vlm_pipeline.py @@ -313,12 +313,12 @@ def ov_pipe_model(request: pytest.FixtureRequest) -> VlmModelInfo: ids=lambda p: f"{p[0]}/{p[1]}", indirect=["ov_pipe_model"], ) - + @pytest.fixture(scope="module") def ov_continious_batching_pipe() -> ContinuousBatchingPipeline: models_path = _get_ov_model(MODEL_IDS[0]) return ContinuousBatchingPipeline(models_path, SchedulerConfig(), "CPU") - + @pytest.fixture(scope="module") def ov_continious_batching_pipe_gemma() -> ContinuousBatchingPipeline: models_path = _get_ov_model(MODEL_IDS[8]) @@ -433,7 +433,7 @@ def test_images(request: pytest.FixtureRequest): def test_vlm_pipeline(ov_pipe_model: VlmModelInfo, test_images: list[openvino.Tensor]): ov_pipe = ov_pipe_model.pipeline result_from_streamer = [] - + def streamer(word: str) -> bool: nonlocal result_from_streamer result_from_streamer.append(word) diff --git a/tools/llm_bench/benchmark.py b/tools/llm_bench/benchmark.py index e584d5fd82..f4f63a137c 100644 --- a/tools/llm_bench/benchmark.py +++ b/tools/llm_bench/benchmark.py @@ -78,6 +78,7 @@ def get_argprser(): 'if the value equals 0 (default), execute the warm-up iteration(0th iteration).', ) parser.add_argument('-i', '--images', default=None, help='test images for vision tasks. Can be directory or path to single image') + parser.add_argument('-v', '--video', default=None, help='test video for vision tasks. Can be directory or path to single video') parser.add_argument('-s', '--seed', type=int, default=42, required=False, help='specific random seed to generate fix result. Default 42.') parser.add_argument( '-lc', @@ -229,15 +230,17 @@ def get_argprser(): help="Path to .bin or .pt file with speaker embeddings for text to speech scenarios") parser.add_argument("--vocoder_path", type=str, default=None, help="Path to vocoder for text to speech scenarios") + parser.add_argument("-vf", "--video_frames", type=int, default=None, + help="controller of video frames to process (required frame number if positive or decimation factor if negative)") return parser.parse_args() CASE_TO_BENCH = { - 'text_gen': bench_text.run_text_generation_benchmark, - 'image_gen': bench_image.run_image_generation_benchmark, - 'code_gen': bench_text.run_text_generation_benchmark, - 'ldm_super_resolution': bench_ldm_sr.run_ldm_super_resolution_benchmark, - 'speech_to_text': bench_speech.run_speech_2_txt_benchmark, + "text_gen": bench_text.run_text_generation_benchmark, + "image_gen": bench_image.run_image_generation_benchmark, + "code_gen": bench_text.run_text_generation_benchmark, + "ldm_super_resolution": bench_ldm_sr.run_ldm_super_resolution_benchmark, + "speech_to_text": bench_speech.run_speech_2_txt_benchmark, "visual_text_gen": bench_vlm.run_visual_language_generation_benchmark, "text_embed": bench_text_embed.run_text_embddings_benchmark, "text_to_speech": bench_text_to_speech.run_text_2_speech_benchmark, @@ -316,6 +319,7 @@ def main(): else: iter_data_list, pretrain_time, iter_timestamp = CASE_TO_BENCH[model_args['use_case'].task]( model_path, framework, args.device, model_args, args.num_iters, memory_data_collector) + if args.report is not None or args.report_json is not None: model_precision = '' if framework == 'ov': diff --git a/tools/llm_bench/llm_bench_utils/model_utils.py b/tools/llm_bench/llm_bench_utils/model_utils.py index 7a83d525ec..1a9ccaf237 100644 --- a/tools/llm_bench/llm_bench_utils/model_utils.py +++ b/tools/llm_bench/llm_bench_utils/model_utils.py @@ -43,6 +43,7 @@ def get_param_from_file(args, input_key): elif args[input_key] is not None and args['prompt_file'] is not None: raise RuntimeError(f'== {input_key} and prompt file should not exist together ==') + else: if args[input_key] is not None: if args[input_key] != '': @@ -56,11 +57,15 @@ def get_param_from_file(args, input_key): if "media" in input_key: if args["media"] is None and args["images"] is None: if args["use_case"].task == "visual_text_gen": - log.warn("Input image is not provided. Only text generation part will be evaluated") + if args["video"] is None: + log.warn("Input image/video is not provided. Only text generation part will be evaluated") elif args["use_case"].task != "image_gen": raise RuntimeError("No input image. ImageToImage/Inpainting Models cannot start generation without one. Please, provide an image.") else: data_dict["media"] = args["media"] if args["media"] is not None else args["images"] + if "video" in input_key and args["video"] is not None: + data_dict["video"] = args["video"] + if args["prompt"] is None: if args["use_case"].task == "visual_text_gen": data_dict["prompt"] = "What is OpenVINO?" if data_dict.get("media") is None else "Describe image" @@ -112,6 +117,7 @@ def analyze_args(args): model_args["height"] = args.height model_args["width"] = args.width model_args['images'] = args.images + model_args['video'] = args.video model_args['seed'] = args.seed model_args['mem_consumption'] = args.memory_consumption model_args['batch_size'] = args.batch_size @@ -135,7 +141,7 @@ def analyze_args(args): model_args["rerank_texts"] = args.texts model_args["rerank_texts_file"] = args.texts_file model_args["apply_chat_template"] = args.apply_chat_template - + model_args["video_frames"] = args.video_frames optimum = args.optimum if optimum and args.genai: diff --git a/tools/llm_bench/llm_bench_utils/parse_json_data.py b/tools/llm_bench/llm_bench_utils/parse_json_data.py index 4e2a13f841..e93bfe0e45 100644 --- a/tools/llm_bench/llm_bench_utils/parse_json_data.py +++ b/tools/llm_bench/llm_bench_utils/parse_json_data.py @@ -2,60 +2,52 @@ # Copyright (C) 2023-2025 Intel Corporation # SPDX-License-Identifier: Apache-2.0 +def create_base_prompt(json_data, key='prompt'): + prompt_data = {} + if key not in json_data: + raise RuntimeError(f"== key word '{key}' does not exist ==") + if json_data[key] == "": + raise RuntimeError(f"== {key} should not be empty string ==") + prompt_data[key] = json_data[key] + return prompt_data + def parse_text_json_data(json_data_list): text_param_list = [] for json_data in json_data_list: - if 'prompt' in json_data: - if json_data['prompt'] != '': - text_param_list.append(json_data['prompt']) - else: - raise RuntimeError('== prompt should not be empty string ==') - else: - raise RuntimeError('== key word "prompt" does not exist ==') + prompt_data = create_base_prompt(json_data) + text_param_list.append(prompt_data["prompt"]) return text_param_list def parse_vlm_json_data(json_data_list): text_param_list = [] for json_data in json_data_list: - prompt_data = {} - if 'prompt' in json_data: - if json_data['prompt'] != '': - prompt_data["prompt"] = json_data['prompt'] - else: - raise RuntimeError('== prompt should not be empty string ==') - else: - raise RuntimeError('== key word "prompt" does not exist ==') + prompt_data = create_base_prompt(json_data) if "media" in json_data: prompt_data["media"] = json_data["media"] + if "video" in json_data: + prompt_data["video"] = json_data["video"] text_param_list.append(prompt_data) return text_param_list def parse_image_json_data(json_data_list): image_param_list = [] - for data in json_data_list: - image_param = {} - if 'prompt' in data: - if data['prompt'] != '': - image_param['prompt'] = data['prompt'] - else: - raise RuntimeError('== prompt should not be empty string ==') - else: - raise RuntimeError('== key word "prompt" does not exist in prompt file ==') - if 'width' in data: - image_param['width'] = int(data['width']) - if 'height' in data: - image_param['height'] = int(data['height']) - if 'steps' in data: - image_param['steps'] = int(data['steps']) - if 'guidance_scale' in data: - image_param['guidance_scale'] = float(data['guidance_scale']) - if 'media' in data: - image_param['media'] = data['media'] - if 'mask_image' in data: - image_param['mask_image'] = data['mask_image'] + for json_data in json_data_list: + image_param = create_base_prompt(json_data) + if 'width' in json_data: + image_param['width'] = int(json_data['width']) + if 'height' in json_data: + image_param['height'] = int(json_data['height']) + if 'steps' in json_data: + image_param['steps'] = int(json_data['steps']) + if 'guidance_scale' in json_data: + image_param['guidance_scale'] = float(json_data['guidance_scale']) + if 'media' in json_data: + image_param['media'] = json_data['media'] + if 'mask_image' in json_data: + image_param['mask_image'] = json_data['mask_image'] image_param_list.append(image_param) return image_param_list @@ -63,17 +55,10 @@ def parse_image_json_data(json_data_list): def parse_speech_json_data(json_data_list): speech_param_list = [] for json_data in json_data_list: - speech_param = {} - if 'media' in json_data: - if json_data['media'] != '': - speech_param['media'] = json_data['media'] - else: - raise RuntimeError('== media path should not be empty string ==') - else: - raise RuntimeError('== key word "media" does not exist ==') - if 'language' in json_data: - speech_param['language'] = json_data['language'] - if 'timestamp' in json_data: - speech_param['timestamp'] = json_data['timestamp'] + speech_param = create_base_prompt(json_data, "media") + if "language" in json_data: + speech_param["language"] = json_data["language"] + if "timestamp" in json_data: + speech_param["timestamp"] = json_data["timestamp"] speech_param_list.append(speech_param) return speech_param_list diff --git a/tools/llm_bench/llm_bench_utils/prompt_utils.py b/tools/llm_bench/llm_bench_utils/prompt_utils.py index 5f19cb6d7f..dbff0e87f4 100644 --- a/tools/llm_bench/llm_bench_utils/prompt_utils.py +++ b/tools/llm_bench/llm_bench_utils/prompt_utils.py @@ -2,8 +2,20 @@ # Copyright (C) 2023-2025 Intel Corporation # SPDX-License-Identifier: Apache-2.0 + +import os +import numpy as np +from PIL import Image +import logging as log +from transformers.image_utils import load_image from .model_utils import get_param_from_file +from .model_utils import resolve_media_file_path from .parse_json_data import parse_text_json_data +from .parse_json_data import parse_vlm_json_data +from pathlib import Path +import openvino as ov +import math +import cv2 def get_text_prompt(args): @@ -17,3 +29,116 @@ def get_text_prompt(args): else: text_list.append(output_data_list[0]) return text_list + + +def print_video_frames_number_and_convert_to_tensor(func): + def inner(video_path, decim_frames, genai_flag): + log.info(f"Input video file: {video_path}") + if decim_frames is not None: + log.info(f"Requested to reduce into {decim_frames} frames") + out_frames = func(video_path, decim_frames) + log.info(f"Final frames number: {len(out_frames)}") + log.info(f"First frame shape: {out_frames[0].shape}") + log.info(f"First frame dtype: {out_frames[0].dtype}") + if genai_flag: + return ov.Tensor(out_frames) + return np.array(out_frames) + return inner + + +@print_video_frames_number_and_convert_to_tensor +def make_video_tensor(video_path, decim_frames=None): + assert os.path.exists(video_path), f"no input video file: {video_path}" + cap = cv2.VideoCapture(video_path) + + output_frames = [] + while True: + ret, frame = cap.read() + if not ret: + break + + frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) + pil_image = Image.fromarray(frame_rgb) + + np_img_array = np.array(pil_image) + log.debug(f"Video shape: {np_img_array.shape}") + log.debug(f"Video dtype: {np_img_array.dtype}") + output_frames.append(np_img_array) + + if not decim_frames: + log.info(f"Video decim: no-set: {decim_frames}: skip") + return output_frames + + # decimation procedure + # decim_frames is required max frame number if positive + # or decimation factor if negative + # e.g. if input frames number is 100 and decim_fames = 5: + # then number of processed frames are: 0, 20, 40, 60, 80 + # if input frames number is 100 and decim_fames = -5: + # then number of processed frames are: 0, 5, 10, 15, 20, ... + + decim_frames = int(decim_frames) + if decim_frames > 0: + if len(output_frames) <= decim_frames: + log.info(f"Video decim: too short to decim: crop: {decim_frames}") + return list(output_frames[:decim_frames]) + decim_factor_f = float(len(output_frames)) / decim_frames + decim_factor = int(math.ceil(decim_factor_f)) + else: + decim_factor = -decim_frames + log.info(f"Video decim factor: {decim_factor}") + if decim_factor >= 2: + return list(output_frames[::decim_factor]) + log.info("Video decim: too large decim factor: skip") + return output_frames + + +def load_image_genai(image_path): + pil_image = load_image(image_path) + image_data = np.array(pil_image)[None] + return ov.Tensor(image_data) + + +def extract_prompt_data(inputs, required_frames, genai_flag): + prompts, images, videos = [], [], [] + if not isinstance(inputs, (list, tuple, set)): + inputs = [inputs] + for input_data in inputs: + if input_data.get("video") is not None: + entry = Path(input_data["video"]) + if entry.is_dir(): + for filename in sorted(entry.iterdir()): + video_tensor = make_video_tensor(filename, required_frames, genai_flag) + videos.append(video_tensor) + else: + video_tensor = make_video_tensor(entry, required_frames, genai_flag) + videos.append(video_tensor) + if input_data.get("media") is not None: + func_load_image = load_image_genai if genai_flag else load_image + entry = Path(input_data["media"]) + if entry.is_dir(): + for file in sorted(entry.iterdir()): + img = func_load_image(str(file)) + images.append(img) + else: + img = func_load_image(input_data["media"]) + images.append(img) + prompts.append(input_data["prompt"]) + return prompts, images, videos + + +def get_image_text_prompt(args): + vlm_file_list = [] + output_data_list, is_json_data = get_param_from_file(args, ["video", "media", "prompt"]) + if is_json_data: + vlm_param_list = parse_vlm_json_data(output_data_list) + if len(vlm_param_list) > 0: + for vlm_file in vlm_param_list: + if args['prompt_file'] is not None and len(args['prompt_file']) > 0 and 'media' in vlm_file: + vlm_file['media'] = resolve_media_file_path(vlm_file.get('media'), args['prompt_file'][0]) + if args['prompt_file'] is not None and len(args['prompt_file']) > 0 and 'video' in vlm_file: + vlm_file['video'] = resolve_media_file_path(vlm_file.get('video'), args['prompt_file'][0]) + vlm_file_list.append(vlm_file) + else: + vlm_file_list.append(output_data_list) + return vlm_file_list diff --git a/tools/llm_bench/requirements.txt b/tools/llm_bench/requirements.txt index ca1f523560..2829489eda 100644 --- a/tools/llm_bench/requirements.txt +++ b/tools/llm_bench/requirements.txt @@ -8,7 +8,7 @@ pillow torch transformers[sentencepiece]>=4.40.0 diffusers>=0.22.0 -#optimum is in dependency list of optimum-intel +#optimum is in dependency list of optimum-intel optimum-intel[nncf]>=1.25.0 packaging psutil @@ -21,3 +21,4 @@ scipy gguf_parser gguf>=0.10 num2words +opencv-python diff --git a/tools/llm_bench/task/visual_language_generation.py b/tools/llm_bench/task/visual_language_generation.py index 4b626bdfaa..5c513b5ad4 100644 --- a/tools/llm_bench/task/visual_language_generation.py +++ b/tools/llm_bench/task/visual_language_generation.py @@ -9,49 +9,47 @@ import llm_bench_utils.pt_utils import llm_bench_utils.model_utils as model_utils import numpy as np -import openvino as ov import hashlib -import llm_bench_utils.metrics_print as metrics_print from transformers import set_seed -from transformers.image_utils import load_image import llm_bench_utils.output_file +import llm_bench_utils.metrics_print as metrics_print import llm_bench_utils.gen_output_data as gen_output_data -import llm_bench_utils.parse_json_data as parse_json_data -from pathlib import Path - +from llm_bench_utils.prompt_utils import extract_prompt_data +from llm_bench_utils.prompt_utils import get_image_text_prompt -FW_UTILS = {'pt': llm_bench_utils.pt_utils, 'ov': llm_bench_utils.ov_utils} DEFAULT_OUTPUT_TOKEN_SIZE = 512 +FW_UTILS = { + 'pt': llm_bench_utils.pt_utils, + 'ov': llm_bench_utils.ov_utils +} def run_visual_language_generation_optimum( - inputs, num, model, processor, args, iter_data_list, md5_list, prompt_index, bench_hook, model_precision, proc_id, mem_consumption -): + inputs, num, model, processor, args, iter_data_list, md5_list, prompt_index, + bench_hook, model_precision, proc_id, mem_consumption): from optimum.intel.utils.import_utils import is_transformers_version set_seed(args['seed']) if args['batch_size'] != 1: log.warning("Only batch size 1 available for benchmarking") args["batch_size"] = 1 - images = [] - prompts = [] - inputs = [inputs] if not isinstance(inputs, (list, tuple)) else inputs - for input_data in inputs: - if input_data.get("media", None): - entry = Path(input_data["media"]) - if entry.is_dir(): - for file in sorted(entry.iterdir()): - images.append(load_image(str(file))) - else: - images.append(load_image(input_data["media"])) - prompts.append(input_data["prompt"]) - prefix = '[warm-up]' if num == 0 else '[{}]'.format(num) - log.info(f'{prefix}[P{prompt_index}] Input image nums:{len(images)}') + + decim_frames = args["video_frames"] + prompts, images, videos = extract_prompt_data(inputs, decim_frames, False) if args["output_dir"] is not None and num == 0: for bs_index, in_text in enumerate(prompts): - llm_bench_utils.output_file.output_input_text(in_text, args, model_precision, prompt_index, bs_index, proc_id) + llm_bench_utils.output_file.output_input_text( + in_text, args, model_precision, + prompt_index, bs_index, proc_id) tok_encode_start = time.perf_counter() - input_data = model.preprocess_inputs(text=prompts[0], image=images[0] if images else None, **processor) + + prefix = '[warm-up]' if num == 0 else '[{}]'.format(num) + log.info(f'{prefix}[P{prompt_index}] Input image nums: {len(images)}') + log.info(f'{prefix}[P{prompt_index}] Input video nums: {len(videos)}') + input_data = model.preprocess_inputs(image=images[0] if images else None, + video=videos[0] if videos else None, + text=prompts[0], **processor) + tok_encode_end = time.perf_counter() tok_encode_time = (tok_encode_end - tok_encode_start) * 1000 # Remove `token_type_ids` from inputs @@ -182,33 +180,21 @@ def run_visual_language_generation_optimum( bench_hook.clear_mm_embeddins_time_list() -def load_image_genai(image_path): - pil_image = load_image(image_path) - image_data = np.array(pil_image)[None] - return ov.Tensor(image_data) - - def run_visual_language_generation_genai( - inputs, num, model, processor, args, iter_data_list, md5_list, prompt_index, streamer, model_precision, proc_id, mem_consumption -): + inputs, num, model, processor, args, iter_data_list, md5_list, + prompt_index, streamer, model_precision, proc_id, mem_consumption): if args['batch_size'] != 1: log.warning("Only batch size 1 available for benchmarking") args["batch_size"] = 1 - images = [] - prompts = [] - inputs = [inputs] if not isinstance(inputs, (list, tuple)) else inputs - for input_data in inputs: - if input_data.get("media", None): - entry = Path(input_data["media"]) - if entry.is_dir(): - for file in sorted(entry.iterdir()): - images.append(load_image_genai(str(file))) - else: - images.append(load_image_genai(input_data["media"])) - prompts.append(input_data["prompt"]) + + decim_frames = args["video_frames"] + prompts, images, videos = extract_prompt_data(inputs, decim_frames, True) if args["output_dir"] is not None and num == 0: for bs_index, in_text in enumerate(prompts): - llm_bench_utils.output_file.output_input_text(in_text, args, model_precision, prompt_index, bs_index, proc_id) + llm_bench_utils.output_file.output_input_text( + in_text, args, model_precision, + prompt_index, bs_index, proc_id) + max_rss_mem_consumption = '' max_sys_mem_consumption = '' max_rss_mem_increase = '' @@ -221,11 +207,17 @@ def run_visual_language_generation_genai( gen_config.num_beams = args["num_beams"] gen_config.do_sample = False gen_config.ignore_eos = True + kwargs = {} - if len(images) >= 1: - kwargs["images"] = images prefix = '[warm-up]' if num == 0 else '[{}]'.format(num) - log.info(f'{prefix}[P{prompt_index}] Input image nums:{len(images)}') + log.info(f'{prefix}[P{prompt_index}] Input image nums: {len(images)}') + log.info(f'{prefix}[P{prompt_index}] Input video nums: {len(videos)}') + + if images: + kwargs["images"] = images + if videos: + kwargs["videos"] = videos + start = time.perf_counter() generation_result = model.generate(prompts[0], generation_config=gen_config, **kwargs) end = time.perf_counter() @@ -305,7 +297,8 @@ def run_visual_language_generation_genai( def run_visual_language_generation_benchmark(model_path, framework, device, args, num_iters, mem_consumption): - model, processor, pretrain_time, bench_hook, use_genai = FW_UTILS[framework].create_image_text_gen_model(model_path, device, mem_consumption, **args) + outs = FW_UTILS[framework].create_image_text_gen_model(model_path, device, mem_consumption, **args) + model, processor, pretrain_time, bench_hook, use_genai = outs model_precision = model_utils.get_model_precision(model_path.parts) iter_data_list = [] md5_list = {num : {} for num in range(num_iters + 1)} @@ -325,10 +318,10 @@ def run_visual_language_generation_benchmark(model_path, framework, device, args log.info(f"Numbeams: {args['num_beams']}, benchmarking iter nums(exclude warm-up): {num_iters}, " f'prompt nums: {len(image_text_list)}, prompt idx: {prompt_idx_list}') - if not use_genai: - gen_fn = run_visual_language_generation_optimum - else: + if use_genai: gen_fn = run_visual_language_generation_genai + else: + gen_fn = run_visual_language_generation_optimum proc_id = os.getpid() iter_timestamp = model_utils.init_timestamp(num_iters, image_text_list, prompt_idx_list) @@ -337,42 +330,29 @@ def run_visual_language_generation_benchmark(model_path, framework, device, args for idx, input_text in enumerate(image_text_list): p_idx = prompt_idx_list[idx] if num == 0: - metrics_print.print_unicode(f'[warm-up][P{p_idx}] Input text: {input_text}', max_output=metrics_print.MAX_INPUT_TXT_IN_LOG) + prefix = f'[warm-up][P{p_idx}] Input text: {input_text}' + metrics_print.print_unicode(prefix, max_output=metrics_print.MAX_INPUT_TXT_IN_LOG) iter_timestamp[num][p_idx]['start'] = datetime.datetime.now().isoformat() gen_fn( input_text, num, model, processor, args, iter_data_list, md5_list, p_idx, bench_hook, model_precision, proc_id, mem_consumption) iter_timestamp[num][p_idx]['end'] = datetime.datetime.now().isoformat() - prefix = '[warm-up]' if num == 0 else '[{}]'.format(num) - log.info(f"{prefix}[P{p_idx}] start: {iter_timestamp[num][p_idx]['start']}, end: {iter_timestamp[num][p_idx]['end']}") + prefix = f"[warm-up][P{p_idx}]" if num == 0 else f"[{num}][P{p_idx}]" + log.info(f"{prefix} start: {iter_timestamp[num][p_idx]['start']}, end: {iter_timestamp[num][p_idx]['end']}") else: for idx, input_text in enumerate(image_text_list): p_idx = prompt_idx_list[idx] for num in range(num_iters + 1): if num == 0: - metrics_print.print_unicode(f'[warm-up][P{p_idx}] Input text: {input_text}', max_output=metrics_print.MAX_INPUT_TXT_IN_LOG) + prefix = f'[warm-up][P{p_idx}] Input text: {input_text}' + metrics_print.print_unicode(prefix, max_output=metrics_print.MAX_INPUT_TXT_IN_LOG) iter_timestamp[num][p_idx]['start'] = datetime.datetime.now().isoformat() gen_fn( input_text, num, model, processor, args, iter_data_list, md5_list, prompt_idx_list[idx], bench_hook, model_precision, proc_id, mem_consumption) iter_timestamp[num][p_idx]['end'] = datetime.datetime.now().isoformat() - prefix = '[warm-up]' if num == 0 else '[{}]'.format(num) - log.info(f"{prefix}[P{p_idx}] start: {iter_timestamp[num][p_idx]['start']}, end: {iter_timestamp[num][p_idx]['end']}") + prefix = f"[warm-up][P{p_idx}]" if num == 0 else f"[{num}][P{p_idx}]" + log.info(f"{prefix} start: {iter_timestamp[num][p_idx]['start']}, end: {iter_timestamp[num][p_idx]['end']}") metrics_print.print_average(iter_data_list, prompt_idx_list, args['batch_size'], True) return iter_data_list, pretrain_time, iter_timestamp - - -def get_image_text_prompt(args): - vlm_file_list = [] - output_data_list, is_json_data = model_utils.get_param_from_file(args, ['media', "prompt"]) - if is_json_data: - vlm_param_list = parse_json_data.parse_vlm_json_data(output_data_list) - if len(vlm_param_list) > 0: - for vlm_file in vlm_param_list: - if args['prompt_file'] is not None and len(args['prompt_file']) > 0: - vlm_file['media'] = model_utils.resolve_media_file_path(vlm_file.get("media"), args['prompt_file'][0]) - vlm_file_list.append(vlm_file) - else: - vlm_file_list.append(output_data_list) - return vlm_file_list