[NPU] Add batch_size support for embedding model (#2986)

mengweiguo · sbalandi · web-flow · commit 1d5d3b84a6c7 · 2025-11-24T12:05:16.000Z
## Description  The model `qwen3-embedding-0.6B` failed on `wwb` test on NPU due to dynamic `batch size`. This PR adds `batch_size` option support for this model in `llm-benchmark` and `wwb`.  [CVS-176378](https://jira.devtools.intel.com/browse/CVS-176378) ## Checklist: - [ ] Tests have been updated or added to cover the new code.  - [ ] This patch fully addresses the ticket.  - [ ] I have made corresponding changes to the documentation.  --------- Co-authored-by: Sofya Balandina <sofya.balandina@intel.com>
diff --git a/tools/llm_bench/llm_bench_utils/ov_utils.py b/tools/llm_bench/llm_bench_utils/ov_utils.py
@@ -683,6 +683,7 @@ def create_genai_text_embed_model(model_path, device, memory_data_collector, **k
     if padding_side:
         config.padding_side = padding_side
 
+    config.batch_size = kwargs.get("batch_size", config.batch_size)
     ov_config = kwargs['config']
 
     if kwargs.get("mem_consumption"):
diff --git a/tools/who_what_benchmark/tests/test_cli_embeddings.py b/tools/who_what_benchmark/tests/test_cli_embeddings.py
@@ -92,3 +92,94 @@ def test_embeddings_basic(model_id, model_type, tmp_path):
         model_type,
         "--genai",
     ])
+
+
+@pytest.mark.parametrize(
+    ("model_id", "model_type", "batch_size"),
+    [
+        ("Qwen/Qwen3-Embedding-0.6B", "text-embedding", 1),
+        ("Qwen/Qwen3-Embedding-0.6B", "text-embedding", 12),
+    ],
+)
+def test_embeddings_with_batch(model_id, model_type, batch_size, tmp_path):
+    GT_FILE = tmp_path / f"gt_batch_{batch_size}.csv"
+    MODEL_PATH = tmp_path / model_id.replace("/", "_")
+
+    result = subprocess.run(["optimum-cli", "export",
+                             "openvino", "-m", model_id,
+                             MODEL_PATH, "--task",
+                             "feature-extraction",
+                             "--trust-remote-code"],
+                            capture_output=True,
+                            text=True,
+                            )
+    assert result.returncode == 0
+
+    # Collect reference with HF model
+    run_wwb([
+        "--base-model",
+        model_id,
+        "--num-samples",
+        "1",
+        "--gt-data",
+        GT_FILE,
+        "--device",
+        "CPU",
+        "--model-type",
+        model_type,
+        "--embeds_batch_size",
+        str(batch_size),
+        "--hf",
+    ])
+
+    # test Optimum
+    run_wwb([
+        "--target-model",
+        MODEL_PATH,
+        "--num-samples",
+        "1",
+        "--gt-data",
+        GT_FILE,
+        "--device",
+        "CPU",
+        "--model-type",
+        model_type,
+        "--embeds_batch_size",
+        str(batch_size),
+    ])
+
+    # test GenAI
+    run_wwb([
+        "--target-model",
+        MODEL_PATH,
+        "--num-samples",
+        "1",
+        "--gt-data",
+        GT_FILE,
+        "--device",
+        "CPU",
+        "--model-type",
+        model_type,
+        "--genai",
+        "--output",
+        tmp_path,
+        "--embeds_batch_size",
+        str(batch_size),
+    ])
+
+    # test w/o models
+    run_wwb([
+        "--target-data",
+        tmp_path / "target.csv",
+        "--num-samples",
+        "1",
+        "--gt-data",
+        GT_FILE,
+        "--device",
+        "CPU",
+        "--model-type",
+        model_type,
+        "--genai",
+        "--embeds_batch_size",
+        str(batch_size),
+    ])
diff --git a/tools/who_what_benchmark/whowhatbench/embeddings_evaluator.py b/tools/who_what_benchmark/whowhatbench/embeddings_evaluator.py
@@ -1,5 +1,6 @@
 from typing import Any, Union
 
+import itertools
 import os
 import torch
 import numpy as np
@@ -66,7 +67,8 @@ def __init__(
         gen_embeds_fn=None,
         pooling_type=None,
         normalize=None,
-        padding_side=None
+        padding_side=None,
+        batch_size=None
     ) -> None:
         assert (
             base_model is not None or gt_data is not None
@@ -80,6 +82,7 @@ def __init__(
         self.normalize = normalize or False
         self.padding_side = padding_side or 'right'
         self.gt_dir = os.path.dirname(gt_data)
+        self.batch_size = batch_size
 
         if base_model:
             self.gt_data = self._generate_data(
@@ -178,8 +181,19 @@ def default_gen_answer(model, tokenizer, passages, **kwargs):
             kwargs = {'padding_side': self.padding_side,
                       'pooling_type': self.pooling_type,
                       'normalize': self.normalize}
-            result = gen_answer_fn(model, self.tokenizer, data[0], **kwargs)
-            passages.append(data[0])
+
+            batch_size = self.batch_size or len(data[0])
+            data_len = len(data[0])
+
+            if batch_size <= data_len:
+                data_input = data[0][:batch_size]
+            else:
+                # Duplicate data to reach batch_size
+                data_input = list(itertools.islice(itertools.cycle(data[0]), batch_size))
+
+            result = gen_answer_fn(model, self.tokenizer, data_input, **kwargs)
+
+            passages.append(data_input)
             result_path = os.path.join(result_dir, f"embeds_{i}.npy")
             with open(result_path, 'wb') as f:
                 np.save(f, result)
diff --git a/tools/who_what_benchmark/whowhatbench/model_loaders.py b/tools/who_what_benchmark/whowhatbench/model_loaders.py
@@ -518,6 +518,7 @@ def load_embedding_genai_pipeline(model_dir, device="CPU", ov_config=None, **kwa
     config.max_length = EMBED_DEFAULT_MAX_LENGTH
     config.normalize = kwargs.get("embeds_normalize", False)
     config.pad_to_max_length = True
+    config.batch_size = kwargs.get("embeds_batch_size", config.batch_size)
 
     logger.info("Using OpenVINO GenAI TextEmbeddingPipeline API")
     pipeline = openvino_genai.TextEmbeddingPipeline(model_dir, device.upper(), config, **ov_config)
diff --git a/tools/who_what_benchmark/whowhatbench/wwb.py b/tools/who_what_benchmark/whowhatbench/wwb.py
@@ -219,6 +219,11 @@ def parse_args():
         choices=["left", "right"],
         default=None,
         help="Side to use for padding 'left' or 'right'. Applicable only for text embeddings")
+    parser.add_argument(
+        "--embeds_batch_size",
+        type=int,
+        default=None,
+        help="Batch size value. Applicable only for text embeddings")
     parser.add_argument(
         "--rag-config",
         type=str,
@@ -261,7 +266,6 @@ def parse_args():
         default=None,
         help="Config option assistant_confidence_threshold for Speculative decoding.",
     )
-
     return parser.parse_args()
 
 
@@ -635,6 +639,7 @@ def create_evaluator(base_model, args):
                 pooling_type=args.embeds_pooling_type,
                 normalize=args.embeds_normalize,
                 padding_side=args.embeds_padding_side,
+                batch_size=args.embeds_batch_size,
             )
         elif task == "text-reranking":
             return EvaluatorCLS(
@@ -771,6 +776,7 @@ def main():
         kwargs["embeds_pooling"] = args.embeds_pooling_type
         kwargs["embeds_normalize"] = args.embeds_normalize
         kwargs["embeds_padding_side"] = args.embeds_padding_side
+        kwargs["embeds_batch_size"] = args.embeds_batch_size
 
     if args.draft_model is not None:
         kwargs["draft_model"] = args.draft_model