Skip to content

Commit 1d5d3b8

Browse files
mengweiguosbalandi
andauthored
[NPU] Add batch_size support for embedding model (#2986)
<!-- Keep your pull requests (PRs) as atomic as possible. That increases the likelihood that an individual PR won't be stuck because of adjacent problems, merge conflicts, or code review. Your merged PR is going to appear in the automatically generated release notes on GitHub. So the clearer the title the better. --> ## Description <!-- Please include a summary of the change. Also include relevant motivation and context. --> The model `qwen3-embedding-0.6B` failed on `wwb` test on NPU due to dynamic `batch size`. This PR adds `batch_size` option support for this model in `llm-benchmark` and `wwb`. <!-- Jira ticket number (e.g., 123). Delete if there's no ticket. --> [CVS-176378](https://jira.devtools.intel.com/browse/CVS-176378) ## Checklist: - [ ] Tests have been updated or added to cover the new code. <!-- If the change isn't maintenance related, update the tests at https://github.com/openvinotoolkit/openvino.genai/tree/master/tests or explain in the description why the tests don't need an update. --> - [ ] This patch fully addresses the ticket. <!--- If follow-up pull requests are needed, specify in description. --> - [ ] I have made corresponding changes to the documentation. <!-- Run github.com/\<username>/openvino.genai/actions/workflows/deploy_gh_pages.yml on your fork with your branch as a parameter to deploy a test version with the updated content. Replace this comment with the link to the built docs. --> --------- Co-authored-by: Sofya Balandina <sofya.balandina@intel.com>
1 parent f76e8cb commit 1d5d3b8

File tree

5 files changed

+117
-4
lines changed

5 files changed

+117
-4
lines changed

tools/llm_bench/llm_bench_utils/ov_utils.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -683,6 +683,7 @@ def create_genai_text_embed_model(model_path, device, memory_data_collector, **k
683683
if padding_side:
684684
config.padding_side = padding_side
685685

686+
config.batch_size = kwargs.get("batch_size", config.batch_size)
686687
ov_config = kwargs['config']
687688

688689
if kwargs.get("mem_consumption"):

tools/who_what_benchmark/tests/test_cli_embeddings.py

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,3 +92,94 @@ def test_embeddings_basic(model_id, model_type, tmp_path):
9292
model_type,
9393
"--genai",
9494
])
95+
96+
97+
@pytest.mark.parametrize(
98+
("model_id", "model_type", "batch_size"),
99+
[
100+
("Qwen/Qwen3-Embedding-0.6B", "text-embedding", 1),
101+
("Qwen/Qwen3-Embedding-0.6B", "text-embedding", 12),
102+
],
103+
)
104+
def test_embeddings_with_batch(model_id, model_type, batch_size, tmp_path):
105+
GT_FILE = tmp_path / f"gt_batch_{batch_size}.csv"
106+
MODEL_PATH = tmp_path / model_id.replace("/", "_")
107+
108+
result = subprocess.run(["optimum-cli", "export",
109+
"openvino", "-m", model_id,
110+
MODEL_PATH, "--task",
111+
"feature-extraction",
112+
"--trust-remote-code"],
113+
capture_output=True,
114+
text=True,
115+
)
116+
assert result.returncode == 0
117+
118+
# Collect reference with HF model
119+
run_wwb([
120+
"--base-model",
121+
model_id,
122+
"--num-samples",
123+
"1",
124+
"--gt-data",
125+
GT_FILE,
126+
"--device",
127+
"CPU",
128+
"--model-type",
129+
model_type,
130+
"--embeds_batch_size",
131+
str(batch_size),
132+
"--hf",
133+
])
134+
135+
# test Optimum
136+
run_wwb([
137+
"--target-model",
138+
MODEL_PATH,
139+
"--num-samples",
140+
"1",
141+
"--gt-data",
142+
GT_FILE,
143+
"--device",
144+
"CPU",
145+
"--model-type",
146+
model_type,
147+
"--embeds_batch_size",
148+
str(batch_size),
149+
])
150+
151+
# test GenAI
152+
run_wwb([
153+
"--target-model",
154+
MODEL_PATH,
155+
"--num-samples",
156+
"1",
157+
"--gt-data",
158+
GT_FILE,
159+
"--device",
160+
"CPU",
161+
"--model-type",
162+
model_type,
163+
"--genai",
164+
"--output",
165+
tmp_path,
166+
"--embeds_batch_size",
167+
str(batch_size),
168+
])
169+
170+
# test w/o models
171+
run_wwb([
172+
"--target-data",
173+
tmp_path / "target.csv",
174+
"--num-samples",
175+
"1",
176+
"--gt-data",
177+
GT_FILE,
178+
"--device",
179+
"CPU",
180+
"--model-type",
181+
model_type,
182+
"--genai",
183+
"--embeds_batch_size",
184+
str(batch_size),
185+
])

tools/who_what_benchmark/whowhatbench/embeddings_evaluator.py

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from typing import Any, Union
22

3+
import itertools
34
import os
45
import torch
56
import numpy as np
@@ -66,7 +67,8 @@ def __init__(
6667
gen_embeds_fn=None,
6768
pooling_type=None,
6869
normalize=None,
69-
padding_side=None
70+
padding_side=None,
71+
batch_size=None
7072
) -> None:
7173
assert (
7274
base_model is not None or gt_data is not None
@@ -80,6 +82,7 @@ def __init__(
8082
self.normalize = normalize or False
8183
self.padding_side = padding_side or 'right'
8284
self.gt_dir = os.path.dirname(gt_data)
85+
self.batch_size = batch_size
8386

8487
if base_model:
8588
self.gt_data = self._generate_data(
@@ -178,8 +181,19 @@ def default_gen_answer(model, tokenizer, passages, **kwargs):
178181
kwargs = {'padding_side': self.padding_side,
179182
'pooling_type': self.pooling_type,
180183
'normalize': self.normalize}
181-
result = gen_answer_fn(model, self.tokenizer, data[0], **kwargs)
182-
passages.append(data[0])
184+
185+
batch_size = self.batch_size or len(data[0])
186+
data_len = len(data[0])
187+
188+
if batch_size <= data_len:
189+
data_input = data[0][:batch_size]
190+
else:
191+
# Duplicate data to reach batch_size
192+
data_input = list(itertools.islice(itertools.cycle(data[0]), batch_size))
193+
194+
result = gen_answer_fn(model, self.tokenizer, data_input, **kwargs)
195+
196+
passages.append(data_input)
183197
result_path = os.path.join(result_dir, f"embeds_{i}.npy")
184198
with open(result_path, 'wb') as f:
185199
np.save(f, result)

tools/who_what_benchmark/whowhatbench/model_loaders.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -518,6 +518,7 @@ def load_embedding_genai_pipeline(model_dir, device="CPU", ov_config=None, **kwa
518518
config.max_length = EMBED_DEFAULT_MAX_LENGTH
519519
config.normalize = kwargs.get("embeds_normalize", False)
520520
config.pad_to_max_length = True
521+
config.batch_size = kwargs.get("embeds_batch_size", config.batch_size)
521522

522523
logger.info("Using OpenVINO GenAI TextEmbeddingPipeline API")
523524
pipeline = openvino_genai.TextEmbeddingPipeline(model_dir, device.upper(), config, **ov_config)

tools/who_what_benchmark/whowhatbench/wwb.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,11 @@ def parse_args():
219219
choices=["left", "right"],
220220
default=None,
221221
help="Side to use for padding 'left' or 'right'. Applicable only for text embeddings")
222+
parser.add_argument(
223+
"--embeds_batch_size",
224+
type=int,
225+
default=None,
226+
help="Batch size value. Applicable only for text embeddings")
222227
parser.add_argument(
223228
"--rag-config",
224229
type=str,
@@ -261,7 +266,6 @@ def parse_args():
261266
default=None,
262267
help="Config option assistant_confidence_threshold for Speculative decoding.",
263268
)
264-
265269
return parser.parse_args()
266270

267271

@@ -635,6 +639,7 @@ def create_evaluator(base_model, args):
635639
pooling_type=args.embeds_pooling_type,
636640
normalize=args.embeds_normalize,
637641
padding_side=args.embeds_padding_side,
642+
batch_size=args.embeds_batch_size,
638643
)
639644
elif task == "text-reranking":
640645
return EvaluatorCLS(
@@ -771,6 +776,7 @@ def main():
771776
kwargs["embeds_pooling"] = args.embeds_pooling_type
772777
kwargs["embeds_normalize"] = args.embeds_normalize
773778
kwargs["embeds_padding_side"] = args.embeds_padding_side
779+
kwargs["embeds_batch_size"] = args.embeds_batch_size
774780

775781
if args.draft_model is not None:
776782
kwargs["draft_model"] = args.draft_model

0 commit comments

Comments
 (0)