Skip to content

Commit e84e073

Browse files
fix: revert cast to cpu in MsgpackEncoder._encode_tensor to avoid hidden performance regressions (vllm-project#25738)
Signed-off-by: Andrew Sansom <andrew@protopia.ai>
1 parent 3edf87d commit e84e073

File tree

2 files changed

+6
-1
lines changed

2 files changed

+6
-1
lines changed

vllm/inputs/preprocess.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,11 @@ def _process_embeds(
278278
raise ValueError(
279279
"prompt_embeds must be of shape (seq_len, hidden_size).")
280280

281+
# Tensors must be on CPU for serialization between processes
282+
# in the MsgpackEncoder. Casting to CPU here ensures that there is no
283+
# hidden device transfer in the critical path of generation.
284+
prompt_embeds = prompt_embeds.cpu()
285+
281286
return embeds_inputs(prompt_embeds=prompt_embeds,
282287
cache_salt=parsed_content.get("cache_salt"))
283288

vllm/v1/serial_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -208,7 +208,7 @@ def _encode_tensor(
208208
) -> tuple[str, tuple[int, ...], Union[int, memoryview]]:
209209
assert self.aux_buffers is not None
210210
# view the tensor as a contiguous 1D array of bytes
211-
arr = obj.flatten().contiguous().cpu().view(torch.uint8).numpy()
211+
arr = obj.flatten().contiguous().view(torch.uint8).numpy()
212212
if obj.nbytes < self.size_threshold:
213213
# Smaller tensors are encoded inline, just like ndarrays.
214214
data = msgpack.Ext(CUSTOM_TYPE_RAW_VIEW, arr.data)

0 commit comments

Comments
 (0)