fix: revert cast to cpu in MsgpackEncoder._encode_tensor to avoid hidden performance regressions (vllm-project#25738)

qthequartermasterman · web-flow · commit e84e0735c71e · 2025-09-26T01:18:05.000-07:00
Signed-off-by: Andrew Sansom &lt;andrew@protopia.ai&gt;
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
@@ -278,6 +278,11 @@ def _process_embeds(
             raise ValueError(
                 "prompt_embeds must be of shape (seq_len, hidden_size).")
 
+        # Tensors must be on CPU for serialization between processes
+        # in the MsgpackEncoder. Casting to CPU here ensures that there is no
+        # hidden device transfer in the critical path of generation.
+        prompt_embeds = prompt_embeds.cpu()
+
         return embeds_inputs(prompt_embeds=prompt_embeds,
                              cache_salt=parsed_content.get("cache_salt"))
 
diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py
@@ -208,7 +208,7 @@ def _encode_tensor(
     ) -> tuple[str, tuple[int, ...], Union[int, memoryview]]:
         assert self.aux_buffers is not None
         # view the tensor as a contiguous 1D array of bytes
-        arr = obj.flatten().contiguous().cpu().view(torch.uint8).numpy()
+        arr = obj.flatten().contiguous().view(torch.uint8).numpy()
         if obj.nbytes < self.size_threshold:
             # Smaller tensors are encoded inline, just like ndarrays.
             data = msgpack.Ext(CUSTOM_TYPE_RAW_VIEW, arr.data)