Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions tests/full_tests/ci_gsm8k_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,11 @@ echo $VLLM_GAUDI_PREFIX
# Gemma3 with image input
run_gemma3_test() {
echo "➡️ Testing gemma-3-4b-it..."
#VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 python -u "${VLLM_GAUDI_PREFIX}/tests/models/language/generation/generation_mm.py" --model-card-path "${VLLM_GAUDI_PREFIX}/tests/full_tests/model_cards/gemma-3-4b-it.yaml"
VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 python -u "${VLLM_GAUDI_PREFIX}/tests/models/language/generation/generation_mm.py" --model-card-path "${VLLM_GAUDI_PREFIX}/tests/full_tests/model_cards/gemma-3-4b-it.yaml"
echo "✅ Test with multimodal-support with gemma-3-4b-it passed."
echo "➡️ Testing gemma-3-4b-it with multiple images(applying sliding_window)..."
#VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 python -u "${VLLM_GAUDI_PREFIX}/tests/models/language/generation/generation_mm_multi.py" --model-card-path "${VLLM_GAUDI_PREFIX}/tests/full_tests/model_cards/gemma-3-27b-it.yaml"
VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 python -u "${VLLM_GAUDI_PREFIX}/tests/models/language/generation/generation_mm_multi.py" --model-card-path "${VLLM_GAUDI_PREFIX}/tests/full_tests/model_cards/gemma-3-27b-it.yaml"
echo "✅ Test with multimodal-support with multiple images gemma-3-27b-it passed."
#Test cases are commented because of PR27772
}

# Basic model test
Expand Down
3 changes: 2 additions & 1 deletion vllm_gaudi/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,10 @@ def register_ops():
import vllm_gaudi.ops.hpu_gptq # noqa: F401
import vllm_gaudi.ops.hpu_awq # noqa: F401
import vllm_gaudi.ops.hpu_multihead_attn # noqa: F401
import vllm_gaudi.ops.hpu_conv # noqa: F401


def register_models():
import vllm_gaudi.models.utils # noqa: F401
import vllm_gaudi.models.interfaces # noqa: F401
from .models import register_model
register_model()
41 changes: 41 additions & 0 deletions vllm_gaudi/models/interfaces.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
from collections.abc import Callable
import torch
from torch import Tensor
from vllm.model_executor.models.interfaces import SupportsMultiModal


def _embed_text_input_ids(
self,
input_ids: Tensor,
embed_input_ids: Callable[[Tensor], Tensor],
*,
is_multimodal: Tensor | None,
handle_oov_mm_token: bool,
) -> Tensor:
if handle_oov_mm_token and is_multimodal is not None:
is_text = ~is_multimodal

# Original implementation uses dynamic indexing.
# Replacing it to use fixed shape for HPU and then fill in text position.
'''
text_embeds = embed_input_ids(input_ids[is_text])

return torch.empty(
(input_ids.shape[0], text_embeds.shape[1]),
dtype=text_embeds.dtype,
device=text_embeds.device,
).masked_scatter_(is_text.unsqueeze_(-1), text_embeds)
'''
all_text_embeds = embed_input_ids(input_ids)
result = torch.zeros_like(all_text_embeds)

return torch.where(
is_text.unsqueeze(-1), # [batch, seq_len, 1]
all_text_embeds, # [batch, seq_len, embed_dim]
result # [batch, seq_len, embed_dim]
)

return embed_input_ids(input_ids)


SupportsMultiModal._embed_text_input_ids = _embed_text_input_ids
35 changes: 35 additions & 0 deletions vllm_gaudi/ops/hpu_conv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import torch
import torch.nn.functional as F
from vllm.model_executor.layers.conv import Conv2dLayer


@Conv2dLayer.register_oot
class HPUConv2dLayer(Conv2dLayer):

def _forward_mulmat(self, x: torch.Tensor) -> torch.Tensor:
assert x.dim() == 4
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
assert x.dim() == 4
assert x.dim() == 4, f"Expected NCHW, got {x.shape}"

slightly clearer version

B, C, H, W = x.shape
K1, K2 = self.kernel_size
H, W = H // K1, W // K2

# TODO: HPU doesn't support unfold, implement with view,reshape.
#x = x.unfold(2, K1, K1).unfold(3, K2, K2)
#x = x.permute(0, 2, 3, 1, 4, 5).reshape(-1, self.input_size)
x = x.view(B, C, H, K1, W, K2)
x = x.permute(0, 2, 4, 1, 3, 5).reshape(-1, self.input_size) # [B*H*W, C*K1*K2]

x = F.linear(
x,
self.weight.view(self.out_channels, self.input_size),
self.bias,
)
x = x.view(B, H, W, self.out_channels).permute(0, 3, 1, 2)
return x

def forward_oot(self, x: torch.Tensor) -> torch.Tensor:
"""Expected input shape: (batch_size, in_channels, height, width)"""
assert x.dim() == 4
if self.enable_linear:
return self._forward_mulmat(x)
else:
return self._forward_conv(x)