Fix compile error for Gemma3 multimodal inputs (vllm-project#671)

jiminha · mhelf-intel · commit fb933d8298e1 · 2025-12-05T12:39:31.000+02:00
Due to the latest changes from upstream, gemma3 is failing to compile on HPU vllm-project/vllm#27772 vllm-project/vllm#28842 -replace unfold to view/reshape -replace text embedding to avoid dynamic shape -remove merge_multimodal replacement since masked_scatter issue is fixed -enable back gemma3 model test --------- Signed-off-by: Jimin Ha <jimin.ha@intel.com>
diff --git a/tests/full_tests/ci_gsm8k_tests.sh b/tests/full_tests/ci_gsm8k_tests.sh
@@ -13,12 +13,11 @@ echo $VLLM_GAUDI_PREFIX
 # Gemma3 with image input
 run_gemma3_test() {
     echo "➡️ Testing gemma-3-4b-it..."
-    #VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 python -u "${VLLM_GAUDI_PREFIX}/tests/models/language/generation/generation_mm.py" --model-card-path "${VLLM_GAUDI_PREFIX}/tests/full_tests/model_cards/gemma-3-4b-it.yaml"
+    VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 python -u "${VLLM_GAUDI_PREFIX}/tests/models/language/generation/generation_mm.py" --model-card-path "${VLLM_GAUDI_PREFIX}/tests/full_tests/model_cards/gemma-3-4b-it.yaml"
     echo "✅ Test with multimodal-support with gemma-3-4b-it passed."
     echo "➡️ Testing gemma-3-4b-it with multiple images(applying sliding_window)..."
-    #VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 python -u "${VLLM_GAUDI_PREFIX}/tests/models/language/generation/generation_mm_multi.py" --model-card-path "${VLLM_GAUDI_PREFIX}/tests/full_tests/model_cards/gemma-3-27b-it.yaml"
+    VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 python -u "${VLLM_GAUDI_PREFIX}/tests/models/language/generation/generation_mm_multi.py" --model-card-path "${VLLM_GAUDI_PREFIX}/tests/full_tests/model_cards/gemma-3-27b-it.yaml"
     echo "✅ Test with multimodal-support with multiple images gemma-3-27b-it passed."
-    #Test cases are commented because of PR27772
 }
 
 # Basic model test
diff --git a/vllm_gaudi/__init__.py b/vllm_gaudi/__init__.py
@@ -20,9 +20,10 @@ def register_ops():
     import vllm_gaudi.ops.hpu_gptq  # noqa: F401
     import vllm_gaudi.ops.hpu_awq  # noqa: F401
     import vllm_gaudi.ops.hpu_multihead_attn  # noqa: F401
+    import vllm_gaudi.ops.hpu_conv  # noqa: F401
 
 
 def register_models():
-    import vllm_gaudi.models.utils  # noqa: F401
+    import vllm_gaudi.models.interfaces  # noqa: F401
     from .models import register_model
     register_model()
diff --git a/vllm_gaudi/models/interfaces.py b/vllm_gaudi/models/interfaces.py
@@ -0,0 +1,41 @@
+from collections.abc import Callable
+import torch
+from torch import Tensor
+from vllm.model_executor.models.interfaces import SupportsMultiModal
+
+
+def _embed_text_input_ids(
+    self,
+    input_ids: Tensor,
+    embed_input_ids: Callable[[Tensor], Tensor],
+    *,
+    is_multimodal: Tensor | None,
+    handle_oov_mm_token: bool,
+) -> Tensor:
+    if handle_oov_mm_token and is_multimodal is not None:
+        is_text = ~is_multimodal
+
+        # Original implementation uses dynamic indexing.
+        # Replacing it to use fixed shape for HPU and then fill in text position.
+        '''
+        text_embeds = embed_input_ids(input_ids[is_text])
+
+        return torch.empty(
+            (input_ids.shape[0], text_embeds.shape[1]),
+            dtype=text_embeds.dtype,
+            device=text_embeds.device,
+        ).masked_scatter_(is_text.unsqueeze_(-1), text_embeds)
+        '''
+        all_text_embeds = embed_input_ids(input_ids)
+        result = torch.zeros_like(all_text_embeds)
+
+        return torch.where(
+            is_text.unsqueeze(-1),  # [batch, seq_len, 1]
+            all_text_embeds,  # [batch, seq_len, embed_dim]
+            result  # [batch, seq_len, embed_dim]
+        )
+
+    return embed_input_ids(input_ids)
+
+
+SupportsMultiModal._embed_text_input_ids = _embed_text_input_ids
diff --git a/vllm_gaudi/ops/hpu_conv.py b/vllm_gaudi/ops/hpu_conv.py
@@ -0,0 +1,35 @@
+import torch
+import torch.nn.functional as F
+from vllm.model_executor.layers.conv import Conv2dLayer
+
+
+@Conv2dLayer.register_oot
+class HPUConv2dLayer(Conv2dLayer):
+
+    def _forward_mulmat(self, x: torch.Tensor) -> torch.Tensor:
+        assert x.dim() == 4
+        B, C, H, W = x.shape
+        K1, K2 = self.kernel_size
+        H, W = H // K1, W // K2
+
+        # TODO: HPU doesn't support unfold, implement with view,reshape.
+        #x = x.unfold(2, K1, K1).unfold(3, K2, K2)
+        #x = x.permute(0, 2, 3, 1, 4, 5).reshape(-1, self.input_size)
+        x = x.view(B, C, H, K1, W, K2)
+        x = x.permute(0, 2, 4, 1, 3, 5).reshape(-1, self.input_size)  # [B*H*W, C*K1*K2]
+
+        x = F.linear(
+            x,
+            self.weight.view(self.out_channels, self.input_size),
+            self.bias,
+        )
+        x = x.view(B, H, W, self.out_channels).permute(0, 3, 1, 2)
+        return x
+
+    def forward_oot(self, x: torch.Tensor) -> torch.Tensor:
+        """Expected input shape: (batch_size, in_channels, height, width)"""
+        assert x.dim() == 4
+        if self.enable_linear:
+            return self._forward_mulmat(x)
+        else:
+            return self._forward_conv(x)