huggingface
diff --git a/‎backends/gaudi/server/text_generation_server/models/__init__.py‎
Lines changed: 48 additions & 4 deletions b/‎backends/gaudi/server/text_generation_server/models/__init__.py‎
Lines changed: 48 additions & 4 deletions
@@ -16,9 +16,6 @@
 
 from text_generation_server.utils.speculate import get_speculate, set_speculate
 from text_generation_server.models.model import Model
-from text_generation_server.models.causal_lm import CausalLM
-from text_generation_server.models.bloom import BLOOM
-from text_generation_server.models.starcoder import StarCoder
 from text_generation_server.models.custom_modeling.flash_phi_moe_modeling import (
     PhiMoEConfig,
 )
@@ -32,7 +29,6 @@
 from text_generation_server.adapters.lora import LoraWeights
 
 from text_generation_server.utils.log import log_master
-from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
 
 __all__ = [
     "Model",
@@ -42,6 +38,7 @@
 ]
 from text_generation_server.models.globals import ATTENTION
 
+VLM_BATCH_TYPES = set()
 FLASH_ATT_ERROR_MESSAGE = "{} requires Flash Attention enabled models."
 
 FLASH_ATTENTION = False
@@ -63,6 +60,9 @@
     from text_generation_server.models.custom_modeling.flash_llama_modeling import (
         FlashLlamaForCausalLM,
     )
+    from text_generation_server.models.custom_modeling.flash_llama4_modeling import (
+        Llama4ForConditionalGeneration,
+    )
     from text_generation_server.models.custom_modeling.flash_cohere_modeling import (
         FlashCohereForCausalLM,
     )
@@ -140,10 +140,24 @@
     log_master(logger.warning, f"Could not import Flash Attention enabled models: {e}")
     SUPPORTS_WINDOWING = False
     FLASH_ATTENTION = False
+    VLM_BATCH_TYPES = set()
 
 if FLASH_ATTENTION:
     __all__.append(FlashCausalLM)
 
+    from text_generation_server.models.flash_vlm_causal_lm import (
+        FlashVlmCausalLMBatch,
+    )
+
+    VLM_BATCH_TYPES = {
+        PaliGemmaBatch,
+        FlashVlmCausalLMBatch,
+        FlashMllamaCausalLMBatch,
+    }
+
+
+__all__.append(VLM_BATCH_TYPES)
+
 
 class ModelType(enum.Enum):
     DEEPSEEK_V2 = {
@@ -179,6 +193,11 @@ class ModelType(enum.Enum):
         "name": "Llama",
         "url": "https://huggingface.co/collections/meta-llama/llama-31-669fc079a0c406a149a5738f",
     }
+    LLAMA4 = {
+        "type": "llama4",
+        "name": "Llama4",
+        "url": "https://huggingface.co/collections/meta-llama/llama-31-669fc079a0c406a149a5738f",
+    }
     PHI3 = {
         "type": "phi3",
         "name": "Phi 3",
@@ -589,6 +608,19 @@ def get_model(
                 trust_remote_code=trust_remote_code,
                 lora_adapter_ids=lora_adapter_ids,
             )
+        elif model_type == LLAMA4:
+            print(f"Llama4 model detected: {model_id}")
+            return FlashVlmCausalLM(
+                model_id=model_id,
+                model_class=Llama4ForConditionalGeneration,
+                revision=revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                default_dtype=torch.bfloat16,
+                trust_remote_code=trust_remote_code,
+                lora_adapter_ids=lora_adapter_ids,
+            )
         elif model_type == BAICHUAN:
             return FlashCausalLM(
                 model_id=model_id,
@@ -823,20 +855,32 @@ def get_model(
                 trust_remote_code=trust_remote_code,
             )
 
+    from text_generation_server.models.causal_lm import CausalLM
     from text_generation_server.models.vlm_causal_lm import VlmCausalLM
     from text_generation_server.models.custom_modeling.mllama import (
         MllamaForConditionalGeneration,
     )
     from text_generation_server.models.custom_modeling.llava_next import (
         LlavaNextForConditionalGeneration,
     )
+    from text_generation_server.models.vlm_causal_lm import (
+        VlmCausalLMBatch,
+    )
+
+    VLM_BATCH_TYPES.add(VlmCausalLMBatch)
+
+    from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
 
     adapt_transformers_to_gaudi()
     if SDP_ON_BF16 == 1:
         torch._C._set_math_sdp_allow_fp16_bf16_reduction(True)
     if model_type == "gpt_bigcode":
+        from text_generation_server.models.starcoder import StarCoder
+
         return StarCoder(model_id=model_id, revision=revision, dtype=dtype)
     if model_type == "bloom":
+        from text_generation_server.models.bloom import BLOOM
+
         return BLOOM(
             model_id=model_id,
             revision=revision,