huggingface
diff --git a/‎backends/gaudi/server/text_generation_server/models/__init__.py‎
Lines changed: 44 additions & 0 deletions b/‎backends/gaudi/server/text_generation_server/models/__init__.py‎
Lines changed: 44 additions & 0 deletions
@@ -67,6 +67,10 @@
     from text_generation_server.models.custom_modeling.flash_gemma2_modeling import (
         FlashGemma2ForCausalLM,
     )
+    from text_generation_server.models.custom_modeling.flash_gemma3_modeling import (
+        Gemma3ForConditionalGeneration,
+        FlashGemma3ForCausalLM,
+    )
     from text_generation_server.models.custom_modeling.flash_dbrx_modeling import (
         FlashDbrxForCausalLM,
         DbrxConfig,
@@ -220,6 +224,16 @@ class ModelType(enum.Enum):
         "name": "Gemma2",
         "url": "https://huggingface.co/collections/google/gemma-2-release-667d6600fd5220e7b967f315",
     }
+    GEMMA3 = {
+        "type": "gemma3",
+        "name": "Gemma3",
+        "url": "https://huggingface.co/collections/google/gemma-3-release-67c6c6f89c4f76621268bb6d",
+    }
+    GEMMA3_TEXT = {
+        "type": "gemma3_text",
+        "name": "Gemma3 Text",
+        "url": "https://huggingface.co/collections/google/gemma-3-release-67c6c6f89c4f76621268bb6d",
+    }
     COHERE = {
         "type": "cohere",
         "name": "Cohere",
@@ -630,6 +644,7 @@ def get_model(
                 quantize=quantize,
                 speculator=speculator,
                 dtype=dtype,
+                kv_cache_dtype=kv_cache_dtype,
                 default_dtype=torch.bfloat16,
                 trust_remote_code=trust_remote_code,
                 lora_adapter_ids=lora_adapter_ids,
@@ -675,6 +690,34 @@ def get_model(
                 trust_remote_code=trust_remote_code,
                 lora_adapter_ids=lora_adapter_ids,
             )
+        elif model_type == GEMMA3:
+            return FlashVlmCausalLM(
+                model_id=model_id,
+                model_class=Gemma3ForConditionalGeneration,
+                revision=revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                kv_cache_dtype=kv_cache_dtype,
+                default_dtype=torch.bfloat16,
+                trust_remote_code=trust_remote_code,
+                lora_adapter_ids=lora_adapter_ids,
+                support_chunking=False,
+            )
+        elif model_type == GEMMA3_TEXT:
+            return FlashCausalLM(
+                model_id=model_id,
+                model_class=FlashGemma3ForCausalLM,
+                revision=revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                kv_cache_dtype=kv_cache_dtype,
+                # Works better for these models
+                default_dtype=torch.bfloat16,
+                trust_remote_code=trust_remote_code,
+                lora_adapter_ids=lora_adapter_ids,
+            )
         elif model_type == COHERE:
             return FlashCausalLM(
                 model_id=model_id,
@@ -864,6 +907,7 @@ def get_model(
                 quantize=quantize,
                 speculator=speculator,
                 dtype=dtype,
+                kv_cache_dtype=kv_cache_dtype,
                 default_dtype=torch.bfloat16,
                 trust_remote_code=trust_remote_code,
                 lora_adapter_ids=lora_adapter_ids,