huggingface
diff --git a/‎backends/gaudi/server/text_generation_server/layers/rotary.py‎
Lines changed: 10 additions & 7 deletions b/‎backends/gaudi/server/text_generation_server/layers/rotary.py‎
Lines changed: 10 additions & 7 deletions
diff --git a/‎backends/gaudi/server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py‎
Lines changed: 14 additions & 8 deletions b/‎backends/gaudi/server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py‎
Lines changed: 14 additions & 8 deletions
diff --git a/‎backends/gaudi/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py‎
Lines changed: 19 additions & 9 deletions b/‎backends/gaudi/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py‎
Lines changed: 19 additions & 9 deletions
diff --git a/‎backends/gaudi/server/text_generation_server/models/custom_modeling/flash_deepseek_v2_modeling.py‎
Lines changed: 11 additions & 8 deletions b/‎backends/gaudi/server/text_generation_server/models/custom_modeling/flash_deepseek_v2_modeling.py‎
Lines changed: 11 additions & 8 deletions
diff --git a/‎backends/gaudi/server/text_generation_server/models/custom_modeling/flash_deepseek_v3_modeling.py‎
Lines changed: 11 additions & 8 deletions b/‎backends/gaudi/server/text_generation_server/models/custom_modeling/flash_deepseek_v3_modeling.py‎
Lines changed: 11 additions & 8 deletions
diff --git a/‎backends/gaudi/server/text_generation_server/models/custom_modeling/flash_gemma2_modeling.py‎
Lines changed: 26 additions & 9 deletions b/‎backends/gaudi/server/text_generation_server/models/custom_modeling/flash_gemma2_modeling.py‎
Lines changed: 26 additions & 9 deletions
@@ -36,7 +36,9 @@ def __init__(self, inv_freq, scaling_factor, max_position_embeddings):
         self._sin_k_cached = None
         self.scaling_factor = scaling_factor
         self.dynamic_args = None
-        self.max_position_embeddings = max_position_embeddings
+        self._update_cos_sin_cache(
+            torch.float32, inv_freq.device, max_position_embeddings
+        )
 
     def forward(
         self,
@@ -268,9 +270,7 @@ def _update_cos_sin_cache(self, dtype, device, seqlen):
             self._sin_cached = torch.sin(freqs).to(dtype)
 
     def get_cos_sin(self, position_ids: torch.Tensor):
-        self._update_cos_sin_cache(
-            torch.float32, position_ids.device, seqlen=self.max_position_embeddings
-        )
+
         cos = torch.index_select(self._cos_cached, 0, position_ids)
         sin = torch.index_select(self._sin_cached, 0, position_ids)
 
@@ -298,6 +298,9 @@ def __init__(
         self._cos_k_cached = None
         self._sin_k_cached = None
         self.dynamic_args = None
+        self._update_cos_sin_cache(
+            torch.float32, short_inv_freq.device, max_position_embeddings
+        )
 
     def _update_cos_sin_cache(self, dtype, device, seqlen):
         # Reset the tables if the sequence length has changed,
@@ -351,6 +354,9 @@ def __init__(
         self._cos_k_cached = None
         self._sin_k_cached = None
         self.dynamic_args = None
+        self._update_cos_sin_cache(
+            torch.float32, short_inv_freq.device, max_position_embeddings
+        )
 
     def _update_cos_sin_cache(self, dtype, device, seqlen):
         if (
@@ -592,9 +598,6 @@ def get_cos_sin(
         position_ids: torch.Tensor,
     ):
         slen = position_ids.shape[0]
-        self._update_cos_sin_cache(
-            torch.float32, position_ids.device, seqlen=self.max_position_embeddings
-        )
 
         cos = self._cos_cached[position_ids].gather(1, self._sections[:slen])
         sin = self._sin_cached[position_ids].gather(1, self._sections[:slen])
 
@@ -160,18 +160,14 @@ def __init__(
         prefix: str,
         config,
         weights,
+        rotary_emb,
     ):
         super().__init__()
         self.num_heads = config.num_attention_heads
         self.hidden_size = config.hidden_size
         self.head_size = self.hidden_size // self.num_heads
 
-        self.rotary_emb = CohereRotary.static(
-            config=config,
-            dim=self.head_size,
-            base=config.rope_theta,
-            device=weights.device,
-        )
+        self.rotary_emb = rotary_emb
 
         self.softmax_scale = self.head_size**-0.5
 
@@ -325,11 +321,14 @@ def forward(self, hidden_states):
 
 
 class FlashCohereLayer(nn.Module):
-    def __init__(self, prefix: str, layer_id, config, weights):
+    def __init__(self, prefix: str, layer_id, config, weights, rotary_emb):
         super().__init__()
         prefix = f"{prefix}.layers.{layer_id}"
         self.self_attn = FlashCohereAttention(
-            prefix=f"{prefix}.self_attn", config=config, weights=weights
+            prefix=f"{prefix}.self_attn",
+            config=config,
+            weights=weights,
+            rotary_emb=rotary_emb,
         )
         self.mlp = CohereMLP(prefix=f"{prefix}.mlp", config=config, weights=weights)
 
@@ -385,13 +384,20 @@ def __init__(self, prefix: str, config, weights):
         self.embed_tokens = TensorParallelEmbedding(
             prefix=f"{prefix}.embed_tokens", weights=weights
         )
+        rotary_emb = CohereRotary.static(
+            config=config,
+            dim=config.hidden_size // config.num_attention_heads,
+            base=config.rope_theta,
+            device=weights.device,
+        )
         self.layers = nn.ModuleList(
             [
                 FlashCohereLayer(
                     prefix,
                     layer_id,
                     config,
                     weights,
+                    rotary_emb,
                 )
                 for layer_id in range(config.num_hidden_layers)
             ]
 
@@ -263,19 +263,15 @@ def __init__(
         prefix: str,
         config,
         weights,
+        rotary_emb,
     ):
         super().__init__()
         self.clip_qkv = config.attn_config.clip_qkv
         self.num_heads = config.n_heads
         self.hidden_size = config.d_model
         self.head_size = self.hidden_size // self.num_heads
 
-        self.rotary_emb = PositionRotaryEmbedding.static(
-            config=config,
-            dim=self.head_size,
-            base=config.attn_config.rope_theta,
-            device=weights.device,
-        )
+        self.rotary_emb = rotary_emb
 
         self.softmax_scale = self.head_size**-0.5
 
@@ -370,13 +366,17 @@ def __init__(
         prefix: str,
         config,
         weights,
+        rotary_emb,
     ):
         super().__init__()
         self.norm_1 = FastLayerNorm.load_no_bias(
             prefix=f"{prefix}.norm_1", weights=weights, eps=1e-5
         )
         self.self_attn = DbrxAttention(
-            prefix=f"{prefix}.attn", config=config, weights=weights
+            prefix=f"{prefix}.attn",
+            config=config,
+            weights=weights,
+            rotary_emb=rotary_emb,
         )
         self.norm_2 = FastLayerNorm.load_no_bias(
             prefix=f"{prefix}.norm_2",
@@ -601,12 +601,15 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
 
 class DbrxLayer(nn.Module):
-    def __init__(self, prefix: str, layer_id, config, weights):
+    def __init__(self, prefix: str, layer_id, config, weights, rotary_emb):
         super().__init__()
         prefix = f"{prefix}.blocks.{layer_id}"
 
         self.attn = DbrxNormAttentionNorm(
-            prefix=f"{prefix}.norm_attn_norm", config=config, weights=weights
+            prefix=f"{prefix}.norm_attn_norm",
+            config=config,
+            weights=weights,
+            rotary_emb=rotary_emb,
         )
 
         moe_cls = BlockSparseMoE if config.quantize is None else DenseMoE
@@ -649,6 +652,12 @@ def __init__(self, prefix: str, config, weights):
         self.embed_tokens = TensorParallelEmbedding(
             prefix=f"{prefix}.wte", weights=weights
         )
+        rotary_emb = PositionRotaryEmbedding.static(
+            config=config,
+            dim=config.d_model // config.n_heads,
+            base=config.attn_config.rope_theta,
+            device=weights.device,
+        )
 
         self.layers = nn.ModuleList(
             [
@@ -657,6 +666,7 @@ def __init__(self, prefix: str, config, weights):
                     layer_id,
                     config,
                     weights,
+                    rotary_emb,
                 )
                 for layer_id in range(config.n_layers)
             ]
 
@@ -156,6 +156,7 @@ def __init__(
         prefix: str,
         config,
         weights: Weights,
+        rotary_emb,
     ):
         super().__init__()
         self.num_heads = config.num_attention_heads
@@ -167,13 +168,7 @@ def __init__(
         self.head_size = config.qk_nope_head_dim + config.qk_rope_head_dim
         self.value_head_size = config.v_head_dim
         self.head_pad_size = max(self.head_size, self.value_head_size)
-
-        self.rotary_emb = PositionRotaryEmbedding.static(
-            config=config,
-            dim=self.qk_rope_head_dim,
-            base=config.rope_theta,
-            device=weights.device,
-        )
+        self.rotary_emb = rotary_emb
 
         mscale = get_mscale(
             self.rotary_emb.scaling_factor, self.rotary_emb.mscale_all_dim
@@ -459,14 +454,15 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
 
 class DeepseekV2Layer(nn.Module):
-    def __init__(self, prefix, layer_id, config, weights):
+    def __init__(self, prefix, layer_id, config, weights, rotary_emb):
         super().__init__()
         prefix = f"{prefix}.layers.{layer_id}"
 
         self.self_attn = DeepseekV2Attention(
             prefix=f"{prefix}.self_attn",
             config=config,
             weights=weights,
+            rotary_emb=rotary_emb,
         )
 
         if (
@@ -541,13 +537,20 @@ def __init__(self, prefix: str, config, weights: Weights):
             prefix=f"{prefix}.embed_tokens", weights=weights
         )
 
+        rotary_emb = PositionRotaryEmbedding.static(
+            config=config,
+            dim=config.qk_rope_head_dim,
+            base=config.rope_theta,
+            device=weights.device,
+        )
         self.layers = nn.ModuleList(
             [
                 DeepseekV2Layer(
                     prefix,
                     layer_id,
                     config,
                     weights,
+                    rotary_emb,
                 )
                 for layer_id in range(config.num_hidden_layers)
             ]
 
@@ -169,6 +169,7 @@ def __init__(
         prefix: str,
         config,
         weights: Weights,
+        rotary_emb,
     ):
         super().__init__()
         self.num_heads = config.num_attention_heads
@@ -180,13 +181,7 @@ def __init__(
         self.head_size = config.qk_nope_head_dim + config.qk_rope_head_dim
         self.value_head_size = config.v_head_dim
         self.head_pad_size = max(self.head_size, self.value_head_size)
-
-        self.rotary_emb = PositionRotaryEmbedding.static(
-            config=config,
-            dim=self.qk_rope_head_dim,
-            base=config.rope_theta,
-            device=weights.device,
-        )
+        self.rotary_emb = rotary_emb
 
         mscale = get_mscale(
             self.rotary_emb.scaling_factor, self.rotary_emb.mscale_all_dim
@@ -535,14 +530,15 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
 
 class DeepseekV3Layer(nn.Module):
-    def __init__(self, prefix, layer_id, config, weights):
+    def __init__(self, prefix, layer_id, config, weights, rotary_emb):
         super().__init__()
         prefix = f"{prefix}.layers.{layer_id}"
 
         self.self_attn = DeepseekV3Attention(
             prefix=f"{prefix}.self_attn",
             config=config,
             weights=weights,
+            rotary_emb=rotary_emb,
         )
 
         if (
@@ -616,6 +612,12 @@ def __init__(self, prefix: str, config, weights: Weights):
         self.embed_tokens = TensorParallelEmbedding(
             prefix=f"{prefix}.embed_tokens", weights=weights
         )
+        rotary_emb = PositionRotaryEmbedding.static(
+            config=config,
+            dim=config.qk_rope_head_dim,
+            base=config.rope_theta,
+            device=weights.device,
+        )
 
         self.layers = nn.ModuleList(
             [
@@ -624,6 +626,7 @@ def __init__(self, prefix: str, config, weights: Weights):
                     layer_id,
                     config,
                     weights,
+                    rotary_emb,
                 )
                 for layer_id in range(config.num_hidden_layers)
             ]
 
@@ -166,7 +166,14 @@ def _load_gqa(config, prefix: str, weights):
 
 class FlashGemma2Attention(torch.nn.Module):
     def __init__(
-        self, prefix: str, config, weights, layer_id, causal: bool, is_sliding: bool
+        self,
+        prefix: str,
+        config,
+        weights,
+        layer_id,
+        causal: bool,
+        is_sliding: bool,
+        rotary_emb,
     ):
         super().__init__()
         self.num_heads = config.num_attention_heads
@@ -176,13 +183,7 @@ def __init__(
             self.window_size = config.sliding_window
         else:
             self.window_size = -1
-
-        self.rotary_emb = PositionRotaryEmbedding.static(
-            config=config,
-            dim=self.head_size,
-            base=config.rope_theta,
-            device=weights.device,
-        )
+        self.rotary_emb = rotary_emb
 
         # self.softmax_scale = self.head_size**-0.5
         self.softmax_scale = config.query_pre_attn_scalar**-0.5
@@ -354,7 +355,14 @@ def forward(self, hidden_states, adapter_data):
 
 class FlashGemma2Layer(nn.Module):
     def __init__(
-        self, prefix: str, config, weights, layer_id, causal: bool, is_sliding: bool
+        self,
+        prefix: str,
+        config,
+        weights,
+        layer_id,
+        causal: bool,
+        is_sliding: bool,
+        rotary_emb,
     ):
         super().__init__()
         self.self_attn = FlashGemma2Attention(
@@ -364,6 +372,7 @@ def __init__(
             layer_id=layer_id,
             causal=causal,
             is_sliding=is_sliding,
+            rotary_emb=rotary_emb,
         )
         self.mlp = Gemma2MLP(
             prefix=f"{prefix}.mlp", config=config, weights=weights, layer_id=layer_id
@@ -435,6 +444,13 @@ def __init__(self, prefix: str, config, weights, causal: bool):
         process_group = weights.process_group
         self.tp_rank = process_group.rank()
         self.tp_world_size = process_group.size()
+        rotary_emb = PositionRotaryEmbedding.static(
+            config=config,
+            dim=config.head_dim,
+            base=config.rope_theta,
+            device=weights.device,
+        )
+
         self.layers = nn.ModuleList(
             [
                 FlashGemma2Layer(
@@ -444,6 +460,7 @@ def __init__(self, prefix: str, config, weights, causal: bool):
                     layer_id=layer_id,
                     causal=causal,
                     is_sliding=layer_id % 2 == 0,
+                    rotary_emb=rotary_emb,
                 )
                 for layer_id in range(config.num_hidden_layers)
             ]