Add truncate arg to yarn to match openai implementation of gpt-oss (vllm-project#28244)

ashors1 · heheda12345 · kitaekatt · commit 018940f301b2 · 2025-12-01T11:35:02.000-06:00
Signed-off-by: ashors1 &lt;ashors@nvidia.com&gt;
Co-authored-by: Chen Zhang &lt;zhangch99@outlook.com&gt;
diff --git a/vllm/model_executor/layers/rotary_embedding/__init__.py b/vllm/model_executor/layers/rotary_embedding/__init__.py
@@ -197,6 +197,7 @@ def get_rope(
                     "beta_fast",
                     "beta_slow",
                     "apply_yarn_scaling",
+                    "truncate",
                 )
             }
             if "mrope_section" in rope_parameters:
diff --git a/vllm/model_executor/layers/rotary_embedding/common.py b/vllm/model_executor/layers/rotary_embedding/common.py
@@ -117,13 +117,13 @@ def yarn_find_correction_range(
     dim: int,
     base: float = 10000,
     max_position_embeddings: int = 2048,
-) -> tuple[int, int]:
-    low = math.floor(
-        yarn_find_correction_dim(low_rot, dim, base, max_position_embeddings)
-    )
-    high = math.ceil(
-        yarn_find_correction_dim(high_rot, dim, base, max_position_embeddings)
-    )
+    truncate: bool = True,
+) -> tuple[float | int, float | int]:
+    low = yarn_find_correction_dim(low_rot, dim, base, max_position_embeddings)
+    high = yarn_find_correction_dim(high_rot, dim, base, max_position_embeddings)
+    if truncate:
+        low = math.floor(low)
+        high = math.ceil(high)
     return max(low, 0), min(high, dim - 1)  # Clamp values just in case
 
 
diff --git a/vllm/model_executor/layers/rotary_embedding/yarn_scaling_rope.py b/vllm/model_executor/layers/rotary_embedding/yarn_scaling_rope.py
@@ -28,12 +28,14 @@ def __init__(
         beta_fast: int = 32,
         beta_slow: int = 1,
         apply_yarn_scaling: bool = True,
+        truncate: bool = True,
     ) -> None:
         self.scaling_factor = scaling_factor
         self.extrapolation_factor = extrapolation_factor
         self.attn_factor = attn_factor
         self.beta_fast = beta_fast
         self.beta_slow = beta_slow
+        self.truncate = truncate
         # Get n-d magnitude scaling corrected for interpolation
         self.mscale = (
             float(yarn_get_mscale(self.scaling_factor) * attn_factor)
@@ -57,6 +59,7 @@ def _compute_inv_freq(self, scaling_factor: float) -> torch.Tensor:
             self.rotary_dim,
             self.base,
             self.max_position_embeddings,
+            self.truncate,
         )
         # Get n-d rotational scaling corrected for extrapolation
         inv_freq_mask = (
diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py
@@ -78,6 +78,7 @@ def __init__(
                 ],
                 "beta_fast": config.rope_parameters["beta_fast"],
                 "beta_slow": config.rope_parameters["beta_slow"],
+                "truncate": config.rope_parameters.get("truncate", True),
             },
             is_neox_style=True,
         )

Original file line number	Diff line number	Diff line change
`@@ -197,6 +197,7 @@ def get_rope(`
`197`	`197`	`"beta_fast",`
`198`	`198`	`"beta_slow",`
`199`	`199`	`"apply_yarn_scaling",`
	`200`	`+ "truncate",`
`200`	`201`	`)`
`201`	`202`	`}`
`202`	`203`	`if "mrope_section" in rope_parameters:`
Original file line number	Diff line number	Diff line change
`@@ -78,6 +78,7 @@ def __init__(`
`78`	`78`	`],`
`79`	`79`	`"beta_fast": config.rope_parameters["beta_fast"],`
`80`	`80`	`"beta_slow": config.rope_parameters["beta_slow"],`
	`81`	`+ "truncate": config.rope_parameters.get("truncate", True),`
`81`	`82`	`},`
`82`	`83`	`is_neox_style=True,`
`83`	`84`	`)`