Add Settings.persistent_reserved_sms (#1129)

jansel · web-flow · commit 5699050af76b · 2025-11-13T18:09:31.000-08:00
diff --git a/docs/api/settings.md b/docs/api/settings.md
@@ -92,6 +92,11 @@ def my_kernel(x: torch.Tensor) -> torch.Tensor:
 
    When enabled, tensor shapes are treated as compile-time constants for optimization. Default is ``True``.
    Set ``HELION_STATIC_SHAPES=0`` the default if you need a compiled kernel instance to serve many shape variants.
+
+.. autoattribute:: Settings.persistent_reserved_sms
+
+   Reserve this many streaming multiprocessors when launching persistent kernels. Default is ``0`` (use all SMs).
+   Configure globally with ``HELION_PERSISTENT_RESERVED_SMS`` or per-kernel via ``@helion.kernel(..., persistent_reserved_sms=N)``.
 ```
 
 ### Autotuning Settings
@@ -251,6 +256,7 @@ Built-in values for ``HELION_AUTOTUNER`` include ``"PatternSearch"``, ``"Differe
 | ``TRITON_F32_DEFAULT`` | ``dot_precision`` | Sets default floating-point precision for Triton dot products (``"tf32"``, ``"tf32x3"``, ``"ieee"``). |
 | ``HELION_INDEX_DTYPE`` | ``index_dtype`` | Choose the default index dtype (accepts any ``torch.<dtype>`` name, e.g. ``int64``). |
 | ``HELION_STATIC_SHAPES`` | ``static_shapes`` | Set to ``0``/``false`` to disable global static shape specialization. |
+| ``HELION_PERSISTENT_RESERVED_SMS`` | ``persistent_reserved_sms`` | Reserve this many streaming multiprocessors when launching persistent kernels (``0`` uses all available SMs). |
 | ``HELION_FORCE_AUTOTUNE`` | ``force_autotune`` | Force the autotuner to run even when explicit configs are provided. |
 | ``HELION_DISALLOW_AUTOTUNING`` | ``check_autotuning_disabled`` | Hard-disable autotuning; kernels must supply explicit configs when this is ``1``. |
 | ``HELION_AUTOTUNE_COMPILE_TIMEOUT`` | ``autotune_compile_timeout`` | Maximum seconds to wait for Triton compilation during autotuning. |
diff --git a/helion/_compiler/program_id.py b/helion/_compiler/program_id.py
@@ -420,9 +420,11 @@ def __init__(self, is_blocked: bool = False) -> None:
                 "step": NUM_SM_VAR,
             }
         if device_function.constexpr_arg(NUM_SM_VAR):
+            reserved_sms = CompileEnvironment.current().settings.persistent_reserved_sms
+            reserved_arg = f", reserved_sms={reserved_sms}" if reserved_sms > 0 else ""
             device_function.codegen.host_statements.append(
                 statement_from_string(
-                    f"{NUM_SM_VAR} = helion.runtime.get_num_sm({self.get_device_str()})"
+                    f"{NUM_SM_VAR} = helion.runtime.get_num_sm({self.get_device_str()}{reserved_arg})"
                 )
             )
 
diff --git a/helion/runtime/__init__.py b/helion/runtime/__init__.py
@@ -38,27 +38,40 @@ def set_triton_allocator() -> None:
         set_allocator(_alloc_fn)
 
 
-def get_num_sm(device: torch.device) -> int:
+def get_num_sm(device: torch.device, *, reserved_sms: int = 0) -> int:
     """
     Get the number of streaming multiprocessors (SMs) for the specified device.
 
     Args:
         device: Device to query.
+        reserved_sms: Number of SMs to keep free for other work (e.g., communication
+            kernels). Defaults to 0 meaning all device SMs are available to Helion.
 
     Returns:
-        Grid size to use for a persistent kernel on the device.
+        Grid size to use for a persistent kernel on the device after accounting
+        for any reserved SMs. Always at least 1.
     """
     assert device.type in ["cuda", "xpu", "cpu"], "TODO: implement for other devices"
+    available_sms: int
     if device.type == "cpu":
         try:
             num_threads = int(torch.get_num_threads())
         except Exception:
             num_threads = 0
-        return num_threads if num_threads > 0 else int(os.cpu_count() or 1)
-    if device.type == "cuda":
-        return torch.cuda.get_device_properties(device.index).multi_processor_count
+        available_sms = num_threads if num_threads > 0 else int(os.cpu_count() or 1)
+    elif device.type == "cuda":
+        available_sms = torch.cuda.get_device_properties(
+            device.index
+        ).multi_processor_count
     # TODO(EikanWang): gpu_subslice_count is an out-of-date term. we change update it to XeCore number.
-    return torch.xpu.get_device_properties(device.index).gpu_subslice_count
+    elif device.type == "xpu":
+        available_sms = torch.xpu.get_device_properties(device.index).gpu_subslice_count
+    else:
+        raise AssertionError("TODO: implement for other devices")
+
+    if reserved_sms <= 0:
+        return available_sms
+    return max(available_sms - reserved_sms, 1)
 
 
 def default_launcher(
diff --git a/helion/runtime/settings.py b/helion/runtime/settings.py
@@ -278,6 +278,13 @@ class _Settings:
     static_shapes: bool = dataclasses.field(
         default_factory=functools.partial(_env_get_bool, "HELION_STATIC_SHAPES", True)
     )
+    persistent_reserved_sms: int = dataclasses.field(
+        default_factory=functools.partial(
+            _env_get_int,
+            "HELION_PERSISTENT_RESERVED_SMS",
+            0,
+        )
+    )
     autotune_log_level: int = dataclasses.field(default_factory=_get_autotune_log_level)
     autotune_log: str | None = dataclasses.field(default_factory=_get_autotune_log_path)
     autotune_compile_timeout: int = dataclasses.field(
@@ -401,6 +408,10 @@ class Settings(_Settings):
             "If True, use static shapes for all tensors. This is a performance optimization. "
             "Set HELION_STATIC_SHAPES=0 to disable."
         ),
+        "persistent_reserved_sms": (
+            "Number of streaming multiprocessors to reserve when launching persistent kernels. "
+            "Set HELION_PERSISTENT_RESERVED_SMS=N (default 0) or pass persistent_reserved_sms=N to helion.kernel."
+        ),
         "autotune_log_level": (
             "Log level for autotuning using Python logging levels. Default is logging.INFO. "
             "Use HELION_AUTOTUNE_LOG_LEVEL to override or set 0 to disable output."
diff --git a/test/test_persistent_kernels.py b/test/test_persistent_kernels.py
@@ -604,6 +604,27 @@ def simple_add(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
         self.assertIn("helion.runtime.get_num_sm(", code_interleaved)
         self.assertIn("for virtual_pid in tl.range", code_interleaved)
 
+    def test_persistent_reserved_sms_setting_applies(self):
+        """Ensure persistent_reserved_sms is threaded into host code for persistent kernels."""
+
+        @helion.kernel(autotune_effort="none", persistent_reserved_sms=3)
+        def reserved_kernel(x: torch.Tensor) -> torch.Tensor:
+            out = x.new_empty(x.size())
+            for tile in hl.tile(x.size(), block_size=[32, 16]):
+                out[tile] = x[tile]
+            return out
+
+        (x,) = (torch.randn([32, 32], device=DEVICE),)
+
+        code_reserved, result_reserved = code_and_output(
+            reserved_kernel,
+            (x,),
+            pid_type="persistent_blocked",
+        )
+
+        torch.testing.assert_close(result_reserved, x)
+        self.assertIn("reserved_sms=3", code_reserved)
+
     def test_multi_loop_persistent_with_shared_program_id(self):
         """Test that multi-loop persistent kernels with ForEachProgramID work correctly.
 

Original file line number	Diff line number	Diff line change
`@@ -420,9 +420,11 @@ def __init__(self, is_blocked: bool = False) -> None:`
`420`	`420`	`"step": NUM_SM_VAR,`
`421`	`421`	`}`
`422`	`422`	`if device_function.constexpr_arg(NUM_SM_VAR):`
	`423`	`+ reserved_sms = CompileEnvironment.current().settings.persistent_reserved_sms`
	`424`	`+ reserved_arg = f", reserved_sms={reserved_sms}" if reserved_sms > 0 else ""`
`423`	`425`	`device_function.codegen.host_statements.append(`
`424`	`426`	`statement_from_string(`
`425`		`- f"{NUM_SM_VAR} = helion.runtime.get_num_sm({self.get_device_str()})"`
	`427`	`+ f"{NUM_SM_VAR} = helion.runtime.get_num_sm({self.get_device_str()}{reserved_arg})"`
`426`	`428`	`)`
`427`	`429`	`)`
`428`	`430`