Skip to content

Commit 2eb9154

Browse files
authored
Suggest HELION_AUTOTUNE_PRECOMPILE=spawn when IMA happens (#984)
1 parent fc69870 commit 2eb9154

File tree

3 files changed

+37
-4
lines changed

3 files changed

+37
-4
lines changed

helion/autotuner/base_search.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@
4747
from .logger import LambdaLogger
4848
from .logger import classify_triton_exception
4949
from .logger import format_triton_compile_failure
50+
from .logger import match_unrecoverable_runtime_error
5051
from .progress_bar import iter_with_progress
5152

5253
if TYPE_CHECKING:
@@ -300,6 +301,14 @@ def benchmark_function(self, config: Config, fn: CompiledConfig) -> float:
300301
self.best_perf_so_far = res
301302
return res
302303
except Exception as e:
304+
if match_unrecoverable_runtime_error(e):
305+
raise exc.TritonUnrecoverableRuntimeError(
306+
reason=str(e),
307+
decorator=self.kernel.format_kernel_decorator(
308+
config, self.settings
309+
),
310+
error=f"{type(e).__qualname__}: {e}",
311+
) from e
303312
action = classify_triton_exception(e)
304313
if self.settings.autotune_ignore_errors:
305314
pass

helion/autotuner/logger.py

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -121,15 +121,30 @@ def format_triton_compile_failure(
121121
"triton.compiler.errors.CompilationError", # Triton CompilationError
122122
"out of resource: shared memory", # Triton shared memory OOM
123123
"ZE_RESULT_ERROR_INVALID_KERNEL_NAME", # Level Zero compile failed
124-
"an illegal memory access was encountered", # workaround triton bugs
125-
"misaligned address", # workaround triton bugs
126-
"unspecified launch failure", # workaround ptxas bugs
127124
"exceeds triton maximum tensor numel", # needs smaller config
128125
],
129126
)
130127
)
131128
)
132129

130+
_UNRECOVERABLE_RUNTIME_ERROR_RE: re.Pattern[str] = re.compile(
131+
"|".join(
132+
map(
133+
re.escape,
134+
[
135+
"illegal memory access",
136+
"misaligned address",
137+
"unspecified launch failure",
138+
],
139+
)
140+
),
141+
re.IGNORECASE,
142+
)
143+
144+
145+
def match_unrecoverable_runtime_error(err: BaseException) -> bool:
146+
return bool(_UNRECOVERABLE_RUNTIME_ERROR_RE.search(str(err)))
147+
133148

134149
def classify_triton_exception(err: BaseException) -> Literal["raise", "warn", "debug"]:
135150
"""
@@ -150,6 +165,6 @@ def classify_triton_exception(err: BaseException) -> Literal["raise", "warn", "d
150165
msg = str(err)
151166
if "PassManager::run failed" in msg:
152167
return "warn"
153-
if _EXPECTED_TRITON_ERRORS_RE.search(msg):
168+
if _EXPECTED_TRITON_ERRORS_RE.search(msg) or match_unrecoverable_runtime_error(err):
154169
return "debug"
155170
return "raise"

helion/exc.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -345,6 +345,15 @@ class TritonError(BaseError):
345345
Set autotune_ignore_errors=True or HELION_AUTOTUNE_IGNORE_ERRORS=1 to ignore Triton errors in autotuning."""
346346

347347

348+
class TritonUnrecoverableRuntimeError(BaseError):
349+
message = """\
350+
An unrecoverable Triton runtime error occurred: {reason}.
351+
This likely indicates a bug in Triton and cannot be recovered from.
352+
{decorator}
353+
Original error: {error}
354+
Set HELION_AUTOTUNE_PRECOMPILE="spawn" to isolate these errors in a subprocess so tuning can continue."""
355+
356+
348357
class BaseWarning(_FixedMessage):
349358
message = "A warning occurred."
350359

0 commit comments

Comments
 (0)