-
Notifications
You must be signed in to change notification settings - Fork 68
Added dynamic-shape 0/1 bucketing: "zero_nonzero" env var #1053
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
a51afd4
a81fb12
eb13f0c
7723054
ad805e8
20234fd
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -188,7 +188,10 @@ def _compute_baseline(self) -> tuple[object, bool, Sequence[object] | None]: | |
| baseline_config, | ||
| prefix=f"Generated Triton code for {decorator}:", | ||
| ) | ||
| <<<<<<< HEAD | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Whoops?
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. let me clear this up. |
||
| self.kernel.maybe_log_repro(self.log.error, new_args, baseline_config) | ||
| ======= | ||
| >>>>>>> 69f3405 (Add `settings.autotune_baseline_fn` to allow passing in custom baseline function to autotuner (#1054)) | ||
| raise exc.InvalidConfig( | ||
| "Default config failed while computing baseline.\n" | ||
| f"Default config: {decorator}\n" | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -397,6 +397,13 @@ def configs(self) -> list[Config]: | |
|
|
||
| def format_kernel_decorator(self, config: Config, settings: Settings) -> str: | ||
| """Return the @helion.kernel decorator snippet capturing configs and settings that influence Triton code generation.""" | ||
| # Include shape_bucketing only when non-default to keep logs compact | ||
| if getattr(settings, "shape_bucketing", "min2") != "min2": | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why getattr? |
||
| return ( | ||
| f"@helion.kernel(config={config.__repr__()}, " | ||
| f"static_shapes={settings.static_shapes}, " | ||
| f"shape_bucketing='{settings.shape_bucketing}')" | ||
| ) | ||
| return f"@helion.kernel(config={config.__repr__()}, static_shapes={settings.static_shapes})" | ||
|
|
||
| def to_triton_code( | ||
|
|
@@ -830,11 +837,15 @@ def _tensor_key(fn: Kernel, obj: torch.Tensor) -> Hashable: | |
| (*obj.size(),), | ||
| (*obj.stride(),), | ||
| ) | ||
| # Non-static path: bucket sizes for specialization. Default is 0/1/>=2 (as 2). | ||
| vals = tuple([min(s, 2) for s in obj.size()]) | ||
| if getattr(fn.settings, "shape_bucketing", "min2") == "zero_nonzero": | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same |
||
| # Keep zero distinct; unify 1 with >=2 to reduce variant churn | ||
| vals = tuple(0 if v == 0 else 2 for v in vals) | ||
| return ( | ||
| obj.dtype, | ||
| obj.device.type, | ||
| # 0, 1, or >=2 specialization | ||
| tuple([min(s, 2) for s in obj.size()]), | ||
| vals, | ||
| ) | ||
|
|
||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -232,6 +232,21 @@ def _get_autotune_random_seed() -> int: | |
| return int(time.time() * 1000) % 2**32 | ||
|
|
||
|
|
||
| def _get_shape_bucketing() -> Literal["min2", "zero_nonzero"]: | ||
| val = _env_get_literal( | ||
| "HELION_SHAPE_BUCKETING", | ||
| "min2", | ||
| mapping={ | ||
| "min2": "min2", | ||
| "zero_nonzero": "zero_nonzero", | ||
| }, | ||
| ) | ||
| # Narrow to Literal explicitly | ||
| if val == "zero_nonzero": | ||
| return "zero_nonzero" | ||
| return "min2" | ||
|
|
||
|
|
||
| def _get_ref_mode() -> RefMode: | ||
| interpret = _env_get_bool("HELION_INTERPRET", False) | ||
| return RefMode.EAGER if interpret else RefMode.OFF | ||
|
|
@@ -347,6 +362,12 @@ class _Settings: | |
| _env_get_bool, "HELION_DEBUG_DTYPE_ASSERTS", False | ||
| ) | ||
| ) | ||
| # Controls non-static shape specialization bucketing. When "min2" (default), | ||
| # we bucket dynamic sizes per-dimension into 0, 1, or >=2 (represented as 2). | ||
| # When "zero_nonzero", we keep 0 distinct and unify 1 with >=2 to reduce churn. | ||
| shape_bucketing: Literal["min2", "zero_nonzero"] = dataclasses.field( | ||
| default_factory=_get_shape_bucketing | ||
| ) | ||
|
Comment on lines
+365
to
+370
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. After some though, perhaps instead of adding a new config we should make static_shapes an enum of "all", "ones", "none". Since if I set static_shapes=True this does nothing. We will need backcompat for True/False, but that might result in a cleaner config.
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Okay, so I was thinking we can do something like this:
To make backcompat for True/False, we can set them as True->"all" & False->"none" and then |
||
| ref_mode: RefMode = dataclasses.field(default_factory=_get_ref_mode) | ||
| autotuner_fn: AutotunerFunction = default_autotuner_fn | ||
| autotune_baseline_fn: Callable[..., object] | None = None | ||
|
|
@@ -401,6 +422,12 @@ class Settings(_Settings): | |
| ), | ||
| "allow_warp_specialize": "If True, allow warp specialization for tl.range calls on CUDA devices.", | ||
| "debug_dtype_asserts": "If True, emit tl.static_assert checks for dtype after each device node.", | ||
| "shape_bucketing": ( | ||
| "Dynamic-shape specialization policy when static_shapes=False. " | ||
| "'min2' buckets each dimension into 0,1,>=2 (current behavior). " | ||
| "'zero_nonzero' keeps 0 distinct and unifies 1 with >=2 to reduce variants. " | ||
| "Override with HELION_SHAPE_BUCKETING=min2|zero_nonzero." | ||
| ), | ||
| "ref_mode": "Reference mode for kernel execution. Can be RefMode.OFF or RefMode.EAGER.", | ||
| "autotuner_fn": ( | ||
| "Function to create an autotuner. " | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -597,7 +597,10 @@ def wrong_fn(*fn_args, **fn_kwargs): | |
| run_mode("fork", expect_error=False) | ||
| run_mode("spawn", expect_error=True) | ||
|
|
||
| <<<<<<< HEAD | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Whoops? |
||
| @skipIfCpu("fails on Triton CPU backend") | ||
| ======= | ||
| >>>>>>> 69f3405 (Add `settings.autotune_baseline_fn` to allow passing in custom baseline function to autotuner (#1054)) | ||
| def test_autotune_baseline_fn(self) -> None: | ||
| """Test that custom baseline function is used for accuracy checking.""" | ||
| config1 = helion.Config(block_sizes=[32], num_warps=4) | ||
|
|
@@ -638,7 +641,10 @@ def add(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor: | |
| # Verify the result is correct | ||
| torch.testing.assert_close(result, args[0] + args[1]) | ||
|
|
||
| <<<<<<< HEAD | ||
| @skipIfCpu("fails on Triton CPU backend") | ||
| ======= | ||
| >>>>>>> 69f3405 (Add `settings.autotune_baseline_fn` to allow passing in custom baseline function to autotuner (#1054)) | ||
| def test_autotune_baseline_fn_filters_bad_config(self) -> None: | ||
| """Test that custom baseline function correctly filters incorrect configs.""" | ||
| bad_config = helion.Config(block_sizes=[1], num_warps=8) | ||
|
|
@@ -737,7 +743,10 @@ def add(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor: | |
| ): | ||
| add(*args) | ||
|
|
||
| <<<<<<< HEAD | ||
| @skipIfCpu("fails on Triton CPU backend") | ||
| ======= | ||
| >>>>>>> 69f3405 (Add `settings.autotune_baseline_fn` to allow passing in custom baseline function to autotuner (#1054)) | ||
| def test_max_generations(self): | ||
| """Autotuner max generation respects explicit kwargs then setting override.""" | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,62 @@ | ||
| This file is automatically generated by assertExpectedJournal calls in test_shape_bucketing.py. | ||
| Update expected outputs by running tests with the EXPECTTEST_ACCEPT=1 environment variable set. | ||
|
|
||
| --- assertExpectedJournal(TestShapeBucketing.test_zero_nonzero_codegen_identical_m1_vs_m2) | ||
| from __future__ import annotations | ||
|
|
||
| import torch | ||
| import triton | ||
| import triton.language as tl | ||
| from helion.runtime import default_launcher as _default_launcher | ||
|
|
||
| import test.test_shape_bucketing as _source_module | ||
|
|
||
| @triton.jit | ||
| def _helion_pw_add_fn(x, out, x_size_1, out_stride_0, out_stride_1, x_stride_0, x_stride_1): | ||
| # src[test_shape_bucketing.py:N]: for i in grid(x.size(0)): | ||
| pid_0 = tl.program_id(0) | ||
| offset_0 = pid_0 | ||
| # src[test_shape_bucketing.py:N]: for j in grid(x.size(1)): | ||
| # src[test_shape_bucketing.py:N]: out[i, j] = x[i, j] + 1.0 | ||
| for offset_1 in tl.range(0, x_size_1): | ||
| # src[test_shape_bucketing.py:N]: out[i, j] = x[i, j] + 1.0 | ||
| load = tl.load(x + (offset_0 * x_stride_0 + offset_1 * x_stride_1), None) | ||
| v_0 = 1.0 | ||
| v_1 = load + v_0 | ||
| tl.store(out + (offset_0 * out_stride_0 + offset_1 * out_stride_1), v_1, None) | ||
|
|
||
| def pw_add_fn(x: torch.Tensor, out: torch.Tensor, *, _launcher=_default_launcher): | ||
| # src[test_shape_bucketing.py:N]: for i in grid(x.size(0)): | ||
| # src[test_shape_bucketing.py:N]: for j in grid(x.size(1)): | ||
| # src[test_shape_bucketing.py:N]: out[i, j] = x[i, j] + 1.0 | ||
| _launcher(_helion_pw_add_fn, (x.size(0),), x, out, x.size(1), out.stride(0), out.stride(1), x.stride(0), x.stride(1), num_warps=4, num_stages=1) | ||
|
|
||
| --- assertExpectedJournal(TestShapeBucketing.test_zero_nonzero_codegen_identical_m1_vs_m2) | ||
| from __future__ import annotations | ||
|
|
||
| import torch | ||
| import triton | ||
| import triton.language as tl | ||
| from helion.runtime import default_launcher as _default_launcher | ||
|
|
||
| import test.test_shape_bucketing as _source_module | ||
|
|
||
| @triton.jit | ||
| def _helion_pw_add_fn(x, out, x_size_1, out_stride_0, out_stride_1, x_stride_0, x_stride_1): | ||
| # src[test_shape_bucketing.py:N]: for i in grid(x.size(0)): | ||
| pid_0 = tl.program_id(0) | ||
| offset_0 = pid_0 | ||
| # src[test_shape_bucketing.py:N]: for j in grid(x.size(1)): | ||
| # src[test_shape_bucketing.py:N]: out[i, j] = x[i, j] + 1.0 | ||
| for offset_1 in tl.range(0, x_size_1): | ||
| # src[test_shape_bucketing.py:N]: out[i, j] = x[i, j] + 1.0 | ||
| load = tl.load(x + (offset_0 * x_stride_0 + offset_1 * x_stride_1), None) | ||
| v_0 = 1.0 | ||
| v_1 = load + v_0 | ||
| tl.store(out + (offset_0 * out_stride_0 + offset_1 * out_stride_1), v_1, None) | ||
|
|
||
| def pw_add_fn(x: torch.Tensor, out: torch.Tensor, *, _launcher=_default_launcher): | ||
| # src[test_shape_bucketing.py:N]: for i in grid(x.size(0)): | ||
| # src[test_shape_bucketing.py:N]: for j in grid(x.size(1)): | ||
| # src[test_shape_bucketing.py:N]: out[i, j] = x[i, j] + 1.0 | ||
| _launcher(_helion_pw_add_fn, (x.size(0),), x, out, x.size(1), out.stride(0), out.stride(1), x.stride(0), x.stride(1), num_warps=4, num_stages=1) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You shouldn't need getattr here.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yeah, I was just trying to be on the safe side. Did the same in
_kernel_typebut it's really not necessary.