From ff591451e821d47ec7daafe6c36491d0793d7433 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Mon, 13 Oct 2025 12:44:40 -0400 Subject: [PATCH 1/4] defaults Signed-off-by: Kyle Sayers --- .../quantization/quant_args.py | 43 ------------------- .../quantization/quant_scheme.py | 34 +++++++++++++++ 2 files changed, 34 insertions(+), 43 deletions(-) diff --git a/src/compressed_tensors/quantization/quant_args.py b/src/compressed_tensors/quantization/quant_args.py index d9a92d0d..3a3c93a4 100644 --- a/src/compressed_tensors/quantization/quant_args.py +++ b/src/compressed_tensors/quantization/quant_args.py @@ -263,8 +263,6 @@ def validate_model_after(model: "QuantizationArgs") -> "QuantizationArgs": block_structure = model.block_structure actorder = model.actorder dynamic = model.dynamic - observer = model.observer - dynamic = model.dynamic # infer strategy if strategy is None: @@ -316,45 +314,8 @@ def validate_model_after(model: "QuantizationArgs") -> "QuantizationArgs": "activation ordering" ) - # infer observer w.r.t. dynamic - if dynamic: - supported_strategies = ( - QuantizationStrategy.TOKEN, - QuantizationStrategy.TENSOR, - QuantizationStrategy.TENSOR_GROUP, - QuantizationStrategy.GROUP, - ) - if strategy not in supported_strategies: - raise ValueError( - f"One of {supported_strategies} must be used for dynamic quant." - ) - - if ( - dynamic == DynamicType.LOCAL - and strategy != QuantizationStrategy.TENSOR_GROUP - ): - raise ValueError("local is only supported for strategy tensor_group") - - if observer is not None: - if dynamic is True: # checking if dynamic is True, not "local" - if ( - observer != "memoryless" - ): # avoid annoying users with old configs - warnings.warn( - "No observer is used for dynamic quant., setting to None" - ) - observer = None - else: - if dynamic == DynamicType.LOCAL: - observer = "minmax" - - elif observer is None: - # default to minmax for non-dynamic cases - observer = "minmax" - # write back modified values model.strategy = strategy - model.observer = observer return model def pytorch_dtype(self) -> torch.dtype: @@ -373,10 +334,6 @@ def pytorch_dtype(self) -> torch.dtype: else: raise ValueError(f"Invalid quantization type {self.type}") - @deprecated("QuantizationArgs.observer") - def get_observer(self) -> str: - return self.observer - model_config = ConfigDict(extra="forbid") diff --git a/src/compressed_tensors/quantization/quant_scheme.py b/src/compressed_tensors/quantization/quant_scheme.py index 79db8d28..af703b41 100644 --- a/src/compressed_tensors/quantization/quant_scheme.py +++ b/src/compressed_tensors/quantization/quant_scheme.py @@ -59,6 +59,7 @@ def validate_model_after(model: "QuantizationScheme") -> "QuantizationScheme": weights = model.weights format = model.format + # validate input args if inputs is not None: if inputs.strategy not in ( QuantizationStrategy.TOKEN, @@ -84,15 +85,21 @@ def validate_model_after(model: "QuantizationScheme") -> "QuantizationScheme": if inputs.actorder is not None: raise ValueError("Cannot apply actorder to input activations") + if inputs.observer is None: + inputs.observer + + # validate output args if outputs is not None: if outputs.actorder is not None: raise ValueError("Cannot apply actorder to output activations") + # validate format if format == CompressionFormat.mixed_precision.value: raise ValueError( "mixed-precision cannot be set as a format for a QuantizationScheme" ) + # validate matching group sizes if ( inputs and weights @@ -110,8 +117,35 @@ def validate_model_after(model: "QuantizationScheme") -> "QuantizationScheme": stacklevel=2, ) + # set observer defaults + model._validate_observers() + return model + def _validate_observers(self): + inputs = self.input_activations + weights = self.weights + outputs = self.output_activations + + if inputs is not None and inputs.observer is None: + if inputs.dynamic: + inputs.observer = "memoryless-minmax" + else: + inputs.observer = "static-minmax" + + if weights is not None and weights.observer is None: + weights.observer = "memoryless-minmax" + + if outputs is not None and outputs.observer is None: + if outputs.dynamic: + outputs.observer = "memoryless-minmax" + else: + outputs.observer = "static-minmax" + + self.input_activations = inputs + self.weights = weights + self.output_activations = outputs + model_config = ConfigDict(extra="forbid") From d95cc6bbfb8829ba0b554bbdf7ef047daf712580 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Mon, 13 Oct 2025 14:18:30 -0400 Subject: [PATCH 2/4] change name Signed-off-by: Kyle Sayers --- src/compressed_tensors/quantization/quant_scheme.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/compressed_tensors/quantization/quant_scheme.py b/src/compressed_tensors/quantization/quant_scheme.py index af703b41..9ff9a826 100644 --- a/src/compressed_tensors/quantization/quant_scheme.py +++ b/src/compressed_tensors/quantization/quant_scheme.py @@ -129,18 +129,18 @@ def _validate_observers(self): if inputs is not None and inputs.observer is None: if inputs.dynamic: - inputs.observer = "memoryless-minmax" + inputs.observer = "memoryless_minmax" else: - inputs.observer = "static-minmax" + inputs.observer = "static_minmax" if weights is not None and weights.observer is None: - weights.observer = "memoryless-minmax" + weights.observer = "memoryless_minmax" if outputs is not None and outputs.observer is None: if outputs.dynamic: - outputs.observer = "memoryless-minmax" + outputs.observer = "memoryless_minmax" else: - outputs.observer = "static-minmax" + outputs.observer = "static_minmax" self.input_activations = inputs self.weights = weights From 93560e090e4af4384ea8c8724175f7084d85b0f0 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Wed, 15 Oct 2025 13:37:38 -0400 Subject: [PATCH 3/4] reduce diff Signed-off-by: Kyle Sayers --- src/compressed_tensors/quantization/quant_args.py | 2 -- src/compressed_tensors/quantization/quant_scheme.py | 5 ----- 2 files changed, 7 deletions(-) diff --git a/src/compressed_tensors/quantization/quant_args.py b/src/compressed_tensors/quantization/quant_args.py index 3a3c93a4..2675d9b8 100644 --- a/src/compressed_tensors/quantization/quant_args.py +++ b/src/compressed_tensors/quantization/quant_args.py @@ -12,13 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -import warnings from enum import Enum from typing import Any, Dict, List, Optional, Union import torch from compressed_tensors.utils import Aliasable -from compressed_tensors.utils.helpers import deprecated from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator diff --git a/src/compressed_tensors/quantization/quant_scheme.py b/src/compressed_tensors/quantization/quant_scheme.py index 9ff9a826..f7901481 100644 --- a/src/compressed_tensors/quantization/quant_scheme.py +++ b/src/compressed_tensors/quantization/quant_scheme.py @@ -85,9 +85,6 @@ def validate_model_after(model: "QuantizationScheme") -> "QuantizationScheme": if inputs.actorder is not None: raise ValueError("Cannot apply actorder to input activations") - if inputs.observer is None: - inputs.observer - # validate output args if outputs is not None: if outputs.actorder is not None: @@ -206,7 +203,6 @@ def is_preset_scheme(name: str) -> bool: symmetric=True, dynamic=False, group_size=16, - observer="static_minmax", ), input_activations=QuantizationArgs( num_bits=4, @@ -215,7 +211,6 @@ def is_preset_scheme(name: str) -> bool: symmetric=True, dynamic=DynamicType.LOCAL, group_size=16, - observer="static_minmax", ), ) From 6fc343a7fcd9223768b829705c09cee5e6e72b2a Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Wed, 15 Oct 2025 13:40:12 -0400 Subject: [PATCH 4/4] use static_minmax for locally dynamic Signed-off-by: Kyle Sayers --- src/compressed_tensors/quantization/quant_scheme.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/compressed_tensors/quantization/quant_scheme.py b/src/compressed_tensors/quantization/quant_scheme.py index f7901481..09f5901f 100644 --- a/src/compressed_tensors/quantization/quant_scheme.py +++ b/src/compressed_tensors/quantization/quant_scheme.py @@ -125,7 +125,7 @@ def _validate_observers(self): outputs = self.output_activations if inputs is not None and inputs.observer is None: - if inputs.dynamic: + if inputs.dynamic is True: inputs.observer = "memoryless_minmax" else: inputs.observer = "static_minmax" @@ -134,7 +134,7 @@ def _validate_observers(self): weights.observer = "memoryless_minmax" if outputs is not None and outputs.observer is None: - if outputs.dynamic: + if outputs.dynamic is True: outputs.observer = "memoryless_minmax" else: outputs.observer = "static_minmax"