add more tests

kylesayrs · kylesayrs · commit 05237b2d7218 · 2025-10-14T00:19:13.000-04:00
Signed-off-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;
diff --git a/src/compressed_tensors/quantization/lifecycle/initialize.py b/src/compressed_tensors/quantization/lifecycle/initialize.py
@@ -289,11 +289,11 @@ def initialize_attn_qparams(
     kv_cache: Optional[QuantizedKVCache] = getattr(module, KV_CACHE_ATTR, None)
 
     if impl is None and kv_cache is None:
-        raise ValueError("Attention module has quantization scheme but no attached ")
+        raise ValueError("Attention module has quantization scheme but no attached")
 
-    config: PretrainedConfig = getattr(impl, "config", None) or getattr(
-        kv_cache, "config", None
-    )
+    _validate_attention_scheme(scheme)
+
+    config: PretrainedConfig = getattr(kv_cache, "config")
     head_dim = get_head_dim(config)
     observed_shape = (head_dim,)  # (batch_size, num_attention_heads, slen, head_dim)
     observed_dtype = next(module.parameters()).dtype
@@ -325,3 +325,19 @@ def initialize_attn_qparams(
             observed_dtype=observed_dtype,
             force_zero_point=force_zero_point,
         )
+
+
+def _validate_attention_scheme(scheme: QuantizationScheme):
+    if scheme.weights is not None:
+        raise ValueError(
+            "Cannot apply weight quantization to attention. "
+            "Instead, target (q|k|v)_proj"
+        )
+
+    if scheme.input_activations is None:
+        raise ValueError(
+            "Cannot apply attention quantization without specifying input activations"
+        )
+
+    if scheme.output_activations is not None:
+        raise ValueError("Cannot apply output quantization to attention")
diff --git a/tests/test_modeling/test_attention_and_cache.py b/tests/test_modeling/test_attention_and_cache.py
@@ -0,0 +1,105 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from compressed_tensors.modeling import (
+    IMPL_ATTR,
+    KV_CACHE_ATTR,
+    QuantizedAttentionImpl,
+    QuantizedKVCache,
+    initialize_hooked_attention,
+    initialize_hooked_kv_cache,
+    register_key_hook,
+    register_query_hook,
+    register_value_hook,
+)
+from tests.testing_utils import requires_gpu
+from transformers import AutoModelForCausalLM
+
+
+@requires_gpu
+def test_attention_cache():
+    model = AutoModelForCausalLM.from_pretrained(
+        "nm-testing/llama2.c-stories15M", device_map="cuda"
+    )
+    inputs = {key: value.to("cuda") for key, value in model.dummy_inputs.items()}
+    true_outputs = model(**inputs)
+    layers = model.model.layers
+
+    # check if hooks work
+    k_called = [False for _ in range(len(layers))]
+    v_called = [False for _ in range(len(layers))]
+
+    # apply kv cache quantization
+    _apply_kv_cache(model, layers, k_called, v_called)
+
+    # check kv cache quantization
+    outputs = model(**inputs)
+    assert torch.equal(outputs.logits, true_outputs.logits)
+    assert all(k_called) and all(v_called)
+
+    ## apply attention quantization after kv cache quantization ##
+
+    # check if hooks work
+    q_called = [False for _ in range(len(layers))]
+    k_called = [False for _ in range(len(layers))]
+    v_called = [False for _ in range(len(layers))]
+
+    _apply_attention(model, layers, q_called, k_called, v_called)
+    outputs = model(**inputs)
+    assert torch.equal(outputs.logits, true_outputs.logits)
+    assert all(q_called) and all(k_called) and all(v_called)
+
+
+def _apply_kv_cache(model, layers, k_called, v_called):
+    for layer_index, layer in enumerate(layers):
+        module = layer.self_attn
+        initialize_hooked_kv_cache(model, module)
+        assert isinstance(getattr(module, KV_CACHE_ATTR), QuantizedKVCache)
+
+        # reapply is no-op
+        initialize_hooked_kv_cache(model, module)
+
+        def k_hook(_module, _states, layer_index=layer_index):  # NOTE: capture by value
+            k_called[layer_index] = True
+
+        def v_hook(_module, _states, layer_index=layer_index):
+            my_index = layer_index
+            v_called[my_index] = True
+
+        register_key_hook(module, k_hook)
+        register_value_hook(module, v_hook)
+
+
+def _apply_attention(model, layers, q_called, k_called, v_called):
+    for layer_index, layer in enumerate(layers):
+        module = layer.self_attn
+        initialize_hooked_attention(model, module)
+        assert isinstance(getattr(module, IMPL_ATTR), QuantizedAttentionImpl)
+
+        # reapply is no-op
+        initialize_hooked_attention(model, module)
+
+        def q_hook(_module, _states, layer_index=layer_index):
+            q_called[layer_index] = True
+
+        def k_hook(_module, _states, layer_index=layer_index):
+            k_called[layer_index] = True
+
+        def v_hook(_module, _states, layer_index=layer_index):
+            v_called[layer_index] = True
+
+        register_query_hook(module, q_hook)
+        register_key_hook(module, k_hook)
+        register_value_hook(module, v_hook)
diff --git a/tests/test_quantization/lifecycle/test_apply.py b/tests/test_quantization/lifecycle/test_apply.py
@@ -366,3 +366,42 @@ def test_multi_apply_quantization_config():
                 weight_zero_point is not None
                 and weight_zero_point.shape == torch.Size([1])
             )
+
+@requires_accelerate()
+def test_apply_kv_cache():
+    from accelerate import init_empty_weights
+
+    with init_empty_weights():
+        model = AutoModelForCausalLM.from_pretrained("nm-testing/llama2.c-stories15M")
+
+    args = QuantizationArgs(num_bits=8, type="float", strategy="tensor")
+    config = QuantizationConfig(config_groups={}, kv_cache_scheme=args)
+    
+    apply_quantization_config(model, config)
+
+    for layer in model.model.layers:
+        assert getattr(layer.self_attn, "quantization_scheme").input_activations == args
+        assert hasattr(layer.self_attn, "k_scale")
+        assert hasattr(layer.self_attn, "v_scale")
+
+
+@requires_accelerate()
+def test_apply_attention():
+    from accelerate import init_empty_weights
+
+    with init_empty_weights():
+        model = AutoModelForCausalLM.from_pretrained("nm-testing/llama2.c-stories15M")
+
+    scheme = QuantizationScheme(
+        targets=["LlamaAttention"],
+        input_activations=QuantizationArgs(num_bits=8, type="float", strategy="tensor"),
+    )
+    config = QuantizationConfig(config_groups={"attention": scheme})
+    
+    apply_quantization_config(model, config)
+
+    for layer in model.model.layers:
+        assert getattr(layer.self_attn, "quantization_scheme") == scheme
+        assert hasattr(layer.self_attn, "q_scale")
+        assert hasattr(layer.self_attn, "k_scale")
+        assert hasattr(layer.self_attn, "v_scale")