Add test for fp8 semi-sparse vs. dense

bbeckca · bbeckca · commit 4199d0659aa9 · 2025-10-26T12:27:14.000-07:00
Signed-off-by: Benji Beck &lt;benjibeck@meta.com&gt;
diff --git a/test/quantization/quantize_/workflows/float8/test_float8_semi_sparse.py b/test/quantization/quantize_/workflows/float8/test_float8_semi_sparse.py
@@ -4,49 +4,38 @@
 # This source code is licensed under the BSD 3-Clause license found in the
 # LICENSE file in the root directory of this source tree.
 
-import tempfile
 import unittest
-
+from torchao.quantization.quantize_.workflows.float8.float8_semi_sparse_tensor import Float8SemiSparseTensor
+from torchao.quantization.quantize_.workflows.float8.float8_tensor import Float8Tensor
+from torchao.float8.inference import Float8MMConfig
 import torch
 from torch.testing._internal.common_utils import (
     TestCase,
     instantiate_parametrized_tests,
     parametrize,
     run_tests,
 )
-
-from torchao.quantization import (
-    Float8WeightOnlyConfig,
-    quantize_,
-)
-from torchao.quantization.utils import compute_error
 from torchao.sparsity.sparse_api import apply_fake_sparsity
 from torchao.testing.utils import skip_if_rocm
-from torchao.utils import torch_version_at_least
+from torchao.utils import is_sm_at_least_90
 
-BF16_ACT_CONFIG = Float8WeightOnlyConfig(
-    group_size=128,
-    packing_format="cutlass_semi_sparse",
-)
 
-
-@unittest.skipIf(not torch_version_at_least("2.8.0"), "Need pytorch 2.8+")
+@unittest.skipIf(not is_sm_at_least_90(), "Need H100+ to run")
 @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
 class TestFloat8SemiSparseTensor(TestCase):
     def setUp(self):
         self.GPU_DEVICES = ["cuda"] if torch.cuda.is_available() else []
 
     @skip_if_rocm("ROCm enablement in progress")
-    @parametrize("config", [BF16_ACT_CONFIG])
     @parametrize(
         "sizes",
         [
             ((128,), 256, 128),
             ((32, 128), 512, 128),
-            ((2, 32, 128), 256, 12),
+            ((2, 32, 128), 256, 128),
         ],
     )
-    def test_linear(self, config, sizes):
+    def test_sparse_vs_dense_fp8(self, sizes):
         dtype = torch.bfloat16
         device = "cuda"
 
@@ -55,52 +44,20 @@ def test_linear(self, config, sizes):
         linear = torch.nn.Linear(K, N, dtype=dtype, device=device)
 
         apply_fake_sparsity(linear)
-        original = linear(input)
-        quantize_(linear, config)
-        quantized = linear(input)
-        self.assertTrue(compute_error(original, quantized) > 20)
-
-        compiled_linear = torch.compile(linear)
-        quantized_and_compiled = compiled_linear(input)
-        self.assertTrue(compute_error(original, quantized_and_compiled) > 20)
-
-    @skip_if_rocm("ROCm enablement in progress")
-    @unittest.skip("Fix later")
-    @parametrize("config", [BF16_ACT_CONFIG])
-    def test_to_device(self, config):
-        for device in self.GPU_DEVICES:
-            linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16)
-            quantize_(linear, config)
-            linear.to(device)
-
-            linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16)
-            quantize_(linear, config)
-            linear.to(device=device)
-
-            linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16)
-            quantize_(linear, config)
-            linear.to(device)
-
-    @skip_if_rocm("ROCm enablement in progress")
-    @parametrize("config", [BF16_ACT_CONFIG])
-    def test_module_path(self, config):
-        linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16)
-        quantize_(linear.cuda(), config)
-        self.assertEqual(
-            str(type(linear.weight)),
-            "<class 'torchao.quantization.Float8SemiSparseTensor'>",
+        
+        mm_config = Float8MMConfig(use_fast_accum=True)
+        input_fp8 = Float8Tensor.from_hp(input, float8_dtype=torch.float8_e4m3fn, mm_config=mm_config)
+        
+        weight_fp8 = Float8Tensor.from_hp(linear.weight.data, float8_dtype=torch.float8_e4m3fn, mm_config=mm_config)
+        dense_output = torch.nn.functional.linear(input_fp8, weight_fp8, linear.bias)
+        
+        weight_sparse_fp8 = Float8SemiSparseTensor.from_hp(linear.weight.data, [1, K])
+        sparse_output = torch.nn.functional.linear(input_fp8, weight_sparse_fp8, linear.bias)
+        
+        torch.testing.assert_close(
+            dense_output, sparse_output, atol=3e-1, rtol=3e-1
         )
 
-        with tempfile.NamedTemporaryFile() as f:
-            torch.save(linear.state_dict(), f)
-            f.seek(0)
-            state_dict = torch.load(f)
-            self.assertEqual(
-                str(type(state_dict["weight"])),
-                "<class 'torchao.quantization.Float8SemiSparseTensor'>",
-            )
-
-
 instantiate_parametrized_tests(TestFloat8SemiSparseTensor)
 
 
diff --git a/torchao/quantization/quantize_/workflows/float8/float8_semi_sparse_tensor.py b/torchao/quantization/quantize_/workflows/float8/float8_semi_sparse_tensor.py
@@ -19,7 +19,7 @@
 
 
 class Float8SemiSparseTensor(TorchAOBaseTensor):
-    tensor_data_names = ["sparse", "scale", "meta"]
+    tensor_data_names = ["sparse", "meta", "scale"]
 
     def __new__(
         cls,
@@ -83,29 +83,66 @@ def from_hp(
 implements_torch_function = Float8SemiSparseTensor.implements_torch_function
 
 
-@implements(aten.linear.default)
-@implements_torch_function(torch.nn.functional.linear)
+@implements(aten.t.default)
 def _(func, types, args, kwargs):
-    from torchao.ops import rowwise_scaled_linear_sparse_cutlass_f8f8
-
-    input_tensor, weight_tensor, bias = (
-        args[0],
-        args[1],
-        args[2] if len(args) > 2 else None,
+    from torch.utils._python_dispatch import return_and_correct_aliasing
+    
+    self = args[0]
+    new = Float8SemiSparseTensor(
+        sparse=self.sparse,
+        meta=self.meta,
+        scale=self.scale,
     )
+    return return_and_correct_aliasing(func, args, kwargs, new)
+
 
-    input = input_tensor.qdata
-    input_scale = input_tensor.scale
+def _linear_fp8_semi_sparse(input_tensor, weight_tensor, bias):
+    from torchao.ops import rowwise_scaled_linear_sparse_cutlass_f8f8
+    from torchao.quantization.quantize_.workflows.float8.float8_tensor import Float8Tensor
+    
+    if isinstance(input_tensor, Float8Tensor):
+        input = input_tensor.qdata
+        input_scale = input_tensor.scale
+        out_dtype = input_tensor.dtype
+    else:
+        input = input_tensor.qdata
+        input_scale = input_tensor.scale
+        out_dtype = input_tensor.dtype
+    
     weight = weight_tensor.sparse
     weight_meta = weight_tensor.meta
     weight_scale = weight_tensor.scale
-    out_dtype = input_tensor.dtype
-
-    out = rowwise_scaled_linear_sparse_cutlass_f8f8(
+    
+    # Reshape input_scale if needed: kernel expects scale to match input shape minus last dim
+    # For input [B, K], scale should be [B] not [B, 1]
+    if input_scale.dim() > input.dim() - 1:
+        input_scale = input_scale.squeeze(-1)
+    
+    return rowwise_scaled_linear_sparse_cutlass_f8f8(
         input, input_scale, weight, weight_meta, weight_scale, bias, out_dtype
     )
 
-    return out
+
+@implements([aten.mm.default, aten.addmm.default])
+def _(func, types, args, kwargs):
+    if func == aten.addmm.default:
+        bias, input_tensor, weight_tensor = args
+    else:  # aten.mm.default
+        input_tensor, weight_tensor = args
+        bias = None
+    
+    return _linear_fp8_semi_sparse(input_tensor, weight_tensor, bias)
+
+
+@implements(aten.linear.default)
+@implements_torch_function(torch.nn.functional.linear)
+def _(func, types, args, kwargs):
+    input_tensor, weight_tensor, bias = (
+        args[0],
+        args[1],
+        args[2] if len(args) > 2 else None,
+    )
+    return _linear_fp8_semi_sparse(input_tensor, weight_tensor, bias)
 
 
 Float8SemiSparseTensor.__module__ = "torchao.quantization"
diff --git a/torchao/quantization/quantize_/workflows/float8/float8_tensor.py b/torchao/quantization/quantize_/workflows/float8/float8_tensor.py
@@ -256,9 +256,10 @@ def _(func, types, args, kwargs):
         args[1],
         args[2] if len(args) > 2 else None,
     )
-    assert isinstance(weight_tensor, Float8Tensor), (
-        f"Don't expect to reach here with an override other than weight currently, {type(input_tensor)} {type(weight_tensor)}"
-    )
+    
+    # If weight is not Float8Tensor, return NotImplemented to allow weight's dispatch to handle it
+    if not isinstance(weight_tensor, Float8Tensor):
+        return NotImplemented
 
     act_quant_kwargs = weight_tensor.act_quant_kwargs
     # quantizing activation, if `act_quant_kwargs` is specified