[wip] float8 rowwise quant along row 1 of tensor rank 2

vkuzo · vkuzo · commit 07eb188b02fa · 2025-11-06T06:43:38.000-08:00
Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags: ghstack-source-id: c40d96b ghstack-comment-id: 3497584430 Pull-Request: #3303
diff --git a/test/quantization/quantize_/workflows/float8/test_float8_tensor.py b/test/quantization/quantize_/workflows/float8/test_float8_tensor.py
@@ -18,6 +18,7 @@
 from torchao.quantization import (
     Float8DynamicActivationFloat8WeightConfig,
     Float8WeightOnlyConfig,
+    PerAxis,
     PerBlock,
     PerRow,
     PerTensor,
@@ -466,6 +467,39 @@ def forward(self, x):
         sqnr = compute_error(original, quantized)
         self.assertTrue(sqnr > 20)
 
+    @unittest.skipIf(not is_sm_at_least_90(), "Nedd sm90+")
+    @unittest.skipIf(not _is_fbgemm_gpu_genai_available(), "Need fbgemm_gpu_genai")
+    def test_bmm_weight_in_bkn_layout(self):
+        # Tests rowwise quantization of a 3d weight stored with shape (B, K, N)
+        # and contigous with that shape. Since the `K` dimension is not last, we
+        # need to specify granularity with `PerAxis(1)`.
+        
+        # only support per row quantization
+        granularity = [PerRow(), PerAxis(1)]
+        config = Float8DynamicActivationFloat8WeightConfig(granularity=granularity)
+
+        class Model(torch.nn.Module):
+            def __init__(self, weight):
+                super().__init__()
+                self.weight = weight
+
+            def forward(self, x):
+                return torch.bmm(x, self.weight)
+
+        dtype = torch.bfloat16
+        device = "cuda"
+
+        B, M, K, N = 10, 32, 128, 256
+
+        input = torch.randn(B, M, K, dtype=dtype, device=device)
+        weight = torch.randn(B, K, N, dtype=dtype, device=device)
+        m = Model(weight).eval()
+        original = m(input)
+        quantize_(m, config, filter_fn=lambda x, fqn: True)
+        quantized = m(input)
+        sqnr = compute_error(original, quantized)
+        self.assertTrue(sqnr > 20)
+
     @common_utils.parametrize("granularity", [PerTensor(), PerRow()])
     @common_utils.parametrize(
         "sizes",
diff --git a/torchao/float8/inference.py b/torchao/float8/inference.py
@@ -15,6 +15,7 @@
 from torchao.float8.float8_utils import is_row_major, pad_tensor_for_matmul
 from torchao.float8.types import FP8Granularity
 from torchao.quantization.granularity import (
+    PerAxis,
     PerBlock,
     PerRow,
     PerTensor,
@@ -247,13 +248,21 @@ def _normalize_granularity(
             granularity[1], PerTensor
         )
         is_per_row = isinstance(granularity[0], PerRow) and isinstance(
-            granularity[1], PerRow
+            granularity[1], (PerRow, PerAxis)
         )
         is_a_1_128_w_128_128 = _granularity_is_a_1_128_w_128_128(granularity)
 
         if not (is_per_tensor or is_per_row or is_a_1_128_w_128_128):
             raise ValueError(f"Unsupported granularity types: {granularity}.")
-        if not isinstance(granularity[0], type(granularity[1])):
+
+        a_w_granularities_match = (
+            # direct match
+            isinstance(granularity[0], type(granularity[1]))
+            # PerAxis is a more general version of PerRow
+            or (isinstance(granularity[0], PerRow) and isinstance(granularity[1], PerAxis))
+        )
+
+        if not a_w_granularities_match:
             raise ValueError(
                 f"Different granularities for activation and weight are not supported: {granularity}."
             )
@@ -280,7 +289,7 @@ def _check_hardware_support(
         granularities[1], PerTensor
     )
     is_per_row = isinstance(granularities[0], PerRow) and isinstance(
-        granularities[1], PerRow
+        granularities[1], (PerRow, PerAxis)
     )
     is_a_1_128_w_128_128 = _granularity_is_a_1_128_w_128_128(granularities)
 
diff --git a/torchao/quantization/quantize_/workflows/float8/float8_tensor.py b/torchao/quantization/quantize_/workflows/float8/float8_tensor.py
@@ -423,6 +423,8 @@ def _(func, types, args, kwargs):
 
         b_data = weight_tensor.qdata
         b_scale = weight_tensor.scale
+        print('a', a_data.shape, a_scale.shape, input_tensor.block_size)
+        print('b', b_data.shape, b_scale.shape, weight_tensor.block_size)
 
         assert (
             weight_tensor.block_size[0] == 1