Add second draft of mm_fp4 backend

bkryu · bkryu · commit 969f547b7da5 · 2025-10-30T17:19:13.000Z
diff --git a/benchmarks/routines/flashinfer_benchmark_utils.py b/benchmarks/routines/flashinfer_benchmark_utils.py
@@ -238,9 +238,9 @@ def dtype_str_to_torch_dtype(dtype_str):
         "8.6": [],
         "8.9": [],
         "9.0": [],
-        "10.0": ["cudnn", "trtllm", "cutlass"],
-        "10.3": ["cudnn", "trtllm", "cutlass"],
-        "12.0": ["cudnn", "cutlass"],
+        "10.0": ["cudnn", "trtllm", "cutlass", "auto"],
+        "10.3": ["cudnn", "trtllm", "cutlass", "auto"],
+        "12.0": ["cudnn", "cutlass", "auto"],
     },
     # MOE
     "trtllm_fp4_block_scale_moe": {
diff --git a/benchmarks/routines/gemm.py b/benchmarks/routines/gemm.py
@@ -131,7 +131,7 @@ def parse_gemm_args(line, parser):
         required=False,
         nargs="+",
         default=["cudnn"],
-        choices=["cudnn", "cublas", "trtllm", "cutlass"],
+        choices=["cudnn", "cublas", "trtllm", "cutlass", "auto"],
         help="Kernel backends to test. Default: cudnn",
     )
     parser.add_argument(
@@ -823,7 +823,7 @@ def testMmFp4(args):
             print(
                 "[INFO] cutlass backend does not support mxfp4 quantization (use_nvfp4=False)"
             )
-            backends.remove("cutlass")
+            remove_cutlass = True
         if remove_cutlass:
             backends.remove("cutlass")
     if "cudnn" in backends:
@@ -833,6 +833,13 @@ def testMmFp4(args):
             remove_cudnn = True
         if remove_cudnn:
             backends.remove("cudnn")
+    if "auto" in backends:
+        remove_auto = False
+        if not use_128x4_sf_layout:
+            print("[INFO] auto backend does not support use_128x4_sf_layout=False")
+            remove_auto = True
+        if remove_auto:
+            backends.remove("auto")
     if getattr(args, "autotune", False):
         backends_to_remove = []
         for cur_backend in backends:
@@ -889,7 +896,7 @@ def testMmFp4(args):
     # res = torch.empty([m, n], device="cuda", dtype=res_dtype)
 
     def run_backend(backend):
-        if backend in ["cudnn", "trtllm", "cutlass"]:
+        if backend in ["cudnn", "trtllm", "cutlass", "auto"]:
             return flashinfer.gemm.mm_fp4(
                 a=input_fp4,
                 b=mat2_fp4.T if backend != "trtllm" else mat2_fp4_trtllm.T,
diff --git a/flashinfer/gemm.py b/flashinfer/gemm.py
@@ -17,7 +17,7 @@
 import functools
 from enum import Enum
 from types import SimpleNamespace
-from typing import List, Literal, Optional, Tuple
+from typing import List, Literal, Optional, Tuple, cast
 
 from flashinfer.trtllm_low_latency_gemm import trtllm_low_latency_gemm
 import torch
@@ -1703,7 +1703,7 @@ def _check_mm_fp4_problem_size(
     out: Optional[torch.Tensor] = None,
     block_size: int = 16,
     use_8x4_sf_layout: bool = False,
-    backend: Literal["cudnn", "trtllm", "cutlass"] = "cudnn",
+    backend: Literal["cudnn", "trtllm", "cutlass", "auto"] = "auto",
     use_nvfp4: bool = True,
 ):
     # Generic checks
@@ -1743,8 +1743,8 @@ def _check_mm_fp4_problem_size(
 
     if backend != "trtllm" and use_8x4_sf_layout:
         raise ValueError("Only TRTLLM FP4 GEMM supports 8x4 scale factor layout.")
-    if backend != "cudnn" and not use_nvfp4:
-        raise ValueError("Only cudnn FP4 GEMM supports mxfp4 quantization.")
+    if backend not in ["cudnn", "auto"] and not use_nvfp4:
+        raise ValueError("Only cudnn and auto FP4 GEMM supports mxfp4 quantization.")
 
     if use_nvfp4 and block_size != 16:
         raise ValueError("nvfp4 only supports block_size = 16.")
@@ -1765,7 +1765,7 @@ def _cudnn_gemm_fp4_requirement(
     out: Optional[torch.Tensor] = None,
     block_size: int = 16,
     use_8x4_sf_layout: bool = False,
-    backend: Literal["cudnn", "trtllm", "cutlass"] = "cudnn",
+    backend: Literal["cudnn", "trtllm", "cutlass", "auto"] = "auto",
     use_nvfp4: bool = True,
 ):
     if (
@@ -1823,7 +1823,7 @@ def _trtllm_gemm_fp4_requirement(
     out: Optional[torch.Tensor] = None,
     block_size: int = 16,
     use_8x4_sf_layout: bool = False,
-    backend: Literal["cudnn", "trtllm", "cutlass"] = "cudnn",
+    backend: Literal["cudnn", "trtllm", "cutlass", "auto"] = "auto",
     use_nvfp4: bool = True,
 ):
     if out_dtype != torch.bfloat16:
@@ -1845,17 +1845,57 @@ def _cutlass_gemm_fp4_requirement(
     out: Optional[torch.Tensor] = None,
     block_size: int = 16,
     use_8x4_sf_layout: bool = False,
-    backend: Literal["cudnn", "trtllm", "cutlass"] = "cudnn",
+    backend: Literal["cudnn", "trtllm", "cutlass", "auto"] = "auto",
     use_nvfp4: bool = True,
 ):
     return True
 
 
+@supported_compute_capability([100, 103, 110, 120])
+def _auto_gemm_fp4_requirement(
+    a: torch.Tensor,
+    b: torch.Tensor,
+    a_descale: torch.Tensor,
+    b_descale: torch.Tensor,
+    alpha: Optional[torch.Tensor] = None,
+    out_dtype: torch.dtype = torch.bfloat16,
+    out: Optional[torch.Tensor] = None,
+    block_size: int = 16,
+    use_8x4_sf_layout: bool = False,
+    backend: Literal["cudnn", "trtllm", "cutlass", "auto"] = "auto",
+    use_nvfp4: bool = True,
+):
+    # Auto backend requires at least one backend to be supported on the current device
+    cc_major, cc_minor = get_compute_capability(a.device)
+    cc_arch = cc_major * 10 + cc_minor
+
+    # Check if at least one backend is supported for this compute capability
+    candidate_backends = ["cudnn", "cutlass", "trtllm"]
+    backend_checkers = {
+        "cudnn": _cudnn_gemm_fp4_requirement,
+        "cutlass": _cutlass_gemm_fp4_requirement,
+        # Does not consider trtllm due to different interface.
+    }
+
+    for candidate in candidate_backends:
+        checker = backend_checkers[candidate]
+        if hasattr(
+            checker, "is_compute_capability_supported"
+        ) and checker.is_compute_capability_supported(cc_arch):
+            # At least one backend is supported
+            print(f"Backend {candidate} is supported on this device.")
+            return True
+
+    # No backend is supported on this device
+    return False
+
+
 @backend_requirement(
     {
         "cudnn": _cudnn_gemm_fp4_requirement,  # Each backend has its own requirement function
         "trtllm": _trtllm_gemm_fp4_requirement,
         "cutlass": _cutlass_gemm_fp4_requirement,
+        "auto": _auto_gemm_fp4_requirement,  # Auto backend requires at least one backend to be supported on the current device
     },
     common_check=_check_mm_fp4_problem_size,  # Shape checks common to all backends
 )
@@ -1950,22 +1990,40 @@ def mm_fp4(
     if backend == "auto":
         cuda_major, _ = get_cuda_version(a.device)
         cc_major, cc_minor = get_compute_capability(a.device)
-        cc_arch = cc_major * 10 + cc_minor
         # If cuda version is 13 or greater AND cudnn version is 9.X or greater, prioritize cudnn.
         if cuda_major >= 13:  # to-do add cudnn version threshold
-            candidate_backends = ["cudnn", "cutlass"]
+            candidate_backends = ("cudnn", "cutlass")
         # Otherwise, prioritize cutlass
         else:
-            candidate_backends = ["cutlass", "cudnn"]
-
-        # Support check
-        backends_to_delete = []
-        for candidate_backend in candidate_backends:
-            if not mm_fp4.is_backend_supported(candidate_backend, cc_arch):
-                backends_to_delete.append(candidate_backend)
-        for backend_to_delete in backends_to_delete:
-            candidate_backends.remove(backend_to_delete)
-        selected_backend = candidate_backends[0]
+            candidate_backends = ("cutlass", "cudnn")
+
+        # Filter to only supported backends for this compute capability
+        # Note: The requirement function already validated that at least one backend is supported
+        supported_backends = []
+        for candidate in candidate_backends:
+            # mypy requires explicit type casting for the backend literal
+            backend_literal = cast(
+                Literal["cudnn", "trtllm", "cutlass", "auto"], candidate
+            )
+            try:
+                _check_mm_fp4_problem_size(
+                    a,
+                    b,
+                    a_descale,
+                    b_descale,
+                    alpha,
+                    out_dtype,
+                    out,
+                    block_size,
+                    use_8x4_sf_layout,
+                    backend_literal,
+                    use_nvfp4,
+                )
+                supported_backends.append(candidate)
+            except Exception:
+                pass
+        print(f"Supported backends: {supported_backends}")
+        selected_backend = supported_backends[0]
         print(
             f"Selected backend: {selected_backend} for cuda version {cuda_major} and compute capability {cc_major}{cc_minor}"
         )
diff --git a/tests/gemm/test_mm_fp4.py b/tests/gemm/test_mm_fp4.py
@@ -105,5 +105,20 @@ def test_mm_fp4(
             pytest.fail(str(e))
 
 
+# Split tests for checking auto functionality
+@pytest.mark.parametrize("m", [1, 48, 256, 512])
+@pytest.mark.parametrize("n", [256, 512])
+@pytest.mark.parametrize("k", [256, 512])
+@pytest.mark.parametrize("res_dtype", [torch.bfloat16, torch.float16])
+@pytest.mark.parametrize("backend", ["auto"])
+@pytest.mark.parametrize("use_128x4_sf_layout", [False, True])
+@pytest.mark.parametrize("auto_tuning", [False, True])
+@pytest.mark.parametrize("fp4_type", ["nvfp4", "mxfp4", "mxfp4_alpha"])
+def test_mm_fp4_backend_auto(
+    m, n, k, res_dtype, backend, use_128x4_sf_layout, auto_tuning, fp4_type
+):
+    test_mm_fp4(m, n, k, res_dtype, "auto", use_128x4_sf_layout, auto_tuning, fp4_type)
+
+
 if __name__ == "__main__":
     pytest.main([__file__])