Merge pull request #54 from foundation-model-stack/fp8

andrea-fasoli · web-flow · commit ae77c06ed27c · 2025-07-02T10:04:47.000-07:00
Add fp8 attention support for AIU
diff --git a/aiu_fms_testing_utils/testing/validation.py b/aiu_fms_testing_utils/testing/validation.py
@@ -187,10 +187,10 @@ def load_validation_information(validation_path, validation_files_type, batch_si
 
     return ValidationInfo(validation_info)
 
-def extract_validation_information(model, input_ids, max_new_tokens, post_iteration_hook, attn_algorithm=None, eos_token_id = None, only_last_token=False, timing="", attn_type="sdpa", **padding_kwargs):
+def extract_validation_information(model, input_ids, max_new_tokens, post_iteration_hook, attn_algorithm=None, eos_token_id = None, only_last_token=False, timing="", **extra_kwargs):
     max_seq_len = model.config.max_expected_seq_len
     attention_specific_kwargs = {}
-    if attn_type == "paged":
+    if "paged" in extra_kwargs["attn_name"]:
         from aiu_fms_testing_utils.utils.paged import generate
     else:
         # TODO: Add a unified generation dependent on attn_type
@@ -199,7 +199,7 @@ def extract_validation_information(model, input_ids, max_new_tokens, post_iterat
         attention_specific_kwargs["max_seq_len"] = max_seq_len
 
     # Add only_last_token optimization
-    extra_generation_kwargs = {**padding_kwargs}
+    extra_generation_kwargs = {**extra_kwargs}
     if only_last_token:
         extra_generation_kwargs["only_last_token"] = only_last_token
     if attn_algorithm is not None:
diff --git a/aiu_fms_testing_utils/utils/__init__.py b/aiu_fms_testing_utils/utils/__init__.py
@@ -9,10 +9,11 @@
 import json
 import random
 
-def warmup_model(model: nn.Module, input_ids: torch.Tensor, max_new_tokens: int, compile_dynamic_sendnn = False, attn_type="sdpa", **padding_kwargs):
+def warmup_model(model: nn.Module, input_ids: torch.Tensor, max_new_tokens: int, compile_dynamic_sendnn = False, use_cache: bool = True, **extra_kwargs):
     import torch_sendnn
     attention_specific_kwargs = {}
-    if attn_type == "paged":
+    attn_name = extra_kwargs["attn_name"]
+    if "paged" in attn_name:
         from aiu_fms_testing_utils.utils.paged import generate, adjust_inputs_to_batch
     else:
         # TODO: Add a unified generation dependent on attn_type
@@ -24,18 +25,18 @@ def warmup_model(model: nn.Module, input_ids: torch.Tensor, max_new_tokens: int,
 
     # adjust inputs depending on attn_type and dynamic shapes
     _warmup_input_ids = input_ids
-    _padding_kwargs = padding_kwargs
+    _extra_kwargs = extra_kwargs
     _max_new_tokens = max_new_tokens
     if compile_dynamic_sendnn:
         _max_new_tokens = 2
         # always warmup with batch size 2 when using attn_type=paged
-        if attn_type == "paged":
-            _warmup_input_ids, _padding_kwargs = adjust_inputs_to_batch(input_ids, **padding_kwargs)
+        if "paged" in attn_name:
+            _warmup_input_ids, _extra_kwargs = adjust_inputs_to_batch(input_ids, **extra_kwargs)
 
-    extra_kwargs = {**_padding_kwargs, "only_last_token": attn_type != "paged"}
+    extra_kwargs = {**_extra_kwargs, "only_last_token": "paged" not in attn_name}
 
     with torch_sendnn.warmup_mode():
-        generate(model, _warmup_input_ids, max_new_tokens=_max_new_tokens, use_cache=True, do_sample=False, extra_kwargs=extra_kwargs, **attention_specific_kwargs)
+        generate(model, _warmup_input_ids, max_new_tokens=_max_new_tokens, do_sample=False, use_cache=use_cache, extra_kwargs=extra_kwargs, **attention_specific_kwargs)
     pt_compile_model_time = time.time() - pt_compile_model_time
     dprint(f"PT compile complete, took {pt_compile_model_time:.3f}s")
 
diff --git a/aiu_fms_testing_utils/utils/paged.py b/aiu_fms_testing_utils/utils/paged.py
@@ -5,7 +5,7 @@
 import torch
 import fms.utils.spyre.paged
 
-def adjust_inputs_to_batch(input_ids: torch.Tensor, **padding_kwargs):
+def adjust_inputs_to_batch(input_ids: torch.Tensor, **extra_kwargs):
     """
     Adjusts the inputs to a batch. Batch size 1 cannot be handled since we want a symbolic shape for the batch 
     and pytorch automatically sets size 1 dimensions as static
@@ -14,11 +14,11 @@ def adjust_inputs_to_batch(input_ids: torch.Tensor, **padding_kwargs):
     """
     input_ids = input_ids[0].repeat(2, 1)
     # ensure we pass along other kwargs
-    kwargs = {**padding_kwargs}
-    mask = padding_kwargs.get("mask", None)
+    kwargs = {**extra_kwargs}
+    mask = extra_kwargs.get("mask", None)
     if mask is not None:
         kwargs["mask"] = torch.stack((mask[0], mask[0]))
-    position_ids = padding_kwargs.get("position_ids", None)
+    position_ids = extra_kwargs.get("position_ids", None)
     if position_ids is not None:
         kwargs["position_ids"] = position_ids[0].repeat(2, 1)
     return input_ids, kwargs
@@ -137,14 +137,23 @@ def generate(
 
     kvheads = kvheads // tensor_parallel_size if kvheads > 1 else kvheads
     head_size = model.config.emb_dim // nheads
-    kwargs["attn_name"] = "spyre_paged_attn"
-    kwargs["past_key_value_states"] = [
-        (
-            torch.zeros(NUM_BLOCKS, BLOCK_SIZE, kvheads, head_size, dtype=model_dtype),
-            torch.zeros(NUM_BLOCKS, BLOCK_SIZE, kvheads, head_size, dtype=model_dtype),
-        )
-        for _ in range(model.config.nlayers)
-    ]
+    if "fp8" in kwargs["attn_name"]:
+        from fms_mo.aiu_addons.fp8.fp8_utils import ScaledTensor
+        kwargs["past_key_value_states"] = [
+            (
+                ScaledTensor(torch.zeros(NUM_BLOCKS, BLOCK_SIZE, kvheads, head_size, dtype=torch.float8_e4m3fn), torch.tensor(1.0), False),
+                ScaledTensor(torch.zeros(NUM_BLOCKS, BLOCK_SIZE, kvheads, head_size, dtype=torch.float8_e4m3fn), torch.tensor(1.0), False),
+            )
+            for _ in range(model.config.nlayers)
+        ]
+    else:
+        kwargs["past_key_value_states"] = [
+            (
+                torch.zeros(NUM_BLOCKS, BLOCK_SIZE, kvheads, head_size, dtype=model_dtype),
+                torch.zeros(NUM_BLOCKS, BLOCK_SIZE, kvheads, head_size, dtype=model_dtype),
+            )
+            for _ in range(model.config.nlayers)
+        ]
     kwargs["block_table"] = None
     block_numbers = [i for i in range(NUM_BLOCKS)]
     # this will ensure we don't have contiguous blocks
@@ -240,6 +249,12 @@ def generate(
                     attn_name=kwargs["attn_name"],
                 )
 
+                # TODO: Figure out how to do this cleanly
+                if "fp8" in kwargs["attn_name"] and seq_i != input_ids.size(0) - 1:
+                    for layer_cache in current_kv_cache:
+                        layer_cache[0]._scaled = False
+                        layer_cache[1]._scaled = False
+
                 outputs_list.append(output[0].squeeze(0))
 
             output = (torch.stack(outputs_list), current_kv_cache)
diff --git a/scripts/inference.py b/scripts/inference.py
@@ -7,7 +7,6 @@
 from pathlib import Path
 import random
 import time
-import contextlib
 
 # Third Party
 from aiu_fms_testing_utils.utils import aiu_setup, warmup_model
@@ -104,7 +103,17 @@
     type=str,
     default=None,
     choices=["bf16", "fp16", "fp32"],
-    help="If set to one of the choices, overrides the model checkpoint weight format by setting the default pytorch format",
+    help="If set to one of the choices, overrides the model checkpoint weight format by setting the default pytorch format. This will break quantized checkpoints.",
+)
+parser.add_argument(
+    "--cast_bf16_to_fp16",
+    action="store_true",
+    help="If set, cast any bf16 weights in the model to fp16 for AIU compiler. Doesn't touch fp32 or quantized",
+)
+parser.add_argument(
+    "--cast_fp16_to_bf16",
+    action="store_true",
+    help="If set, cast any fp16 weights in the model to bf16 for GPU. Doesn't touch fp32 or quantized",
 )
 parser.add_argument(
     "--compile",
@@ -221,17 +230,29 @@
 parser.add_argument(
     "--attention_type",
     type=str,
-    choices=["sdpa", "paged"],
+    choices=["sdpa", "paged", "math_fp8", "paged_fp8"],
     default="sdpa",
     help="which backend attention to use in mha",
 )
 args = parser.parse_args()
 
-if args.attention_type == "paged":
+attention_map = {
+    "sdpa": "sdpa_causal",
+    "paged": "spyre_paged_attn",
+    "math_fp8": "math_fp8",
+    "paged_fp8": "spyre_paged_attn_fp8",
+}
+
+attn_name = attention_map[args.attention_type]
+
+if "paged" in attn_name:
     from aiu_fms_testing_utils.utils.paged import generate
 else:
     from fms.utils.generation import generate
 
+if "fp8" in attn_name:
+    import fms_mo.aiu_addons.fp8.fp8_attn
+
 if args.quantization == "gptq":
     if "aiu" in args.device_type:
         try:
@@ -329,7 +350,7 @@
             print("must set AIU_WORLD_RANK_0")
             exit()
         os.environ.setdefault("FLEX_COMPUTE", "SENTIENT")
-        os.environ.setdefault("FLEX_DEVICE", "VFIO")
+        os.environ.setdefault("FLEX_DEVICE", "PF")
 
     device = torch.device("cpu")
 else:
@@ -463,6 +484,38 @@ def select_int8_module(
     fused_weights=fused_weights,
 )
 
+### Quantization
+
+# FP8 model checks
+has_fp8_weights = False
+has_bf16_weights = False
+has_fp16_weights = False
+for param in model.parameters():
+    if param.dtype == torch.float8_e4m3fn:
+        has_fp8_weights = True
+    elif param.dtype == torch.bfloat16:
+        has_bf16_weights = True
+    elif param.dtype == torch.float16:
+        has_fp16_weights = True
+
+if has_fp8_weights:
+    if is_aiu_backend and has_bf16_weights and not args.cast_bf16_to_fp16:
+        raise ValueError("FP8 checkpoints on AIU with bf16 weights require casting to fp16 using --cast_bf16_to_fp16. Do not use --default_dtype!")
+    elif device.type == "cuda" and has_fp16_weights and not args.cast_fp16_to_bf16:
+        raise ValueError("FP8 checkpoints on GPU with fp16 weights require casting to bf16 using --cast_fp16_to_bf16. Do not use --default_dtype!")
+
+if args.cast_bf16_to_fp16:
+    for name, param in model.named_parameters():
+        if param.dtype == torch.bfloat16:
+            if param.max() > torch.finfo(torch.float16).max:
+                dprint(f"[WARNING] You are casting param {name} to fp16, which will cause loss of accuracy. You can ignore this warning if this is intended.")
+            param.data = param.data.to(dtype=torch.float16)
+
+if args.cast_fp16_to_bf16:
+    for param in model.parameters():
+        if param.dtype == torch.float16:
+            param.data = param.data.to(dtype=torch.bfloat16)
+
 if args.quantization in ["gptq", "int8"]:
     if rank == 0 and args.verbose > 0:
         dprint("PARAMS:\n" + "\n".join(f"{k:60} {str(v.dtype):15} {str(v.device):10} {list(v.size())}" for k,v in model.named_parameters()))
@@ -606,7 +659,9 @@ def truncate_prompts_to_max_length(prompts, max_len, max_allowed_length):
     ids = prompts
     if isinstance(ids, list) and len(ids) == 1:
         ids = ids[0].unsqueeze(0)
-    extra_generation_kwargs = None
+    extra_generation_kwargs = {}
+
+extra_generation_kwargs["attn_name"] = attn_name
 
 
 def print_result(result, result_idx: int):
@@ -648,19 +703,15 @@ def infer(use_cache, do_sample, warmup):
     global extra_generation_kwargs
     if extra_generation_kwargs is None:
         extra_generation_kwargs = {}
-    extra_generation_kwargs["only_last_token"] = args.attention_type != "paged"
-
-    if args.device_type == "cpu":
-        # Bug in 2.3.1 fixed in 2.4.1 for SDPA flash cpu impl when padding too much
-        extra_generation_kwargs["attn_algorithm"] = "math"
+    extra_generation_kwargs["only_last_token"] = "paged" not in attn_name
 
     if not args.no_early_termination and not warmup:
         eos_token_id = tokenizer.eos_token_id
     else:
         eos_token_id = None
 
     attention_specific_kwargs = {}
-    if args.attention_type == "sdpa":
+    if attn_name == "sdpa_causal":
         attention_specific_kwargs["contiguous_cache"] = True
 
     result = generate(
@@ -706,7 +757,8 @@ def infer(use_cache, do_sample, warmup):
     dprint(f"compilation warmup")
     pt_compile_model_time = time.time()
     if args.device_type == "aiu":  # only run warmup for AIU, no need for senulator
-        warmup_model(model, ids, args.max_new_tokens, args.compile_dynamic_sendnn, attn_type=args.attention_type, **extra_generation_kwargs)
+        for cache in use_cache:
+            warmup_model(model, ids, args.max_new_tokens, args.compile_dynamic_sendnn, **extra_generation_kwargs)
         aiu_warmup_time = time.time()
         for sample, cache in itertools.product(do_sample, use_cache):
             infer(cache, sample, True)
diff --git a/tests/models/test_decoders.py b/tests/models/test_decoders.py
@@ -58,6 +58,14 @@
 USE_DISTRIBUTED = os.environ.get("FMS_TEST_SHAPES_DISTRIBUTED", "0") == "1"
 
 ATTN_TYPE = os.environ.get("FMS_TEST_SHAPES_ATTN_TYPE", "sdpa")
+attention_map = {
+    "sdpa": "sdpa_causal",
+    "paged": "spyre_paged_attn",
+    "math_fp8": "math_fp8",
+    "paged_fp8": "spyre_paged_attn_fp8",
+}
+ATTN_NAME = attention_map[ATTN_TYPE]
+
 FORCE_VALIDATION_LEVEL_1 = (
     os.environ.get("FMS_TEST_SHAPES_FORCE_VALIDATION_LEVEL_1", "0") == "1"
 )
@@ -246,8 +254,8 @@ def __prepare_inputs(batch_size, seq_length, tokenizer, seed=0):
     for prompt, _ in prompts_and_sizes:
         prompt_list.append(ids_for_prompt(prompt, tokenizer))
 
-    input_ids, padding_kwargs = pad_input_ids(prompt_list, min_pad_length=seq_length)
-    return input_ids, padding_kwargs
+    input_ids, extra_kwargs = pad_input_ids(prompt_list, min_pad_length=seq_length)
+    return input_ids, extra_kwargs
 
 
 def __find_eos_index(reference_tokens, eos_token_id, seq_length, max_new_tokens):
@@ -413,10 +421,11 @@ def test_common_shapes(model_path, batch_size, seq_length, max_new_tokens, persi
         )
 
     # prepare input_ids
-    input_ids, padding_kwargs = __prepare_inputs(batch_size, seq_length, tokenizer)
+    input_ids, extra_kwargs = __prepare_inputs(batch_size, seq_length, tokenizer)
+    extra_kwargs["attn_name"] = ATTN_NAME
 
     # warmup aiu model
-    warmup_model(model, input_ids, max_new_tokens, compile_dynamic_sendnn, attn_type=ATTN_TYPE, **padding_kwargs)
+    warmup_model(model, input_ids, max_new_tokens, compile_dynamic_sendnn, **extra_kwargs)
 
     # generate cpu validation info
     cpu_validation_info = __load_validation_info(
@@ -429,7 +438,7 @@ def test_common_shapes(model_path, batch_size, seq_length, max_new_tokens, persi
             max_new_tokens,
             LogitsExtractorHook(),
             attn_algorithm="math",
-            **padding_kwargs,
+            **extra_kwargs,
         )
 
         if save_validation_info_outputs:
@@ -448,7 +457,7 @@ def test_common_shapes(model_path, batch_size, seq_length, max_new_tokens, persi
 
     # first test validation level 0
     aiu_validation_info = extract_validation_information(
-        model, input_ids, max_new_tokens, None, only_last_token=ATTN_TYPE != "paged", attn_type=ATTN_TYPE, **padding_kwargs
+        model, input_ids, max_new_tokens, None, only_last_token="paged" not in ATTN_NAME, **extra_kwargs
     )
     dprint("aiu validation info extracted for validation level 0")
 
@@ -487,9 +496,10 @@ def _metric_calculator(r: torch.Tensor, t: torch.Tensor):
         for i in range(iters):
             # for iteration 0, we have computed the cpu validation info in the prior step for seed=0, so skip
             if i != 0:
-                input_ids, padding_kwargs = __prepare_inputs(
+                input_ids, extra_kwargs = __prepare_inputs(
                     batch_size, seq_length, tokenizer, seed=i
                 )
+                extra_kwargs["attn_name"] = ATTN_NAME
                 cpu_validation_info = __load_validation_info(
                     model_path, batch_size, seq_length, max_new_tokens, tokenizer, i
                 )
@@ -500,7 +510,7 @@ def _metric_calculator(r: torch.Tensor, t: torch.Tensor):
                         max_new_tokens,
                         LogitsExtractorHook(),
                         attn_algorithm="math",
-                        **padding_kwargs,
+                        **extra_kwargs,
                     )
                     dprint(
                         f"cpu validation info extracted for validation level 1 - iter={i}"
@@ -526,8 +536,7 @@ def _metric_calculator(r: torch.Tensor, t: torch.Tensor):
                 max_new_tokens,
                 GoldenTokenHook(cpu_static_tokens),
                 only_last_token=ATTN_TYPE != "paged",
-                attn_type=ATTN_TYPE, 
-                **padding_kwargs,
+                **extra_kwargs,
             )
             dprint(f"aiu validation info extracted for validation level 1 - iter={i}")
             if save_validation_info_outputs: