Merge pull request #119 from foundation-model-stack/log_version_modified

JRosenkranz · web-flow · commit dff0aa267139 · 2025-09-04T14:07:35.000-04:00
Include version in validation info outputs
diff --git a/aiu_fms_testing_utils/testing/validation.py b/aiu_fms_testing_utils/testing/validation.py
@@ -3,6 +3,7 @@
 
 import torch
 from aiu_fms_testing_utils.utils.aiu_setup import dprint
+from aiu_fms_testing_utils._version import version_tuple
 import os
 
 
@@ -130,8 +131,22 @@ def get_default_validation_prefix(
     seq_length: int,
     dtype: str,
     attn_type: str,
+    aftu_version: str,
 ):
-    return f"{model_id.replace('/', '--')}_max-new-tokens-{max_new_tokens}_batch-size-{batch_size}_seq-length-{seq_length}_dtype-{dtype}_attn-type-{attn_type}"
+    """
+    Args:
+        model_id (str): model name used
+        max_new_tokens (int): number of max new tokens to generate
+        batch_size (int): batch size used
+        seq_length (int):sequence length used
+        dtype (str): data type
+        attn_type (str): type of attention
+        aftu_version (str): introduced in v0.3.0 to track changed in log
+
+    Returns:
+        str: A prefix that will be prepended to the file name
+    """
+    return f"{model_id.replace('/', '--')}_max-new-tokens-{max_new_tokens}_batch-size-{batch_size}_seq-length-{seq_length}_dtype-{dtype}_attn-type-{attn_type}.{aftu_version}"
 
 
 def load_validation_information(
@@ -246,7 +261,7 @@ def extract_validation_information(
     **extra_kwargs,
 ):
     attention_specific_kwargs = {}
-    if "paged" in extra_kwargs["attn_name"]:
+    if "paged" in extra_kwargs.get("attn_name", "sdpa"):
         from aiu_fms_testing_utils.utils.paged import generate
     else:
         # TODO: Add a unified generation dependent on attn_type
@@ -388,3 +403,87 @@ def print_failed_cases(failed_cases, aiu_tokens, validation_tokens, tokenizer):
         print(
             f"In sentence {sentence_index + 1}/{len(aiu_tokens)}, token {token_index}, AIU outputs {aiu_token} instead of {validation_token} -- AIU val={aiu_str} -- CPU val={validation_str}"
         )
+
+
+def get_validation_info_path(
+    validation_info_dir: str,
+    model_variant: str,
+    batch_size: int,
+    seq_length: int,
+    max_new_tokens: int,
+    seed: int,
+    attn_type: str,
+    aftu_version: Optional[Tuple[int, int, int]] = None,
+    device_type: str = "cpu",
+    dtype: str = "fp16",
+):
+    if aftu_version is None:
+        aftu_version = version_tuple
+
+    validation_file_name = f"{get_default_validation_prefix(model_variant, max_new_tokens, batch_size, seq_length, dtype, attn_type, '.'.join([str(_) for _ in aftu_version[:3]]))}.{device_type}_validation_info.{seed}.out"
+    full_path = os.path.join(validation_info_dir, validation_file_name)
+    return full_path
+
+
+def __decrement_version(version: Tuple[int, int, int]):
+    """
+    Function designed to prevent triple nested for loop while decrementing version
+    """
+    major, minor, patch = version
+    if patch > 0:
+        return (major, minor, patch - 1)
+    elif minor > 0:
+        return (major, minor - 1, 0)
+    elif major > 0:
+        return (major - 1, 0, 0)
+    else:
+        return None
+
+
+def find_validation_info_path(
+    validation_info_dir: str,
+    model_variant: str,
+    batch_size: int,
+    seq_length: int,
+    max_new_tokens: int,
+    seed: int,
+    attn_type: str,
+    aftu_version: Optional[Tuple[int, int, int]] = None,
+    version_allow_decrement: bool = False,
+    device_type: str = "cpu",
+    dtype: str = "fp16",
+):
+    """
+    Find the validation info path if it exists, otherwise return None
+    """
+
+    if aftu_version is None:
+        loc_version_tuple = version_tuple[:3]
+    else:
+        loc_version_tuple = aftu_version
+
+    result_path: Optional[str] = None
+
+    while result_path is None and loc_version_tuple is not None:
+        full_path = get_validation_info_path(
+            validation_info_dir,
+            model_variant,
+            batch_size,
+            seq_length,
+            max_new_tokens,
+            seed,
+            attn_type,
+            loc_version_tuple,
+            device_type,
+            dtype,
+        )
+        # if the path is found, we are done searching and can return
+        if os.path.exists(full_path):
+            result_path = full_path
+        # if allow version decrements, decrement the version and continue
+        elif version_allow_decrement:
+            loc_version_tuple = __decrement_version(loc_version_tuple)
+        # if path is not found and we are not allowing decrementing of version, finish with no result
+        else:
+            loc_version_tuple = None
+    return result_path
diff --git a/tests/models/test_decoders.py b/tests/models/test_decoders.py
@@ -4,8 +4,6 @@
 from fms.models import get_model
 from fms.utils.generation import pad_input_ids
 import itertools
-import warnings
-import re
 import torch
 from torch import distributed as dist
 from aiu_fms_testing_utils.testing.validation import (
@@ -14,10 +12,11 @@
     GoldenTokenHook,
     capture_level_1_metrics,
     filter_failed_level_1_cases,
-    get_default_validation_prefix,
+    get_validation_info_path,
     load_validation_information,
     validate_level_0,
     top_k_loss_calculator,
+    find_validation_info_path,
 )
 from aiu_fms_testing_utils.utils import (
     warmup_model,
@@ -80,6 +79,8 @@
 }
 ATTN_NAME = attention_map[ATTN_TYPE]
 
+CPU_DTYPE = "fp8" if "fp8" in ATTN_TYPE else "fp32"
+
 FORCE_VALIDATION_LEVEL_1 = (
     os.environ.get("FMS_TEST_SHAPES_FORCE_VALIDATION_LEVEL_1", "0") == "1"
 )
@@ -356,45 +357,26 @@ def __filter_before_eos(metrics, filter_indexes):
     return [item for sublist in filtered_results for item in sublist]
 
 
-def __get_validation_info_full_path(
-    model_path,
-    batch_size,
-    seq_length,
-    max_new_tokens,
-    seed,
-    attn_type: str,
-    device_type="cpu",
-):
-    validation_file_name = f"{get_default_validation_prefix(model_path, max_new_tokens, batch_size, seq_length, 'fp16', attn_type)}.{device_type}_validation_info.{seed}.out"
-    full_path = os.path.join(validation_info_dir, validation_file_name)
-    return full_path
-
-
 def __load_validation_info(
     model_path, batch_size, seq_length, max_new_tokens, tokenizer, seed, attn_type: str
 ):
     # if path doesn't exist and paged isn't in the attention name, remove `attn_type` and recheck again, warn that we will no longer in the future have paths without 'attn_type'
-    full_path = __get_validation_info_full_path(
-        model_path, batch_size, seq_length, max_new_tokens, seed, attn_type
+    full_path = find_validation_info_path(
+        validation_info_dir,
+        model_path,
+        batch_size,
+        seq_length,
+        max_new_tokens,
+        seed,
+        attn_type,
+        version_allow_decrement=True,
+        dtype=CPU_DTYPE,
     )
-
-    if os.path.exists(full_path):
+    if full_path is not None:
         dprint(f"cpu validation info found for seed={seed} -- loading it")
         return load_validation_information(full_path, "logits", batch_size, tokenizer)
-    elif "paged" not in attn_type:
-        # This regex applies to a very specific file name format
-        modified_full_path = re.sub(r"_attn-type[^.]*", "", full_path)
-
-        if os.path.exists(modified_full_path):
-            warnings.warn(
-                f"All future paths should contain attn_type prefix information in path name, please modify {full_path=} to {modified_full_path=}",
-                stacklevel=2,
-            )
-            dprint(f"cpu validation info found for seed={seed} -- loading it")
-            return load_validation_information(
-                modified_full_path, "logits", batch_size, tokenizer
-            )
-    return None
+    else:
+        return None
 
 
 class PersistentModel:
@@ -568,8 +550,15 @@ def test_common_shapes(
 
         if save_validation_info_outputs:
             cpu_validation_info.save(
-                __get_validation_info_full_path(
-                    model_path, batch_size, seq_length, max_new_tokens, 0, ATTN_NAME
+                get_validation_info_path(
+                    validation_info_dir,
+                    model_path,
+                    batch_size,
+                    seq_length,
+                    max_new_tokens,
+                    0,
+                    ATTN_NAME,
+                    dtype=CPU_DTYPE,
                 )
             )
     cpu_static_tokens = cpu_validation_info.get_info("tokens")
@@ -654,13 +643,15 @@ def _metric_calculator(r: torch.Tensor, t: torch.Tensor):
                     )
                     if save_validation_info_outputs:
                         cpu_validation_info.save(
-                            __get_validation_info_full_path(
+                            get_validation_info_path(
+                                validation_info_dir,
                                 model_path,
                                 batch_size,
                                 seq_length,
                                 max_new_tokens,
                                 i,
                                 ATTN_NAME,
+                                dtype=CPU_DTYPE,
                             )
                         )
                 cpu_static_tokens = cpu_validation_info.get_info("tokens")
@@ -684,14 +675,15 @@ def _metric_calculator(r: torch.Tensor, t: torch.Tensor):
             dprint(f"aiu validation info extracted for validation level 1 - iter={i}")
             if save_validation_info_outputs:
                 aiu_validation_info.save(
-                    __get_validation_info_full_path(
+                    get_validation_info_path(
+                        validation_info_dir,
                         model_path,
                         batch_size,
                         seq_length,
                         max_new_tokens,
                         i,
                         ATTN_NAME,
-                        "aiu",
+                        device_type="aiu",
                     )
                 )
 
diff --git a/tests/testing/test_validation.py b/tests/testing/test_validation.py