addressed pr comments; added constant for kv-cache hint

JRosenkranz · JRosenkranz · commit 2e9d6d8d6f2b · 2025-09-16T19:55:01.000Z
Signed-off-by: Joshua Rosenkranz &lt;jmrosenk@us.ibm.com&gt;
diff --git a/aiu_fms_testing_utils/utils/__init__.py b/aiu_fms_testing_utils/utils/__init__.py
@@ -468,7 +468,7 @@ def __sample_requests(
     return filtered_dataset
 
 
-def sample_granite_3_3_long_answerable_requests(
+def sample_rag_factoid_requests(
     dataset_path: str,
     num_requests: int,
     tokenizer: PreTrainedTokenizerBase,
diff --git a/aiu_fms_testing_utils/utils/paged.py b/aiu_fms_testing_utils/utils/paged.py
@@ -416,7 +416,7 @@ def generate(
         if post_iteration_hook is not None:
             _logits = logits
             _next_val = next_val
-            # since we cannot handle batch size 1 and mimic with batch size 2, we need to only pass in the first logits/next_val
+            # since we cannot handle batch size 1 for fp8 and mimic with batch size 2, we need to only pass in the first logits/next_val
             if is_fp8 and not is_batch:
                 _logits = logits[0].unsqueeze(0)
                 _next_val = _next_val[0].unsqueeze(0)
@@ -464,6 +464,11 @@ def generate(
     return result
 
 
+# this value is default to 2080 to be consistent with vllm for granite 3.3 8b instruct
+KVCACHE_NUM_BLOCKS_HINT = int(
+    os.environ.get("AFTU_PAGED_KVCACHE_NUM_BLOCKS_HINT", 2080)
+)
+
 VLLM_DT_MAX_BATCH_TKV_LIMIT = int(os.environ.get("VLLM_DT_MAX_BATCH_TKV_LIMIT", 131072))
 
 
diff --git a/scripts/drive_paged_programs.py b/scripts/drive_paged_programs.py
@@ -24,13 +24,17 @@
     top_k_loss_calculator,
 )
 from aiu_fms_testing_utils.utils import (
-    sample_granite_3_3_long_answerable_requests,
+    sample_rag_factoid_requests,
     sample_sharegpt_requests,
     stagger_region,
     warmup_model,
 )
 from aiu_fms_testing_utils.utils.aiu_setup import aiu_dist_setup, dprint, local_rank
-from aiu_fms_testing_utils.utils.paged import ProgramCriteria, get_programs_prompts
+from aiu_fms_testing_utils.utils.paged import (
+    ProgramCriteria,
+    get_programs_prompts,
+    KVCACHE_NUM_BLOCKS_HINT,
+)
 
 parser = argparse.ArgumentParser(
     description="Script which will drive paged programs for debugging"
@@ -167,7 +171,7 @@
 save_validation_info_outputs = args.save_validation_info_outputs
 
 if args.dataset_type == "rag_factoid":
-    sampler = sample_granite_3_3_long_answerable_requests
+    sampler = sample_rag_factoid_requests
     allow_truncation = False
 elif args.dataset_type == "sharegpt":
     sampler = sample_sharegpt_requests
@@ -335,7 +339,7 @@ def __load_validation_info(
     and USE_DISTRIBUTED
     and dist.get_world_size() == 4
 ):
-    extra_kwargs["_kvcache_num_blocks_hint"] = 2080
+    extra_kwargs["_kvcache_num_blocks_hint"] = KVCACHE_NUM_BLOCKS_HINT
 warmup_model(
     model,
     input_ids,
@@ -347,6 +351,7 @@ def __load_validation_info(
 
 if USE_DISTRIBUTED:
     # wait for rank0 to be finished as it is the only one generating the criteria json
+    # this is needed since otherwise we may run into a race condition
     torch.distributed.barrier()
 
 with open(args.program_criteria_json_path, "r") as f:
@@ -434,7 +439,8 @@ def __metric_calculator(r: torch.Tensor, t: torch.Tensor):
 
 
 failed_cases = []
-for program_id, valid_prompt in valid_prompts:  # for each program
+# for each program and valid prompt (batch size, sequence length)
+for program_id, valid_prompt in valid_prompts:
     input_ids, extra_kwargs = __prepare_inputs(
         valid_prompt[0], valid_prompt[1], tokenizer, enforce_sizes=[valid_prompt[1]]
     )
@@ -444,7 +450,7 @@ def __metric_calculator(r: torch.Tensor, t: torch.Tensor):
         and USE_DISTRIBUTED
         and dist.get_world_size() == 4
     ):
-        extra_kwargs["_kvcache_num_blocks_hint"] = 2080
+        extra_kwargs["_kvcache_num_blocks_hint"] = KVCACHE_NUM_BLOCKS_HINT
 
     if local_rank == 0:
         dprint(f"*** testing program {program_id} ***")
diff --git a/tests/models/test_decoders.py b/tests/models/test_decoders.py
@@ -24,6 +24,7 @@
     warmup_model,
     sample_sharegpt_requests,
 )
+from aiu_fms_testing_utils.utils.paged import KVCACHE_NUM_BLOCKS_HINT
 import json
 from transformers import AutoTokenizer
 
@@ -538,7 +539,7 @@ def test_common_shapes(
         and USE_DISTRIBUTED
         and dist.get_world_size() == 4
     ):
-        extra_kwargs["_kvcache_num_blocks_hint"] = 2080
+        extra_kwargs["_kvcache_num_blocks_hint"] = KVCACHE_NUM_BLOCKS_HINT
 
     # warmup aiu model
     warmup_model(
@@ -637,7 +638,7 @@ def _metric_calculator(r: torch.Tensor, t: torch.Tensor):
                     and USE_DISTRIBUTED
                     and dist.get_world_size() == 4
                 ):
-                    extra_kwargs["_kvcache_num_blocks_hint"] = 2080
+                    extra_kwargs["_kvcache_num_blocks_hint"] = KVCACHE_NUM_BLOCKS_HINT
 
                 cpu_validation_info = __load_validation_info(
                     model_path,