Merge pull request #135 from foundation-model-stack/fix_inference_script_static_graphs_show_more_symbols_than_expected

JRosenkranz · web-flow · commit ab4f7a261712 · 2025-09-19T21:36:19.000-04:00
fixed inference.py for batch size 1 symbolic sdpa
diff --git a/scripts/inference.py b/scripts/inference.py
@@ -588,7 +588,7 @@ def select_int8_module(
 
 if args.compile:
     dprint("compiling model")
-    fx_config.backed_size_oblivious = True
+    fx_config.backed_size_oblivious = "paged" in attn_name
     if is_aiu_backend:
         model.compile(
             backend="sendnn", options={"sendnn.dynamic": args.compile_dynamic_sendnn}
diff --git a/tests/models/test_scripts.py b/tests/models/test_scripts.py
@@ -36,11 +36,6 @@
 
 
 def execute_script(execute_cmd):
-    # using these options temporarily
-    current_env["VLLM_DT_MAX_BATCH_TKV_LIMIT"] = "16384"
-    current_env["VLLM_DT_MAX_BATCH_SIZE"] = "4"
-    current_env["VLLM_DT_MAX_CONTEXT_LEN"] = "4096"
-
     with Popen(
         execute_cmd,
         stdin=PIPE,
@@ -56,11 +51,25 @@ def execute_script(execute_cmd):
             raise Exception(error)
 
 
-def execute_inference(model_path, batch_size, seq_length, max_new_tokens, attn_type):
+def execute_inference(
+    model_path, batch_size, seq_length, max_new_tokens, attn_type, allow_symbolic_shapes
+):
     extra_args = []
     if attn_type == "paged":
-        extra_args.append("--compile_dynamic_sendnn")
+        # paged needs symbolic shapes
         extra_args.append("--attention_type=paged")
+        # using these options temporarily
+        current_env.setdefault("VLLM_DT_MAX_BATCH_TKV_LIMIT", "16384")
+        current_env.setdefault("VLLM_DT_MAX_BATCH_SIZE", "4")
+        current_env.setdefault("VLLM_DT_MAX_CONTEXT_LEN", "4096")
+    else:
+        # added in case symbolic shapes used with sdpa
+        current_env.setdefault("_PROMPT_LEN", "64")
+        current_env.setdefault("_MAX_DECODE_TOKENS", "8")
+        current_env.setdefault("_MAX_CONTEXT_LEN", "71")
+
+    if allow_symbolic_shapes is not None and allow_symbolic_shapes:
+        extra_args.append("--compile_dynamic_sendnn")
 
     execute_cmd = [
         "python3",
@@ -97,20 +106,38 @@ def __repeat_batch_asserts(bs: int) -> list[str]:
 # add the asserts based on batch size
 # for batches greater than common_asserts, repeat common_asserts since this follows inference behavior
 common_inference_params = [
-    common_param + (__repeat_batch_asserts(common_param[1]),)
+    common_param + (__repeat_batch_asserts(common_param[1]), None)
     for common_param in common_params
 ]
+# adding special case where we allow symbolic shapes for batch size 1 using sdpa
+common_inference_params.append(
+    (common_model_paths[0], 1, 64, 8, "sdpa", [common_asserts[0]], True)
+)
 
 
 @pytest.mark.parametrize(
-    "model_path,batch_size,seq_length,max_new_tokens,attn_type,asserts",
+    "model_path,batch_size,seq_length,max_new_tokens,attn_type,asserts,allow_symbolic_shapes",
     common_inference_params,
 )
 def test_inference_script(
-    model_path, batch_size, seq_length, max_new_tokens, attn_type, asserts
+    model_path,
+    batch_size,
+    seq_length,
+    max_new_tokens,
+    attn_type,
+    asserts,
+    allow_symbolic_shapes,
 ):
+    # force symbolic shapes if paged
+    if "paged" in attn_type:
+        allow_symbolic_shapes = True
     result_text = execute_inference(
-        model_path, batch_size, seq_length, max_new_tokens, attn_type
+        model_path,
+        batch_size,
+        seq_length,
+        max_new_tokens,
+        attn_type,
+        allow_symbolic_shapes,
     )
 
     for common_assert in asserts: