huggingface
diff --git a/‎examples/pytorch/continuous_batching.py‎
Lines changed: 95 additions & 81 deletions b/‎examples/pytorch/continuous_batching.py‎
Lines changed: 95 additions & 81 deletions
@@ -17,6 +17,7 @@
 import json
 import os
 import time
+from itertools import cycle
 from typing import Optional
 
 import datasets
@@ -29,42 +30,32 @@
 from transformers.generation.continuous_batching.requests import logger
 
 
-# MODEL_ID = "Qwen/Qwen3-4B-Instruct-2507"
-SLIDING_WINDOW = 0
-MODEL_ID = "google/gemma-2-2b-it" if SLIDING_WINDOW > 0 else "meta-llama/Meta-Llama-3-8B"
-FORCE_MAX_LENGTH = False  # should be False unless you are debugging sliding window features
-SKIP_SPECIAL_TOKENS = False
-
-
-def generate_simple(
-    attn_impl: str, simple_batch_inputs: list[int], generation_config: GenerationConfig
+def generate_without_cb(
+    model_id: str, sliding_window: int, attn_impl: str, batched_inputs: list[int], generation_config: GenerationConfig
 ) -> dict[str, str]:
-    attn_impl = {
-        "sdpa": "sdpa",
-        "eager": "eager",
-        "paged_attention": "eager",  # TODO: this does not work on AMD docker
-        "flash_paged": "flash_attention_2",  # TODO: this does not work on AMD docker
-        "kernels-community/flash-attn": "eager",
-    }[attn_impl]
-
-    model = AutoModelForCausalLM.from_pretrained(MODEL_ID, dtype=torch.bfloat16, attn_implementation=attn_impl)
+    # Setup model and tokenizer
+    model = AutoModelForCausalLM.from_pretrained(model_id, dtype=torch.bfloat16, attn_implementation=attn_impl)
     model = model.cuda().eval()
-    if getattr(model.config, "sliding_window", None) is not None:
-        model.config.sliding_window = SLIDING_WINDOW
-
+    if sliding_window > 0 and getattr(model.config, "sliding_window", None) is not None:
+        model.config.sliding_window = sliding_window
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
+    # Generate one by one
     decoded_outputs = {}
-    for input_ids in tqdm(simple_batch_inputs, desc="Generating outputs without CB"):
+    for input_ids in tqdm(batched_inputs, desc="Generating outputs without CB"):
         key = " ".join(map(str, input_ids))  # This will be used to identify the output after batched generation
         input_ids = torch.tensor([input_ids]).to("cuda")
-        # attention_mask = torch.ones_like(input_ids)
-        outputs = model.generate(input_ids, generation_config=generation_config, use_model_defaults=False)
+        attention_mask = torch.ones_like(input_ids)
+        outputs = model.generate(
+            input_ids, attention_mask=attention_mask, generation_config=generation_config, use_model_defaults=False
+        )
         generated_tokens = outputs[0][input_ids.shape[1] :]
-        decoded_output = tokenizer.decode(generated_tokens, skip_special_tokens=SKIP_SPECIAL_TOKENS)
-        decoded_outputs[key] = decoded_output
+        decoded_outputs[key] = tokenizer.decode(generated_tokens, skip_special_tokens=False)
     return decoded_outputs
 
 
-def setup_metrics():
+def maybe_setup_metrics(use_metrics: bool) -> None:
+    if not use_metrics:
+        return
     try:
         from opentelemetry import metrics, trace
         from opentelemetry.exporter.otlp.proto.http.metric_exporter import OTLPMetricExporter
@@ -119,16 +110,14 @@ def batch_generate(
     token_count = 0
     data = []
     for i, request in enumerate(batch_outputs):
-        input_text = tokenizer.decode(batch_outputs[request].prompt_ids, skip_special_tokens=SKIP_SPECIAL_TOKENS)
+        input_text = tokenizer.decode(batch_outputs[request].prompt_ids, skip_special_tokens=False)
         # The key is used to tie back to the output of unbatched generation
         key = " ".join(map(str, batch_outputs[request].prompt_ids))
         data.append({"input": input_text, "key": key})
 
         # Try to decode the output
         try:
-            output_text = tokenizer.decode(
-                batch_outputs[request].generated_tokens, skip_special_tokens=SKIP_SPECIAL_TOKENS
-            )
+            output_text = tokenizer.decode(batch_outputs[request].generated_tokens, skip_special_tokens=False)
             token_count += len(batch_outputs[request].generated_tokens[1:])
             data[-1]["cb_outputs"] = output_text
         except Exception as e:
@@ -138,14 +127,7 @@ def batch_generate(
 
         # Display sample if asked
         if i < displayed_samples:
-            if len(output_text) > 0:
-                print("-" * 20)
-                print(f"{request} Input:  {input_text}")
-                print(f"{request} Output: {output_text}")
-            else:
-                print(f"{request} Input:  {input_text}")
-                print("[WARN]")
-                print(f"{request} Output was empty!")
+            print("-" * 20, f"{request} Input:  {input_text}", f"{request} Output: {output_text}", sep="\n")
 
         # Compare with classic generate if asked
         if expected_outputs is not None:
@@ -182,83 +164,115 @@ def batch_generate(
 
 
 if __name__ == "__main__":
-    # Parse args
     parser = argparse.ArgumentParser()
+
+    # Continuous batching parameters
     parser.add_argument("--num-blocks", "-n", type=int, default=None)
     parser.add_argument("--max-batch-tokens", "-b", type=int, default=None)
 
+    # Model parameters
+    parser.add_argument("--sliding-window", type=int, default=0)
     parser.add_argument("--attn", type=str, default="kernels-community/flash-attn", help="Attention implementation")
+
+    # Performance parameters
     parser.add_argument("--matmul-precision", "-mp", type=str, default="high")  # set to "none" to disable
     parser.add_argument("--cuda-graph", "-cg", help="Use cuda graphs", type=str, default=None)
     parser.add_argument("--compile", action="store_true", help="Compile the model using torch.compile")
+    parser.add_argument("--do-sample", action="store_true", help="Activate sampling")
 
+    # Benchmark parameters
     parser.add_argument("--samples", type=int, default=500, help="Number of samples to generate")
+    parser.add_argument("--add-prefix", action="store_true", help="Add a prefix to the samples")
+    parser.add_argument("--compare", action="store_true", help="Compare CB generation with classic generate")
+    parser.add_argument("--profile", type=str, default=None)
+    parser.add_argument("--metrics", action="store_true")
+    parser.add_argument("--force-max-length", action="store_true", help="Force generation to stop at max length")
+
+    # Display parameters
     parser.add_argument("--displayed", type=int, default=0, help="Number of samples to display")
     parser.add_argument("--log-level", type=str, default="INFO")
     parser.add_argument("--output-file", type=str, default=None)
-    parser.add_argument("--compare", action="store_true")
-    parser.add_argument("--metrics", action="store_true")
-    parser.add_argument("--profile", type=str, default=None)
+
     args = parser.parse_args()
 
-    # Set log level
-    logger.setLevel(args.log_level.upper())
+    # Create model
+    model_id = "google/gemma-2-2b-it" if args.sliding_window > 0 else "meta-llama/Llama-3.1-8B-Instruct"
+    has_system_role = args.sliding_window == 0
+
+    model = AutoModelForCausalLM.from_pretrained(model_id, attn_implementation=args.attn, dtype=torch.bfloat16)
+    model = model.cuda().eval()
 
-    # If turned on, we setup metrics
-    if args.metrics:
-        setup_metrics()
+    if args.sliding_window > 0 and getattr(model.config, "sliding_window", None) is not None:
+        print(f"Setting sliding window from {model.config.sliding_window} to {args.sliding_window}")
+        model.config.sliding_window = args.sliding_window
 
-    # Set matmul precision if not none
+    # Set up diagnostics
+    logger.setLevel(args.log_level.upper())
+    maybe_setup_metrics(args.metrics)
+
+    # Set up performance
     if args.matmul_precision != "none":
         torch.set_float32_matmul_precision(args.matmul_precision)
-    # Parse cuda graph argument
-    if args.cuda_graph is not None:
-        use_cuda_graph = {
-            "none": None,
-            "yes": True, "y": True, "true": True, "t": True, "1": True,
-            "no": False, "n": False, "false": False, "f": False, "0": False,
-        }[args.cuda_graph.lower()]  # fmt: skip
-    else:
-        use_cuda_graph = None
 
-    # Prepare model
-    model = AutoModelForCausalLM.from_pretrained(
-        MODEL_ID,
-        attn_implementation=args.attn,
-        dtype=torch.bfloat16,
-    )
-    model = model.cuda().eval()
-    if getattr(model.config, "sliding_window", None) is not None:
-        print(f"Setting sliding window from {model.config.sliding_window} to {SLIDING_WINDOW}")
-        model.config.sliding_window = SLIDING_WINDOW
+    cuda_graph_arg = args.cuda_graph.lower() if args.cuda_graph is not None else None
+    use_cuda_graph = {
+        "none": None, None: None,
+        "yes": True, "y": True, "true": True, "t": True, "1": True,
+        "no": False, "n": False, "false": False, "f": False, "0": False,
+    }[cuda_graph_arg]  # fmt: skip
 
-    # If turned on, we compile the model
     if args.compile:
         model.forward = torch.compile(model.forward, mode="max-autotune-no-cudagraphs")
 
     # Prepare tokenizer and dataset
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, padding_side="left")
+    tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side="left")
 
     dataset = datasets.load_dataset("openai/gsm8k", "socratic", split="test")
     dataset = dataset.select(range(args.samples))
 
-    simple_batch_inputs = [tokenizer(item["question"])["input_ids"] for item in dataset]
+    if args.add_prefix:
+        possible_prefixes = [
+            None,
+            "You are a bot that solves math problems.",
+            "You are a bot who solves math problems. Try to make your answer clear and understandable, and include your stages of reasoning.",
+            "You are a bot with the aim to solves math problems. Try to make your answer clear and understandable, and include your stages of reasoning. No loud words or emojis, all responses must be readable by a child. Here is now the problem:",
+        ]  # fmt: skip
+    else:
+        possible_prefixes = [None]
+
+    batched_inputs = []
+    for item, prefix in zip(dataset, cycle(possible_prefixes)):
+        messages = []
+        question = item["question"]
+        if prefix is not None:
+            if has_system_role:
+                messages.append({"role": "system", "content": prefix})
+            else:
+                question = prefix + "\n\n" + question
+        messages.append({"role": "user", "content": question})
+        inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True)
+        batched_inputs.append(inputs["input_ids"])
 
     # Prepare generation config
-    generation_config = GenerationConfig(
+    generation_cfg = GenerationConfig(
         max_new_tokens=512,
         use_cuda_graph=use_cuda_graph,
-        eos_token_id=tokenizer.pad_token_id if FORCE_MAX_LENGTH else tokenizer.eos_token_id,
+        eos_token_id=tokenizer.pad_token_id if args.force_max_length else tokenizer.eos_token_id,
         pad_token_id=tokenizer.pad_token_id,
-        do_sample=not args.compare,
+        do_sample=args.do_sample,
         temperature=0.8,
         top_p=0.9,
         num_blocks=args.num_blocks,
         max_batch_tokens=args.max_batch_tokens,
     )
 
     # If we need to compare, we need to generate the reference outputs
-    expected_outputs = generate_simple(args.attn, simple_batch_inputs, generation_config) if args.compare else None
+    if args.compare:
+        expected_outputs = generate_without_cb(
+            model_id, args.sliding_window, args.attn, batched_inputs, generation_cfg
+        )
+    else:
+        expected_outputs = None
 
     # If no output file is provided, we pick a name based on the args
     if args.output_file is None:
@@ -271,8 +285,8 @@ def batch_generate(
     # Run warmup batch generation # TODO: understand why warmup incurs a large overhead during cache creation
     batch_generate(
         model,
-        simple_batch_inputs[: min(5, args.samples)],
-        generation_config,
+        batched_inputs[: min(5, args.samples)],
+        generation_cfg,
         tokenizer,
         displayed_samples=-1,
     )
@@ -285,8 +299,8 @@ def batch_generate(
         # Run batch generation
         gen_time, tok_per_sec = batch_generate(
             model,
-            simple_batch_inputs,
-            generation_config,
+            batched_inputs,
+            generation_cfg,
             tokenizer,
             displayed_samples=args.displayed,
             output_file=args.output_file,
@@ -297,5 +311,5 @@ def batch_generate(
         prof.export_chrome_trace(filename)
 
 # Example usage:
-# python examples/pytorch/continuous_batching.py --attn sdpa_paged -mp none --samples 3 --compare
-# python examples/pytorch/continuous_batching.py --num-blocks 369 --max-batch-tokens 23 --attn sdpa_paged -mp none --samples 1 --displayed 0 --output-file sliced.json
+# python examples/pytorch/continuous_batching.py --attn sdpa --add-prefix --samples 10 --compare
+# python examples/pytorch/continuous_batching.py --attn flash_attention_2 -mp none --add-prefix --samples 500