Add configurable sample requests to prepare inputs

flaviabeo · flaviabeo · commit d2e3d98d5e32 · 2025-07-11T13:44:50.000-03:00
Signed-off-by: Flavia Beo &lt;flavia.beo@ibm.com&gt;
diff --git a/aiu_fms_testing_utils/utils/__init__.py b/aiu_fms_testing_utils/utils/__init__.py
@@ -168,15 +168,39 @@ def sample_squad_v2_qa_requests(
         seed,
     )
 
-def prepare_inputs(batch_size, seq_length, tokenizer, sharegpt_path, seed=0):
-    prompts_and_sizes = sample_sharegpt_requests(
-        sharegpt_path,
-        batch_size,
-        tokenizer,
-        int(seq_length / 2),
-        seq_length,
-        seed,
-    )
+def prepare_inputs(batch_size, seq_length, tokenizer, ds_path, seed=0, ds_type="sharegpt"):
+    """
+    Prepare input IDs and padding kwargs for a batch of questions.
+
+    Args:
+        batch_size (int): The number of questions in the batch.
+        seq_length (int): The maximum length of the input sequence.
+        tokenizer (Tokenizer): A tokenizer object to tokenize the questions.
+        ds_path (str): The path to the dataset file.
+        seed (int, optional): The random seed for reproducibility. Defaults to 0.
+        ds_type (str, optional): The type of dataset to use. Can be "sharegpt" or any other supported dataset type. Defaults to "sharegpt".
+
+    Returns:
+        tuple: A tuple containing the input IDs and padding kwargs.
+    """
+    if not "sharegpt" in ds_type:
+        prompts_and_sizes = sample_squad_v2_qa_requests(
+            ds_path, 
+            batch_size, 
+            tokenizer, 
+            int(seq_length / 2), 
+            seq_length, 
+            seed,
+        )
+    else:
+        prompts_and_sizes = sample_sharegpt_requests(
+            ds_path,
+            batch_size,
+            tokenizer,
+            int(seq_length / 2),
+            seq_length,
+            seed,
+        )
     prompt_list = []
     for prompt, _ in prompts_and_sizes:
         prompt_list.append(ids_for_prompt(prompt, tokenizer))