Merge pull request #121 from kcirred/sampling_mod

JRosenkranz · web-flow · commit 178bc894c546 · 2025-09-09T13:01:18.000-04:00
modification to enforce_size behavior to start accepting samples even before enforcing sizes when there is sufficient space
diff --git a/aiu_fms_testing_utils/utils/__init__.py b/aiu_fms_testing_utils/utils/__init__.py
@@ -378,7 +378,10 @@ def __sample_requests(
         random.Random(seed).shuffle(dataset)
 
     for prompt, prompt_len in dataset:
-        if len(filtered_dataset) == num_requests and not enforce_sizes:
+        if (
+            len(filtered_dataset) + len(enforced_dataset) == num_requests
+            and not enforce_sizes
+        ):
             break
 
         # NOTE: This section is for enforce heterogeneous, does not work with enforce_sizes
@@ -387,7 +390,6 @@ def __sample_requests(
             and max_heterogeneous_combinations > len(filtered_dataset)
             and len(filtered_dataset) < num_requests
         ):
-            # for _, size in filtered_dataset:
             current_padded_size = pad_size_dict[prompt_len]
 
             if current_padded_size not in seen_sizes:
@@ -403,6 +405,7 @@ def __sample_requests(
             # NOTE: this should not be `elif` despite enforce_sizes and enforce_sizes_with_truncation
             # are mutually exclusive because we allow same prompt to be used in enforce_sizes_with_truncation
             # even if it is taken from enforce_sizes
+            truncation_found = None
             if enforce_sizes_with_truncation:
                 truncation_found: Tuple[int, int] = next(
                     (
@@ -422,6 +425,16 @@ def __sample_requests(
                     )
                     enforced_dataset.append((truncated_prompt, truncate_to_size))
                     enforce_sizes_with_truncation.remove(truncation_found)
+            # This condition allows adding prompts to the final dataset as long as there is
+            # sufficient space allocated for sizes that need to be enforced.
+            if (
+                not truncation_found
+                and current_padded_size not in enforce_sizes
+                and len(filtered_dataset) + len(enforced_dataset)
+                < num_requests
+                - (len(enforce_sizes) + len(enforce_sizes_with_truncation))
+            ):
+                filtered_dataset.append((prompt, prompt_len))
 
         # when not enforcing heterogeneous or when exhausted all possible prompt_lengths
         else:
@@ -441,10 +454,12 @@ def __sample_requests(
         print(
             f"There may be prompt size repeats because {num_requests=} while {max_heterogeneous_combinations=}"
         )
-    if enforced_dataset:
+    if enforced_dataset and enforce_heterogeneous:
         filtered_dataset = _merge_enforce_keep_heterogeneous(
             enforced_dataset, filtered_dataset, num_requests
         )
+    elif enforced_dataset:
+        filtered_dataset = enforced_dataset + filtered_dataset
 
     if len(filtered_dataset) != num_requests:
         warnings.warn("Returning dataset not equal to number requested", stacklevel=2)
diff --git a/tests/utils/test_sampling.py b/tests/utils/test_sampling.py
@@ -13,6 +13,7 @@
 
 BATCH_SIZES = [0, 1, 2, 3, 4, 8]
 ENFORCE_HETEROGENEOUS = [True, False]
+LEN_ENFORCE_SIZES = [0, 1, 2, 3, 4]
 TRUNCATION = [True, False]
 ENFORCE_TRUNCATION_SIZE = [
     [],
@@ -58,6 +59,39 @@
 TOKENIZER = AutoTokenizer.from_pretrained("ibm-granite/granite-3.3-8b-instruct")
 
 
+def _replace_begin_mid_end(
+    prompt_list: list[str], target_count: int = 1, target_length: int = 128
+):
+    """Replaces slots in the list with new of target length:
+        - First `target_count` slots
+        - Middle `target_count` slots
+        - Last `target_count` slots
+
+    Args:
+        prompt_list (list[str]): a list of dummy strings.
+        target_count (int, optional): how many slots to replace. Defaults to 1.
+        target_length (int, optional): how long the string will be.
+    """
+
+    replacement_block = ["enforce" * target_length] * target_count
+
+    if target_count >= 1:
+        beginning = replacement_block + prompt_list[target_count:]
+        mid = len(prompt_list) // 2
+        pointer = max(0, mid - target_count // 2)
+        middle = (
+            prompt_list[:pointer]
+            + replacement_block
+            + prompt_list[pointer + target_count :]
+        )
+        end = prompt_list[:-target_count] + replacement_block
+    else:
+        beginning = prompt_list
+        middle = prompt_list
+        end = prompt_list
+    return (beginning, middle, end)
+
+
 def _prepare_sub_sharegpt_dataset(prompt_length_min, prompt_length_max, tokenizer):
     dataset_path = os.environ.get(
         "SHARE_GPT_DATASET_PATH", os.path.expanduser("~/share_gpt.json")
@@ -247,3 +281,64 @@ def test_get_truncation(enforce_truncation_size, available_sizes):
         assert "size_to_enforce" in f"{e}"
     except Exception as e:
         pytest.fail(f"Unexpeced exception: {e}")
+
+
+ENFORCE_SIZES_COMBO = list(product(BATCH_SIZES, LEN_ENFORCE_SIZES))
+
+
+@pytest.mark.parametrize("batch_size, target_count", ENFORCE_SIZES_COMBO)
+def test_enforce_sizes(batch_size, target_count):
+    print(f"{batch_size=}, {target_count=}")
+    base_text = "base"
+    basic_seq_len = 64
+    prompt_list = [base_text * basic_seq_len] * batch_size
+    enforce_len = 128
+    list_of_prompt_list = _replace_begin_mid_end(prompt_list, target_count, enforce_len)
+    print(list_of_prompt_list)
+    reference = None
+    for prompt_list in list_of_prompt_list:
+        try:
+            prompts_and_sizes = __sample_requests(
+                prompt_list,
+                batch_size,
+                TOKENIZER,
+                32,
+                enforce_len,
+                None,
+                False,
+                [enforce_len] * target_count,
+                False,
+            )
+        except ValueError as e:
+            assert "is smaller than" in f"{e}"
+            continue
+
+        # Given this test case final batch size should equal returned prompts_and_sizes
+        assert len(prompts_and_sizes) == batch_size
+        if reference is None:
+            reference = prompts_and_sizes.copy()
+        # all different prompts should yield the same result (without seed it should be sorted)
+        assert prompts_and_sizes == reference
+        num_found = 0
+        for _, sizes in prompts_and_sizes:
+            if sizes == 128:
+                num_found += 1
+        # Verify that all inserted enforceable_sizes are found
+        assert num_found == target_count
+
+        try:
+            half_batch_prompts_and_sizes = __sample_requests(
+                prompt_list,
+                batch_size // 2,
+                TOKENIZER,
+                32,
+                enforce_len,
+                None,
+                False,
+                [enforce_len] * target_count,
+                False,
+            )
+        except ValueError as e:
+            assert "is smaller than" in f"{e}"
+            continue
+        assert len(half_batch_prompts_and_sizes) == batch_size // 2