Skip to content

Commit b774201

Browse files
avery-blanchardalex-jw-brooks
authored andcommitted
Add test case for caching
Signed-off-by: Avery Blanchard <avery.blanchard@ibm.com>
1 parent bd1090e commit b774201

File tree

1 file changed

+57
-2
lines changed

1 file changed

+57
-2
lines changed

tests/models/test_decoders.py

Lines changed: 57 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,8 @@
175175
)
176176
os.environ["VLLM_DT_MAX_BATCH_SIZE"] = str(max(max(common_batch_sizes), 2))
177177

178+
cache_params = list(itertools.product([common_model_paths[0]], [common_batch_sizes[0]], [common_seq_lengths[0]], [common_max_new_tokens[0]], ["miss", "hit"]))
179+
178180
# thresholds are chosen based on 1024 tokens per sequence
179181
# 1% error threshold rate between cpu fp32 and cuda fp16
180182
# if a models failure thresholds do not exist in this dict, default to the default_metrics_threshold defined above
@@ -246,7 +248,7 @@ def reset_compiler():
246248
torch.compiler.reset()
247249
torch._dynamo.reset()
248250
os.environ.pop("COMPILATION_MODE", None)
249-
251+
os.environ.pop('TORCH_SENDNN_CACHE_ENABLE', None)
250252

251253
# TODO: Currently, gptq does not have the same level of support as non-gptq models for get_model. This method provides the extra requirements for gptq for get_model,
252254
# however ideally, these fixes should be done in foundation-model-stack.
@@ -292,7 +294,6 @@ def __maybe_get_gptq_kwargs(model_path):
292294
pass
293295
return gptq_kwargs_aiu, gptq_kwargs_cpu
294296

295-
296297
def __prepare_inputs(batch_size, seq_length, tokenizer, seed=0):
297298
prompts_and_sizes = sample_sharegpt_requests(
298299
SHARE_GPT_DATASET_PATH,
@@ -674,3 +675,57 @@ def _metric_calculator(r: torch.Tensor, t: torch.Tensor):
674675
print("passed validation level 1")
675676
else:
676677
print("passed validation level 0")
678+
679+
@pytest.mark.parametrize("model_path,batch_size,seq_length,max_new_tokens,cache_status", cache_params)
680+
def test_cache(model_path, batch_size, seq_length, max_new_tokens, cache_status):
681+
torch.manual_seed(42)
682+
os.environ["TORCH_SENDNN_CACHE_ENABLE"] = "1"
683+
os.environ["COMPILATION_MODE"] = "offline_decoder"
684+
685+
dprint(f"testing with cache: model={model_path}, batch_size={batch_size}, seq_length={seq_length}, max_new_tokens={max_new_tokens}, micro_model={USE_MICRO_MODELS}, cache={cache_status}")
686+
687+
if USE_MICRO_MODELS:
688+
micro_model_kwargs = {"architecture": "hf_configured", "nlayers": 3}
689+
else:
690+
micro_model_kwargs = {"architecture": "hf_pretrained"}
691+
692+
if not USE_MICRO_MODELS and os.path.exists(model_path):
693+
model_path_kwargs = {"model_path": model_path}
694+
else:
695+
model_path_kwargs = {"variant": model_path}
696+
697+
distributed_kwargs = {}
698+
if USE_DISTRIBUTED:
699+
distributed_kwargs["distr_param"] = "tp"
700+
distributed_kwargs["group"] = dist.group.WORLD
701+
get_model_kwargs = {**model_path_kwargs, **micro_model_kwargs, **distributed_kwargs}
702+
703+
tokenizer = tokenizers.get_tokenizer(model_path)
704+
705+
# prepare the AIU model
706+
model = get_model(
707+
device_type="cpu",
708+
fused_weights=False,
709+
**get_model_kwargs
710+
)
711+
712+
model.eval()
713+
torch.set_grad_enabled(False)
714+
model.compile(backend="sendnn_decoder")
715+
716+
717+
# prepare input_ids
718+
input_ids, padding_kwargs = __prepare_inputs(batch_size, seq_length, tokenizer)
719+
720+
# warmup aiu model
721+
warmup_model(model, input_ids, max_new_tokens, **padding_kwargs)
722+
723+
# aiu validatation
724+
aiu_validation_info = extract_validation_information(
725+
model,
726+
input_ids,
727+
max_new_tokens,
728+
None,
729+
only_last_token=True,
730+
**padding_kwargs
731+
)

0 commit comments

Comments
 (0)