|
175 | 175 | ) |
176 | 176 | os.environ["VLLM_DT_MAX_BATCH_SIZE"] = str(max(max(common_batch_sizes), 2)) |
177 | 177 |
|
| 178 | +cache_params = list(itertools.product([common_model_paths[0]], [common_batch_sizes[0]], [common_seq_lengths[0]], [common_max_new_tokens[0]], ["miss", "hit"])) |
| 179 | + |
178 | 180 | # thresholds are chosen based on 1024 tokens per sequence |
179 | 181 | # 1% error threshold rate between cpu fp32 and cuda fp16 |
180 | 182 | # if a models failure thresholds do not exist in this dict, default to the default_metrics_threshold defined above |
@@ -246,7 +248,7 @@ def reset_compiler(): |
246 | 248 | torch.compiler.reset() |
247 | 249 | torch._dynamo.reset() |
248 | 250 | os.environ.pop("COMPILATION_MODE", None) |
249 | | - |
| 251 | + os.environ.pop('TORCH_SENDNN_CACHE_ENABLE', None) |
250 | 252 |
|
251 | 253 | # TODO: Currently, gptq does not have the same level of support as non-gptq models for get_model. This method provides the extra requirements for gptq for get_model, |
252 | 254 | # however ideally, these fixes should be done in foundation-model-stack. |
@@ -292,7 +294,6 @@ def __maybe_get_gptq_kwargs(model_path): |
292 | 294 | pass |
293 | 295 | return gptq_kwargs_aiu, gptq_kwargs_cpu |
294 | 296 |
|
295 | | - |
296 | 297 | def __prepare_inputs(batch_size, seq_length, tokenizer, seed=0): |
297 | 298 | prompts_and_sizes = sample_sharegpt_requests( |
298 | 299 | SHARE_GPT_DATASET_PATH, |
@@ -674,3 +675,57 @@ def _metric_calculator(r: torch.Tensor, t: torch.Tensor): |
674 | 675 | print("passed validation level 1") |
675 | 676 | else: |
676 | 677 | print("passed validation level 0") |
| 678 | + |
| 679 | +@pytest.mark.parametrize("model_path,batch_size,seq_length,max_new_tokens,cache_status", cache_params) |
| 680 | +def test_cache(model_path, batch_size, seq_length, max_new_tokens, cache_status): |
| 681 | + torch.manual_seed(42) |
| 682 | + os.environ["TORCH_SENDNN_CACHE_ENABLE"] = "1" |
| 683 | + os.environ["COMPILATION_MODE"] = "offline_decoder" |
| 684 | + |
| 685 | + dprint(f"testing with cache: model={model_path}, batch_size={batch_size}, seq_length={seq_length}, max_new_tokens={max_new_tokens}, micro_model={USE_MICRO_MODELS}, cache={cache_status}") |
| 686 | + |
| 687 | + if USE_MICRO_MODELS: |
| 688 | + micro_model_kwargs = {"architecture": "hf_configured", "nlayers": 3} |
| 689 | + else: |
| 690 | + micro_model_kwargs = {"architecture": "hf_pretrained"} |
| 691 | + |
| 692 | + if not USE_MICRO_MODELS and os.path.exists(model_path): |
| 693 | + model_path_kwargs = {"model_path": model_path} |
| 694 | + else: |
| 695 | + model_path_kwargs = {"variant": model_path} |
| 696 | + |
| 697 | + distributed_kwargs = {} |
| 698 | + if USE_DISTRIBUTED: |
| 699 | + distributed_kwargs["distr_param"] = "tp" |
| 700 | + distributed_kwargs["group"] = dist.group.WORLD |
| 701 | + get_model_kwargs = {**model_path_kwargs, **micro_model_kwargs, **distributed_kwargs} |
| 702 | + |
| 703 | + tokenizer = tokenizers.get_tokenizer(model_path) |
| 704 | + |
| 705 | + # prepare the AIU model |
| 706 | + model = get_model( |
| 707 | + device_type="cpu", |
| 708 | + fused_weights=False, |
| 709 | + **get_model_kwargs |
| 710 | + ) |
| 711 | + |
| 712 | + model.eval() |
| 713 | + torch.set_grad_enabled(False) |
| 714 | + model.compile(backend="sendnn_decoder") |
| 715 | + |
| 716 | + |
| 717 | + # prepare input_ids |
| 718 | + input_ids, padding_kwargs = __prepare_inputs(batch_size, seq_length, tokenizer) |
| 719 | + |
| 720 | + # warmup aiu model |
| 721 | + warmup_model(model, input_ids, max_new_tokens, **padding_kwargs) |
| 722 | + |
| 723 | + # aiu validatation |
| 724 | + aiu_validation_info = extract_validation_information( |
| 725 | + model, |
| 726 | + input_ids, |
| 727 | + max_new_tokens, |
| 728 | + None, |
| 729 | + only_last_token=True, |
| 730 | + **padding_kwargs |
| 731 | +) |
0 commit comments