From 1cd87efb2e9cc76b46c8654a7125c97b32024a24 Mon Sep 17 00:00:00 2001 From: Abukhoyer Shaik Date: Wed, 22 Oct 2025 05:56:19 +0000 Subject: [PATCH 1/2] Models test config in single Config file Signed-off-by: Abukhoyer Shaik --- QEfficient/utils/constants.py | 4 +- tests/conftest.py | 35 -- .../models/custom_tiny_model_configs.json | 348 ------------ .../models/test_audio_embedding_models.py | 9 +- .../models/test_causal_lm_models.py | 126 ++--- .../models/test_embedding_models.py | 10 +- .../models/test_image_text_to_text_models.py | 350 ++++-------- .../models/test_model_configs.json | 532 ++++++++++++++++++ .../models/test_prefix_caching.py | 9 +- .../models/test_speech_seq2seq_models.py | 9 +- 10 files changed, 703 insertions(+), 729 deletions(-) delete mode 100644 tests/transformers/models/custom_tiny_model_configs.json create mode 100644 tests/transformers/models/test_model_configs.json diff --git a/QEfficient/utils/constants.py b/QEfficient/utils/constants.py index 57fba282b..114a6fc11 100644 --- a/QEfficient/utils/constants.py +++ b/QEfficient/utils/constants.py @@ -17,7 +17,7 @@ ONNX_EXPORT_EXAMPLE_SEQ_LEN = 32 ONNX_EXPORT_EXAMPLE_FBS = 4 ONNX_EXPORT_EXAMPLE_NLK = 2 # Number of Logits to Keep -ONNX_EXPORT_OPSET = 13 +ONNX_EXPORT_OPSET = 17 ONNX_EXPORT_MAX_NUM_IMAGES = 1 ONNX_EXPORT_MAX_IMAGE_TILES = 4 ONNX_EXPORT_IMAGE_WIDTH = 560 @@ -84,7 +84,7 @@ def get_models_dir(): ONNX_EXPORT_EXAMPLE_MAX_TOP_K_IDS = 512 ONNX_EXPORT_EXAMPLE_TOP_PS = 0.80 ONNX_EXPORT_EXAMPLE_MIN_PS = 0.99 -ONNX_EXPORT_OPSET = 13 +ONNX_EXPORT_OPSET = 17 COMPILER = ["/opt/qti-aic/exec/qaic-exec", "-aic-hw"] DEFAULT_AIC_HW_VERSION = "ai100" diff --git a/tests/conftest.py b/tests/conftest.py index ba0f341fe..051701036 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -5,46 +5,11 @@ # # ----------------------------------------------------------------------------- -import json import os import shutil -import pytest -from transformers import AutoConfig - from QEfficient.utils.constants import QEFF_MODELS_DIR from QEfficient.utils.logging_utils import logger -from QEfficient.utils.test_utils import ModelConfig - - -def get_custom_model_config_dict(configs): - """ - Converts a list of custom model configuration dictionaries into a dictionary - mapping model names to their corresponding AutoConfig objects. - - Args: - configs (List[Dict]): A list of dictionaries, each containing model configuration parameters. - - Returns: - Dict[str, AutoConfig]: A dictionary where keys are model names and values are AutoConfig objects. - """ - config_dict = {} - for config in configs: - model_name = config["model_name"] - config_dict[model_name] = AutoConfig.from_pretrained( - model_name, - trust_remote_code=config["model_name"] in ModelConfig.EXTERNAL_MODELS, - **config.get("additional_params", {}), - ) - return config_dict - - -# Pytest fixture to load custom model configs from a JSON file -@pytest.fixture(scope="session") -def custom_causal_model_config_dict(): - with open("tests/transformers/models/custom_tiny_model_configs.json", "r") as f: - custom_model_configs_data = json.load(f) - return get_custom_model_config_dict(custom_model_configs_data) def qeff_models_clean_up(): diff --git a/tests/transformers/models/custom_tiny_model_configs.json b/tests/transformers/models/custom_tiny_model_configs.json deleted file mode 100644 index 03a9541fd..000000000 --- a/tests/transformers/models/custom_tiny_model_configs.json +++ /dev/null @@ -1,348 +0,0 @@ -[ - { - "model_name": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", - "model_type": "llama", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 32000, - "num_key_value_heads": 1 - } - }, - { - "model_name": "gpt2", - "model_type": "gpt2", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 50257, - "num_key_value_heads": 1 - } - }, - { - "model_name": "allenai/OLMo-2-0425-1B", - "model_type": "olmo2", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 100352, - "num_key_value_heads": 1 - } - }, - { - "model_name": "Salesforce/codegen-350M-mono", - "model_type": "codegen", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 4, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 51200, - "num_key_value_heads": 1, - "rotary_dim": 16 - } - }, - - { - "model_name": "microsoft/Phi-3-mini-4k-instruct", - "model_type": "phi3", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 32064, - "num_key_value_heads": 1 - } - }, - { - "model_name": "tiiuae/falcon-7b", - "model_type": "falcon", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 65024, - "num_key_value_heads": 1 - } - }, - { - "model_name": "Qwen/Qwen3-30B-A3B-Instruct-2507", - "model_type": "qwen3_moe", - "additional_params": { - "hidden_size": 256, - "intermediate_size": 256, - "max_position_embeddings": 128, - "max_window_layers": 48, - "moe_intermediate_size": 768, - "num_attention_heads": 2, - "num_experts": 4, - "num_experts_per_tok": 2, - "num_hidden_layers": 1, - "num_key_value_heads": 1, - "vocab_size": 151936 - } - }, - { - "model_name": "Qwen/Qwen2-0.5B", - "model_type": "qwen2", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 151936, - "num_key_value_heads": 1 - } - }, - { - "model_name": "bigcode/starcoder2-3b", - "model_type": "starcoder2", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 49152, - "num_key_value_heads": 1 - } - }, - { - "model_name": "Felladrin/Minueza-32M-Base", - "model_type": "mistral", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 32002, - "num_key_value_heads": 1 - } - }, - { - "model_name": "wtang06/mpt-125m-c4", - "model_type": "mpt", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 50368 - } - }, - { - "model_name": "hakurei/gpt-j-random-tinier", - "model_type": "gptj", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 50400, - "num_key_value_heads": 1, - "rotary_dim": 16 - } - }, - { - "model_name": "mistralai/Mixtral-8x7B-Instruct-v0.1", - "model_type": "mixtral", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 32000, - "num_key_value_heads": 1 - } - }, - { - "model_name": "meta-llama/Llama-3.2-1B", - "model_type": "llama", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 128256, - "num_key_value_heads": 1, - "rope_scaling": { - "factor": 32.0, - "high_freq_factor": 4.0, - "low_freq_factor": 1.0, - "original_max_position_embeddings": 8192, - "rope_type": "llama3" - } - } - }, - { - "model_name": "unsloth/gemma-2b", - "model_type": "gemma", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 256000, - "num_key_value_heads": 1 - } - }, - { - "model_name": "unsloth/gemma-2-2b", - "model_type": "gemma2", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 256000, - "num_key_value_heads": 1 - } - }, - { - "model_name": "TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", - "model_type": "llama", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 32003 - } - }, - { - "model_name": "TheBloke/Llama-2-7B-GPTQ", - "model_type": "llama", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 32000 - } - }, - { - "model_name": "ibm-granite/granite-20b-code-base", - "model_type": "gpt_bigcode", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 49152, - "num_key_value_heads": 1, - "activation_function": "gelu", - "architectures": [ - "GPTBigCodeForCausalLM" - ] - } - }, - { - "model_name": "neuralmagic/Llama-3.2-3B-Instruct-FP8", - "model_type": "llama", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 128256 - } - }, - { - "model_name": "neuralmagic/Qwen2-0.5B-Instruct-FP8", - "model_type": "qwen2", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 2, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 151936 - } - }, - { - "model_name": "ibm-granite/granite-3.1-2b-instruct", - "model_type": "granite", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 49155, - "num_key_value_heads": 1 - } - }, - { - "model_name": "ibm-granite/granite-guardian-3.1-2b", - "model_type": "granite", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 49155, - "num_key_value_heads": 1 - } - }, - { - "model_name": "hpcai-tech/grok-1", - "model_type": null, - "additional_params":{ - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 131072, - "num_key_value_heads": 1 - } - }, - { - "model_name": "Snowflake/Llama-3.1-SwiftKV-8B-Instruct", - "model_type": null, - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 2, - "num_attention_heads": 2, - "hidden_size": 256, - "intermediate_size": 256, - "vocab_size": 128256, - "num_key_value_layers": 1, - "num_key_value_heads": 1, - "rope_scaling": { - "factor": 8.0, - "high_freq_factor": 4.0, - "low_freq_factor": 1.0, - "original_max_position_embeddings": 8192, - "rope_type": "llama3" - } - } - } -] diff --git a/tests/transformers/models/test_audio_embedding_models.py b/tests/transformers/models/test_audio_embedding_models.py index da30c76b0..75f9fac08 100644 --- a/tests/transformers/models/test_audio_embedding_models.py +++ b/tests/transformers/models/test_audio_embedding_models.py @@ -5,6 +5,7 @@ # # ----------------------------------------------------------------------------- +import json import os from typing import List, Optional @@ -23,9 +24,11 @@ from QEfficient.utils.constants import WAV2VEC2_MAX_SEQ_LEN, QnnConstants from QEfficient.utils.device_utils import get_available_device_id -test_models = [ - "facebook/wav2vec2-base-960h", -] +CONFIG_PATH = os.path.join(os.path.dirname(__file__), "test_model_configs.json") + +with open(CONFIG_PATH, "r") as f: + config_data = json.load(f) + test_models = config_data["audio_embedding_models"] def load_ctc_model(model_config): diff --git a/tests/transformers/models/test_causal_lm_models.py b/tests/transformers/models/test_causal_lm_models.py index 86bce4441..81f710f09 100644 --- a/tests/transformers/models/test_causal_lm_models.py +++ b/tests/transformers/models/test_causal_lm_models.py @@ -6,6 +6,7 @@ # ----------------------------------------------------------------------------- import copy +import json import os from typing import Optional @@ -24,47 +25,40 @@ from QEfficient.utils.run_utils import ApiRunner from QEfficient.utils.test_utils import ModelConfig -test_models_causal = [ - "TinyLlama/TinyLlama-1.1B-Chat-v1.0", - "gpt2", - "Salesforce/codegen-350M-mono", - "microsoft/Phi-3-mini-4k-instruct", - "tiiuae/falcon-7b", - "Qwen/Qwen2-0.5B", - "Qwen/Qwen3-0.6B", - "bigcode/starcoder2-3b", - "Qwen/Qwen3-30B-A3B-Instruct-2507", - "Felladrin/Minueza-32M-Base", - "wtang06/mpt-125m-c4", - "hakurei/gpt-j-random-tinier", - "mistralai/Mixtral-8x7B-Instruct-v0.1", - "meta-llama/Llama-3.2-1B", - "unsloth/gemma-2b", - "unsloth/gemma-2-2b", - "TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", # AWQ model - "TheBloke/Llama-2-7B-GPTQ", # GPTQ model - "ibm-granite/granite-20b-code-base", - # "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8-dynamic", # naive-quantized compressed-tensor FP8 model per-channel weight, per-token activations - "neuralmagic/Llama-3.2-3B-Instruct-FP8", # float quantized compressed-tensor per tensor both weight and activations - "neuralmagic/Qwen2-0.5B-Instruct-FP8", # fp8 quant method, static, with lm head ignored - "ibm-granite/granite-3.1-2b-instruct", - "ibm-granite/granite-guardian-3.1-2b", - "hpcai-tech/grok-1", - "Snowflake/Llama-3.1-SwiftKV-8B-Instruct", - "allenai/OLMo-2-0425-1B", -] - -test_models_qnn = [ - "mistralai/Mixtral-8x7B-Instruct-v0.1", - "meta-llama/Llama-3.2-1B", - "unsloth/gemma-2b", - "ibm-granite/granite-guardian-3.1-2b", -] - -test_models_spd = [ - "TinyLlama/TinyLlama-1.1B-Chat-v1.0", - "Qwen/Qwen2-0.5B", -] +CONFIG_PATH = os.path.join(os.path.dirname(__file__), "test_model_configs.json") + +with open(CONFIG_PATH, "r") as f: + config_data = json.load(f) + causal_lm_models = config_data["causal_lm_models"] + spd_models = config_data["spd_causal_lm_models"] + qnn_models = config_data["qnn_causal_lm_models"] + + +# Create a list of model names for parameterization +test_models_causal = [model["model_name"] for model in causal_lm_models] +test_models_spd = [model["model_name"] for model in spd_models] +test_models_qnn = [model["model_name"] for model in qnn_models] + +# Create a dictionary mapping model names to their configs +model_config_dict = {model["model_name"]: model for model in causal_lm_models} + + +def get_hf_config_from_custom_config(model_name): + """ + Function to get HF config from custom config file + -------- + :model_name: str + + :return config + """ + custom_config = model_config_dict[model_name] + + hf_config = AutoConfig.from_pretrained( + model_name, + trust_remote_code=model_name in ModelConfig.EXTERNAL_MODELS, + **custom_config.get("additional_params", {}), + ) + return hf_config def get_custom_n_layers(model_name): @@ -101,7 +95,6 @@ def load_causal_lm_model(model_name, n_layer=1, config=None): ) if config is None: # If custom config is not provided, load the model config from Hugging Face if n_layer is not None: - # If n_layer is specified, load the model with that many layers model_hf = AutoModelForCausalLM.from_pretrained( model_path, use_cache=True, @@ -145,7 +138,6 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( enable_qnn: Optional[bool] = False, qnn_config: Optional[str] = None, config: Optional[AutoConfig] = None, - pytorch_hf_tokens: Optional[list] = None, ): """ Validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, both with and without continuous batching. @@ -172,7 +164,8 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( Constants.PROMPT_LEN, Constants.CTX_LEN, ) - if model_name not in ModelConfig.SWIFTKV_MODELS and model_name not in ModelConfig.EXTERNAL_MODELS: + + if model_name not in ModelConfig.SWIFTKV_MODELS: pytorch_hf_tokens = api_runner.run_hf_model_on_pytorch(model_hf) is_tlm = False if num_speculative_tokens is None else True @@ -191,8 +184,6 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( assert (pytorch_kv_tokens == ort_tokens).all(), "Tokens don't match for ONNXRT output and PyTorch output." - if not get_available_device_id(): - pytest.skip("No available devices to run model on Cloud AI 100") qpc_path = qeff_model.compile( prefill_seq_len=prompt_len, ctx_len=ctx_len, @@ -232,14 +223,10 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( Constants.CTX_LEN, full_batch_size, ) - - if model_name not in ModelConfig.SWIFTKV_MODELS and model_name not in ModelConfig.EXTERNAL_MODELS: + if model_name not in ModelConfig.SWIFTKV_MODELS: pytorch_hf_tokens = api_runner.run_hf_model_on_pytorch_CB(model_hf) pytorch_hf_tokens = np.vstack(pytorch_hf_tokens) - if model_name in ModelConfig.EXTERNAL_MODELS: - pytorch_hf_tokens = [pytorch_hf_tokens for _ in range(full_batch_size)] - qeff_model = QEFFAutoModelForCausalLM( model_hf, continuous_batching=True, is_tlm=is_tlm, pretrained_model_name_or_path=model_name ) @@ -261,7 +248,6 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( qnn_config=qnn_config, ) exec_info_fbs = qeff_model.generate(tokenizer, prompts=fbs_prompts) - if model_name in ModelConfig.SWIFTKV_MODELS: assert all( [ @@ -315,25 +301,19 @@ def test_causal_lm_export_with_deprecated_api(model_name): @pytest.mark.on_qaic @pytest.mark.regular @pytest.mark.parametrize("model_name", test_models_causal) -def test_custom_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name, custom_causal_model_config_dict): +def test_custom_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name): """ Test function to validate the dummy PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, both with and without continuous batching. ``Mandatory`` Args: :model_name (str): Hugging Face Model Card name, Example: ``gpt2`` """ - config = custom_causal_model_config_dict.get(model_name) - - # Using fixed reference tokens for external models for specific test cases. - # These tokens are hardcoded, therefore will not match if the model config changes. - pytorch_hf_tokens = None - if model_name in ModelConfig.EXTERNAL_MODELS: - pytorch_hf_tokens = ModelConfig.EXTERNAL_MODELS[model_name]["pytorch_hf_tokens_custom_case"] + hf_config = get_hf_config_from_custom_config(model_name) if model_name in ModelConfig.QUANTIZED_MODELS: n_layer = get_custom_n_layers(model_name) - check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name, n_layer=n_layer, pytorch_hf_tokens=pytorch_hf_tokens) + check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name, n_layer=n_layer) else: - check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name, config=config, pytorch_hf_tokens=pytorch_hf_tokens) + check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name, config=hf_config) @pytest.mark.nightly @@ -347,34 +327,26 @@ def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name): """ n_layer = get_custom_n_layers(model_name) - # Using fixed reference tokens for external models for specific test cases. - # These tokens are hardcoded, therefore will not match if the model config changes. - pytorch_hf_tokens = None - if model_name in ModelConfig.EXTERNAL_MODELS: - pytorch_hf_tokens = ModelConfig.EXTERNAL_MODELS[model_name]["pytorch_hf_tokens_normal_case"] - - check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( - model_name=model_name, n_layer=n_layer, pytorch_hf_tokens=pytorch_hf_tokens - ) + check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name=model_name, n_layer=n_layer) @pytest.mark.on_qaic @pytest.mark.regular @pytest.mark.qnn @pytest.mark.parametrize("model_name", test_models_qnn) -def test_custom_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100_qnn(model_name, custom_causal_model_config_dict): +def test_custom_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100_qnn(model_name): """ QNN Setup Test function to validate the dummy PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, both with and without continuous batching. ``Mandatory`` Args: :model_name (str): Hugging Face Model Card name, Example: ``gpt2`` """ - config = custom_causal_model_config_dict.get(model_name) + hf_config = get_hf_config_from_custom_config(model_name) qnn_config_json_path = os.path.join(os.getcwd(), "qnn_config.json") create_json(qnn_config_json_path, QnnConstants.QNN_SAMPLE_CONFIG) check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( - model_name, enable_qnn=True, qnn_config=qnn_config_json_path, config=config + model_name, enable_qnn=True, qnn_config=qnn_config_json_path, config=hf_config ) @@ -402,18 +374,18 @@ def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100_qnn(model_name): @pytest.mark.on_qaic @pytest.mark.qnn @pytest.mark.parametrize("model_name", test_models_spd) -def test_custom_causal_tlm_pytorch_vs_kv_vs_ort_vs_ai100(model_name, custom_causal_model_config_dict): +def test_custom_causal_tlm_pytorch_vs_kv_vs_ort_vs_ai100(model_name): """ Test function to validate the dummy PyTorch model for speculative decoding, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, both with and without continuous batching. ``Mandatory`` Args: :model_name (str): Hugging Face Model Card name, Example: ``gpt2`` """ - config = custom_causal_model_config_dict.get(model_name) + hf_config = get_hf_config_from_custom_config(model_name) check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( model_name=model_name, num_speculative_tokens=Constants.NUM_SPECULATIVE_TOKENS, - config=config, + config=hf_config, ) diff --git a/tests/transformers/models/test_embedding_models.py b/tests/transformers/models/test_embedding_models.py index 2d110faeb..e9a636d71 100644 --- a/tests/transformers/models/test_embedding_models.py +++ b/tests/transformers/models/test_embedding_models.py @@ -5,6 +5,7 @@ # # ----------------------------------------------------------------------------- +import json import os from typing import Optional @@ -19,10 +20,11 @@ from QEfficient.utils._utils import create_json from QEfficient.utils.constants import Constants, QnnConstants -embed_test_models = [ - {"model_name": "jinaai/jina-embeddings-v2-base-code", "pooling": "mean"}, - {"model_name": "sentence-transformers/nli-bert-base-cls-pooling", "pooling": "cls"}, -] +CONFIG_PATH = os.path.join(os.path.dirname(__file__), "test_model_configs.json") + +with open(CONFIG_PATH, "r") as f: + config_data = json.load(f) + embed_test_models = config_data["embedding_models"] def check_embed_pytorch_vs_ort_vs_ai100( diff --git a/tests/transformers/models/test_image_text_to_text_models.py b/tests/transformers/models/test_image_text_to_text_models.py index a7b4162aa..673d60f96 100644 --- a/tests/transformers/models/test_image_text_to_text_models.py +++ b/tests/transformers/models/test_image_text_to_text_models.py @@ -5,6 +5,7 @@ # # ---------------------------------------------------------------------------- +import json import os from io import BytesIO from typing import List, Optional @@ -32,167 +33,21 @@ from QEfficient.utils.test_utils import InternProcessor NEW_GENERATION_TOKENS = 10 -test_models_config = [ - # CONFIG PARAMS NEEDED FOR A MODEL TO BE TESTED - # ( - # model_name, - # kv_offload, - # batch_size, - # prompt_len, - # ctx_len, - # img_size, - # img_url", - # text_prompt, - # number of layers of the model, - # ), - ( - "llava-hf/llava-1.5-7b-hf", - True, - 1, - 784, - 1024, - 336, - "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg", - "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud", - 1, - ), - ( - "llava-hf/llava-1.5-7b-hf", - False, - 1, - 784, - 1024, - 336, - "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg", - "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud", - 1, - ), - # Disabled in CI due to performance issues - # ( - # "meta-llama/Llama-4-Scout-17B-16E-Instruct", - # True, - # 1, - # 128, - # 3072, - # 336, - # "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg", - # "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud", - # 4, - # ), - # ( - # "meta-llama/Llama-4-Scout-17B-16E-Instruct", - # False, - # 1, - # 128, - # 3072, - # 336, - # "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg", - # "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud", - # 4, - # ), - ( - "google/gemma-3-4b-it", - True, - 1, - 128, - 3072, - 896, - "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png", - "Can you describe the image in detail.", - 1, - ), - ( - "google/gemma-3-4b-it", - False, - 1, - 128, - 3072, - 896, - "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png", - "Can you describe the image in detail.", - 1, - ), - ( - "mistralai/Mistral-Small-3.1-24B-Instruct-2503", - True, - 1, - 128, - 4096, - 1540, - "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png", - "Can you describe the image in detail.", - 1, - ), - ( - "mistralai/Mistral-Small-3.1-24B-Instruct-2503", - False, - 1, - 128, - 4096, - 1540, - "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png", - "Can you describe the image in detail.", - 1, - ), - # ( - # "meta-llama/Llama-3.2-11B-Vision-Instruct", - # True, - # 1, - # 32, - # 512, - # 560, - # "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg", - # "Explain this image", - # 7, - # ), -] - -intern_model_config = [ - ( - "OpenGVLab/InternVL2_5-1B", - True, - 1, - 384, - 512, - "https://image.slidesharecdn.com/azureintroduction-191206101932/75/Introduction-to-Microsoft-Azure-Cloud-1-2048.jpg", - "Please describe the image in detail.", - 2, - ), - ( - "OpenGVLab/InternVL3_5-1B", - True, - 1, - 384, - 512, - "https://image.slidesharecdn.com/azureintroduction-191206101932/75/Introduction-to-Microsoft-Azure-Cloud-1-2048.jpg", - "Please describe the image in detail.", - 2, - ), - # ( - # "OpenGVLab/InternVL2_5-1B", - # False, - # 1, - # 384, - # 512, - # "https://image.slidesharecdn.com/azureintroduction-191206101932/75/Introduction-to-Microsoft-Azure-Cloud-1-2048.jpg", - # "Please describe the image in detail.", - # 2, - # ), # commented becuase QNN Convertor is not supported for this model yet. -] - -molmo_model_config = [ - # Disabled in CI due to HF issues - # ( - # "allenai/Molmo-7B-D-0924", - # True, - # 1, - # 128, - # 4096, - # "https://picsum.photos/id/237/536/354", - # "Can you describe the image in detail.", - # 2, - # ), -] + +CONFIG_PATH = os.path.join(os.path.dirname(__file__), "test_model_configs.json") + +with open(CONFIG_PATH, "r") as f: + config_data = json.load(f) + multimodal_models = config_data["multimodal_models"] + intern_models = config_data["intern_models"] + +test_mm_models = [model_config["model_name"] for model_config in multimodal_models] +test_intern_models = [model_config["model_name"] for model_config in intern_models] + +test_mm_models_config = {model["model_name"]: model for model in multimodal_models} +test_intern_config = {model["model_name"]: model for model in intern_models} + +model_config_dict = {**test_mm_models_config, **test_intern_config} def load_image_text_to_text_model(model_config): @@ -218,6 +73,28 @@ def load_image_text_to_text_model(model_config): return model_hf, params +def load_image_text_to_text_model_from_config(model_name, config): + torch.manual_seed(42) + model_path = hf_download( + repo_id=model_name, + ignore_patterns=["*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf", "*.h5", "*.msgpack"], + ) + try: + model_hf = AutoModelForImageTextToText.from_config( + config, + ) + except ValueError: + model_hf = AutoModelForCausalLM.from_pretrained( + model_path, + low_cpu_mem_usage=False, + trust_remote_code=True, + config=config, + ) + params = sum(p.numel() for p in model_hf.parameters()) + model_hf.eval() + return model_hf, params + + def set_num_layers(config, n_layer=1): ## -1 indicates use all the layers of the model. if n_layer == -1: @@ -252,14 +129,16 @@ def check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( num_devices: int = 1, enable_qnn: Optional[bool] = False, qnn_config: Optional[str] = None, + config: Optional[AutoConfig] = None, ): - model_config = {"model_name": model_name} - model_config["img_size"] = img_size - config = AutoConfig.from_pretrained(model_config["model_name"], trust_remote_code=True, padding=True) - config = set_num_layers(config, n_layer=n_layer) - model_hf, _ = load_image_text_to_text_model(config) - processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True, padding=True) + if config is None: + config = AutoConfig.from_pretrained(model_name, trust_remote_code=True, padding=True) + config = set_num_layers(config, n_layer=n_layer) + model_hf, _ = load_image_text_to_text_model(config) + else: + model_hf, _ = load_image_text_to_text_model_from_config(model_name, config) + processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True, padding=True) n_layer = get_num_layers_vlm(config) image = Image.open(requests.get(img_url, stream=True).raw) if model_name == "mistralai/Mistral-Small-3.1-24B-Instruct-2503": @@ -293,25 +172,12 @@ def check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32) streamer = TextStreamer(processor.tokenizer) pytorch_hf_tokens = api_runner.run_vlm_hf_model_on_pytorch(model_hf, inputs) - qeff_model = QEFFAutoModelForImageTextToText.from_pretrained( - model_config["model_name"], - kv_offload=kv_offload, - config=config, - ) - - # pytorch_kv_tokens = api_runner.run_vlm_kv_model_on_pytorch(qeff_model.model) - # assert (pytorch_kv_tokens == pytorch_hf_tokens).all(), ( - # "Tokens don't match for pytorch HF output and pytorch KV output" - # ) + qeff_model = QEFFAutoModelForImageTextToText(model_hf, kv_offload=kv_offload) qeff_model.export() - # onnx_model_path = qeff_model.export() - # ort_tokens = api_runner.run_vlm_kv_model_on_ort(onnx_model_path) - # assert (pytorch_hf_tokens == ort_tokens).all(), "Tokens don't match for pytorch HF output and ORT output" - if not get_available_device_id(): - pytest.skip("No available devices to run model on Cloud AI 100") + qeff_model.compile( - img_size=model_config["img_size"], + img_size=img_size, num_devices=num_devices, prefill_seq_len=prompt_len, ctx_len=ctx_len, @@ -491,8 +357,7 @@ def check_intern_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( # onnx_model_path = qeff_model.export() # ort_tokens = api_runner.run_vlm_kv_model_on_ort(onnx_model_path) # assert (pytorch_hf_tokens == ort_tokens).all(), "Tokens don't match for pytorch HF output and ORT output" - if not get_available_device_id(): - pytest.skip("No available devices to run model on Cloud AI 100") + qeff_model.compile( num_patches=1, num_devices=num_devices, @@ -511,27 +376,26 @@ def check_intern_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( @pytest.mark.on_qaic @pytest.mark.multimodal -@pytest.mark.parametrize( - "model_name, kv_offload, batch_size, prompt_len, ctx_len, img_size, img_url, query, n_layer", test_models_config -) -def test_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( - model_name, kv_offload, batch_size, prompt_len, ctx_len, img_size, img_url, query, n_layer -): +@pytest.mark.parametrize("model_name", test_mm_models) +@pytest.mark.parametrize("kv_offload", [True, False]) +def test_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100(model_name, kv_offload): """ Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, without continuous batching. ``Mandatory`` Args: :model_name (str): Hugging Face Model Card name, Example: ``gpt2`` """ + if model_name == "meta-llama/Llama-4-Scout-17B-16E-Instruct": + pytest.skip("Performance issue: Skipping the test for Llama-4-Scout-17B-16E-Instruct model.") check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( model_name=model_name, - prompt_len=prompt_len, - ctx_len=ctx_len, + prompt_len=model_config_dict[model_name]["prompt_len"], + ctx_len=model_config_dict[model_name]["ctx_len"], max_gen_len=NEW_GENERATION_TOKENS, - img_size=img_size, - img_url=img_url, - query=query, - n_layer=n_layer, - batch_size=batch_size, + img_size=model_config_dict[model_name]["img_size"], + img_url=model_config_dict[model_name]["img_url"], + query=model_config_dict[model_name]["text_prompt"], + n_layer=model_config_dict[model_name]["num_layers"], + batch_size=model_config_dict[model_name]["batch_size"], kv_offload=kv_offload, ) @@ -539,12 +403,9 @@ def test_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( @pytest.mark.on_qaic @pytest.mark.qnn @pytest.mark.multimodal -@pytest.mark.parametrize( - "model_name, kv_offload, batch_size, prompt_len, ctx_len, img_size, img_url, query, n_layer", test_models_config -) -def test_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100_qnn( - model_name, kv_offload, batch_size, prompt_len, ctx_len, img_size, img_url, query, n_layer -): +@pytest.mark.parametrize("model_name", test_mm_models) +@pytest.mark.parametrize("kv_offload", [True, False]) +def test_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100_qnn(model_name, kv_offload): """ Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, without continuous batching. ``Mandatory`` Args: @@ -558,14 +419,14 @@ def test_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100_qnn( check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( model_name=model_name, - prompt_len=prompt_len, - ctx_len=ctx_len, + prompt_len=model_config_dict[model_name]["prompt_len"], + ctx_len=model_config_dict[model_name]["ctx_len"], max_gen_len=NEW_GENERATION_TOKENS, - img_size=img_size, - img_url=img_url, - query=query, - n_layer=n_layer, - batch_size=batch_size, + img_size=model_config_dict[model_name]["img_size"], + img_url=model_config_dict[model_name]["img_url"], + query=model_config_dict[model_name]["text_prompt"], + n_layer=model_config_dict[model_name]["num_layers"], + batch_size=model_config_dict[model_name]["batch_size"], kv_offload=kv_offload, enable_qnn=True, qnn_config=qnn_config_json_path, @@ -574,42 +435,20 @@ def test_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100_qnn( @pytest.mark.on_qaic @pytest.mark.multimodal -@pytest.mark.parametrize( - "model_name, kv_offload, batch_size, prompt_len, ctx_len, img_url, query, n_layer", molmo_model_config -) -def test_image_text_to_text_molmo_pytorch_vs_kv_vs_ort_vs_ai100( - model_name, kv_offload, batch_size, prompt_len, ctx_len, img_url, query, n_layer -): - check_molmo_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( - model_name=model_name, - prompt_len=prompt_len, - ctx_len=ctx_len, - max_gen_len=NEW_GENERATION_TOKENS, - img_url=img_url, - query=query, - n_layer=n_layer, - batch_size=batch_size, - kv_offload=kv_offload, - ) - - -@pytest.mark.on_qaic -@pytest.mark.multimodal -@pytest.mark.parametrize( - "model_name, kv_offload, batch_size, prompt_len, ctx_len, img_url, query, n_layer", intern_model_config -) -def test_image_text_to_text_intern_pytorch_vs_kv_vs_ort_vs_ai100( - model_name, kv_offload, batch_size, prompt_len, ctx_len, img_url, query, n_layer -): +@pytest.mark.parametrize("model_name", test_intern_models) +@pytest.mark.parametrize("kv_offload", [True, False]) +def test_image_text_to_text_intern_pytorch_vs_kv_vs_ort_vs_ai100(model_name, kv_offload): + if not kv_offload: + pytest.skip("Single Qpc is not supported for InternVL without kv_offload.") check_intern_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( model_name=model_name, - prompt_len=prompt_len, - ctx_len=ctx_len, + prompt_len=model_config_dict[model_name]["prompt_len"], + ctx_len=model_config_dict[model_name]["ctx_len"], max_gen_len=NEW_GENERATION_TOKENS, - img_url=img_url, - query=query, - n_layer=n_layer, - batch_size=batch_size, + img_url=model_config_dict[model_name]["img_url"], + query=model_config_dict[model_name]["text_prompt"], + n_layer=model_config_dict[model_name]["num_layers"], + batch_size=model_config_dict[model_name]["batch_size"], kv_offload=kv_offload, ) @@ -617,24 +456,23 @@ def test_image_text_to_text_intern_pytorch_vs_kv_vs_ort_vs_ai100( @pytest.mark.on_qaic @pytest.mark.qnn @pytest.mark.multimodal -@pytest.mark.parametrize( - "model_name, kv_offload, batch_size, prompt_len, ctx_len, img_url, query, n_layer", intern_model_config -) -def test_image_text_to_text_intern_pytorch_vs_kv_vs_ort_vs_ai100_qnn( - model_name, kv_offload, batch_size, prompt_len, ctx_len, img_url, query, n_layer -): +@pytest.mark.parametrize("model_name", test_intern_models) +@pytest.mark.parametrize("kv_offload", [True, False]) +def test_image_text_to_text_intern_pytorch_vs_kv_vs_ort_vs_ai100_qnn(model_name, kv_offload): + if not kv_offload: + pytest.skip("Single Qpc is not supported for InternVL without kv_offload.") qnn_config_json_path = os.path.join(os.getcwd(), "qnn_config.json") create_json(qnn_config_json_path, QnnConstants.QNN_SAMPLE_CONFIG) check_intern_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( model_name=model_name, - prompt_len=prompt_len, - ctx_len=ctx_len, + prompt_len=model_config_dict[model_name]["prompt_len"], + ctx_len=model_config_dict[model_name]["ctx_len"], max_gen_len=NEW_GENERATION_TOKENS, - img_url=img_url, - query=query, - n_layer=n_layer, - batch_size=batch_size, + img_url=model_config_dict[model_name]["img_url"], + query=model_config_dict[model_name]["text_prompt"], + n_layer=model_config_dict[model_name]["num_layers"], + batch_size=model_config_dict[model_name]["batch_size"], kv_offload=kv_offload, enable_qnn=True, qnn_config=qnn_config_json_path, diff --git a/tests/transformers/models/test_model_configs.json b/tests/transformers/models/test_model_configs.json new file mode 100644 index 000000000..63cb429d5 --- /dev/null +++ b/tests/transformers/models/test_model_configs.json @@ -0,0 +1,532 @@ +{ + "causal_lm_models": [ + { + "model_name": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", + "model_type": "llama", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 32000, + "num_key_value_heads": 1 + } + }, + { + "model_name": "gpt2", + "model_type": "gpt2", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 50257, + "num_key_value_heads": 1 + } + }, + { + "model_name": "allenai/OLMo-2-0425-1B", + "model_type": "olmo2", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 100352, + "num_key_value_heads": 1 + } + }, + { + "model_name": "Salesforce/codegen-350M-mono", + "model_type": "codegen", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 4, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 51200, + "num_key_value_heads": 1, + "rotary_dim": 16 + } + }, + + { + "model_name": "microsoft/Phi-3-mini-4k-instruct", + "model_type": "phi3", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 32064, + "num_key_value_heads": 1 + } + }, + { + "model_name": "tiiuae/falcon-7b", + "model_type": "falcon", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 65024, + "num_key_value_heads": 1 + } + }, + { + "model_name": "Qwen/Qwen3-30B-A3B-Instruct-2507", + "model_type": "qwen3_moe", + "additional_params": { + "hidden_size": 256, + "intermediate_size": 256, + "max_position_embeddings": 128, + "max_window_layers": 48, + "moe_intermediate_size": 768, + "num_attention_heads": 2, + "num_experts": 4, + "num_experts_per_tok": 2, + "num_hidden_layers": 1, + "num_key_value_heads": 1, + "vocab_size": 151936 + } + }, + { + "model_name": "Qwen/Qwen2-0.5B", + "model_type": "qwen2", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 151936, + "num_key_value_heads": 1 + } + }, + { + "model_name": "bigcode/starcoder2-3b", + "model_type": "starcoder2", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 49152, + "num_key_value_heads": 1 + } + }, + { + "model_name": "Felladrin/Minueza-32M-Base", + "model_type": "mistral", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 32002, + "num_key_value_heads": 1 + } + }, + { + "model_name": "wtang06/mpt-125m-c4", + "model_type": "mpt", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 50368 + } + }, + { + "model_name": "hakurei/gpt-j-random-tinier", + "model_type": "gptj", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 50400, + "num_key_value_heads": 1, + "rotary_dim": 16 + } + }, + { + "model_name": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "model_type": "mixtral", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 32000, + "num_key_value_heads": 1 + } + }, + { + "model_name": "meta-llama/Llama-3.2-1B", + "model_type": "llama", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 128256, + "num_key_value_heads": 1, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + } + } + }, + { + "model_name": "unsloth/gemma-2b", + "model_type": "gemma", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 256000, + "num_key_value_heads": 1 + } + }, + { + "model_name": "unsloth/gemma-2-2b", + "model_type": "gemma2", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 256000, + "num_key_value_heads": 1 + } + }, + { + "model_name": "TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", + "model_type": "llama", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 32003 + } + }, + { + "model_name": "TheBloke/Llama-2-7B-GPTQ", + "model_type": "llama", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 32000 + } + }, + { + "model_name": "ibm-granite/granite-20b-code-base", + "model_type": "gpt_bigcode", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 49152, + "num_key_value_heads": 1, + "activation_function": "gelu", + "architectures": [ + "GPTBigCodeForCausalLM" + ] + } + }, + { + "model_name": "neuralmagic/Llama-3.2-3B-Instruct-FP8", + "model_type": "llama", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 128256 + } + }, + { + "model_name": "neuralmagic/Qwen2-0.5B-Instruct-FP8", + "model_type": "qwen2", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 2, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 151936 + } + }, + { + "model_name": "ibm-granite/granite-3.1-2b-instruct", + "model_type": "granite", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 49155, + "num_key_value_heads": 1 + } + }, + { + "model_name": "ibm-granite/granite-guardian-3.1-2b", + "model_type": "granite", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 49155, + "num_key_value_heads": 1 + } + }, + { + "model_name": "hpcai-tech/grok-1", + "model_type": null, + "additional_params":{ + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 131072, + "num_key_value_heads": 1 + } + }, + { + "model_name": "Snowflake/Llama-3.1-SwiftKV-8B-Instruct", + "model_type": null, + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 2, + "num_attention_heads": 2, + "hidden_size": 256, + "intermediate_size": 256, + "vocab_size": 128256, + "num_key_value_layers": 1, + "num_key_value_heads": 1, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + } + } + } + ], + "multimodal_models": [ + { + "model_name": "llava-hf/llava-1.5-7b-hf", + "model_type": "llava", + "batch_size": 1, + "prompt_len": 784, + "ctx_len": 1024, + "img_size": 336, + "img_url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg", + "text_prompt": "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud", + "num_layers": 1, + "additional_params": { + } + }, + { + "model_name": "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "model_type": "llama4", + "batch_size": 1, + "prompt_len": 32, + "ctx_len": 3072, + "img_size": 336, + "img_url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg", + "text_prompt": "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud", + "num_layers": 4, + "additional_params": { + } + }, + { + "model_name": "google/gemma-3-4b-it", + "model_type": "gemma3", + "batch_size": 1, + "prompt_len": 128, + "ctx_len": 3072, + "img_size": 896, + "img_url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png", + "text_prompt": "Can you describe the image in detail.", + "num_layers": 1, + "additional_params": { + } + }, + { + "model_name": "mistralai/Mistral-Small-3.1-24B-Instruct-2503", + "model_type": "mistral3", + "batch_size": 1, + "prompt_len": 128, + "ctx_len": 4096, + "img_size": 1540, + "img_url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png", + "text_prompt": "Can you describe the image in detail.", + "num_layers": 1, + "additional_params": { + } + } + + ], + "intern_models": [ + { + "model_name": "OpenGVLab/InternVL2_5-1B", + "batch_size": 1, + "prompt_len": 384, + "ctx_len": 512, + "img_url": "https://image.slidesharecdn.com/azureintroduction-191206101932/75/Introduction-to-Microsoft-Azure-Cloud-1-2048.jpg", + "text_prompt": "Please describe the image in detail.", + "num_layers": 2, + "additional_params": { + } + } + ], + "speech_seq2seq_models": [ + "openai/whisper-tiny" + ], + "embedding_models": [ + {"model_name": "jinaai/jina-embeddings-v2-base-code", "pooling": "mean"}, + {"model_name": "sentence-transformers/nli-bert-base-cls-pooling", "pooling": "cls"} + ], + "spd_causal_lm_models": [ + { + "model_name": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", + "model_type": "llama", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 32000, + "num_key_value_heads": 1 + } + }, + { + "model_name": "Qwen/Qwen2-0.5B", + "model_type": "qwen2", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 151936, + "num_key_value_heads": 1 + } + } + ], + "qnn_causal_lm_models": [ + { + "model_name": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "model_type": "mixtral", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 32000, + "num_key_value_heads": 1 + } + }, + { + "model_name": "meta-llama/Llama-3.2-1B", + "model_type": "llama", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 128256, + "num_key_value_heads": 1, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + } + } + }, + { + "model_name": "unsloth/gemma-2b", + "model_type": "gemma", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 256000, + "num_key_value_heads": 1 + } + }, + { + "model_name": "ibm-granite/granite-guardian-3.1-2b", + "model_type": "granite", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 49155, + "num_key_value_heads": 1 + } + } + ], + "prefix_caching_models": [ + { + "model_name": "gpt2", + "model_type": "gpt2", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 50257, + "num_key_value_heads": 1 + } + } + ], + "audio_embedding_models": [ + "facebook/wav2vec2-base-960h" + ] +} diff --git a/tests/transformers/models/test_prefix_caching.py b/tests/transformers/models/test_prefix_caching.py index 88862fce7..a9662cc73 100644 --- a/tests/transformers/models/test_prefix_caching.py +++ b/tests/transformers/models/test_prefix_caching.py @@ -5,6 +5,7 @@ # # ----------------------------------------------------------------------------- +import json import os import numpy as np @@ -16,7 +17,13 @@ from QEfficient.utils._utils import create_json from QEfficient.utils.constants import QnnConstants -test_models = ["gpt2"] +CONFIG_PATH = os.path.join(os.path.dirname(__file__), "test_model_configs.json") + +with open(CONFIG_PATH, "r") as f: + config_data = json.load(f) + prefix_caching_models = config_data["prefix_caching_models"] + +test_models = [model["model_name"] for model in prefix_caching_models] # The test should first generate output with some prefix+suffix1 or batch_id and then confirm that we are still able to execute of prefix+suffix2 on same batch id and getting correct output. diff --git a/tests/transformers/models/test_speech_seq2seq_models.py b/tests/transformers/models/test_speech_seq2seq_models.py index 4ae8928b7..52a96d7fe 100644 --- a/tests/transformers/models/test_speech_seq2seq_models.py +++ b/tests/transformers/models/test_speech_seq2seq_models.py @@ -5,6 +5,7 @@ # # ----------------------------------------------------------------------------- +import json import os from importlib import reload from typing import List, Optional @@ -25,9 +26,11 @@ from QEfficient.utils.constants import Constants, QnnConstants from QEfficient.utils.device_utils import get_available_device_id -test_models = [ - "openai/whisper-tiny", -] +CONFIG_PATH = os.path.join(os.path.dirname(__file__), "test_model_configs.json") + +with open(CONFIG_PATH, "r") as f: + config_data = json.load(f) + test_models = config_data["speech_seq2seq_models"] def load_seq2seq_model(model_config): From d7e045e5c7f8c74a2452ec31f2fa86d0c71aab70 Mon Sep 17 00:00:00 2001 From: Abukhoyer Shaik Date: Thu, 23 Oct 2025 08:11:36 +0000 Subject: [PATCH 2/2] Conflict Resolved Signed-off-by: Abukhoyer Shaik --- tests/transformers/models/test_causal_lm_models.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/transformers/models/test_causal_lm_models.py b/tests/transformers/models/test_causal_lm_models.py index 81f710f09..369c9ac8d 100644 --- a/tests/transformers/models/test_causal_lm_models.py +++ b/tests/transformers/models/test_causal_lm_models.py @@ -165,7 +165,7 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( Constants.CTX_LEN, ) - if model_name not in ModelConfig.SWIFTKV_MODELS: + if model_name not in ModelConfig.SWIFTKV_MODELS and model_name not in ModelConfig.EXTERNAL_MODELS: pytorch_hf_tokens = api_runner.run_hf_model_on_pytorch(model_hf) is_tlm = False if num_speculative_tokens is None else True @@ -174,7 +174,7 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( ) pytorch_kv_tokens = api_runner.run_kv_model_on_pytorch(qeff_model.model) - if model_name not in ModelConfig.SWIFTKV_MODELS: + if model_name not in ModelConfig.SWIFTKV_MODELS and model_name not in ModelConfig.EXTERNAL_MODELS: assert (pytorch_hf_tokens == pytorch_kv_tokens).all(), ( "Tokens don't match for HF PyTorch model output and KV PyTorch model output" ) @@ -223,7 +223,7 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( Constants.CTX_LEN, full_batch_size, ) - if model_name not in ModelConfig.SWIFTKV_MODELS: + if model_name not in ModelConfig.SWIFTKV_MODELS and model_name not in ModelConfig.EXTERNAL_MODELS: pytorch_hf_tokens = api_runner.run_hf_model_on_pytorch_CB(model_hf) pytorch_hf_tokens = np.vstack(pytorch_hf_tokens) @@ -248,7 +248,7 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( qnn_config=qnn_config, ) exec_info_fbs = qeff_model.generate(tokenizer, prompts=fbs_prompts) - if model_name in ModelConfig.SWIFTKV_MODELS: + if model_name in ModelConfig.SWIFTKV_MODELS or model_name in ModelConfig.EXTERNAL_MODELS: assert all( [ all(ort_token[:24] == cloud_token[:24])