From 0f3c2ef8739cda14fd879f698a91fa548c270967 Mon Sep 17 00:00:00 2001 From: HDCharles Date: Wed, 22 Oct 2025 14:53:55 +0000 Subject: [PATCH 01/10] Adding new MoE e2e tests [wip] Summary new e2e tests for MoE + various techniques Signed-off-by: HDCharles --- tests/e2e/vLLM/configs/fp4_nvfp4_moe.yaml | 7 +++++++ tests/e2e/vLLM/configs/fp8_dynamic_per_token_moe.yaml | 4 ++++ tests/e2e/vLLM/configs/w4a16_grouped_quant_moe.yaml | 7 +++++++ 3 files changed, 18 insertions(+) create mode 100644 tests/e2e/vLLM/configs/fp4_nvfp4_moe.yaml create mode 100644 tests/e2e/vLLM/configs/fp8_dynamic_per_token_moe.yaml create mode 100644 tests/e2e/vLLM/configs/w4a16_grouped_quant_moe.yaml diff --git a/tests/e2e/vLLM/configs/fp4_nvfp4_moe.yaml b/tests/e2e/vLLM/configs/fp4_nvfp4_moe.yaml new file mode 100644 index 0000000000..02dbed8ab4 --- /dev/null +++ b/tests/e2e/vLLM/configs/fp4_nvfp4_moe.yaml @@ -0,0 +1,7 @@ +cadence: "nightly" +test_type: "regression" +model: Qwen/Qwen3-30B-A3B +scheme: NVFP4 +num_calibration_samples: 20 +dataset_id: HuggingFaceH4/ultrachat_200k +dataset_split: train_sft diff --git a/tests/e2e/vLLM/configs/fp8_dynamic_per_token_moe.yaml b/tests/e2e/vLLM/configs/fp8_dynamic_per_token_moe.yaml new file mode 100644 index 0000000000..2a8fd2bd05 --- /dev/null +++ b/tests/e2e/vLLM/configs/fp8_dynamic_per_token_moe.yaml @@ -0,0 +1,4 @@ +cadence: "nightly" +test_type: "regression" +model: Qwen/Qwen3-30B-A3B +scheme: FP8_DYNAMIC diff --git a/tests/e2e/vLLM/configs/w4a16_grouped_quant_moe.yaml b/tests/e2e/vLLM/configs/w4a16_grouped_quant_moe.yaml new file mode 100644 index 0000000000..8d85dedd03 --- /dev/null +++ b/tests/e2e/vLLM/configs/w4a16_grouped_quant_moe.yaml @@ -0,0 +1,7 @@ +cadence: "nightly" +test_type: "regression" +model: Qwen/Qwen3-30B-A3B +scheme: W4A16 +dataset_id: HuggingFaceH4/ultrachat_200k +dataset_split: train_sft +quant_type: "GPTQ" From 4e7f7a1d42cd49b32ff8d64f70ea24f5e89ceb3b Mon Sep 17 00:00:00 2001 From: HDCharles Date: Tue, 4 Nov 2025 15:16:52 +0000 Subject: [PATCH 02/10] making tests run faster Summary Signed-off-by: HDCharles --- ...p4_nvfp4_moe.yaml => qwen3_fp4_nvfp4.yaml} | 4 +++- ....yaml => qwen3_fp8_dynamic_per_token.yaml} | 1 + ...oe.yaml => qwen3_w4a16_grouped_quant.yaml} | 6 ++++-- ...ipe_w4a16_group_quant_first_20_layers.yaml | 20 +++++++++++++++++++ 4 files changed, 28 insertions(+), 3 deletions(-) rename tests/e2e/vLLM/configs/{fp4_nvfp4_moe.yaml => qwen3_fp4_nvfp4.yaml} (98%) rename tests/e2e/vLLM/configs/{fp8_dynamic_per_token_moe.yaml => qwen3_fp8_dynamic_per_token.yaml} (98%) rename tests/e2e/vLLM/configs/{w4a16_grouped_quant_moe.yaml => qwen3_w4a16_grouped_quant.yaml} (54%) create mode 100644 tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_group_quant_first_20_layers.yaml diff --git a/tests/e2e/vLLM/configs/fp4_nvfp4_moe.yaml b/tests/e2e/vLLM/configs/qwen3_fp4_nvfp4.yaml similarity index 98% rename from tests/e2e/vLLM/configs/fp4_nvfp4_moe.yaml rename to tests/e2e/vLLM/configs/qwen3_fp4_nvfp4.yaml index 02dbed8ab4..b260864dd3 100644 --- a/tests/e2e/vLLM/configs/fp4_nvfp4_moe.yaml +++ b/tests/e2e/vLLM/configs/qwen3_fp4_nvfp4.yaml @@ -1,7 +1,9 @@ cadence: "nightly" test_type: "regression" model: Qwen/Qwen3-30B-A3B + scheme: NVFP4 -num_calibration_samples: 20 + dataset_id: HuggingFaceH4/ultrachat_200k dataset_split: train_sft +num_calibration_samples: 20 diff --git a/tests/e2e/vLLM/configs/fp8_dynamic_per_token_moe.yaml b/tests/e2e/vLLM/configs/qwen3_fp8_dynamic_per_token.yaml similarity index 98% rename from tests/e2e/vLLM/configs/fp8_dynamic_per_token_moe.yaml rename to tests/e2e/vLLM/configs/qwen3_fp8_dynamic_per_token.yaml index 2a8fd2bd05..4c7d26e2d4 100644 --- a/tests/e2e/vLLM/configs/fp8_dynamic_per_token_moe.yaml +++ b/tests/e2e/vLLM/configs/qwen3_fp8_dynamic_per_token.yaml @@ -1,4 +1,5 @@ cadence: "nightly" test_type: "regression" model: Qwen/Qwen3-30B-A3B + scheme: FP8_DYNAMIC diff --git a/tests/e2e/vLLM/configs/w4a16_grouped_quant_moe.yaml b/tests/e2e/vLLM/configs/qwen3_w4a16_grouped_quant.yaml similarity index 54% rename from tests/e2e/vLLM/configs/w4a16_grouped_quant_moe.yaml rename to tests/e2e/vLLM/configs/qwen3_w4a16_grouped_quant.yaml index 8d85dedd03..a86d7e812f 100644 --- a/tests/e2e/vLLM/configs/w4a16_grouped_quant_moe.yaml +++ b/tests/e2e/vLLM/configs/qwen3_w4a16_grouped_quant.yaml @@ -1,7 +1,9 @@ cadence: "nightly" test_type: "regression" model: Qwen/Qwen3-30B-A3B -scheme: W4A16 + dataset_id: HuggingFaceH4/ultrachat_200k dataset_split: train_sft -quant_type: "GPTQ" +num_calibration_samples: 20 + +recipe: tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_group_quant_first_20_layers.yaml diff --git a/tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_group_quant_first_20_layers.yaml b/tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_group_quant_first_20_layers.yaml new file mode 100644 index 0000000000..afa5b5aa3a --- /dev/null +++ b/tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_group_quant_first_20_layers.yaml @@ -0,0 +1,20 @@ +quant_stage: + quant_modifiers: + GPTQModifier: + ignore: [ + "lm_head", + # Ignore layers (20+) + "re:.*model\\.layers\\.([2-9][0-9])\\..*", + ] + actorder: null + config_groups: + group_0: + weights: + num_bits: 4 + type: "int" + symmetric: False + strategy: "group" + group_size: 128 + input_activations: null + output_activations: null + targets: ["Linear"] From 18dfa5784085cbd3969e71d554f72f418985d5cd Mon Sep 17 00:00:00 2001 From: HDCharles Date: Thu, 6 Nov 2025 04:20:57 +0000 Subject: [PATCH 03/10] further shaving int4 layers to improve e2e test time Summary Signed-off-by: HDCharles --- .../configs/qwen3_w4a16_grouped_quant.yaml | 2 +- ...ipe_w4a16_group_quant_first_20_layers.yaml | 20 ------------------- 2 files changed, 1 insertion(+), 21 deletions(-) delete mode 100644 tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_group_quant_first_20_layers.yaml diff --git a/tests/e2e/vLLM/configs/qwen3_w4a16_grouped_quant.yaml b/tests/e2e/vLLM/configs/qwen3_w4a16_grouped_quant.yaml index a86d7e812f..c618a070ef 100644 --- a/tests/e2e/vLLM/configs/qwen3_w4a16_grouped_quant.yaml +++ b/tests/e2e/vLLM/configs/qwen3_w4a16_grouped_quant.yaml @@ -6,4 +6,4 @@ dataset_id: HuggingFaceH4/ultrachat_200k dataset_split: train_sft num_calibration_samples: 20 -recipe: tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_group_quant_first_20_layers.yaml +recipe: tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_group_quant_first_10_layers.yaml diff --git a/tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_group_quant_first_20_layers.yaml b/tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_group_quant_first_20_layers.yaml deleted file mode 100644 index afa5b5aa3a..0000000000 --- a/tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_group_quant_first_20_layers.yaml +++ /dev/null @@ -1,20 +0,0 @@ -quant_stage: - quant_modifiers: - GPTQModifier: - ignore: [ - "lm_head", - # Ignore layers (20+) - "re:.*model\\.layers\\.([2-9][0-9])\\..*", - ] - actorder: null - config_groups: - group_0: - weights: - num_bits: 4 - type: "int" - symmetric: False - strategy: "group" - group_size: 128 - input_activations: null - output_activations: null - targets: ["Linear"] From cc847382b0c8e38ec3be05dd3838c637c701c5ca Mon Sep 17 00:00:00 2001 From: HDCharles Date: Thu, 6 Nov 2025 15:02:39 +0000 Subject: [PATCH 04/10] formatting Summary Signed-off-by: HDCharles --- tests/e2e/vLLM/configs/qwen3_fp4_nvfp4.yaml | 2 -- .../configs/qwen3_fp8_dynamic_per_token.yaml | 1 - .../configs/qwen3_w4a16_grouped_quant.yaml | 2 -- ...ipe_w4a16_group_quant_first_10_layers.yaml | 20 +++++++++++++++++++ 4 files changed, 20 insertions(+), 5 deletions(-) create mode 100644 tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_group_quant_first_10_layers.yaml diff --git a/tests/e2e/vLLM/configs/qwen3_fp4_nvfp4.yaml b/tests/e2e/vLLM/configs/qwen3_fp4_nvfp4.yaml index b260864dd3..81fa03ad7d 100644 --- a/tests/e2e/vLLM/configs/qwen3_fp4_nvfp4.yaml +++ b/tests/e2e/vLLM/configs/qwen3_fp4_nvfp4.yaml @@ -1,9 +1,7 @@ cadence: "nightly" test_type: "regression" model: Qwen/Qwen3-30B-A3B - scheme: NVFP4 - dataset_id: HuggingFaceH4/ultrachat_200k dataset_split: train_sft num_calibration_samples: 20 diff --git a/tests/e2e/vLLM/configs/qwen3_fp8_dynamic_per_token.yaml b/tests/e2e/vLLM/configs/qwen3_fp8_dynamic_per_token.yaml index 4c7d26e2d4..2a8fd2bd05 100644 --- a/tests/e2e/vLLM/configs/qwen3_fp8_dynamic_per_token.yaml +++ b/tests/e2e/vLLM/configs/qwen3_fp8_dynamic_per_token.yaml @@ -1,5 +1,4 @@ cadence: "nightly" test_type: "regression" model: Qwen/Qwen3-30B-A3B - scheme: FP8_DYNAMIC diff --git a/tests/e2e/vLLM/configs/qwen3_w4a16_grouped_quant.yaml b/tests/e2e/vLLM/configs/qwen3_w4a16_grouped_quant.yaml index c618a070ef..ce49930255 100644 --- a/tests/e2e/vLLM/configs/qwen3_w4a16_grouped_quant.yaml +++ b/tests/e2e/vLLM/configs/qwen3_w4a16_grouped_quant.yaml @@ -1,9 +1,7 @@ cadence: "nightly" test_type: "regression" model: Qwen/Qwen3-30B-A3B - dataset_id: HuggingFaceH4/ultrachat_200k dataset_split: train_sft num_calibration_samples: 20 - recipe: tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_group_quant_first_10_layers.yaml diff --git a/tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_group_quant_first_10_layers.yaml b/tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_group_quant_first_10_layers.yaml new file mode 100644 index 0000000000..15e9f4f943 --- /dev/null +++ b/tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_group_quant_first_10_layers.yaml @@ -0,0 +1,20 @@ +quant_stage: + quant_modifiers: + GPTQModifier: + ignore: [ + "lm_head", + # Ignore layers (10+) + "re:.*model\\.layers\\.([1-9][0-9])\\..*", + ] + actorder: null + config_groups: + group_0: + weights: + num_bits: 4 + type: "int" + symmetric: False + strategy: "group" + group_size: 128 + input_activations: null + output_activations: null + targets: ["Linear"] From 92961e4a38d75fd772313be993b08e5acfe07e9c Mon Sep 17 00:00:00 2001 From: Charles Hernandez Date: Thu, 6 Nov 2025 20:31:20 +0000 Subject: [PATCH 05/10] fix error Title Summary Signed-off-by: HDCharles --- tests/e2e/vLLM/run_vllm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/e2e/vLLM/run_vllm.py b/tests/e2e/vLLM/run_vllm.py index 4daa93db10..aed83f2e56 100644 --- a/tests/e2e/vLLM/run_vllm.py +++ b/tests/e2e/vLLM/run_vllm.py @@ -18,7 +18,7 @@ def parse_args(): except json.JSONDecodeError as e: raise ValueError(f"Invalid JSON input: {e}") - if "W4A16_2of4" in scheme: + if scheme is not None and "W4A16_2of4" in scheme: # required by the kernel llm_kwargs["dtype"] = torch.float16 From 75ad5a7d5182ebb17066367f75498e93a1d12112 Mon Sep 17 00:00:00 2001 From: HDCharles Date: Fri, 7 Nov 2025 01:56:27 +0000 Subject: [PATCH 06/10] needs to be symmetric for MoE Summary Signed-off-by: HDCharles --- .../recipes/WNA16/recipe_w4a16_group_quant_first_10_layers.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_group_quant_first_10_layers.yaml b/tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_group_quant_first_10_layers.yaml index 15e9f4f943..0351195ce4 100644 --- a/tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_group_quant_first_10_layers.yaml +++ b/tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_group_quant_first_10_layers.yaml @@ -12,7 +12,7 @@ quant_stage: weights: num_bits: 4 type: "int" - symmetric: False + symmetric: True strategy: "group" group_size: 128 input_activations: null From 41f4481336091149bb75b59a248f46e58a2b5da4 Mon Sep 17 00:00:00 2001 From: HDCharles Date: Tue, 11 Nov 2025 03:42:30 +0000 Subject: [PATCH 07/10] removing int4 test the other tests are working fine, need additional debugging for int4 Signed-off-by: HDCharles --- .../configs/qwen3_w4a16_grouped_quant.yaml | 7 ------- ...ipe_w4a16_group_quant_first_10_layers.yaml | 20 ------------------- 2 files changed, 27 deletions(-) delete mode 100644 tests/e2e/vLLM/configs/qwen3_w4a16_grouped_quant.yaml delete mode 100644 tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_group_quant_first_10_layers.yaml diff --git a/tests/e2e/vLLM/configs/qwen3_w4a16_grouped_quant.yaml b/tests/e2e/vLLM/configs/qwen3_w4a16_grouped_quant.yaml deleted file mode 100644 index ce49930255..0000000000 --- a/tests/e2e/vLLM/configs/qwen3_w4a16_grouped_quant.yaml +++ /dev/null @@ -1,7 +0,0 @@ -cadence: "nightly" -test_type: "regression" -model: Qwen/Qwen3-30B-A3B -dataset_id: HuggingFaceH4/ultrachat_200k -dataset_split: train_sft -num_calibration_samples: 20 -recipe: tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_group_quant_first_10_layers.yaml diff --git a/tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_group_quant_first_10_layers.yaml b/tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_group_quant_first_10_layers.yaml deleted file mode 100644 index 0351195ce4..0000000000 --- a/tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_group_quant_first_10_layers.yaml +++ /dev/null @@ -1,20 +0,0 @@ -quant_stage: - quant_modifiers: - GPTQModifier: - ignore: [ - "lm_head", - # Ignore layers (10+) - "re:.*model\\.layers\\.([1-9][0-9])\\..*", - ] - actorder: null - config_groups: - group_0: - weights: - num_bits: 4 - type: "int" - symmetric: True - strategy: "group" - group_size: 128 - input_activations: null - output_activations: null - targets: ["Linear"] From 80c344e1a9d9454933e070b062e5222877c18c16 Mon Sep 17 00:00:00 2001 From: HDCharles Date: Fri, 14 Nov 2025 04:26:57 +0000 Subject: [PATCH 08/10] skip gate Summary Signed-off-by: HDCharles --- tests/e2e/e2e_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/e2e/e2e_utils.py b/tests/e2e/e2e_utils.py index 92a272737f..6404f0c854 100644 --- a/tests/e2e/e2e_utils.py +++ b/tests/e2e/e2e_utils.py @@ -84,11 +84,11 @@ def data_collator(batch): targets="Linear", scheme=scheme, actorder=None, # added for consistency with past testing configs - ignore=["lm_head"], + ignore=["lm_head", "re:.*mlp.gate.*"], ) else: oneshot_kwargs["recipe"] = QuantizationModifier( - targets="Linear", scheme=scheme, ignore=["lm_head"] + targets="Linear", scheme=scheme, ignore=["lm_head", "re:.*mlp.gate.*"] ) # Apply quantization. From a575e8651d49c5283e5d766a6d2d93760391ae76 Mon Sep 17 00:00:00 2001 From: HDCharles Date: Fri, 14 Nov 2025 15:10:51 +0000 Subject: [PATCH 09/10] don't ignore gate_proj Summary Signed-off-by: HDCharles --- tests/e2e/e2e_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/e2e/e2e_utils.py b/tests/e2e/e2e_utils.py index 6404f0c854..8fdad05963 100644 --- a/tests/e2e/e2e_utils.py +++ b/tests/e2e/e2e_utils.py @@ -84,11 +84,11 @@ def data_collator(batch): targets="Linear", scheme=scheme, actorder=None, # added for consistency with past testing configs - ignore=["lm_head", "re:.*mlp.gate.*"], + ignore=["lm_head", "re:.*mlp.gate[.].*"], ) else: oneshot_kwargs["recipe"] = QuantizationModifier( - targets="Linear", scheme=scheme, ignore=["lm_head", "re:.*mlp.gate.*"] + targets="Linear", scheme=scheme, ignore=["lm_head", "re:.*mlp.gate[.].*"] ) # Apply quantization. From 530ceaea3f9b02e15a3a4148dd1d6a8be61010e4 Mon Sep 17 00:00:00 2001 From: HDCharles Date: Fri, 14 Nov 2025 15:29:23 +0000 Subject: [PATCH 10/10] format Summary Signed-off-by: HDCharles --- tests/e2e/e2e_utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/e2e/e2e_utils.py b/tests/e2e/e2e_utils.py index 8fdad05963..765d864cc4 100644 --- a/tests/e2e/e2e_utils.py +++ b/tests/e2e/e2e_utils.py @@ -88,7 +88,9 @@ def data_collator(batch): ) else: oneshot_kwargs["recipe"] = QuantizationModifier( - targets="Linear", scheme=scheme, ignore=["lm_head", "re:.*mlp.gate[.].*"] + targets="Linear", + scheme=scheme, + ignore=["lm_head", "re:.*mlp.gate[.].*"], ) # Apply quantization.