From 0f3c2ef8739cda14fd879f698a91fa548c270967 Mon Sep 17 00:00:00 2001
From: HDCharles <charlesdavidhernandez@gmail.com>
Date: Wed, 22 Oct 2025 14:53:55 +0000
Subject: [PATCH 01/10] Adding new MoE e2e tests [wip]

Summary

new e2e tests for MoE + various techniques

Signed-off-by: HDCharles <charlesdavidhernandez@gmail.com>
---
 tests/e2e/vLLM/configs/fp4_nvfp4_moe.yaml             | 7 +++++++
 tests/e2e/vLLM/configs/fp8_dynamic_per_token_moe.yaml | 4 ++++
 tests/e2e/vLLM/configs/w4a16_grouped_quant_moe.yaml   | 7 +++++++
 3 files changed, 18 insertions(+)
 create mode 100644 tests/e2e/vLLM/configs/fp4_nvfp4_moe.yaml
 create mode 100644 tests/e2e/vLLM/configs/fp8_dynamic_per_token_moe.yaml
 create mode 100644 tests/e2e/vLLM/configs/w4a16_grouped_quant_moe.yaml

diff --git a/tests/e2e/vLLM/configs/fp4_nvfp4_moe.yaml b/tests/e2e/vLLM/configs/fp4_nvfp4_moe.yaml
new file mode 100644
index 0000000000..02dbed8ab4
--- /dev/null
+++ b/tests/e2e/vLLM/configs/fp4_nvfp4_moe.yaml
@@ -0,0 +1,7 @@
+cadence: "nightly"
+test_type: "regression"
+model: Qwen/Qwen3-30B-A3B
+scheme: NVFP4
+num_calibration_samples: 20
+dataset_id: HuggingFaceH4/ultrachat_200k
+dataset_split: train_sft
diff --git a/tests/e2e/vLLM/configs/fp8_dynamic_per_token_moe.yaml b/tests/e2e/vLLM/configs/fp8_dynamic_per_token_moe.yaml
new file mode 100644
index 0000000000..2a8fd2bd05
--- /dev/null
+++ b/tests/e2e/vLLM/configs/fp8_dynamic_per_token_moe.yaml
@@ -0,0 +1,4 @@
+cadence: "nightly"
+test_type: "regression"
+model: Qwen/Qwen3-30B-A3B
+scheme: FP8_DYNAMIC
diff --git a/tests/e2e/vLLM/configs/w4a16_grouped_quant_moe.yaml b/tests/e2e/vLLM/configs/w4a16_grouped_quant_moe.yaml
new file mode 100644
index 0000000000..8d85dedd03
--- /dev/null
+++ b/tests/e2e/vLLM/configs/w4a16_grouped_quant_moe.yaml
@@ -0,0 +1,7 @@
+cadence: "nightly"
+test_type: "regression"
+model: Qwen/Qwen3-30B-A3B
+scheme: W4A16
+dataset_id: HuggingFaceH4/ultrachat_200k
+dataset_split: train_sft
+quant_type: "GPTQ"

From 4e7f7a1d42cd49b32ff8d64f70ea24f5e89ceb3b Mon Sep 17 00:00:00 2001
From: HDCharles <charlesdavidhernandez@gmail.com>
Date: Tue, 4 Nov 2025 15:16:52 +0000
Subject: [PATCH 02/10] making tests run faster

Summary

Signed-off-by: HDCharles <charlesdavidhernandez@gmail.com>
---
 ...p4_nvfp4_moe.yaml => qwen3_fp4_nvfp4.yaml} |  4 +++-
 ....yaml => qwen3_fp8_dynamic_per_token.yaml} |  1 +
 ...oe.yaml => qwen3_w4a16_grouped_quant.yaml} |  6 ++++--
 ...ipe_w4a16_group_quant_first_20_layers.yaml | 20 +++++++++++++++++++
 4 files changed, 28 insertions(+), 3 deletions(-)
 rename tests/e2e/vLLM/configs/{fp4_nvfp4_moe.yaml => qwen3_fp4_nvfp4.yaml} (98%)
 rename tests/e2e/vLLM/configs/{fp8_dynamic_per_token_moe.yaml => qwen3_fp8_dynamic_per_token.yaml} (98%)
 rename tests/e2e/vLLM/configs/{w4a16_grouped_quant_moe.yaml => qwen3_w4a16_grouped_quant.yaml} (54%)
 create mode 100644 tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_group_quant_first_20_layers.yaml

diff --git a/tests/e2e/vLLM/configs/fp4_nvfp4_moe.yaml b/tests/e2e/vLLM/configs/qwen3_fp4_nvfp4.yaml
similarity index 98%
rename from tests/e2e/vLLM/configs/fp4_nvfp4_moe.yaml
rename to tests/e2e/vLLM/configs/qwen3_fp4_nvfp4.yaml
index 02dbed8ab4..b260864dd3 100644
--- a/tests/e2e/vLLM/configs/fp4_nvfp4_moe.yaml
+++ b/tests/e2e/vLLM/configs/qwen3_fp4_nvfp4.yaml
@@ -1,7 +1,9 @@
 cadence: "nightly"
 test_type: "regression"
 model: Qwen/Qwen3-30B-A3B
+
 scheme: NVFP4
-num_calibration_samples: 20
+
 dataset_id: HuggingFaceH4/ultrachat_200k
 dataset_split: train_sft
+num_calibration_samples: 20
diff --git a/tests/e2e/vLLM/configs/fp8_dynamic_per_token_moe.yaml b/tests/e2e/vLLM/configs/qwen3_fp8_dynamic_per_token.yaml
similarity index 98%
rename from tests/e2e/vLLM/configs/fp8_dynamic_per_token_moe.yaml
rename to tests/e2e/vLLM/configs/qwen3_fp8_dynamic_per_token.yaml
index 2a8fd2bd05..4c7d26e2d4 100644
--- a/tests/e2e/vLLM/configs/fp8_dynamic_per_token_moe.yaml
+++ b/tests/e2e/vLLM/configs/qwen3_fp8_dynamic_per_token.yaml
@@ -1,4 +1,5 @@
 cadence: "nightly"
 test_type: "regression"
 model: Qwen/Qwen3-30B-A3B
+
 scheme: FP8_DYNAMIC
diff --git a/tests/e2e/vLLM/configs/w4a16_grouped_quant_moe.yaml b/tests/e2e/vLLM/configs/qwen3_w4a16_grouped_quant.yaml
similarity index 54%
rename from tests/e2e/vLLM/configs/w4a16_grouped_quant_moe.yaml
rename to tests/e2e/vLLM/configs/qwen3_w4a16_grouped_quant.yaml
index 8d85dedd03..a86d7e812f 100644
--- a/tests/e2e/vLLM/configs/w4a16_grouped_quant_moe.yaml
+++ b/tests/e2e/vLLM/configs/qwen3_w4a16_grouped_quant.yaml
@@ -1,7 +1,9 @@
 cadence: "nightly"
 test_type: "regression"
 model: Qwen/Qwen3-30B-A3B
-scheme: W4A16
+
 dataset_id: HuggingFaceH4/ultrachat_200k
 dataset_split: train_sft
-quant_type: "GPTQ"
+num_calibration_samples: 20
+
+recipe: tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_group_quant_first_20_layers.yaml
diff --git a/tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_group_quant_first_20_layers.yaml b/tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_group_quant_first_20_layers.yaml
new file mode 100644
index 0000000000..afa5b5aa3a
--- /dev/null
+++ b/tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_group_quant_first_20_layers.yaml
@@ -0,0 +1,20 @@
+quant_stage:
+  quant_modifiers:
+    GPTQModifier:
+      ignore: [
+        "lm_head",
+        # Ignore layers (20+)
+        "re:.*model\\.layers\\.([2-9][0-9])\\..*",
+      ]
+      actorder: null
+      config_groups:
+        group_0:
+          weights:
+            num_bits: 4
+            type: "int"
+            symmetric: False
+            strategy: "group"
+            group_size: 128
+          input_activations: null
+          output_activations: null
+          targets: ["Linear"]

From 18dfa5784085cbd3969e71d554f72f418985d5cd Mon Sep 17 00:00:00 2001
From: HDCharles <charlesdavidhernandez@gmail.com>
Date: Thu, 6 Nov 2025 04:20:57 +0000
Subject: [PATCH 03/10] further shaving int4 layers to improve e2e test time

Summary

Signed-off-by: HDCharles <charlesdavidhernandez@gmail.com>
---
 .../configs/qwen3_w4a16_grouped_quant.yaml    |  2 +-
 ...ipe_w4a16_group_quant_first_20_layers.yaml | 20 -------------------
 2 files changed, 1 insertion(+), 21 deletions(-)
 delete mode 100644 tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_group_quant_first_20_layers.yaml

diff --git a/tests/e2e/vLLM/configs/qwen3_w4a16_grouped_quant.yaml b/tests/e2e/vLLM/configs/qwen3_w4a16_grouped_quant.yaml
index a86d7e812f..c618a070ef 100644
--- a/tests/e2e/vLLM/configs/qwen3_w4a16_grouped_quant.yaml
+++ b/tests/e2e/vLLM/configs/qwen3_w4a16_grouped_quant.yaml
@@ -6,4 +6,4 @@ dataset_id: HuggingFaceH4/ultrachat_200k
 dataset_split: train_sft
 num_calibration_samples: 20
 
-recipe: tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_group_quant_first_20_layers.yaml
+recipe: tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_group_quant_first_10_layers.yaml
diff --git a/tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_group_quant_first_20_layers.yaml b/tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_group_quant_first_20_layers.yaml
deleted file mode 100644
index afa5b5aa3a..0000000000
--- a/tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_group_quant_first_20_layers.yaml
+++ /dev/null
@@ -1,20 +0,0 @@
-quant_stage:
-  quant_modifiers:
-    GPTQModifier:
-      ignore: [
-        "lm_head",
-        # Ignore layers (20+)
-        "re:.*model\\.layers\\.([2-9][0-9])\\..*",
-      ]
-      actorder: null
-      config_groups:
-        group_0:
-          weights:
-            num_bits: 4
-            type: "int"
-            symmetric: False
-            strategy: "group"
-            group_size: 128
-          input_activations: null
-          output_activations: null
-          targets: ["Linear"]

From cc847382b0c8e38ec3be05dd3838c637c701c5ca Mon Sep 17 00:00:00 2001
From: HDCharles <charlesdavidhernandez@gmail.com>
Date: Thu, 6 Nov 2025 15:02:39 +0000
Subject: [PATCH 04/10] formatting

Summary

Signed-off-by: HDCharles <charlesdavidhernandez@gmail.com>
---
 tests/e2e/vLLM/configs/qwen3_fp4_nvfp4.yaml   |  2 --
 .../configs/qwen3_fp8_dynamic_per_token.yaml  |  1 -
 .../configs/qwen3_w4a16_grouped_quant.yaml    |  2 --
 ...ipe_w4a16_group_quant_first_10_layers.yaml | 20 +++++++++++++++++++
 4 files changed, 20 insertions(+), 5 deletions(-)
 create mode 100644 tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_group_quant_first_10_layers.yaml

diff --git a/tests/e2e/vLLM/configs/qwen3_fp4_nvfp4.yaml b/tests/e2e/vLLM/configs/qwen3_fp4_nvfp4.yaml
index b260864dd3..81fa03ad7d 100644
--- a/tests/e2e/vLLM/configs/qwen3_fp4_nvfp4.yaml
+++ b/tests/e2e/vLLM/configs/qwen3_fp4_nvfp4.yaml
@@ -1,9 +1,7 @@
 cadence: "nightly"
 test_type: "regression"
 model: Qwen/Qwen3-30B-A3B
-
 scheme: NVFP4
-
 dataset_id: HuggingFaceH4/ultrachat_200k
 dataset_split: train_sft
 num_calibration_samples: 20
diff --git a/tests/e2e/vLLM/configs/qwen3_fp8_dynamic_per_token.yaml b/tests/e2e/vLLM/configs/qwen3_fp8_dynamic_per_token.yaml
index 4c7d26e2d4..2a8fd2bd05 100644
--- a/tests/e2e/vLLM/configs/qwen3_fp8_dynamic_per_token.yaml
+++ b/tests/e2e/vLLM/configs/qwen3_fp8_dynamic_per_token.yaml
@@ -1,5 +1,4 @@
 cadence: "nightly"
 test_type: "regression"
 model: Qwen/Qwen3-30B-A3B
-
 scheme: FP8_DYNAMIC
diff --git a/tests/e2e/vLLM/configs/qwen3_w4a16_grouped_quant.yaml b/tests/e2e/vLLM/configs/qwen3_w4a16_grouped_quant.yaml
index c618a070ef..ce49930255 100644
--- a/tests/e2e/vLLM/configs/qwen3_w4a16_grouped_quant.yaml
+++ b/tests/e2e/vLLM/configs/qwen3_w4a16_grouped_quant.yaml
@@ -1,9 +1,7 @@
 cadence: "nightly"
 test_type: "regression"
 model: Qwen/Qwen3-30B-A3B
-
 dataset_id: HuggingFaceH4/ultrachat_200k
 dataset_split: train_sft
 num_calibration_samples: 20
-
 recipe: tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_group_quant_first_10_layers.yaml
diff --git a/tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_group_quant_first_10_layers.yaml b/tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_group_quant_first_10_layers.yaml
new file mode 100644
index 0000000000..15e9f4f943
--- /dev/null
+++ b/tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_group_quant_first_10_layers.yaml
@@ -0,0 +1,20 @@
+quant_stage:
+  quant_modifiers:
+    GPTQModifier:
+      ignore: [
+        "lm_head",
+        # Ignore layers (10+)
+        "re:.*model\\.layers\\.([1-9][0-9])\\..*",
+      ]
+      actorder: null
+      config_groups:
+        group_0:
+          weights:
+            num_bits: 4
+            type: "int"
+            symmetric: False
+            strategy: "group"
+            group_size: 128
+          input_activations: null
+          output_activations: null
+          targets: ["Linear"]

From 92961e4a38d75fd772313be993b08e5acfe07e9c Mon Sep 17 00:00:00 2001
From: Charles Hernandez <charlesdavidhernandez@gmail.com>
Date: Thu, 6 Nov 2025 20:31:20 +0000
Subject: [PATCH 05/10] fix error

Title

Summary

Signed-off-by: HDCharles <charlesdavidhernandez@gmail.com>
---
 tests/e2e/vLLM/run_vllm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/e2e/vLLM/run_vllm.py b/tests/e2e/vLLM/run_vllm.py
index 4daa93db10..aed83f2e56 100644
--- a/tests/e2e/vLLM/run_vllm.py
+++ b/tests/e2e/vLLM/run_vllm.py
@@ -18,7 +18,7 @@ def parse_args():
     except json.JSONDecodeError as e:
         raise ValueError(f"Invalid JSON input: {e}")
 
-    if "W4A16_2of4" in scheme:
+    if scheme is not None and "W4A16_2of4" in scheme:
         # required by the kernel
         llm_kwargs["dtype"] = torch.float16
 

From 75ad5a7d5182ebb17066367f75498e93a1d12112 Mon Sep 17 00:00:00 2001
From: HDCharles <charlesdavidhernandez@gmail.com>
Date: Fri, 7 Nov 2025 01:56:27 +0000
Subject: [PATCH 06/10] needs to be symmetric for MoE

Summary

Signed-off-by: HDCharles <charlesdavidhernandez@gmail.com>
---
 .../recipes/WNA16/recipe_w4a16_group_quant_first_10_layers.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_group_quant_first_10_layers.yaml b/tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_group_quant_first_10_layers.yaml
index 15e9f4f943..0351195ce4 100644
--- a/tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_group_quant_first_10_layers.yaml
+++ b/tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_group_quant_first_10_layers.yaml
@@ -12,7 +12,7 @@ quant_stage:
           weights:
             num_bits: 4
             type: "int"
-            symmetric: False
+            symmetric: True
             strategy: "group"
             group_size: 128
           input_activations: null

From 41f4481336091149bb75b59a248f46e58a2b5da4 Mon Sep 17 00:00:00 2001
From: HDCharles <charlesdavidhernandez@gmail.com>
Date: Tue, 11 Nov 2025 03:42:30 +0000
Subject: [PATCH 07/10] removing int4 test

the other tests are working fine, need additional debugging for int4

Signed-off-by: HDCharles <charlesdavidhernandez@gmail.com>
---
 .../configs/qwen3_w4a16_grouped_quant.yaml    |  7 -------
 ...ipe_w4a16_group_quant_first_10_layers.yaml | 20 -------------------
 2 files changed, 27 deletions(-)
 delete mode 100644 tests/e2e/vLLM/configs/qwen3_w4a16_grouped_quant.yaml
 delete mode 100644 tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_group_quant_first_10_layers.yaml

diff --git a/tests/e2e/vLLM/configs/qwen3_w4a16_grouped_quant.yaml b/tests/e2e/vLLM/configs/qwen3_w4a16_grouped_quant.yaml
deleted file mode 100644
index ce49930255..0000000000
--- a/tests/e2e/vLLM/configs/qwen3_w4a16_grouped_quant.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-cadence: "nightly"
-test_type: "regression"
-model: Qwen/Qwen3-30B-A3B
-dataset_id: HuggingFaceH4/ultrachat_200k
-dataset_split: train_sft
-num_calibration_samples: 20
-recipe: tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_group_quant_first_10_layers.yaml
diff --git a/tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_group_quant_first_10_layers.yaml b/tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_group_quant_first_10_layers.yaml
deleted file mode 100644
index 0351195ce4..0000000000
--- a/tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_group_quant_first_10_layers.yaml
+++ /dev/null
@@ -1,20 +0,0 @@
-quant_stage:
-  quant_modifiers:
-    GPTQModifier:
-      ignore: [
-        "lm_head",
-        # Ignore layers (10+)
-        "re:.*model\\.layers\\.([1-9][0-9])\\..*",
-      ]
-      actorder: null
-      config_groups:
-        group_0:
-          weights:
-            num_bits: 4
-            type: "int"
-            symmetric: True
-            strategy: "group"
-            group_size: 128
-          input_activations: null
-          output_activations: null
-          targets: ["Linear"]

From 80c344e1a9d9454933e070b062e5222877c18c16 Mon Sep 17 00:00:00 2001
From: HDCharles <charlesdavidhernandez@gmail.com>
Date: Fri, 14 Nov 2025 04:26:57 +0000
Subject: [PATCH 08/10] skip gate

Summary

Signed-off-by: HDCharles <charlesdavidhernandez@gmail.com>
---
 tests/e2e/e2e_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/e2e/e2e_utils.py b/tests/e2e/e2e_utils.py
index 92a272737f..6404f0c854 100644
--- a/tests/e2e/e2e_utils.py
+++ b/tests/e2e/e2e_utils.py
@@ -84,11 +84,11 @@ def data_collator(batch):
                 targets="Linear",
                 scheme=scheme,
                 actorder=None,  # added for consistency with past testing configs
-                ignore=["lm_head"],
+                ignore=["lm_head", "re:.*mlp.gate.*"],
             )
         else:
             oneshot_kwargs["recipe"] = QuantizationModifier(
-                targets="Linear", scheme=scheme, ignore=["lm_head"]
+                targets="Linear", scheme=scheme, ignore=["lm_head", "re:.*mlp.gate.*"]
             )
 
     # Apply quantization.

From a575e8651d49c5283e5d766a6d2d93760391ae76 Mon Sep 17 00:00:00 2001
From: HDCharles <charlesdavidhernandez@gmail.com>
Date: Fri, 14 Nov 2025 15:10:51 +0000
Subject: [PATCH 09/10] don't ignore gate_proj

Summary

Signed-off-by: HDCharles <charlesdavidhernandez@gmail.com>
---
 tests/e2e/e2e_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/e2e/e2e_utils.py b/tests/e2e/e2e_utils.py
index 6404f0c854..8fdad05963 100644
--- a/tests/e2e/e2e_utils.py
+++ b/tests/e2e/e2e_utils.py
@@ -84,11 +84,11 @@ def data_collator(batch):
                 targets="Linear",
                 scheme=scheme,
                 actorder=None,  # added for consistency with past testing configs
-                ignore=["lm_head", "re:.*mlp.gate.*"],
+                ignore=["lm_head", "re:.*mlp.gate[.].*"],
             )
         else:
             oneshot_kwargs["recipe"] = QuantizationModifier(
-                targets="Linear", scheme=scheme, ignore=["lm_head", "re:.*mlp.gate.*"]
+                targets="Linear", scheme=scheme, ignore=["lm_head", "re:.*mlp.gate[.].*"]
             )
 
     # Apply quantization.

From 530ceaea3f9b02e15a3a4148dd1d6a8be61010e4 Mon Sep 17 00:00:00 2001
From: HDCharles <charlesdavidhernandez@gmail.com>
Date: Fri, 14 Nov 2025 15:29:23 +0000
Subject: [PATCH 10/10] format

Summary

Signed-off-by: HDCharles <charlesdavidhernandez@gmail.com>
---
 tests/e2e/e2e_utils.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/e2e/e2e_utils.py b/tests/e2e/e2e_utils.py
index 8fdad05963..765d864cc4 100644
--- a/tests/e2e/e2e_utils.py
+++ b/tests/e2e/e2e_utils.py
@@ -88,7 +88,9 @@ def data_collator(batch):
             )
         else:
             oneshot_kwargs["recipe"] = QuantizationModifier(
-                targets="Linear", scheme=scheme, ignore=["lm_head", "re:.*mlp.gate[.].*"]
+                targets="Linear",
+                scheme=scheme,
+                ignore=["lm_head", "re:.*mlp.gate[.].*"],
             )
 
     # Apply quantization.