pq-code-package
diff --git a/‎dev/x86_64/src/ntt.S‎
Lines changed: 26 additions & 1 deletion b/‎dev/x86_64/src/ntt.S‎
Lines changed: 26 additions & 1 deletion
@@ -81,8 +81,33 @@ vpblendd	$0xAA,%ymm12,%ymm\h,%ymm\h /* mulhi(h, zh) */
  */
 vpsubd      %ymm\h,%ymm\l,%ymm12 /* l - mulhi(h, zh)
                                   * = h' - mulhi(q, mullo(h, zl)) */
-vpaddd      %ymm\h,%ymm\l,%ymm\l /* l + mulhi(h, zh)
+/*
+ * VEX Encoding Optimization for Platform-Independent Code
+ *
+ * Some assemblers (notably clang) will automatically swap operands of
+ * commutative instructions like VPADDD to use shorter encodings, while others
+ * (like gcc) may not. This causes different machine code across platforms.
+ *
+ * VEX prefixes come in two forms:
+ * - 2-byte VEX (0xC5): Can only be used when the ModR/M.rm operand is ymm0-7
+ * - 3-byte VEX (0xC4): Required when ModR/M.rm operand is ymm8-15
+ *
+ * When one operand is in ymm0-7 and another is in ymm8-15, we explicitly
+ * place the lower-numbered register (ymm0-7) as the second source operand
+ * to enable the 2-byte VEX encoding. Since VPADDD is commutative, this
+ * produces identical results while ensuring consistent machine code across
+ * different assemblers.
+ *
+ * Example:
+ *   VPADDD ymm4, ymm4, ymm8  -> 3-byte VEX (0xC4 0xC1 0x5D 0xFE 0xE0)
+ *   VPADDD ymm4, ymm8, ymm4  -> 2-byte VEX (0xC5 0xBD 0xFE 0xE4) ✓ preferred
+ */
+.if (\l < 8) && (\h >= 8)
+vpaddd      %ymm\l,%ymm\h,%ymm\l /* l + mulhi(h, zh)
                                   * = l' + mulhi(q, mullo(h, zl)) */
+.else                            
+vpaddd      %ymm\h,%ymm\l,%ymm\l
+.endif
 
 vmovshdup	%ymm13,%ymm13
 vpblendd	$0xAA,%ymm14,%ymm13,%ymm13 /* mulhi(q, mullo(h, zl)) */