@@ -81,8 +81,33 @@ vpblendd $0xAA,%ymm12,%ymm\h,%ymm\h /* mulhi(h, zh) */
8181 */
8282vpsubd %ymm\h,%ymm\l,%ymm12 /* l - mulhi(h, zh)
8383 * = h' - mulhi(q, mullo(h, zl)) */
84- vpaddd %ymm\h,%ymm\l,%ymm\l /* l + mulhi(h, zh)
84+ /*
85+ * VEX Encoding Optimization for Platform-Independent Code
86+ *
87+ * Some assemblers (notably clang) will automatically swap operands of
88+ * commutative instructions like VPADDD to use shorter encodings, while others
89+ * (like gcc) may not. This causes different machine code across platforms.
90+ *
91+ * VEX prefixes come in two forms:
92+ * - 2-byte VEX (0xC5): Can only be used when the ModR/M.rm operand is ymm0-7
93+ * - 3-byte VEX (0xC4): Required when ModR/M.rm operand is ymm8-15
94+ *
95+ * When one operand is in ymm0-7 and another is in ymm8-15, we explicitly
96+ * place the lower-numbered register (ymm0-7) as the second source operand
97+ * to enable the 2-byte VEX encoding. Since VPADDD is commutative, this
98+ * produces identical results while ensuring consistent machine code across
99+ * different assemblers.
100+ *
101+ * Example:
102+ * VPADDD ymm4, ymm4, ymm8 -> 3-byte VEX (0xC4 0xC1 0x5D 0xFE 0xE0)
103+ * VPADDD ymm4, ymm8, ymm4 -> 2-byte VEX (0xC5 0xBD 0xFE 0xE4) ✓ preferred
104+ */
105+ .if (\l < 8) && (\h > = 8 )
106+ vpaddd %ymm\l,%ymm\h,%ymm\l /* l + mulhi(h, zh)
85107 * = l' + mulhi(q, mullo(h, zl)) */
108+ .else
109+ vpaddd %ymm\h,%ymm\l,%ymm\l
110+ .endif
86111
87112vmovshdup %ymm13 ,%ymm13
88113vpblendd $0xAA ,%ymm14 ,%ymm13 ,%ymm13 /* mulhi(q, mullo(h, zl)) */
0 commit comments