Skip to content

Commit 5756943

Browse files
jakemashanno-becker
authored andcommitted
HOL-Light: ML-DSA NTT Platform independent code
Signed-off-by: Jake Massimo <jakemas@amazon.com>
1 parent b3f5140 commit 5756943

File tree

4 files changed

+173
-196
lines changed

4 files changed

+173
-196
lines changed

dev/x86_64/src/ntt.S

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,8 +81,33 @@ vpblendd $0xAA,%ymm12,%ymm\h,%ymm\h /* mulhi(h, zh) */
8181
*/
8282
vpsubd %ymm\h,%ymm\l,%ymm12 /* l - mulhi(h, zh)
8383
* = h' - mulhi(q, mullo(h, zl)) */
84-
vpaddd %ymm\h,%ymm\l,%ymm\l /* l + mulhi(h, zh)
84+
/*
85+
* VEX Encoding Optimization for Platform-Independent Code
86+
*
87+
* Some assemblers (notably clang) will automatically swap operands of
88+
* commutative instructions like VPADDD to use shorter encodings, while others
89+
* (like gcc) may not. This causes different machine code across platforms.
90+
*
91+
* VEX prefixes come in two forms:
92+
* - 2-byte VEX (0xC5): Can only be used when the ModR/M.rm operand is ymm0-7
93+
* - 3-byte VEX (0xC4): Required when ModR/M.rm operand is ymm8-15
94+
*
95+
* When one operand is in ymm0-7 and another is in ymm8-15, we explicitly
96+
* place the lower-numbered register (ymm0-7) as the second source operand
97+
* to enable the 2-byte VEX encoding. Since VPADDD is commutative, this
98+
* produces identical results while ensuring consistent machine code across
99+
* different assemblers.
100+
*
101+
* Example:
102+
* VPADDD ymm4, ymm4, ymm8 -> 3-byte VEX (0xC4 0xC1 0x5D 0xFE 0xE0)
103+
* VPADDD ymm4, ymm8, ymm4 -> 2-byte VEX (0xC5 0xBD 0xFE 0xE4) ✓ preferred
104+
*/
105+
.if (\l < 8) && (\h >= 8)
106+
vpaddd %ymm\l,%ymm\h,%ymm\l /* l + mulhi(h, zh)
85107
* = l' + mulhi(q, mullo(h, zl)) */
108+
.else
109+
vpaddd %ymm\h,%ymm\l,%ymm\l
110+
.endif
86111

87112
vmovshdup %ymm13,%ymm13
88113
vpblendd $0xAA,%ymm14,%ymm13,%ymm13 /* mulhi(q, mullo(h, zl)) */

0 commit comments

Comments
 (0)