|
1 | 1 | bits 64 |
2 | 2 |
|
3 | | -%macro amx 1 |
| 3 | +%macro amx 3 |
4 | 4 | %define treg tmm %+ %1 |
| 5 | + %define treg2 tmm %+ %2 |
| 6 | + %define treg3 tmm %+ %3 |
| 7 | + %define zreg zmm %+ %1 |
5 | 8 |
|
6 | | - ldtilecfg [rsi] |
7 | | - sttilecfg [rdi] |
| 9 | + ldtilecfg [rsi] ;AMX_TILE |
| 10 | + sttilecfg [rdi] ;AMX_TILE |
8 | 11 |
|
9 | | - tilezero treg |
| 12 | + tilezero treg ;AMX_TILE |
10 | 13 |
|
11 | | - tileloadd treg, [rax] |
12 | | - tileloadd treg, [rax,rdx] |
13 | | - tileloadd treg, [rax,rdx*2] |
| 14 | + tileloadd treg, [rax] ;AMX_TILE |
| 15 | + tileloadd treg, [rax+rdx] ;AMX_TILE |
| 16 | + tileloadd treg, [rax+rdx*2] ;AMX_TILE |
14 | 17 |
|
15 | | - tileloaddt1 treg, [rax] |
16 | | - tileloaddt1 treg, [rax,rdx] |
17 | | - tileloaddt1 treg, [rax,rdx*2] |
| 18 | + tileloaddt1 treg, [rax] ;AMX_TILE |
| 19 | + tileloaddt1 treg, [rax+rdx] ;AMX_TILE |
| 20 | + tileloaddt1 treg, [rax+rdx*2] ;AMX_TILE |
18 | 21 |
|
19 | | - tdpbf16ps treg, treg, treg |
20 | | - tdpbssd treg, treg, treg |
21 | | - tdpbusd treg, treg, treg |
22 | | - tdpbsud treg, treg, treg |
23 | | - tdpbuud treg, treg, treg |
| 22 | + tileloaddrs treg, [rax] ;AMX-MOVRS |
| 23 | + tileloaddrs treg, [rax+rdx] ;AMX-MOVRS |
| 24 | + tileloaddrs treg, [rax+rdx*2] ;AMX-MOVRS |
24 | 25 |
|
25 | | - tilestored [rax], treg |
26 | | - tilestored [rax,rdx], treg |
27 | | - tilestored [rax,rdx*2], treg |
| 26 | + tileloaddrst1 treg, [rax] ;AMX-MOVRS |
| 27 | + tileloaddrst1 treg, [rax+rdx] ;AMX-MOVRS |
| 28 | + tileloaddrst1 treg, [rax+rdx*2] ;AMX-MOVRS |
28 | 29 |
|
29 | | - tilerelease |
| 30 | + tdpbf16ps treg, treg2, treg3 ;AMX-BF16 |
| 31 | + tdpbssd treg, treg2, treg3 ;AMX_INT8 |
| 32 | + tdpbusd treg, treg2, treg3 ;AMX_INT8 |
| 33 | + tdpbsud treg, treg2, treg3 ;AMX_INT8 |
| 34 | + tdpbuud treg, treg2, treg3 ;AMX_INT8 |
| 35 | + tdpfp16ps treg, treg2, treg3 ;AMX-FP16 |
| 36 | + tcmmimfp16ps treg, treg2, treg3 ;AMX-COMPLEX |
| 37 | + tcmmrlfp16ps treg, treg2, treg3 ;AMX-COMPLEX |
| 38 | + |
| 39 | + tmmultf32ps treg, treg2, treg3 ;AMX_TF32 |
| 40 | + |
| 41 | + tdpbf8ps treg, treg2, treg3 ;AMX-FP8 |
| 42 | + tdpbhf8ps treg, treg2, treg3 ;AMX-FP8 |
| 43 | + tdphbf8ps treg, treg2, treg3 ;AMX-FP8 |
| 44 | + tdphf8ps treg, treg2, treg3 ;AMX-FP8 |
| 45 | + |
| 46 | + tcvtrowd2ps zreg, treg, eax ;AMX-AVX512 |
| 47 | + tcvtrowd2ps zreg, treg, %1 ;AMX-AVX512 |
| 48 | + tcvtrowps2bf16h zreg, treg, eax ;AMX-AVX512 |
| 49 | + tcvtrowps2bf16h zreg, treg, %1 ;AMX-AVX512 |
| 50 | + tcvtrowps2bf16l zreg, treg, eax ;AMX-AVX512 |
| 51 | + tcvtrowps2bf16l zreg, treg, %1 ;AMX-AVX512 |
| 52 | + tcvtrowps2phh zreg, treg, eax ;AMX-AVX512 |
| 53 | + tcvtrowps2phh zreg, treg, %1 ;AMX-AVX512 |
| 54 | + tcvtrowps2phl zreg, treg, eax ;AMX-AVX512 |
| 55 | + tcvtrowps2phl zreg, treg, %1 ;AMX-AVX512 |
| 56 | + tilemovrow zreg, treg, eax ;AMX-AVX512 |
| 57 | + tilemovrow zreg, treg, %1 ;AMX-AVX512 |
| 58 | + |
| 59 | + t2rpntlvwz0 treg, [rax] ;AMX-TRANSPOSE |
| 60 | + t2rpntlvwz0 treg, [rax+rdx] ;AMX-TRANSPOSE |
| 61 | + t2rpntlvwz0 treg, [rax+rdx*2] ;AMX-TRANSPOSE |
| 62 | + |
| 63 | + t2rpntlvwz0t1 treg, [rax] ;AMX-TRANSPOSE |
| 64 | + t2rpntlvwz0t1 treg, [rax+rdx] ;AMX-TRANSPOSE |
| 65 | + t2rpntlvwz0t1 treg, [rax+rdx*2] ;AMX-TRANSPOSE |
| 66 | + |
| 67 | + t2rpntlvwz1 treg, [rax] ;AMX-TRANSPOSE |
| 68 | + t2rpntlvwz1 treg, [rax+rdx] ;AMX-TRANSPOSE |
| 69 | + t2rpntlvwz1 treg, [rax+rdx*2] ;AMX-TRANSPOSE |
| 70 | + |
| 71 | + t2rpntlvwz1t1 treg, [rax] ;AMX-TRANSPOSE |
| 72 | + t2rpntlvwz1t1 treg, [rax+rdx] ;AMX-TRANSPOSE |
| 73 | + t2rpntlvwz1t1 treg, [rax+rdx*2] ;AMX-TRANSPOSE |
| 74 | + |
| 75 | + ttransposed treg, treg ;AMX-TRANSPOSE |
| 76 | + |
| 77 | + t2rpntlvwz0rs treg, [rax] ;AMX-TRANSPOSE + AMX-MOVRS |
| 78 | + t2rpntlvwz0rs treg, [rax+rdx] ;AMX-TRANSPOSE + AMX-MOVRS |
| 79 | + t2rpntlvwz0rs treg, [rax+rdx*2] ;AMX-TRANSPOSE + AMX-MOVRS |
| 80 | + |
| 81 | + t2rpntlvwz0rst1 treg, [rax] ;AMX-TRANSPOSE + AMX-MOVRS |
| 82 | + t2rpntlvwz0rst1 treg, [rax+rdx] ;AMX-TRANSPOSE + AMX-MOVRS |
| 83 | + t2rpntlvwz0rst1 treg, [rax+rdx*2] ;AMX-TRANSPOSE + AMX-MOVRS |
| 84 | + |
| 85 | + t2rpntlvwz1rs treg, [rax] ;AMX-TRANSPOSE + AMX-MOVRS |
| 86 | + t2rpntlvwz1rs treg, [rax+rdx] ;AMX-TRANSPOSE + AMX-MOVRS |
| 87 | + t2rpntlvwz1rs treg, [rax+rdx*2] ;AMX-TRANSPOSE + AMX-MOVRS |
| 88 | + |
| 89 | + t2rpntlvwz1rst1 treg, [rax] ;AMX-TRANSPOSE + AMX-MOVRS |
| 90 | + t2rpntlvwz1rst1 treg, [rax+rdx] ;AMX-TRANSPOSE + AMX-MOVRS |
| 91 | + t2rpntlvwz1rst1 treg, [rax+rdx*2] ;AMX-TRANSPOSE + AMX-MOVRS |
| 92 | + |
| 93 | + ttdpbf16ps treg, treg2, treg3 ;AMX-TRANSPOSE + AMX-BF16 |
| 94 | + ttdpfp16ps treg, treg2, treg3 ;AMX-TRANSPOSE + AMX-FP16 |
| 95 | + ttcmmimfp16ps treg, treg2, treg3 ;AMX-TRANSPOSE + AMX-COMPLEX |
| 96 | + ttcmmrlfp16ps treg, treg2, treg3 ;AMX-TRANSPOSE + AMX-COMPLEX |
| 97 | + tconjtcmmimfp16ps treg, treg2, treg3 ;AMX-TRANSPOSE + AMX-COMPLEX |
| 98 | + tconjtfp16 treg, treg ;AMX-TRANSPOSE + AMX-COMPLEX |
| 99 | + |
| 100 | + ttmmultf32ps treg, treg2, treg3 ;AMX-TRANSPOSE + AMX_TF32 |
| 101 | + |
| 102 | + tilestored [rax], treg ;AMX_TILE |
| 103 | + tilestored [rax,rdx], treg ;AMX_TILE |
| 104 | + tilestored [rax,rdx*2], treg ;AMX_TILE |
| 105 | + |
| 106 | + tilerelease ;AMX_TILE |
30 | 107 | %endmacro |
31 | 108 |
|
32 | 109 | %assign n 0 |
| 110 | +%assign m 1 |
| 111 | +%assign l 2 |
33 | 112 | %rep 8 |
34 | | - amx n |
35 | | - %assign n n+1 |
| 113 | + amx n, m, l |
| 114 | + %assign n ((n+1) % 8) |
| 115 | + %assign m ((m+1) % 8) |
| 116 | + %assign l ((l+1) % 8) |
36 | 117 | %endrep |
0 commit comments