|
| 1 | +// Copyright 2025 The Go Authors. All rights reserved. |
| 2 | +// Use of this source code is governed by a BSD-style |
| 3 | +// license that can be found in the LICENSE file. |
| 4 | + |
| 5 | +//go:build loong64 && gc && !purego |
| 6 | + |
| 7 | +#include "textflag.h" |
| 8 | + |
| 9 | +DATA ·iv0<>+0(SB)/4, $0x6a09e667 |
| 10 | +DATA ·iv0<>+4(SB)/4, $0xbb67ae85 |
| 11 | +DATA ·iv0<>+8(SB)/4, $0x3c6ef372 |
| 12 | +DATA ·iv0<>+12(SB)/4, $0xa54ff53a |
| 13 | +GLOBL ·iv0<>(SB), RODATA|NOPTR, $16 |
| 14 | + |
| 15 | +DATA ·iv1<>+0(SB)/4, $0x510e527f |
| 16 | +DATA ·iv1<>+4(SB)/4, $0x9b05688c |
| 17 | +DATA ·iv1<>+8(SB)/4, $0x1f83d9ab |
| 18 | +DATA ·iv1<>+12(SB)/4, $0x5be0cd19 |
| 19 | +GLOBL ·iv1<>(SB), RODATA|NOPTR, $16 |
| 20 | + |
| 21 | +#define SHUFFLE_1 \ |
| 22 | + VSHUF4IW $57, V1, V1; \ |
| 23 | + VSHUF4IW $78, V2, V2; \ |
| 24 | + VSHUF4IW $147, V3, V3; \ |
| 25 | + |
| 26 | +#define SHUFFLE_2 \ |
| 27 | + VSHUF4IW $147, V1, V1; \ |
| 28 | + VSHUF4IW $78, V2, V2; \ |
| 29 | + VSHUF4IW $57, V3, V3; \ |
| 30 | + |
| 31 | +#define LOAD_M(a, b, c, d, e, f, g, h) \ |
| 32 | + VMOVQ a, V8.W[0]; \ |
| 33 | + VMOVQ b, V8.W[1]; \ |
| 34 | + VMOVQ c, V8.W[2]; \ |
| 35 | + VMOVQ d, V8.W[3]; \ |
| 36 | + VMOVQ e, V9.W[0]; \ |
| 37 | + VMOVQ f, V9.W[1]; \ |
| 38 | + VMOVQ g, V9.W[2]; \ |
| 39 | + VMOVQ h, V9.W[3]; \ |
| 40 | + |
| 41 | +#define ROUND_0 \ |
| 42 | + VADDW V0, V8, V0; \ |
| 43 | + VADDW V0, V1, V0; \ |
| 44 | + VXORV V3, V0, V3; \ |
| 45 | + VROTRW $16, V3, V3; \ |
| 46 | + VADDW V2, V3, V2; \ |
| 47 | + VXORV V1, V2, V1; \ |
| 48 | + VROTRW $12, V1, V1; \ |
| 49 | + VADDW V0, V9, V0; \ |
| 50 | + VADDW V0, V1, V0; \ |
| 51 | + VXORV V3, V0, V3; \ |
| 52 | + VROTRW $8, V3, V3; \ |
| 53 | + VADDW V2, V3, V2; \ |
| 54 | + VXORV V1, V2, V1; \ |
| 55 | + VROTRW $7, V1, V1; \ |
| 56 | + |
| 57 | +#define ROUND_8 ROUND_0 |
| 58 | + |
| 59 | +// func hashBlocksVX(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte) |
| 60 | +TEXT ·hashBlocksVX(SB), NOSPLIT, $0-48 |
| 61 | + MOVV h+0(FP), R4 |
| 62 | + MOVV c+8(FP), R5 |
| 63 | + MOVWU flag+16(FP), R6 |
| 64 | + MOVV blocks_base+24(FP), R7 |
| 65 | + MOVV blocks_len+32(FP), R8 |
| 66 | + MOVW (R5), R9 // c0 |
| 67 | + MOVW 4(R5), R10 // c1 |
| 68 | + |
| 69 | +loop: |
| 70 | + ADD $0x40, R9 |
| 71 | + SGTU $0x40, R9, R11 |
| 72 | + ADD R10, R11, R10 |
| 73 | + |
| 74 | + MOVV $·iv0<>(SB), R11 |
| 75 | + MOVV $·iv1<>(SB), R12 |
| 76 | + MOVWU 0(R12), R13 // v12 |
| 77 | + MOVWU 4(R12), R14 // v13 |
| 78 | + MOVWU 8(R12), R15 // v14 |
| 79 | + MOVWU 12(R12), R16 // v15 |
| 80 | + XOR R13, R9, R13 |
| 81 | + XOR R14, R10, R14 |
| 82 | + XOR R15, R6, R15 |
| 83 | + |
| 84 | + VMOVQ (R4), V0 |
| 85 | + VMOVQ 16(R4), V1 |
| 86 | + VMOVQ (R11), V2 |
| 87 | + VMOVQ R16, V3.W[3] |
| 88 | + VMOVQ R13, V3.W[0] |
| 89 | + VMOVQ R14, V3.W[1] |
| 90 | + VMOVQ R15, V3.W[2] |
| 91 | + |
| 92 | + MOVWU (R7), R11 |
| 93 | + MOVWU 4(R7), R12 |
| 94 | + MOVWU 8(R7), R13 |
| 95 | + MOVWU 12(R7), R14 |
| 96 | + MOVWU 16(R7), R15 |
| 97 | + MOVWU 20(R7), R16 |
| 98 | + MOVWU 24(R7), R17 |
| 99 | + MOVWU 28(R7), R18 |
| 100 | + MOVWU 32(R7), R19 |
| 101 | + MOVWU 36(R7), R24 |
| 102 | + MOVWU 40(R7), R25 |
| 103 | + MOVWU 44(R7), R26 |
| 104 | + MOVWU 48(R7), R27 |
| 105 | + MOVWU 52(R7), R28 |
| 106 | + MOVWU 56(R7), R29 |
| 107 | + MOVWU 60(R7), R30 |
| 108 | + |
| 109 | + LOAD_M(R11, R13, R15, R17, R12, R14, R16, R18) |
| 110 | + ROUND_0 |
| 111 | + SHUFFLE_1 |
| 112 | + LOAD_M(R19, R25, R27, R29, R24, R26, R28, R30) |
| 113 | + ROUND_8 |
| 114 | + SHUFFLE_2 |
| 115 | + |
| 116 | + LOAD_M(R29, R15, R24, R28, R25, R19, R30, R17) |
| 117 | + ROUND_0 |
| 118 | + SHUFFLE_1 |
| 119 | + LOAD_M(R12, R11, R26, R16, R27, R13, R18, R14) |
| 120 | + ROUND_8 |
| 121 | + SHUFFLE_2 |
| 122 | + |
| 123 | + LOAD_M(R26, R27, R16, R30, R19, R11, R13, R28) |
| 124 | + ROUND_0 |
| 125 | + SHUFFLE_1 |
| 126 | + LOAD_M(R25, R14, R18, R24, R29, R17, R12, R15) |
| 127 | + ROUND_8 |
| 128 | + SHUFFLE_2 |
| 129 | + |
| 130 | + LOAD_M(R18, R14, R28, R26, R24, R12, R27, R29) |
| 131 | + ROUND_0 |
| 132 | + SHUFFLE_1 |
| 133 | + LOAD_M(R13, R16, R15, R30, R17, R25, R11, R19) |
| 134 | + ROUND_8 |
| 135 | + SHUFFLE_2 |
| 136 | + |
| 137 | + LOAD_M(R24, R16, R13, R25, R11, R18, R15, R30) |
| 138 | + ROUND_0 |
| 139 | + SHUFFLE_1 |
| 140 | + LOAD_M(R29, R26, R17, R14, R12, R27, R19, R28) |
| 141 | + ROUND_8 |
| 142 | + SHUFFLE_2 |
| 143 | + |
| 144 | + LOAD_M(R13, R17, R11, R19, R27, R25, R26, R14) |
| 145 | + ROUND_0 |
| 146 | + SHUFFLE_1 |
| 147 | + LOAD_M(R15, R18, R30, R12, R28, R16, R29, R24) |
| 148 | + ROUND_8 |
| 149 | + SHUFFLE_2 |
| 150 | + |
| 151 | + LOAD_M(R27, R12, R29, R15, R16, R30, R28, R25) |
| 152 | + ROUND_0 |
| 153 | + SHUFFLE_1 |
| 154 | + LOAD_M(R11, R17, R24, R19, R18, R14, R13, R26) |
| 155 | + ROUND_8 |
| 156 | + SHUFFLE_2 |
| 157 | + |
| 158 | + LOAD_M(R28, R18, R27, R14, R26, R29, R12, R24) |
| 159 | + ROUND_0 |
| 160 | + SHUFFLE_1 |
| 161 | + LOAD_M(R16, R30, R19, R13, R11, R15, R17, R25) |
| 162 | + ROUND_8 |
| 163 | + SHUFFLE_2 |
| 164 | + |
| 165 | + LOAD_M(R17, R29, R26, R11, R30, R24, R14, R19) |
| 166 | + ROUND_0 |
| 167 | + SHUFFLE_1 |
| 168 | + LOAD_M(R27, R28, R12, R25, R13, R18, R15, R16) |
| 169 | + ROUND_8 |
| 170 | + SHUFFLE_2 |
| 171 | + |
| 172 | + LOAD_M(R25, R19, R18, R12, R13, R15, R17, R16) |
| 173 | + ROUND_0 |
| 174 | + SHUFFLE_1 |
| 175 | + LOAD_M(R30, R24, R14, R28, R26, R29, R27, R11) |
| 176 | + ROUND_8 |
| 177 | + SHUFFLE_2 |
| 178 | + |
| 179 | + VMOVQ (R4), V8 |
| 180 | + VMOVQ 16(R4), V9 |
| 181 | + VXORV V8, V0, V8 |
| 182 | + VXORV V9, V1, V9 |
| 183 | + VXORV V8, V2, V8 |
| 184 | + VXORV V9, V3, V9 |
| 185 | + VMOVQ V8, (R4) |
| 186 | + VMOVQ V9, 16(R4) |
| 187 | + |
| 188 | + SUBV $64, R8 |
| 189 | + ADDV $64, R7 |
| 190 | + BNE R8, R0, loop |
| 191 | + |
| 192 | + MOVW R9, (R5) |
| 193 | + MOVW R10, 4(R5) |
| 194 | + |
| 195 | + RET |
| 196 | + |
0 commit comments