|
1 | | -/* SPDX-License-Identifier: GPL-2.0 */ |
| 1 | +/* SPDX-License-Identifier: GPL-2.0-only */ |
| 2 | +/* |
| 3 | + * Copyright (C) 2022 Michael T. Kloos <michael@michaelkloos.com> |
| 4 | + */ |
2 | 5 |
|
3 | 6 | #include <linux/linkage.h> |
4 | 7 | #include <asm/asm.h> |
5 | 8 |
|
6 | | -ENTRY(__memmove) |
7 | | -WEAK(memmove) |
8 | | - move t0, a0 |
9 | | - move t1, a1 |
10 | | - |
11 | | - beq a0, a1, exit_memcpy |
12 | | - beqz a2, exit_memcpy |
13 | | - srli t2, a2, 0x2 |
14 | | - |
15 | | - slt t3, a0, a1 |
16 | | - beqz t3, do_reverse |
17 | | - |
18 | | - andi a2, a2, 0x3 |
19 | | - li t4, 1 |
20 | | - beqz t2, byte_copy |
21 | | - |
22 | | -word_copy: |
23 | | - lw t3, 0(a1) |
24 | | - addi t2, t2, -1 |
25 | | - addi a1, a1, 4 |
26 | | - sw t3, 0(a0) |
27 | | - addi a0, a0, 4 |
28 | | - bnez t2, word_copy |
29 | | - beqz a2, exit_memcpy |
30 | | - j byte_copy |
31 | | - |
32 | | -do_reverse: |
33 | | - add a0, a0, a2 |
34 | | - add a1, a1, a2 |
35 | | - andi a2, a2, 0x3 |
36 | | - li t4, -1 |
37 | | - beqz t2, reverse_byte_copy |
38 | | - |
39 | | -reverse_word_copy: |
40 | | - addi a1, a1, -4 |
41 | | - addi t2, t2, -1 |
42 | | - lw t3, 0(a1) |
43 | | - addi a0, a0, -4 |
44 | | - sw t3, 0(a0) |
45 | | - bnez t2, reverse_word_copy |
46 | | - beqz a2, exit_memcpy |
47 | | - |
48 | | -reverse_byte_copy: |
49 | | - addi a0, a0, -1 |
50 | | - addi a1, a1, -1 |
| 9 | +SYM_FUNC_START(__memmove) |
| 10 | +SYM_FUNC_START_WEAK(memmove) |
| 11 | + /* |
| 12 | + * Returns |
| 13 | + * a0 - dest |
| 14 | + * |
| 15 | + * Parameters |
| 16 | + * a0 - Inclusive first byte of dest |
| 17 | + * a1 - Inclusive first byte of src |
| 18 | + * a2 - Length of copy n |
| 19 | + * |
| 20 | + * Because the return matches the parameter register a0, |
| 21 | + * we will not clobber or modify that register. |
| 22 | + * |
| 23 | + * Note: This currently only works on little-endian. |
| 24 | + * To port to big-endian, reverse the direction of shifts |
| 25 | + * in the 2 misaligned fixup copy loops. |
| 26 | + */ |
51 | 27 |
|
| 28 | + /* Return if nothing to do */ |
| 29 | + beq a0, a1, return_from_memmove |
| 30 | + beqz a2, return_from_memmove |
| 31 | + |
| 32 | + /* |
| 33 | + * Register Uses |
| 34 | + * Forward Copy: a1 - Index counter of src |
| 35 | + * Reverse Copy: a4 - Index counter of src |
| 36 | + * Forward Copy: t3 - Index counter of dest |
| 37 | + * Reverse Copy: t4 - Index counter of dest |
| 38 | + * Both Copy Modes: t5 - Inclusive first multibyte/aligned of dest |
| 39 | + * Both Copy Modes: t6 - Non-Inclusive last multibyte/aligned of dest |
| 40 | + * Both Copy Modes: t0 - Link / Temporary for load-store |
| 41 | + * Both Copy Modes: t1 - Temporary for load-store |
| 42 | + * Both Copy Modes: t2 - Temporary for load-store |
| 43 | + * Both Copy Modes: a5 - dest to src alignment offset |
| 44 | + * Both Copy Modes: a6 - Shift ammount |
| 45 | + * Both Copy Modes: a7 - Inverse Shift ammount |
| 46 | + * Both Copy Modes: a2 - Alternate breakpoint for unrolled loops |
| 47 | + */ |
| 48 | + |
| 49 | + /* |
| 50 | + * Solve for some register values now. |
| 51 | + * Byte copy does not need t5 or t6. |
| 52 | + */ |
| 53 | + mv t3, a0 |
| 54 | + add t4, a0, a2 |
| 55 | + add a4, a1, a2 |
| 56 | + |
| 57 | + /* |
| 58 | + * Byte copy if copying less than (2 * SZREG) bytes. This can |
| 59 | + * cause problems with the bulk copy implementation and is |
| 60 | + * small enough not to bother. |
| 61 | + */ |
| 62 | + andi t0, a2, -(2 * SZREG) |
| 63 | + beqz t0, byte_copy |
| 64 | + |
| 65 | + /* |
| 66 | + * Now solve for t5 and t6. |
| 67 | + */ |
| 68 | + andi t5, t3, -SZREG |
| 69 | + andi t6, t4, -SZREG |
| 70 | + /* |
| 71 | + * If dest(Register t3) rounded down to the nearest naturally |
| 72 | + * aligned SZREG address, does not equal dest, then add SZREG |
| 73 | + * to find the low-bound of SZREG alignment in the dest memory |
| 74 | + * region. Note that this could overshoot the dest memory |
| 75 | + * region if n is less than SZREG. This is one reason why |
| 76 | + * we always byte copy if n is less than SZREG. |
| 77 | + * Otherwise, dest is already naturally aligned to SZREG. |
| 78 | + */ |
| 79 | + beq t5, t3, 1f |
| 80 | + addi t5, t5, SZREG |
| 81 | + 1: |
| 82 | + |
| 83 | + /* |
| 84 | + * If the dest and src are co-aligned to SZREG, then there is |
| 85 | + * no need for the full rigmarole of a full misaligned fixup copy. |
| 86 | + * Instead, do a simpler co-aligned copy. |
| 87 | + */ |
| 88 | + xor t0, a0, a1 |
| 89 | + andi t1, t0, (SZREG - 1) |
| 90 | + beqz t1, coaligned_copy |
| 91 | + /* Fall through to misaligned fixup copy */ |
| 92 | + |
| 93 | +misaligned_fixup_copy: |
| 94 | + bltu a1, a0, misaligned_fixup_copy_reverse |
| 95 | + |
| 96 | +misaligned_fixup_copy_forward: |
| 97 | + jal t0, byte_copy_until_aligned_forward |
| 98 | + |
| 99 | + andi a5, a1, (SZREG - 1) /* Find the alignment offset of src (a1) */ |
| 100 | + slli a6, a5, 3 /* Multiply by 8 to convert that to bits to shift */ |
| 101 | + sub a5, a1, t3 /* Find the difference between src and dest */ |
| 102 | + andi a1, a1, -SZREG /* Align the src pointer */ |
| 103 | + addi a2, t6, SZREG /* The other breakpoint for the unrolled loop*/ |
| 104 | + |
| 105 | + /* |
| 106 | + * Compute The Inverse Shift |
| 107 | + * a7 = XLEN - a6 = XLEN + -a6 |
| 108 | + * 2s complement negation to find the negative: -a6 = ~a6 + 1 |
| 109 | + * Add that to XLEN. XLEN = SZREG * 8. |
| 110 | + */ |
| 111 | + not a7, a6 |
| 112 | + addi a7, a7, (SZREG * 8 + 1) |
| 113 | + |
| 114 | + /* |
| 115 | + * Fix Misalignment Copy Loop - Forward |
| 116 | + * load_val0 = load_ptr[0]; |
| 117 | + * do { |
| 118 | + * load_val1 = load_ptr[1]; |
| 119 | + * store_ptr += 2; |
| 120 | + * store_ptr[0 - 2] = (load_val0 >> {a6}) | (load_val1 << {a7}); |
| 121 | + * |
| 122 | + * if (store_ptr == {a2}) |
| 123 | + * break; |
| 124 | + * |
| 125 | + * load_val0 = load_ptr[2]; |
| 126 | + * load_ptr += 2; |
| 127 | + * store_ptr[1 - 2] = (load_val1 >> {a6}) | (load_val0 << {a7}); |
| 128 | + * |
| 129 | + * } while (store_ptr != store_ptr_end); |
| 130 | + * store_ptr = store_ptr_end; |
| 131 | + */ |
| 132 | + |
| 133 | + REG_L t0, (0 * SZREG)(a1) |
| 134 | + 1: |
| 135 | + REG_L t1, (1 * SZREG)(a1) |
| 136 | + addi t3, t3, (2 * SZREG) |
| 137 | + srl t0, t0, a6 |
| 138 | + sll t2, t1, a7 |
| 139 | + or t2, t0, t2 |
| 140 | + REG_S t2, ((0 * SZREG) - (2 * SZREG))(t3) |
| 141 | + |
| 142 | + beq t3, a2, 2f |
| 143 | + |
| 144 | + REG_L t0, (2 * SZREG)(a1) |
| 145 | + addi a1, a1, (2 * SZREG) |
| 146 | + srl t1, t1, a6 |
| 147 | + sll t2, t0, a7 |
| 148 | + or t2, t1, t2 |
| 149 | + REG_S t2, ((1 * SZREG) - (2 * SZREG))(t3) |
| 150 | + |
| 151 | + bne t3, t6, 1b |
| 152 | + 2: |
| 153 | + mv t3, t6 /* Fix the dest pointer in case the loop was broken */ |
| 154 | + |
| 155 | + add a1, t3, a5 /* Restore the src pointer */ |
| 156 | + j byte_copy_forward /* Copy any remaining bytes */ |
| 157 | + |
| 158 | +misaligned_fixup_copy_reverse: |
| 159 | + jal t0, byte_copy_until_aligned_reverse |
| 160 | + |
| 161 | + andi a5, a4, (SZREG - 1) /* Find the alignment offset of src (a4) */ |
| 162 | + slli a6, a5, 3 /* Multiply by 8 to convert that to bits to shift */ |
| 163 | + sub a5, a4, t4 /* Find the difference between src and dest */ |
| 164 | + andi a4, a4, -SZREG /* Align the src pointer */ |
| 165 | + addi a2, t5, -SZREG /* The other breakpoint for the unrolled loop*/ |
| 166 | + |
| 167 | + /* |
| 168 | + * Compute The Inverse Shift |
| 169 | + * a7 = XLEN - a6 = XLEN + -a6 |
| 170 | + * 2s complement negation to find the negative: -a6 = ~a6 + 1 |
| 171 | + * Add that to XLEN. XLEN = SZREG * 8. |
| 172 | + */ |
| 173 | + not a7, a6 |
| 174 | + addi a7, a7, (SZREG * 8 + 1) |
| 175 | + |
| 176 | + /* |
| 177 | + * Fix Misalignment Copy Loop - Reverse |
| 178 | + * load_val1 = load_ptr[0]; |
| 179 | + * do { |
| 180 | + * load_val0 = load_ptr[-1]; |
| 181 | + * store_ptr -= 2; |
| 182 | + * store_ptr[1] = (load_val0 >> {a6}) | (load_val1 << {a7}); |
| 183 | + * |
| 184 | + * if (store_ptr == {a2}) |
| 185 | + * break; |
| 186 | + * |
| 187 | + * load_val1 = load_ptr[-2]; |
| 188 | + * load_ptr -= 2; |
| 189 | + * store_ptr[0] = (load_val1 >> {a6}) | (load_val0 << {a7}); |
| 190 | + * |
| 191 | + * } while (store_ptr != store_ptr_end); |
| 192 | + * store_ptr = store_ptr_end; |
| 193 | + */ |
| 194 | + |
| 195 | + REG_L t1, ( 0 * SZREG)(a4) |
| 196 | + 1: |
| 197 | + REG_L t0, (-1 * SZREG)(a4) |
| 198 | + addi t4, t4, (-2 * SZREG) |
| 199 | + sll t1, t1, a7 |
| 200 | + srl t2, t0, a6 |
| 201 | + or t2, t1, t2 |
| 202 | + REG_S t2, ( 1 * SZREG)(t4) |
| 203 | + |
| 204 | + beq t4, a2, 2f |
| 205 | + |
| 206 | + REG_L t1, (-2 * SZREG)(a4) |
| 207 | + addi a4, a4, (-2 * SZREG) |
| 208 | + sll t0, t0, a7 |
| 209 | + srl t2, t1, a6 |
| 210 | + or t2, t0, t2 |
| 211 | + REG_S t2, ( 0 * SZREG)(t4) |
| 212 | + |
| 213 | + bne t4, t5, 1b |
| 214 | + 2: |
| 215 | + mv t4, t5 /* Fix the dest pointer in case the loop was broken */ |
| 216 | + |
| 217 | + add a4, t4, a5 /* Restore the src pointer */ |
| 218 | + j byte_copy_reverse /* Copy any remaining bytes */ |
| 219 | + |
| 220 | +/* |
| 221 | + * Simple copy loops for SZREG co-aligned memory locations. |
| 222 | + * These also make calls to do byte copies for any unaligned |
| 223 | + * data at their terminations. |
| 224 | + */ |
| 225 | +coaligned_copy: |
| 226 | + bltu a1, a0, coaligned_copy_reverse |
| 227 | + |
| 228 | +coaligned_copy_forward: |
| 229 | + jal t0, byte_copy_until_aligned_forward |
| 230 | + |
| 231 | + 1: |
| 232 | + REG_L t1, ( 0 * SZREG)(a1) |
| 233 | + addi a1, a1, SZREG |
| 234 | + addi t3, t3, SZREG |
| 235 | + REG_S t1, (-1 * SZREG)(t3) |
| 236 | + bne t3, t6, 1b |
| 237 | + |
| 238 | + j byte_copy_forward /* Copy any remaining bytes */ |
| 239 | + |
| 240 | +coaligned_copy_reverse: |
| 241 | + jal t0, byte_copy_until_aligned_reverse |
| 242 | + |
| 243 | + 1: |
| 244 | + REG_L t1, (-1 * SZREG)(a4) |
| 245 | + addi a4, a4, -SZREG |
| 246 | + addi t4, t4, -SZREG |
| 247 | + REG_S t1, ( 0 * SZREG)(t4) |
| 248 | + bne t4, t5, 1b |
| 249 | + |
| 250 | + j byte_copy_reverse /* Copy any remaining bytes */ |
| 251 | + |
| 252 | +/* |
| 253 | + * These are basically sub-functions within the function. They |
| 254 | + * are used to byte copy until the dest pointer is in alignment. |
| 255 | + * At which point, a bulk copy method can be used by the |
| 256 | + * calling code. These work on the same registers as the bulk |
| 257 | + * copy loops. Therefore, the register values can be picked |
| 258 | + * up from where they were left and we avoid code duplication |
| 259 | + * without any overhead except the call in and return jumps. |
| 260 | + */ |
| 261 | +byte_copy_until_aligned_forward: |
| 262 | + beq t3, t5, 2f |
| 263 | + 1: |
| 264 | + lb t1, 0(a1) |
| 265 | + addi a1, a1, 1 |
| 266 | + addi t3, t3, 1 |
| 267 | + sb t1, -1(t3) |
| 268 | + bne t3, t5, 1b |
| 269 | + 2: |
| 270 | + jalr zero, 0x0(t0) /* Return to multibyte copy loop */ |
| 271 | + |
| 272 | +byte_copy_until_aligned_reverse: |
| 273 | + beq t4, t6, 2f |
| 274 | + 1: |
| 275 | + lb t1, -1(a4) |
| 276 | + addi a4, a4, -1 |
| 277 | + addi t4, t4, -1 |
| 278 | + sb t1, 0(t4) |
| 279 | + bne t4, t6, 1b |
| 280 | + 2: |
| 281 | + jalr zero, 0x0(t0) /* Return to multibyte copy loop */ |
| 282 | + |
| 283 | +/* |
| 284 | + * Simple byte copy loops. |
| 285 | + * These will byte copy until they reach the end of data to copy. |
| 286 | + * At that point, they will call to return from memmove. |
| 287 | + */ |
52 | 288 | byte_copy: |
53 | | - lb t3, 0(a1) |
54 | | - addi a2, a2, -1 |
55 | | - sb t3, 0(a0) |
56 | | - add a1, a1, t4 |
57 | | - add a0, a0, t4 |
58 | | - bnez a2, byte_copy |
59 | | - |
60 | | -exit_memcpy: |
61 | | - move a0, t0 |
62 | | - move a1, t1 |
63 | | - ret |
64 | | -END(__memmove) |
| 289 | + bltu a1, a0, byte_copy_reverse |
| 290 | + |
| 291 | +byte_copy_forward: |
| 292 | + beq t3, t4, 2f |
| 293 | + 1: |
| 294 | + lb t1, 0(a1) |
| 295 | + addi a1, a1, 1 |
| 296 | + addi t3, t3, 1 |
| 297 | + sb t1, -1(t3) |
| 298 | + bne t3, t4, 1b |
| 299 | + 2: |
| 300 | + ret |
| 301 | + |
| 302 | +byte_copy_reverse: |
| 303 | + beq t4, t3, 2f |
| 304 | + 1: |
| 305 | + lb t1, -1(a4) |
| 306 | + addi a4, a4, -1 |
| 307 | + addi t4, t4, -1 |
| 308 | + sb t1, 0(t4) |
| 309 | + bne t4, t3, 1b |
| 310 | + 2: |
| 311 | + |
| 312 | +return_from_memmove: |
| 313 | + ret |
| 314 | + |
| 315 | +SYM_FUNC_END(memmove) |
| 316 | +SYM_FUNC_END(__memmove) |
0 commit comments