Skip to content

Commit d804db3

Browse files
echelonxraygregkh
authored andcommitted
riscv: Fixed misaligned memory access. Fixed pointer comparison.
[ Upstream commit 9d1f0ec ] Rewrote the RISC-V memmove() assembly implementation. The previous implementation did not check memory alignment and it compared 2 pointers with a signed comparison. The misaligned memory access would cause the kernel to crash on systems that did not emulate it in firmware and did not support it in hardware. Firmware emulation is slow and may not exist. The RISC-V spec does not guarantee that support for misaligned memory accesses will exist. It should not be depended on. This patch now checks for XLEN granularity of co-alignment between the pointers. Failing that, copying is done by loading from the 2 contiguous and naturally aligned XLEN memory locations containing the overlapping XLEN sized data to be copied. The data is shifted into the correct place and binary or'ed together on each iteration. The result is then stored into the corresponding naturally aligned XLEN sized location in the destination. For unaligned data at the terminations of the regions to be copied or for copies less than (2 * XLEN) in size, byte copy is used. This patch also now uses unsigned comparison for the pointers and migrates to the newer assembler annotations from the now deprecated ones. Signed-off-by: Michael T. Kloos <michael@michaelkloos.com> Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com> Signed-off-by: Sasha Levin <sashal@kernel.org>
1 parent 6ca7107 commit d804db3

File tree

1 file changed

+310
-58
lines changed

1 file changed

+310
-58
lines changed

arch/riscv/lib/memmove.S

Lines changed: 310 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -1,64 +1,316 @@
1-
/* SPDX-License-Identifier: GPL-2.0 */
1+
/* SPDX-License-Identifier: GPL-2.0-only */
2+
/*
3+
* Copyright (C) 2022 Michael T. Kloos <michael@michaelkloos.com>
4+
*/
25

36
#include <linux/linkage.h>
47
#include <asm/asm.h>
58

6-
ENTRY(__memmove)
7-
WEAK(memmove)
8-
move t0, a0
9-
move t1, a1
10-
11-
beq a0, a1, exit_memcpy
12-
beqz a2, exit_memcpy
13-
srli t2, a2, 0x2
14-
15-
slt t3, a0, a1
16-
beqz t3, do_reverse
17-
18-
andi a2, a2, 0x3
19-
li t4, 1
20-
beqz t2, byte_copy
21-
22-
word_copy:
23-
lw t3, 0(a1)
24-
addi t2, t2, -1
25-
addi a1, a1, 4
26-
sw t3, 0(a0)
27-
addi a0, a0, 4
28-
bnez t2, word_copy
29-
beqz a2, exit_memcpy
30-
j byte_copy
31-
32-
do_reverse:
33-
add a0, a0, a2
34-
add a1, a1, a2
35-
andi a2, a2, 0x3
36-
li t4, -1
37-
beqz t2, reverse_byte_copy
38-
39-
reverse_word_copy:
40-
addi a1, a1, -4
41-
addi t2, t2, -1
42-
lw t3, 0(a1)
43-
addi a0, a0, -4
44-
sw t3, 0(a0)
45-
bnez t2, reverse_word_copy
46-
beqz a2, exit_memcpy
47-
48-
reverse_byte_copy:
49-
addi a0, a0, -1
50-
addi a1, a1, -1
9+
SYM_FUNC_START(__memmove)
10+
SYM_FUNC_START_WEAK(memmove)
11+
/*
12+
* Returns
13+
* a0 - dest
14+
*
15+
* Parameters
16+
* a0 - Inclusive first byte of dest
17+
* a1 - Inclusive first byte of src
18+
* a2 - Length of copy n
19+
*
20+
* Because the return matches the parameter register a0,
21+
* we will not clobber or modify that register.
22+
*
23+
* Note: This currently only works on little-endian.
24+
* To port to big-endian, reverse the direction of shifts
25+
* in the 2 misaligned fixup copy loops.
26+
*/
5127

28+
/* Return if nothing to do */
29+
beq a0, a1, return_from_memmove
30+
beqz a2, return_from_memmove
31+
32+
/*
33+
* Register Uses
34+
* Forward Copy: a1 - Index counter of src
35+
* Reverse Copy: a4 - Index counter of src
36+
* Forward Copy: t3 - Index counter of dest
37+
* Reverse Copy: t4 - Index counter of dest
38+
* Both Copy Modes: t5 - Inclusive first multibyte/aligned of dest
39+
* Both Copy Modes: t6 - Non-Inclusive last multibyte/aligned of dest
40+
* Both Copy Modes: t0 - Link / Temporary for load-store
41+
* Both Copy Modes: t1 - Temporary for load-store
42+
* Both Copy Modes: t2 - Temporary for load-store
43+
* Both Copy Modes: a5 - dest to src alignment offset
44+
* Both Copy Modes: a6 - Shift ammount
45+
* Both Copy Modes: a7 - Inverse Shift ammount
46+
* Both Copy Modes: a2 - Alternate breakpoint for unrolled loops
47+
*/
48+
49+
/*
50+
* Solve for some register values now.
51+
* Byte copy does not need t5 or t6.
52+
*/
53+
mv t3, a0
54+
add t4, a0, a2
55+
add a4, a1, a2
56+
57+
/*
58+
* Byte copy if copying less than (2 * SZREG) bytes. This can
59+
* cause problems with the bulk copy implementation and is
60+
* small enough not to bother.
61+
*/
62+
andi t0, a2, -(2 * SZREG)
63+
beqz t0, byte_copy
64+
65+
/*
66+
* Now solve for t5 and t6.
67+
*/
68+
andi t5, t3, -SZREG
69+
andi t6, t4, -SZREG
70+
/*
71+
* If dest(Register t3) rounded down to the nearest naturally
72+
* aligned SZREG address, does not equal dest, then add SZREG
73+
* to find the low-bound of SZREG alignment in the dest memory
74+
* region. Note that this could overshoot the dest memory
75+
* region if n is less than SZREG. This is one reason why
76+
* we always byte copy if n is less than SZREG.
77+
* Otherwise, dest is already naturally aligned to SZREG.
78+
*/
79+
beq t5, t3, 1f
80+
addi t5, t5, SZREG
81+
1:
82+
83+
/*
84+
* If the dest and src are co-aligned to SZREG, then there is
85+
* no need for the full rigmarole of a full misaligned fixup copy.
86+
* Instead, do a simpler co-aligned copy.
87+
*/
88+
xor t0, a0, a1
89+
andi t1, t0, (SZREG - 1)
90+
beqz t1, coaligned_copy
91+
/* Fall through to misaligned fixup copy */
92+
93+
misaligned_fixup_copy:
94+
bltu a1, a0, misaligned_fixup_copy_reverse
95+
96+
misaligned_fixup_copy_forward:
97+
jal t0, byte_copy_until_aligned_forward
98+
99+
andi a5, a1, (SZREG - 1) /* Find the alignment offset of src (a1) */
100+
slli a6, a5, 3 /* Multiply by 8 to convert that to bits to shift */
101+
sub a5, a1, t3 /* Find the difference between src and dest */
102+
andi a1, a1, -SZREG /* Align the src pointer */
103+
addi a2, t6, SZREG /* The other breakpoint for the unrolled loop*/
104+
105+
/*
106+
* Compute The Inverse Shift
107+
* a7 = XLEN - a6 = XLEN + -a6
108+
* 2s complement negation to find the negative: -a6 = ~a6 + 1
109+
* Add that to XLEN. XLEN = SZREG * 8.
110+
*/
111+
not a7, a6
112+
addi a7, a7, (SZREG * 8 + 1)
113+
114+
/*
115+
* Fix Misalignment Copy Loop - Forward
116+
* load_val0 = load_ptr[0];
117+
* do {
118+
* load_val1 = load_ptr[1];
119+
* store_ptr += 2;
120+
* store_ptr[0 - 2] = (load_val0 >> {a6}) | (load_val1 << {a7});
121+
*
122+
* if (store_ptr == {a2})
123+
* break;
124+
*
125+
* load_val0 = load_ptr[2];
126+
* load_ptr += 2;
127+
* store_ptr[1 - 2] = (load_val1 >> {a6}) | (load_val0 << {a7});
128+
*
129+
* } while (store_ptr != store_ptr_end);
130+
* store_ptr = store_ptr_end;
131+
*/
132+
133+
REG_L t0, (0 * SZREG)(a1)
134+
1:
135+
REG_L t1, (1 * SZREG)(a1)
136+
addi t3, t3, (2 * SZREG)
137+
srl t0, t0, a6
138+
sll t2, t1, a7
139+
or t2, t0, t2
140+
REG_S t2, ((0 * SZREG) - (2 * SZREG))(t3)
141+
142+
beq t3, a2, 2f
143+
144+
REG_L t0, (2 * SZREG)(a1)
145+
addi a1, a1, (2 * SZREG)
146+
srl t1, t1, a6
147+
sll t2, t0, a7
148+
or t2, t1, t2
149+
REG_S t2, ((1 * SZREG) - (2 * SZREG))(t3)
150+
151+
bne t3, t6, 1b
152+
2:
153+
mv t3, t6 /* Fix the dest pointer in case the loop was broken */
154+
155+
add a1, t3, a5 /* Restore the src pointer */
156+
j byte_copy_forward /* Copy any remaining bytes */
157+
158+
misaligned_fixup_copy_reverse:
159+
jal t0, byte_copy_until_aligned_reverse
160+
161+
andi a5, a4, (SZREG - 1) /* Find the alignment offset of src (a4) */
162+
slli a6, a5, 3 /* Multiply by 8 to convert that to bits to shift */
163+
sub a5, a4, t4 /* Find the difference between src and dest */
164+
andi a4, a4, -SZREG /* Align the src pointer */
165+
addi a2, t5, -SZREG /* The other breakpoint for the unrolled loop*/
166+
167+
/*
168+
* Compute The Inverse Shift
169+
* a7 = XLEN - a6 = XLEN + -a6
170+
* 2s complement negation to find the negative: -a6 = ~a6 + 1
171+
* Add that to XLEN. XLEN = SZREG * 8.
172+
*/
173+
not a7, a6
174+
addi a7, a7, (SZREG * 8 + 1)
175+
176+
/*
177+
* Fix Misalignment Copy Loop - Reverse
178+
* load_val1 = load_ptr[0];
179+
* do {
180+
* load_val0 = load_ptr[-1];
181+
* store_ptr -= 2;
182+
* store_ptr[1] = (load_val0 >> {a6}) | (load_val1 << {a7});
183+
*
184+
* if (store_ptr == {a2})
185+
* break;
186+
*
187+
* load_val1 = load_ptr[-2];
188+
* load_ptr -= 2;
189+
* store_ptr[0] = (load_val1 >> {a6}) | (load_val0 << {a7});
190+
*
191+
* } while (store_ptr != store_ptr_end);
192+
* store_ptr = store_ptr_end;
193+
*/
194+
195+
REG_L t1, ( 0 * SZREG)(a4)
196+
1:
197+
REG_L t0, (-1 * SZREG)(a4)
198+
addi t4, t4, (-2 * SZREG)
199+
sll t1, t1, a7
200+
srl t2, t0, a6
201+
or t2, t1, t2
202+
REG_S t2, ( 1 * SZREG)(t4)
203+
204+
beq t4, a2, 2f
205+
206+
REG_L t1, (-2 * SZREG)(a4)
207+
addi a4, a4, (-2 * SZREG)
208+
sll t0, t0, a7
209+
srl t2, t1, a6
210+
or t2, t0, t2
211+
REG_S t2, ( 0 * SZREG)(t4)
212+
213+
bne t4, t5, 1b
214+
2:
215+
mv t4, t5 /* Fix the dest pointer in case the loop was broken */
216+
217+
add a4, t4, a5 /* Restore the src pointer */
218+
j byte_copy_reverse /* Copy any remaining bytes */
219+
220+
/*
221+
* Simple copy loops for SZREG co-aligned memory locations.
222+
* These also make calls to do byte copies for any unaligned
223+
* data at their terminations.
224+
*/
225+
coaligned_copy:
226+
bltu a1, a0, coaligned_copy_reverse
227+
228+
coaligned_copy_forward:
229+
jal t0, byte_copy_until_aligned_forward
230+
231+
1:
232+
REG_L t1, ( 0 * SZREG)(a1)
233+
addi a1, a1, SZREG
234+
addi t3, t3, SZREG
235+
REG_S t1, (-1 * SZREG)(t3)
236+
bne t3, t6, 1b
237+
238+
j byte_copy_forward /* Copy any remaining bytes */
239+
240+
coaligned_copy_reverse:
241+
jal t0, byte_copy_until_aligned_reverse
242+
243+
1:
244+
REG_L t1, (-1 * SZREG)(a4)
245+
addi a4, a4, -SZREG
246+
addi t4, t4, -SZREG
247+
REG_S t1, ( 0 * SZREG)(t4)
248+
bne t4, t5, 1b
249+
250+
j byte_copy_reverse /* Copy any remaining bytes */
251+
252+
/*
253+
* These are basically sub-functions within the function. They
254+
* are used to byte copy until the dest pointer is in alignment.
255+
* At which point, a bulk copy method can be used by the
256+
* calling code. These work on the same registers as the bulk
257+
* copy loops. Therefore, the register values can be picked
258+
* up from where they were left and we avoid code duplication
259+
* without any overhead except the call in and return jumps.
260+
*/
261+
byte_copy_until_aligned_forward:
262+
beq t3, t5, 2f
263+
1:
264+
lb t1, 0(a1)
265+
addi a1, a1, 1
266+
addi t3, t3, 1
267+
sb t1, -1(t3)
268+
bne t3, t5, 1b
269+
2:
270+
jalr zero, 0x0(t0) /* Return to multibyte copy loop */
271+
272+
byte_copy_until_aligned_reverse:
273+
beq t4, t6, 2f
274+
1:
275+
lb t1, -1(a4)
276+
addi a4, a4, -1
277+
addi t4, t4, -1
278+
sb t1, 0(t4)
279+
bne t4, t6, 1b
280+
2:
281+
jalr zero, 0x0(t0) /* Return to multibyte copy loop */
282+
283+
/*
284+
* Simple byte copy loops.
285+
* These will byte copy until they reach the end of data to copy.
286+
* At that point, they will call to return from memmove.
287+
*/
52288
byte_copy:
53-
lb t3, 0(a1)
54-
addi a2, a2, -1
55-
sb t3, 0(a0)
56-
add a1, a1, t4
57-
add a0, a0, t4
58-
bnez a2, byte_copy
59-
60-
exit_memcpy:
61-
move a0, t0
62-
move a1, t1
63-
ret
64-
END(__memmove)
289+
bltu a1, a0, byte_copy_reverse
290+
291+
byte_copy_forward:
292+
beq t3, t4, 2f
293+
1:
294+
lb t1, 0(a1)
295+
addi a1, a1, 1
296+
addi t3, t3, 1
297+
sb t1, -1(t3)
298+
bne t3, t4, 1b
299+
2:
300+
ret
301+
302+
byte_copy_reverse:
303+
beq t4, t3, 2f
304+
1:
305+
lb t1, -1(a4)
306+
addi a4, a4, -1
307+
addi t4, t4, -1
308+
sb t1, 0(t4)
309+
bne t4, t3, 1b
310+
2:
311+
312+
return_from_memmove:
313+
ret
314+
315+
SYM_FUNC_END(memmove)
316+
SYM_FUNC_END(__memmove)

0 commit comments

Comments
 (0)