|
27 | 27 | * NOTE! The calling convention is very intentionally the same as |
28 | 28 | * for 'rep movs', so that we can rewrite the function call with |
29 | 29 | * just a plain 'rep movs' on machines that have FSRM. But to make |
30 | | - * it simpler for us, we can clobber rsi/rdi and rax freely. |
| 30 | + * it simpler for us, we can clobber rsi/rdi and rax/r8-r11 freely. |
31 | 31 | */ |
32 | 32 | SYM_FUNC_START(rep_movs_alternative) |
33 | 33 | cmpq $64,%rcx |
@@ -68,24 +68,55 @@ SYM_FUNC_START(rep_movs_alternative) |
68 | 68 | _ASM_EXTABLE_UA( 3b, .Lcopy_user_tail) |
69 | 69 |
|
70 | 70 | .Llarge: |
71 | | -0: ALTERNATIVE "jmp .Llarge_movsq", "rep movsb", X86_FEATURE_ERMS |
| 71 | +0: ALTERNATIVE "jmp .Lunrolled", "rep movsb", X86_FEATURE_ERMS |
72 | 72 | 1: RET |
73 | 73 |
|
74 | | - _ASM_EXTABLE_UA( 0b, 1b) |
| 74 | + _ASM_EXTABLE_UA( 0b, 1b) |
75 | 75 |
|
76 | | -.Llarge_movsq: |
77 | | - movq %rcx,%rax |
78 | | - shrq $3,%rcx |
79 | | - andl $7,%eax |
80 | | -0: rep movsq |
81 | | - movl %eax,%ecx |
| 76 | + .p2align 4 |
| 77 | +.Lunrolled: |
| 78 | +10: movq (%rsi),%r8 |
| 79 | +11: movq 8(%rsi),%r9 |
| 80 | +12: movq 16(%rsi),%r10 |
| 81 | +13: movq 24(%rsi),%r11 |
| 82 | +14: movq %r8,(%rdi) |
| 83 | +15: movq %r9,8(%rdi) |
| 84 | +16: movq %r10,16(%rdi) |
| 85 | +17: movq %r11,24(%rdi) |
| 86 | +20: movq 32(%rsi),%r8 |
| 87 | +21: movq 40(%rsi),%r9 |
| 88 | +22: movq 48(%rsi),%r10 |
| 89 | +23: movq 56(%rsi),%r11 |
| 90 | +24: movq %r8,32(%rdi) |
| 91 | +25: movq %r9,40(%rdi) |
| 92 | +26: movq %r10,48(%rdi) |
| 93 | +27: movq %r11,56(%rdi) |
| 94 | + addq $64,%rsi |
| 95 | + addq $64,%rdi |
| 96 | + subq $64,%rcx |
| 97 | + cmpq $64,%rcx |
| 98 | + jae .Lunrolled |
| 99 | + cmpl $8,%ecx |
| 100 | + jae .Lword |
82 | 101 | testl %ecx,%ecx |
83 | 102 | jne .Lcopy_user_tail |
84 | 103 | RET |
85 | 104 |
|
86 | | -1: leaq (%rax,%rcx,8),%rcx |
87 | | - jmp .Lcopy_user_tail |
88 | | - |
89 | | - _ASM_EXTABLE_UA( 0b, 1b) |
| 105 | + _ASM_EXTABLE_UA(10b, .Lcopy_user_tail) |
| 106 | + _ASM_EXTABLE_UA(11b, .Lcopy_user_tail) |
| 107 | + _ASM_EXTABLE_UA(12b, .Lcopy_user_tail) |
| 108 | + _ASM_EXTABLE_UA(13b, .Lcopy_user_tail) |
| 109 | + _ASM_EXTABLE_UA(14b, .Lcopy_user_tail) |
| 110 | + _ASM_EXTABLE_UA(15b, .Lcopy_user_tail) |
| 111 | + _ASM_EXTABLE_UA(16b, .Lcopy_user_tail) |
| 112 | + _ASM_EXTABLE_UA(17b, .Lcopy_user_tail) |
| 113 | + _ASM_EXTABLE_UA(20b, .Lcopy_user_tail) |
| 114 | + _ASM_EXTABLE_UA(21b, .Lcopy_user_tail) |
| 115 | + _ASM_EXTABLE_UA(22b, .Lcopy_user_tail) |
| 116 | + _ASM_EXTABLE_UA(23b, .Lcopy_user_tail) |
| 117 | + _ASM_EXTABLE_UA(24b, .Lcopy_user_tail) |
| 118 | + _ASM_EXTABLE_UA(25b, .Lcopy_user_tail) |
| 119 | + _ASM_EXTABLE_UA(26b, .Lcopy_user_tail) |
| 120 | + _ASM_EXTABLE_UA(27b, .Lcopy_user_tail) |
90 | 121 | SYM_FUNC_END(rep_movs_alternative) |
91 | 122 | EXPORT_SYMBOL(rep_movs_alternative) |
0 commit comments