1616// feature is present at compile-time. We don't bother detecting other features.
1717// Note that ERMSB does not enhance the backwards (DF=1) "rep movsb".
1818
19+ use core:: arch:: asm;
1920use core:: intrinsics;
2021use core:: mem;
2122
@@ -34,16 +35,26 @@ pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) {
3435
3536#[ inline( always) ]
3637#[ cfg( not( target_feature = "ermsb" ) ) ]
37- pub unsafe fn copy_forward ( dest : * mut u8 , src : * const u8 , count : usize ) {
38- let qword_count = count >> 3 ;
39- let byte_count = count & 0b111 ;
40- // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
41- core:: arch:: asm!(
42- "repe movsq (%rsi), (%rdi)" ,
43- "mov {byte_count:e}, %ecx" ,
44- "repe movsb (%rsi), (%rdi)" ,
45- byte_count = in( reg) byte_count,
38+ pub unsafe fn copy_forward ( mut dest : * mut u8 , mut src : * const u8 , count : usize ) {
39+ let ( pre_byte_count, qword_count, byte_count) = rep_param ( dest, count) ;
40+ // Separating the blocks gives the compiler more freedom to reorder instructions.
41+ asm ! (
42+ "rep movsb" ,
43+ inout( "ecx" ) pre_byte_count => _,
44+ inout( "rdi" ) dest => dest,
45+ inout( "rsi" ) src => src,
46+ options( att_syntax, nostack, preserves_flags)
47+ ) ;
48+ asm ! (
49+ "rep movsq" ,
4650 inout( "rcx" ) qword_count => _,
51+ inout( "rdi" ) dest => dest,
52+ inout( "rsi" ) src => src,
53+ options( att_syntax, nostack, preserves_flags)
54+ ) ;
55+ asm ! (
56+ "rep movsb" ,
57+ inout( "ecx" ) byte_count => _,
4758 inout( "rdi" ) dest => _,
4859 inout( "rsi" ) src => _,
4960 options( att_syntax, nostack, preserves_flags)
@@ -52,22 +63,28 @@ pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) {
5263
5364#[ inline( always) ]
5465pub unsafe fn copy_backward ( dest : * mut u8 , src : * const u8 , count : usize ) {
55- let qword_count = count >> 3 ;
56- let byte_count = count & 0b111 ;
57- // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
58- core:: arch:: asm!(
66+ let ( pre_byte_count, qword_count, byte_count) = rep_param ( dest, count) ;
67+ // We can't separate this block due to std/cld
68+ asm ! (
5969 "std" ,
60- "repe movsq (%rsi), (%rdi)" ,
61- "movl {byte_count:e}, %ecx" ,
62- "addq $7, %rdi" ,
63- "addq $7, %rsi" ,
64- "repe movsb (%rsi), (%rdi)" ,
70+ "rep movsb" ,
71+ "sub $7, %rsi" ,
72+ "sub $7, %rdi" ,
73+ "mov {qword_count}, %rcx" ,
74+ "rep movsq" ,
75+ "test {pre_byte_count:e}, {pre_byte_count:e}" ,
76+ "add $7, %rsi" ,
77+ "add $7, %rdi" ,
78+ "mov {pre_byte_count:e}, %ecx" ,
79+ "rep movsb" ,
6580 "cld" ,
66- byte_count = in( reg) byte_count,
67- inout( "rcx" ) qword_count => _,
68- inout( "rdi" ) dest. add( count) . wrapping_sub( 8 ) => _,
69- inout( "rsi" ) src. add( count) . wrapping_sub( 8 ) => _,
70- options( att_syntax, nostack)
81+ pre_byte_count = in( reg) pre_byte_count,
82+ qword_count = in( reg) qword_count,
83+ inout( "ecx" ) byte_count => _,
84+ inout( "rdi" ) dest. add( count - 1 ) => _,
85+ inout( "rsi" ) src. add( count - 1 ) => _,
86+ // We modify flags, but we restore it afterwards
87+ options( att_syntax, nostack, preserves_flags)
7188 ) ;
7289}
7390
@@ -86,18 +103,29 @@ pub unsafe fn set_bytes(dest: *mut u8, c: u8, count: usize) {
86103
87104#[ inline( always) ]
88105#[ cfg( not( target_feature = "ermsb" ) ) ]
89- pub unsafe fn set_bytes ( dest : * mut u8 , c : u8 , count : usize ) {
90- let qword_count = count >> 3 ;
91- let byte_count = count & 0b111 ;
92- // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
93- core:: arch:: asm!(
94- "repe stosq %rax, (%rdi)" ,
95- "mov {byte_count:e}, %ecx" ,
96- "repe stosb %al, (%rdi)" ,
97- byte_count = in( reg) byte_count,
106+ pub unsafe fn set_bytes ( mut dest : * mut u8 , c : u8 , count : usize ) {
107+ let c = c as u64 * 0x0101_0101_0101_0101 ;
108+ let ( pre_byte_count, qword_count, byte_count) = rep_param ( dest, count) ;
109+ // Separating the blocks gives the compiler more freedom to reorder instructions.
110+ asm ! (
111+ "rep stosb" ,
112+ inout( "ecx" ) pre_byte_count => _,
113+ inout( "rdi" ) dest => dest,
114+ in( "rax" ) c,
115+ options( att_syntax, nostack, preserves_flags)
116+ ) ;
117+ asm ! (
118+ "rep stosq" ,
98119 inout( "rcx" ) qword_count => _,
120+ inout( "rdi" ) dest => dest,
121+ in( "rax" ) c,
122+ options( att_syntax, nostack, preserves_flags)
123+ ) ;
124+ asm ! (
125+ "rep stosb" ,
126+ inout( "ecx" ) byte_count => _,
99127 inout( "rdi" ) dest => _,
100- in( "rax" ) ( c as u64 ) * 0x0101010101010101 ,
128+ in( "rax" ) c ,
101129 options( att_syntax, nostack, preserves_flags)
102130 ) ;
103131}
@@ -156,3 +184,13 @@ pub unsafe fn compare_bytes(a: *const u8, b: *const u8, n: usize) -> i32 {
156184 c16 ( a. cast ( ) , b. cast ( ) , n)
157185 }
158186}
187+
188+ /// Determine optimal parameters for a `rep` instruction.
189+ fn rep_param ( dest : * mut u8 , mut count : usize ) -> ( usize , usize , usize ) {
190+ // Unaligned writes are still slow on modern processors, so align the destination address.
191+ let pre_byte_count = ( ( 8 - ( dest as usize & 0b111 ) ) & 0b111 ) . min ( count) ;
192+ count -= pre_byte_count;
193+ let qword_count = count >> 3 ;
194+ let byte_count = count & 0b111 ;
195+ ( pre_byte_count, qword_count, byte_count)
196+ }
0 commit comments