1616// feature is present at compile-time. We don't bother detecting other features.
1717// Note that ERMSB does not enhance the backwards (DF=1) "rep movsb".
1818
19+ use core:: arch:: asm;
1920use core:: intrinsics;
2021use core:: mem;
2122
@@ -34,40 +35,61 @@ pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) {
3435
3536#[ inline( always) ]
3637#[ cfg( not( target_feature = "ermsb" ) ) ]
37- pub unsafe fn copy_forward ( dest : * mut u8 , src : * const u8 , count : usize ) {
38- let qword_count = count >> 3 ;
39- let byte_count = count & 0b111 ;
40- // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
41- core:: arch:: asm!(
42- "repe movsq (%rsi), (%rdi)" ,
43- "mov {byte_count:e}, %ecx" ,
44- "repe movsb (%rsi), (%rdi)" ,
45- byte_count = in( reg) byte_count,
38+ pub unsafe fn copy_forward ( mut dest : * mut u8 , mut src : * const u8 , count : usize ) {
39+ let ( pre_byte_count, qword_count, byte_count) = rep_param ( dest, count) ;
40+ // Separating the blocks gives the compiler more freedom to reorder instructions.
41+ // It also allows us to trivially skip the rep movsb, which is faster when memcpying
42+ // aligned data.
43+ if pre_byte_count > 0 {
44+ asm ! (
45+ "rep movsb" ,
46+ inout( "ecx" ) pre_byte_count => _,
47+ inout( "rdi" ) dest => dest,
48+ inout( "rsi" ) src => src,
49+ options( nostack, preserves_flags)
50+ ) ;
51+ }
52+ asm ! (
53+ "rep movsq" ,
4654 inout( "rcx" ) qword_count => _,
47- inout( "rdi" ) dest => _ ,
48- inout( "rsi" ) src => _ ,
49- options( att_syntax , nostack, preserves_flags)
55+ inout( "rdi" ) dest => dest ,
56+ inout( "rsi" ) src => src ,
57+ options( nostack, preserves_flags)
5058 ) ;
59+ if byte_count > 0 {
60+ asm ! (
61+ "rep movsb" ,
62+ inout( "ecx" ) byte_count => _,
63+ inout( "rdi" ) dest => _,
64+ inout( "rsi" ) src => _,
65+ options( nostack, preserves_flags)
66+ ) ;
67+ }
5168}
5269
5370#[ inline( always) ]
5471pub unsafe fn copy_backward ( dest : * mut u8 , src : * const u8 , count : usize ) {
55- let qword_count = count >> 3 ;
56- let byte_count = count & 0b111 ;
57- // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
58- core:: arch:: asm!(
72+ let ( pre_byte_count, qword_count, byte_count) = rep_param_rev ( dest, count) ;
73+ // We can't separate this block due to std/cld
74+ asm ! (
5975 "std" ,
60- "repe movsq (%rsi), (%rdi)" ,
61- "movl {byte_count:e}, %ecx" ,
62- "addq $7, %rdi" ,
63- "addq $7, %rsi" ,
64- "repe movsb (%rsi), (%rdi)" ,
76+ "rep movsb" ,
77+ "sub rsi, 7" ,
78+ "sub rdi, 7" ,
79+ "mov rcx, {qword_count}" ,
80+ "rep movsq" ,
81+ "add rsi, 7" ,
82+ "add rdi, 7" ,
83+ "mov ecx, {byte_count:e}" ,
84+ "rep movsb" ,
6585 "cld" ,
6686 byte_count = in( reg) byte_count,
67- inout( "rcx" ) qword_count => _,
68- inout( "rdi" ) dest. add( count) . wrapping_sub( 8 ) => _,
69- inout( "rsi" ) src. add( count) . wrapping_sub( 8 ) => _,
70- options( att_syntax, nostack)
87+ qword_count = in( reg) qword_count,
88+ inout( "ecx" ) pre_byte_count => _,
89+ inout( "rdi" ) dest. add( count - 1 ) => _,
90+ inout( "rsi" ) src. add( count - 1 ) => _,
91+ // We modify flags, but we restore it afterwards
92+ options( nostack, preserves_flags)
7193 ) ;
7294}
7395
@@ -86,20 +108,36 @@ pub unsafe fn set_bytes(dest: *mut u8, c: u8, count: usize) {
86108
87109#[ inline( always) ]
88110#[ cfg( not( target_feature = "ermsb" ) ) ]
89- pub unsafe fn set_bytes ( dest : * mut u8 , c : u8 , count : usize ) {
90- let qword_count = count >> 3 ;
91- let byte_count = count & 0b111 ;
92- // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
93- core:: arch:: asm!(
94- "repe stosq %rax, (%rdi)" ,
95- "mov {byte_count:e}, %ecx" ,
96- "repe stosb %al, (%rdi)" ,
97- byte_count = in( reg) byte_count,
111+ pub unsafe fn set_bytes ( mut dest : * mut u8 , c : u8 , count : usize ) {
112+ let ( pre_byte_count, qword_count, byte_count) = rep_param ( dest, count) ;
113+ // Separating the blocks gives the compiler more freedom to reorder instructions.
114+ // It also allows us to trivially skip the rep stosb, which is faster when memcpying
115+ // aligned data.
116+ if pre_byte_count > 0 {
117+ asm ! (
118+ "rep stosb" ,
119+ inout( "ecx" ) pre_byte_count => _,
120+ inout( "rdi" ) dest => dest,
121+ in( "al" ) c,
122+ options( nostack, preserves_flags)
123+ ) ;
124+ }
125+ asm ! (
126+ "rep stosq" ,
98127 inout( "rcx" ) qword_count => _,
99- inout( "rdi" ) dest => _ ,
128+ inout( "rdi" ) dest => dest ,
100129 in( "rax" ) ( c as u64 ) * 0x0101010101010101 ,
101- options( att_syntax , nostack, preserves_flags)
130+ options( nostack, preserves_flags)
102131 ) ;
132+ if byte_count > 0 {
133+ asm ! (
134+ "rep stosb" ,
135+ inout( "ecx" ) byte_count => _,
136+ inout( "rdi" ) dest => _,
137+ in( "al" ) c,
138+ options( nostack, preserves_flags)
139+ ) ;
140+ }
103141}
104142
105143#[ inline( always) ]
@@ -156,3 +194,23 @@ pub unsafe fn compare_bytes(a: *const u8, b: *const u8, n: usize) -> i32 {
156194 c16 ( a. cast ( ) , b. cast ( ) , n)
157195 }
158196}
197+
198+ /// Determine optimal parameters for a `rep` instruction.
199+ fn rep_param ( dest : * mut u8 , mut count : usize ) -> ( usize , usize , usize ) {
200+ // Unaligned writes are still slow on modern processors, so align the destination address.
201+ let pre_byte_count = ( ( 8 - ( dest as usize & 0b111 ) ) & 0b111 ) . min ( count) ;
202+ count -= pre_byte_count;
203+ let qword_count = count >> 3 ;
204+ let byte_count = count & 0b111 ;
205+ ( pre_byte_count, qword_count, byte_count)
206+ }
207+
208+ /// Determine optimal parameters for a reverse `rep` instruction (i.e. direction bit is set).
209+ fn rep_param_rev ( dest : * mut u8 , mut count : usize ) -> ( usize , usize , usize ) {
210+ // Unaligned writes are still slow on modern processors, so align the destination address.
211+ let pre_byte_count = ( ( dest as usize + count) & 0b111 ) . min ( count) ;
212+ count -= pre_byte_count;
213+ let qword_count = count >> 3 ;
214+ let byte_count = count & 0b111 ;
215+ ( pre_byte_count, qword_count, byte_count)
216+ }
0 commit comments