@@ -173,6 +173,136 @@ pub unsafe fn compare_bytes(a: *const u8, b: *const u8, n: usize) -> i32 {
173173 c16 ( a. cast ( ) , b. cast ( ) , n)
174174}
175175
176+ // In order to process more than on byte simultaneously when executing strlen,
177+ // two things must be considered:
178+ // * An n byte read with an n-byte aligned address will never cross
179+ // a page boundary and will always succeed. Any smaller alignment
180+ // may result in a read that will cross a page boundary, which may
181+ // trigger an access violation.
182+ // * Surface Rust considers any kind of out-of-bounds read as undefined
183+ // behaviour. To dodge this, memory access operations are written
184+ // using inline assembly.
185+
186+ #[ cfg( target_feature = "sse2" ) ]
187+ #[ inline( always) ]
188+ pub unsafe fn c_string_length ( mut s : * const core:: ffi:: c_char ) -> usize {
189+ use core:: arch:: x86_64:: { __m128i, _mm_cmpeq_epi8, _mm_movemask_epi8, _mm_set1_epi8} ;
190+
191+ let mut n = 0 ;
192+
193+ // The use of _mm_movemask_epi8 and company allow for speedups,
194+ // but they aren't cheap by themselves. Thus, possibly small strings
195+ // are handled in simple loops.
196+
197+ for _ in 0 ..4 {
198+ if * s == 0 {
199+ return n;
200+ }
201+
202+ n += 1 ;
203+ s = s. add ( 1 ) ;
204+ }
205+
206+ // Shave of the least significand bits to align the address to a 16
207+ // byte boundary. The shaved of bits are used to correct the first iteration.
208+
209+ let align = s as usize & 15 ;
210+ let mut s = ( ( s as usize ) - align) as * const __m128i ;
211+ let zero = _mm_set1_epi8 ( 0 ) ;
212+
213+ let x = {
214+ let r;
215+ asm ! (
216+ "movdqa ({addr}), {dest}" ,
217+ addr = in( reg) s,
218+ dest = out( xmm_reg) r,
219+ options( att_syntax, nostack) ,
220+ ) ;
221+ r
222+ } ;
223+ let cmp = _mm_movemask_epi8 ( _mm_cmpeq_epi8 ( x, zero) ) >> align;
224+
225+ if cmp != 0 {
226+ return n + cmp. trailing_zeros ( ) as usize ;
227+ }
228+
229+ n += 16 - align;
230+ s = s. add ( 1 ) ;
231+
232+ loop {
233+ let x = {
234+ let r;
235+ asm ! (
236+ "movdqa ({addr}), {dest}" ,
237+ addr = in( reg) s,
238+ dest = out( xmm_reg) r,
239+ options( att_syntax, nostack) ,
240+ ) ;
241+ r
242+ } ;
243+ let cmp = _mm_movemask_epi8 ( _mm_cmpeq_epi8 ( x, zero) ) as u32 ;
244+ if cmp == 0 {
245+ n += 16 ;
246+ s = s. add ( 1 ) ;
247+ } else {
248+ return n + cmp. trailing_zeros ( ) as usize ;
249+ }
250+ }
251+ }
252+
253+ // Provided for scenarios like kernel development, where SSE might not
254+ // be available.
255+ #[ cfg( not( target_feature = "sse2" ) ) ]
256+ #[ inline( always) ]
257+ pub unsafe fn c_string_length ( mut s : * const core:: ffi:: c_char ) -> usize {
258+ let mut n = 0 ;
259+
260+ // Check bytes in steps of one until
261+ // either a zero byte is discovered or
262+ // pointer is aligned to an eight byte boundary.
263+
264+ while s as usize & 7 != 0 {
265+ if * s == 0 {
266+ return n;
267+ }
268+ n += 1 ;
269+ s = s. add ( 1 ) ;
270+ }
271+
272+ // Check bytes in steps of eight until a zero
273+ // byte is discovered.
274+
275+ let mut s = s as * const u64 ;
276+
277+ loop {
278+ let mut cs = {
279+ let r: u64 ;
280+ asm ! (
281+ "mov ({addr}), {dest}" ,
282+ addr = in( reg) s,
283+ dest = out( reg) r,
284+ options( att_syntax, nostack) ,
285+ ) ;
286+ r
287+ } ;
288+ // Detect if a word has a zero byte, taken from
289+ // https://graphics.stanford.edu/~seander/bithacks.html
290+ if ( cs. wrapping_sub ( 0x0101010101010101 ) & !cs & 0x8080808080808080 ) != 0 {
291+ loop {
292+ if cs & 255 == 0 {
293+ return n;
294+ } else {
295+ cs >>= 8 ;
296+ n += 1 ;
297+ }
298+ }
299+ } else {
300+ n += 8 ;
301+ s = s. add ( 1 ) ;
302+ }
303+ }
304+ }
305+
176306/// Determine optimal parameters for a `rep` instruction.
177307fn rep_param ( dest : * mut u8 , mut count : usize ) -> ( usize , usize , usize ) {
178308 // Unaligned writes are still slow on modern processors, so align the destination address.
0 commit comments