@@ -227,6 +227,22 @@ fn analyze_source_file_dispatch(
227227 }
228228}
229229
230+ #[ cfg( target_arch = "aarch64" ) ]
231+ fn analyze_source_file_dispatch (
232+ src : & str ,
233+ lines : & mut Vec < TextSize > ,
234+ multi_byte_chars : & mut IntMap < u32 , Vec < WideChar > > ,
235+ ) {
236+ if std:: arch:: is_aarch64_feature_detected!( "neon" ) {
237+ // SAFETY: NEON support was checked
238+ unsafe {
239+ analyze_source_file_neon ( src, lines, multi_byte_chars) ;
240+ }
241+ } else {
242+ analyze_source_file_generic ( src, src. len ( ) , TextSize :: from ( 0 ) , lines, multi_byte_chars) ;
243+ }
244+ }
245+
230246/// Checks 16 byte chunks of text at a time. If the chunk contains
231247/// something other than printable ASCII characters and newlines, the
232248/// function falls back to the generic implementation. Otherwise it uses
@@ -322,7 +338,102 @@ unsafe fn analyze_source_file_sse2(
322338 }
323339}
324340
325- #[ cfg( not( any( target_arch = "x86" , target_arch = "x86_64" ) ) ) ]
341+ #[ target_feature( enable = "neon" ) ]
342+ #[ cfg( any( target_arch = "aarch64" ) ) ]
343+ #[ inline]
344+ // See https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
345+ //
346+ // The mask is a 64-bit integer, where each 4-bit corresponds to a u8 in the
347+ // input vector. The least significant 4 bits correspond to the first byte in
348+ // the vector.
349+ unsafe fn move_mask ( v : std:: arch:: aarch64:: uint8x16_t ) -> u64 {
350+ use std:: arch:: aarch64:: * ;
351+
352+ let nibble_mask = vshrn_n_u16 ( vreinterpretq_u16_u8 ( v) , 4 ) ;
353+ vget_lane_u64 ( vreinterpret_u64_u8 ( nibble_mask) , 0 )
354+ }
355+
356+ #[ target_feature( enable = "neon" ) ]
357+ #[ cfg( any( target_arch = "aarch64" ) ) ]
358+ unsafe fn analyze_source_file_neon (
359+ src : & str ,
360+ lines : & mut Vec < TextSize > ,
361+ multi_byte_chars : & mut IntMap < u32 , Vec < WideChar > > ,
362+ ) {
363+ use std:: arch:: aarch64:: * ;
364+
365+ const CHUNK_SIZE : usize = 16 ;
366+
367+ let src_bytes = src. as_bytes ( ) ;
368+
369+ let chunk_count = src. len ( ) / CHUNK_SIZE ;
370+
371+ let newline = vdupq_n_s8 ( b'\n' as i8 ) ;
372+
373+ // This variable keeps track of where we should start decoding a
374+ // chunk. If a multi-byte character spans across chunk boundaries,
375+ // we need to skip that part in the next chunk because we already
376+ // handled it.
377+ let mut intra_chunk_offset = 0 ;
378+
379+ for chunk_index in 0 ..chunk_count {
380+ let ptr = src_bytes. as_ptr ( ) as * const i8 ;
381+ let chunk = vld1q_s8 ( ptr. add ( chunk_index * CHUNK_SIZE ) ) ;
382+
383+ // For character in the chunk, see if its byte value is < 0, which
384+ // indicates that it's part of a UTF-8 char.
385+ let multibyte_test = vcltzq_s8 ( chunk) ;
386+ // Create a bit mask from the comparison results.
387+ let multibyte_mask = move_mask ( multibyte_test) ;
388+
389+ // If the bit mask is all zero, we only have ASCII chars here:
390+ if multibyte_mask == 0 {
391+ assert ! ( intra_chunk_offset == 0 ) ;
392+
393+ // Check for newlines in the chunk
394+ let newlines_test = vceqq_s8 ( chunk, newline) ;
395+ let mut newlines_mask = move_mask ( newlines_test) ;
396+
397+ // If the bit mask is not all zero, there are newlines in this chunk.
398+ if newlines_mask != 0 {
399+ let output_offset = TextSize :: from ( ( chunk_index * CHUNK_SIZE + 1 ) as u32 ) ;
400+
401+ while newlines_mask != 0 {
402+ let trailing_zeros = newlines_mask. trailing_zeros ( ) ;
403+ let index = trailing_zeros / 4 ;
404+
405+ lines. push ( TextSize :: from ( index) + output_offset) ;
406+
407+ // Clear the current 4-bit, so we can find the next one.
408+ newlines_mask &= ( !0xF ) << trailing_zeros;
409+ }
410+ }
411+ continue ;
412+ }
413+
414+ let scan_start = chunk_index * CHUNK_SIZE + intra_chunk_offset;
415+ intra_chunk_offset = analyze_source_file_generic (
416+ & src[ scan_start..] ,
417+ CHUNK_SIZE - intra_chunk_offset,
418+ TextSize :: from ( scan_start as u32 ) ,
419+ lines,
420+ multi_byte_chars,
421+ ) ;
422+ }
423+
424+ let tail_start = chunk_count * CHUNK_SIZE + intra_chunk_offset;
425+ if tail_start < src. len ( ) {
426+ analyze_source_file_generic (
427+ & src[ tail_start..] ,
428+ src. len ( ) - tail_start,
429+ TextSize :: from ( tail_start as u32 ) ,
430+ lines,
431+ multi_byte_chars,
432+ ) ;
433+ }
434+ }
435+
436+ #[ cfg( not( any( target_arch = "x86" , target_arch = "x86_64" , target_arch = "aarch64" ) ) ) ]
326437// The target (or compiler version) does not support SSE2 ...
327438fn analyze_source_file_dispatch (
328439 src : & str ,
0 commit comments