@@ -227,6 +227,22 @@ fn analyze_source_file_dispatch(
227227 }
228228}
229229
230+ #[ cfg( target_arch = "aarch64" ) ]
231+ fn analyze_source_file_dispatch (
232+ src : & str ,
233+ lines : & mut Vec < TextSize > ,
234+ multi_byte_chars : & mut IntMap < u32 , Vec < WideChar > > ,
235+ ) {
236+ if std:: arch:: is_aarch64_feature_detected!( "neon" ) {
237+ // SAFETY: NEON support was checked
238+ unsafe {
239+ analyze_source_file_neon ( src, lines, multi_byte_chars) ;
240+ }
241+ } else {
242+ analyze_source_file_generic ( src, src. len ( ) , TextSize :: from ( 0 ) , lines, multi_byte_chars) ;
243+ }
244+ }
245+
230246/// Checks 16 byte chunks of text at a time. If the chunk contains
231247/// something other than printable ASCII characters and newlines, the
232248/// function falls back to the generic implementation. Otherwise it uses
@@ -322,7 +338,101 @@ unsafe fn analyze_source_file_sse2(
322338 }
323339}
324340
325- #[ cfg( not( any( target_arch = "x86" , target_arch = "x86_64" ) ) ) ]
341+ #[ target_feature( enable = "neon" ) ]
342+ #[ cfg( any( target_arch = "aarch64" ) ) ]
343+ // See https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
344+ //
345+ // The mask is a 64-bit integer, where each 4-bit corresponds to a u8 in the
346+ // input vector. The least significant 4 bits correspond to the first byte in
347+ // the vector.
348+ unsafe fn move_mask ( v : std:: arch:: aarch64:: uint8x16_t ) -> u64 {
349+ use std:: arch:: aarch64:: * ;
350+
351+ let nibble_mask = vshrn_n_u16 ( vreinterpretq_u16_u8 ( v) , 4 ) ;
352+ vget_lane_u64 ( vreinterpret_u64_u8 ( nibble_mask) , 0 )
353+ }
354+
355+ #[ target_feature( enable = "neon" ) ]
356+ #[ cfg( any( target_arch = "aarch64" ) ) ]
357+ unsafe fn analyze_source_file_neon (
358+ src : & str ,
359+ lines : & mut Vec < TextSize > ,
360+ multi_byte_chars : & mut IntMap < u32 , Vec < WideChar > > ,
361+ ) {
362+ use std:: arch:: aarch64:: * ;
363+
364+ const CHUNK_SIZE : usize = 16 ;
365+
366+ let src_bytes = src. as_bytes ( ) ;
367+
368+ let chunk_count = src. len ( ) / CHUNK_SIZE ;
369+
370+ let newline = vdupq_n_s8 ( b'\n' as i8 ) ;
371+
372+ // This variable keeps track of where we should start decoding a
373+ // chunk. If a multi-byte character spans across chunk boundaries,
374+ // we need to skip that part in the next chunk because we already
375+ // handled it.
376+ let mut intra_chunk_offset = 0 ;
377+
378+ for chunk_index in 0 ..chunk_count {
379+ let ptr = src_bytes. as_ptr ( ) as * const i8 ;
380+ let chunk = vld1q_s8 ( ptr. add ( chunk_index * CHUNK_SIZE ) ) ;
381+
382+ // For character in the chunk, see if its byte value is < 0, which
383+ // indicates that it's part of a UTF-8 char.
384+ let multibyte_test = vcltzq_s8 ( chunk) ;
385+ // Create a bit mask from the comparison results.
386+ let multibyte_mask = move_mask ( multibyte_test) ;
387+
388+ // If the bit mask is all zero, we only have ASCII chars here:
389+ if multibyte_mask == 0 {
390+ assert ! ( intra_chunk_offset == 0 ) ;
391+
392+ // Check for newlines in the chunk
393+ let newlines_test = vceqq_s8 ( chunk, newline) ;
394+ let mut newlines_mask = move_mask ( newlines_test) ;
395+
396+ // If the bit mask is all zero, there are no newlines in this chunk.
397+ if newlines_mask != 0 {
398+ let output_offset = TextSize :: from ( ( chunk_index * CHUNK_SIZE + 1 ) as u32 ) ;
399+
400+ while newlines_mask != 0 {
401+ let trailing_zeros = newlines_mask. trailing_zeros ( ) ;
402+ let index = trailing_zeros / 4 ;
403+
404+ lines. push ( TextSize :: from ( index) + output_offset) ;
405+
406+ // Clear the current 4-bit, so we can find the next one.
407+ newlines_mask &= ( !0xF ) << trailing_zeros;
408+ }
409+ }
410+ continue ;
411+ }
412+
413+ let scan_start = chunk_index * CHUNK_SIZE + intra_chunk_offset;
414+ intra_chunk_offset = analyze_source_file_generic (
415+ & src[ scan_start..] ,
416+ CHUNK_SIZE - intra_chunk_offset,
417+ TextSize :: from ( scan_start as u32 ) ,
418+ lines,
419+ multi_byte_chars,
420+ ) ;
421+ }
422+
423+ let tail_start = chunk_count * CHUNK_SIZE + intra_chunk_offset;
424+ if tail_start < src. len ( ) {
425+ analyze_source_file_generic (
426+ & src[ tail_start..] ,
427+ src. len ( ) - tail_start,
428+ TextSize :: from ( tail_start as u32 ) ,
429+ lines,
430+ multi_byte_chars,
431+ ) ;
432+ }
433+ }
434+
435+ #[ cfg( not( any( target_arch = "x86" , target_arch = "x86_64" , target_arch = "aarch64" ) ) ) ]
326436// The target (or compiler version) does not support SSE2 ...
327437fn analyze_source_file_dispatch (
328438 src : & str ,
0 commit comments