@@ -706,11 +706,11 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
706706 states:: Data => loop {
707707 let set = small_char_set ! ( '\r' '\0' '&' '<' '\n' ) ;
708708
709- #[ cfg( any( target_arch = "x86" , target_arch = "x86_64" ) ) ]
709+ #[ cfg( any( target_arch = "x86" , target_arch = "x86_64" , target_arch = "aarch64" ) ) ]
710710 let set_result = if !( self . opts . exact_errors
711711 || self . reconsume . get ( )
712712 || self . ignore_lf . get ( ) )
713- && is_x86_feature_detected ! ( "sse2" )
713+ && Self :: is_supported_simd_feature_detected ( )
714714 {
715715 let front_buffer = input. peek_front_chunk_mut ( ) ;
716716 let Some ( mut front_buffer) = front_buffer else {
@@ -729,8 +729,8 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
729729 self . pop_except_from ( input, set)
730730 } else {
731731 // SAFETY:
732- // This CPU is guaranteed to support SSE2 due to the is_x86_feature_detected check above
733- let result = unsafe { self . data_state_sse2_fast_path ( & mut front_buffer) } ;
732+ // This CPU is guaranteed to support SIMD due to the is_supported_simd_feature_detected check above
733+ let result = unsafe { self . data_state_simd_fast_path ( & mut front_buffer) } ;
734734
735735 if front_buffer. is_empty ( ) {
736736 drop ( front_buffer) ;
@@ -743,7 +743,11 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
743743 self . pop_except_from ( input, set)
744744 } ;
745745
746- #[ cfg( not( any( target_arch = "x86" , target_arch = "x86_64" ) ) ) ]
746+ #[ cfg( not( any(
747+ target_arch = "x86" ,
748+ target_arch = "x86_64" ,
749+ target_arch = "aarch64"
750+ ) ) ) ]
747751 let set_result = self . pop_except_from ( input, set) ;
748752
749753 let Some ( set_result) = set_result else {
@@ -1885,18 +1889,90 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
18851889 }
18861890 }
18871891
1888- #[ cfg( any( target_arch = "x86" , target_arch = "x86_64" ) ) ]
1889- #[ target_feature( enable = "sse2" ) ]
1892+ /// Checks for supported SIMD feature, which is now either SSE2 for x86/x86_64 or NEON for aarch64.
1893+ fn is_supported_simd_feature_detected ( ) -> bool {
1894+ #[ cfg( any( target_arch = "x86" , target_arch = "x86_64" ) ) ]
1895+ {
1896+ is_x86_feature_detected ! ( "sse2" )
1897+ }
1898+
1899+ #[ cfg( target_arch = "aarch64" ) ]
1900+ {
1901+ std:: arch:: is_aarch64_feature_detected!( "neon" )
1902+ }
1903+
1904+ #[ cfg( not( any( target_arch = "x86" , target_arch = "x86_64" , target_arch = "aarch64" ) ) ) ]
1905+ false
1906+ }
1907+
1908+ #[ cfg( any( target_arch = "x86" , target_arch = "x86_64" , target_arch = "aarch64" ) ) ]
18901909 /// Implements the [data state] with SIMD instructions.
1910+ /// Calls SSE2- or NEON-specific function for chunks and processes any remaining bytes.
18911911 ///
18921912 /// The algorithm implemented is the naive SIMD approach described [here].
18931913 ///
18941914 /// ### SAFETY:
1895- /// Calling this function on a CPU that does not support SSE2 causes undefined behaviour.
1915+ /// Calling this function on a CPU that supports neither SSE2 nor NEON causes undefined behaviour.
18961916 ///
18971917 /// [data state]: https://html.spec.whatwg.org/#data-state
18981918 /// [here]: https://lemire.me/blog/2024/06/08/scan-html-faster-with-simd-instructions-chrome-edition/
1899- unsafe fn data_state_sse2_fast_path ( & self , input : & mut StrTendril ) -> Option < SetResult > {
1919+ unsafe fn data_state_simd_fast_path ( & self , input : & mut StrTendril ) -> Option < SetResult > {
1920+ #[ cfg( any( target_arch = "x86" , target_arch = "x86_64" ) ) ]
1921+ let ( mut i, mut n_newlines) = self . data_state_sse2_fast_path ( input) ;
1922+
1923+ #[ cfg( target_arch = "aarch64" ) ]
1924+ let ( mut i, mut n_newlines) = self . data_state_neon_fast_path ( input) ;
1925+
1926+ // Process any remaining bytes (less than STRIDE)
1927+ while let Some ( c) = input. as_bytes ( ) . get ( i) {
1928+ if matches ! ( * c, b'<' | b'&' | b'\r' | b'\0' ) {
1929+ break ;
1930+ }
1931+ if * c == b'\n' {
1932+ n_newlines += 1 ;
1933+ }
1934+
1935+ i += 1 ;
1936+ }
1937+
1938+ let set_result = if i == 0 {
1939+ let first_char = input. pop_front_char ( ) . unwrap ( ) ;
1940+ debug_assert ! ( matches!( first_char, '<' | '&' | '\r' | '\0' ) ) ;
1941+
1942+ // FIXME: Passing a bogus input queue is only relevant when c is \n, which can never happen in this case.
1943+ // Still, it would be nice to not have to do that.
1944+ // The same is true for the unwrap call.
1945+ let preprocessed_char = self
1946+ . get_preprocessed_char ( first_char, & BufferQueue :: default ( ) )
1947+ . unwrap ( ) ;
1948+ SetResult :: FromSet ( preprocessed_char)
1949+ } else {
1950+ debug_assert ! (
1951+ input. len( ) >= i,
1952+ "Trying to remove {:?} bytes from a tendril that is only {:?} bytes long" ,
1953+ i,
1954+ input. len( )
1955+ ) ;
1956+ let consumed_chunk = input. unsafe_subtendril ( 0 , i as u32 ) ;
1957+ input. unsafe_pop_front ( i as u32 ) ;
1958+ SetResult :: NotFromSet ( consumed_chunk)
1959+ } ;
1960+
1961+ self . current_line . set ( self . current_line . get ( ) + n_newlines) ;
1962+
1963+ Some ( set_result)
1964+ }
1965+
1966+ #[ cfg( any( target_arch = "x86" , target_arch = "x86_64" ) ) ]
1967+ #[ target_feature( enable = "sse2" ) ]
1968+ /// Implements the [data state] with SSE2 instructions for x86/x86_64.
1969+ /// Returns a pair of the number of bytes processed and the number of newlines found.
1970+ ///
1971+ /// ### SAFETY:
1972+ /// Calling this function on a CPU that does not support NEON causes undefined behaviour.
1973+ ///
1974+ /// [data state]: https://html.spec.whatwg.org/#data-state
1975+ unsafe fn data_state_sse2_fast_path ( & self , input : & mut StrTendril ) -> ( usize , u64 ) {
19001976 #[ cfg( target_arch = "x86" ) ]
19011977 use std:: arch:: x86:: {
19021978 __m128i, _mm_cmpeq_epi8, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128,
@@ -1960,44 +2036,78 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
19602036 i += STRIDE ;
19612037 }
19622038
1963- // Process any remaining bytes (less than STRIDE)
1964- while let Some ( c) = raw_bytes. get ( i) {
1965- if matches ! ( * c, b'<' | b'&' | b'\r' | b'\0' ) {
1966- break ;
1967- }
1968- if * c == b'\n' {
1969- n_newlines += 1 ;
1970- }
2039+ ( i, n_newlines)
2040+ }
19712041
1972- i += 1 ;
1973- }
2042+ #[ cfg( target_arch = "aarch64" ) ]
2043+ #[ target_feature( enable = "neon" ) ]
2044+ /// Implements the [data state] with NEON SIMD instructions for AArch64.
2045+ /// Returns a pair of the number of bytes processed and the number of newlines found.
2046+ ///
2047+ /// ### SAFETY:
2048+ /// Calling this function on a CPU that does not support NEON causes undefined behaviour.
2049+ ///
2050+ /// [data state]: https://html.spec.whatwg.org/#data-state
2051+ unsafe fn data_state_neon_fast_path ( & self , input : & mut StrTendril ) -> ( usize , u64 ) {
2052+ use std:: arch:: aarch64:: { vceqq_u8, vdupq_n_u8, vld1q_u8, vmaxvq_u8, vorrq_u8} ;
19742053
1975- let set_result = if i == 0 {
1976- let first_char = input. pop_front_char ( ) . unwrap ( ) ;
1977- debug_assert ! ( matches!( first_char, '<' | '&' | '\r' | '\0' ) ) ;
2054+ debug_assert ! ( !input. is_empty( ) ) ;
19782055
1979- // FIXME: Passing a bogus input queue is only relevant when c is \n, which can never happen in this case.
1980- // Still, it would be nice to not have to do that.
1981- // The same is true for the unwrap call.
1982- let preprocessed_char = self
1983- . get_preprocessed_char ( first_char, & BufferQueue :: default ( ) )
1984- . unwrap ( ) ;
1985- SetResult :: FromSet ( preprocessed_char)
1986- } else {
1987- debug_assert ! (
1988- input. len( ) >= i,
1989- "Trying to remove {:?} bytes from a tendril that is only {:?} bytes long" ,
1990- i,
1991- input. len( )
1992- ) ;
1993- let consumed_chunk = input. unsafe_subtendril ( 0 , i as u32 ) ;
1994- input. unsafe_pop_front ( i as u32 ) ;
1995- SetResult :: NotFromSet ( consumed_chunk)
1996- } ;
2056+ let quote_mask = vdupq_n_u8 ( b'<' ) ;
2057+ let escape_mask = vdupq_n_u8 ( b'&' ) ;
2058+ let carriage_return_mask = vdupq_n_u8 ( b'\r' ) ;
2059+ let zero_mask = vdupq_n_u8 ( b'\0' ) ;
2060+ let newline_mask = vdupq_n_u8 ( b'\n' ) ;
19972061
1998- self . current_line . set ( self . current_line . get ( ) + n_newlines) ;
2062+ let raw_bytes: & [ u8 ] = input. as_bytes ( ) ;
2063+ let start = raw_bytes. as_ptr ( ) ;
19992064
2000- Some ( set_result)
2065+ const STRIDE : usize = 16 ;
2066+ let mut i = 0 ;
2067+ let mut n_newlines = 0 ;
2068+ while i + STRIDE <= raw_bytes. len ( ) {
2069+ // Load a 16 byte chunk from the input
2070+ let data = vld1q_u8 ( start. add ( i) ) ;
2071+
2072+ // Compare the chunk against each mask
2073+ let quotes = vceqq_u8 ( data, quote_mask) ;
2074+ let escapes = vceqq_u8 ( data, escape_mask) ;
2075+ let carriage_returns = vceqq_u8 ( data, carriage_return_mask) ;
2076+ let zeros = vceqq_u8 ( data, zero_mask) ;
2077+ let newlines = vceqq_u8 ( data, newline_mask) ;
2078+
2079+ // Combine all test results and create a bitmask from them.
2080+ // Each bit in the mask will be 1 if the character at the bit position is in the set and 0 otherwise.
2081+ let test_result =
2082+ vorrq_u8 ( vorrq_u8 ( quotes, zeros) , vorrq_u8 ( escapes, carriage_returns) ) ;
2083+ let bitmask = vmaxvq_u8 ( test_result) ;
2084+ let newline_mask = vmaxvq_u8 ( newlines) ;
2085+ if bitmask != 0 {
2086+ // We have reached one of the characters that cause the state machine to transition
2087+ let chunk_bytes = std:: slice:: from_raw_parts ( start. add ( i) , STRIDE ) ;
2088+ let position = chunk_bytes
2089+ . iter ( )
2090+ . position ( |& b| matches ! ( b, b'<' | b'&' | b'\r' | b'\0' ) )
2091+ . unwrap ( ) ;
2092+
2093+ n_newlines += chunk_bytes[ ..position]
2094+ . iter ( )
2095+ . filter ( |& & b| b == b'\n' )
2096+ . count ( ) as u64 ;
2097+
2098+ i += position;
2099+ break ;
2100+ } else {
2101+ if newline_mask != 0 {
2102+ let chunk_bytes = std:: slice:: from_raw_parts ( start. add ( i) , STRIDE ) ;
2103+ n_newlines += chunk_bytes. iter ( ) . filter ( |& & b| b == b'\n' ) . count ( ) as u64 ;
2104+ }
2105+ }
2106+
2107+ i += STRIDE ;
2108+ }
2109+
2110+ ( i, n_newlines)
20012111 }
20022112}
20032113
0 commit comments