@@ -704,7 +704,52 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
704704 match self . state . get ( ) {
705705 //§ data-state
706706 states:: Data => loop {
707- match pop_except_from ! ( self , input, small_char_set!( '\r' '\0' '&' '<' '\n' ) ) {
707+ let set = small_char_set ! ( '\r' '\0' '&' '<' '\n' ) ;
708+
709+ #[ cfg( any( target_arch = "x86" , target_arch = "x86_64" ) ) ]
710+ let set_result = if !( self . opts . exact_errors
711+ || self . reconsume . get ( )
712+ || self . ignore_lf . get ( ) )
713+ && is_x86_feature_detected ! ( "sse2" )
714+ {
715+ let front_buffer = input. peek_front_chunk_mut ( ) ;
716+ let Some ( mut front_buffer) = front_buffer else {
717+ return ProcessResult :: Suspend ;
718+ } ;
719+
720+ // Special case: The fast path is not worth taking if the first character is already in the set,
721+ // which is fairly common
722+ let first_char = front_buffer
723+ . chars ( )
724+ . next ( )
725+ . expect ( "Input buffers are never empty" ) ;
726+
727+ if matches ! ( first_char, '\r' | '\0' | '&' | '<' | '\n' ) {
728+ drop ( front_buffer) ;
729+ self . pop_except_from ( input, set)
730+ } else {
731+ // SAFETY:
732+ // This CPU is guaranteed to support SSE2 due to the is_x86_feature_detected check above
733+ let result = unsafe { self . data_state_sse2_fast_path ( & mut front_buffer) } ;
734+
735+ if front_buffer. is_empty ( ) {
736+ drop ( front_buffer) ;
737+ input. pop_front ( ) ;
738+ }
739+
740+ result
741+ }
742+ } else {
743+ self . pop_except_from ( input, set)
744+ } ;
745+
746+ #[ cfg( not( any( target_arch = "x86" , target_arch = "x86_64" ) ) ) ]
747+ let set_result = self . pop_except_from ( input, set) ;
748+
749+ let Some ( set_result) = set_result else {
750+ return ProcessResult :: Suspend ;
751+ } ;
752+ match set_result {
708753 FromSet ( '\0' ) => {
709754 self . bad_char_error ( ) ;
710755 self . emit_char ( '\0' ) ;
@@ -1839,6 +1884,121 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
18391884 states:: CdataSectionEnd => go ! ( self : push_temp ']' ; push_temp ']' ; to CdataSection ) ,
18401885 }
18411886 }
1887+
1888+ #[ cfg( any( target_arch = "x86" , target_arch = "x86_64" ) ) ]
1889+ #[ target_feature( enable = "sse2" ) ]
1890+ /// Implements the [data state] with SIMD instructions.
1891+ ///
1892+ /// The algorithm implemented is the naive SIMD approach described [here].
1893+ ///
1894+ /// ### SAFETY:
1895+ /// Calling this function on a CPU that does not support SSE2 causes undefined behaviour.
1896+ ///
1897+ /// [data state]: https://html.spec.whatwg.org/#data-state
1898+ /// [here]: https://lemire.me/blog/2024/06/08/scan-html-faster-with-simd-instructions-chrome-edition/
1899+ unsafe fn data_state_sse2_fast_path ( & self , input : & mut StrTendril ) -> Option < SetResult > {
1900+ #[ cfg( target_arch = "x86" ) ]
1901+ use std:: arch:: x86:: {
1902+ __m128i, _mm_cmpeq_epi8, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128,
1903+ _mm_set1_epi8,
1904+ } ;
1905+ #[ cfg( target_arch = "x86_64" ) ]
1906+ use std:: arch:: x86_64:: {
1907+ __m128i, _mm_cmpeq_epi8, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128,
1908+ _mm_set1_epi8,
1909+ } ;
1910+
1911+ debug_assert ! ( !input. is_empty( ) ) ;
1912+
1913+ let quote_mask = _mm_set1_epi8 ( '<' as i8 ) ;
1914+ let escape_mask = _mm_set1_epi8 ( '&' as i8 ) ;
1915+ let carriage_return_mask = _mm_set1_epi8 ( '\r' as i8 ) ;
1916+ let zero_mask = _mm_set1_epi8 ( '\0' as i8 ) ;
1917+ let newline_mask = _mm_set1_epi8 ( '\n' as i8 ) ;
1918+
1919+ let raw_bytes: & [ u8 ] = input. as_bytes ( ) ;
1920+ let start = raw_bytes. as_ptr ( ) ;
1921+
1922+ const STRIDE : usize = 16 ;
1923+ let mut i = 0 ;
1924+ let mut n_newlines = 0 ;
1925+ while i + STRIDE <= raw_bytes. len ( ) {
1926+ // Load a 16 byte chunk from the input
1927+ let data = _mm_loadu_si128 ( start. add ( i) as * const __m128i ) ;
1928+
1929+ // Compare the chunk against each mask
1930+ let quotes = _mm_cmpeq_epi8 ( data, quote_mask) ;
1931+ let escapes = _mm_cmpeq_epi8 ( data, escape_mask) ;
1932+ let carriage_returns = _mm_cmpeq_epi8 ( data, carriage_return_mask) ;
1933+ let zeros = _mm_cmpeq_epi8 ( data, zero_mask) ;
1934+ let newlines = _mm_cmpeq_epi8 ( data, newline_mask) ;
1935+
1936+ // Combine all test results and create a bitmask from them.
1937+ // Each bit in the mask will be 1 if the character at the bit position is in the set and 0 otherwise.
1938+ let test_result = _mm_or_si128 (
1939+ _mm_or_si128 ( quotes, zeros) ,
1940+ _mm_or_si128 ( escapes, carriage_returns) ,
1941+ ) ;
1942+ let bitmask = _mm_movemask_epi8 ( test_result) ;
1943+ let newline_mask = _mm_movemask_epi8 ( newlines) ;
1944+
1945+ if ( bitmask != 0 ) {
1946+ // We have reached one of the characters that cause the state machine to transition
1947+ let position = if cfg ! ( target_endian = "little" ) {
1948+ bitmask. trailing_zeros ( ) as usize
1949+ } else {
1950+ bitmask. leading_zeros ( ) as usize
1951+ } ;
1952+
1953+ n_newlines += ( newline_mask & ( ( 1 << position) - 1 ) ) . count_ones ( ) as u64 ;
1954+ i += position;
1955+ break ;
1956+ } else {
1957+ n_newlines += newline_mask. count_ones ( ) as u64 ;
1958+ }
1959+
1960+ i += STRIDE ;
1961+ }
1962+
1963+ // Process any remaining bytes (less than STRIDE)
1964+ while let Some ( c) = raw_bytes. get ( i) {
1965+ if matches ! ( * c, b'<' | b'&' | b'\r' | b'\0' ) {
1966+ break ;
1967+ }
1968+ if * c == b'\n' {
1969+ n_newlines += 1 ;
1970+ }
1971+
1972+ i += 1 ;
1973+ }
1974+
1975+ let set_result = if i == 0 {
1976+ let first_char = input. pop_front_char ( ) . unwrap ( ) ;
1977+ debug_assert ! ( matches!( first_char, '<' | '&' | '\r' | '\0' ) ) ;
1978+
1979+ // FIXME: Passing a bogus input queue is only relevant when c is \n, which can never happen in this case.
1980+ // Still, it would be nice to not have to do that.
1981+ // The same is true for the unwrap call.
1982+ let preprocessed_char = self
1983+ . get_preprocessed_char ( first_char, & BufferQueue :: default ( ) )
1984+ . unwrap ( ) ;
1985+ SetResult :: FromSet ( preprocessed_char)
1986+ } else {
1987+ debug_assert ! (
1988+ input. len( ) >= i,
1989+ "Trying to remove {:?} bytes from a tendril that is only {:?} bytes long" ,
1990+ i,
1991+ input. len( )
1992+ ) ;
1993+ let consumed_chunk = input. unsafe_subtendril ( 0 , i as u32 ) ;
1994+ input. unsafe_pop_front ( i as u32 ) ;
1995+ SetResult :: NotFromSet ( consumed_chunk)
1996+ } ;
1997+
1998+ self . current_line . set ( self . current_line . get ( ) + n_newlines) ;
1999+
2000+ Some ( set_result)
2001+ }
18422002}
18432003
18442004#[ cfg( test) ]
0 commit comments