@@ -48,6 +48,17 @@ constexpr bool to_lower_ascii(char* input, size_t length) noexcept {
4848#if ADA_NEON
4949ada_really_inline bool has_tabs_or_newline (
5050 std::string_view user_input) noexcept {
51+ // first check for short strings in which case we do it naively.
52+ if (user_input.size () < 16 ) { // slow path
53+ for (size_t i = 0 ; i < user_input.size (); i++) {
54+ if (user_input[i] == ' \r ' || user_input[i] == ' \n ' ||
55+ user_input[i] == ' \t ' ) {
56+ return true ;
57+ }
58+ }
59+ return false ;
60+ }
61+ // fast path for long strings (expected to be common)
5162 size_t i = 0 ;
5263 const uint8x16_t mask1 = vmovq_n_u8 (' \r ' );
5364 const uint8x16_t mask2 = vmovq_n_u8 (' \n ' );
@@ -60,9 +71,8 @@ ada_really_inline bool has_tabs_or_newline(
6071 vceqq_u8 (word, mask3));
6172 }
6273 if (i < user_input.size ()) {
63- uint8_t buffer[16 ]{};
64- memcpy (buffer, user_input.data () + i, user_input.size () - i);
65- uint8x16_t word = vld1q_u8 ((const uint8_t *)user_input.data () + i);
74+ uint8x16_t word =
75+ vld1q_u8 ((const uint8_t *)user_input.data () + user_input.length () - 16 );
6676 running = vorrq_u8 (vorrq_u8 (running, vorrq_u8 (vceqq_u8 (word, mask1),
6777 vceqq_u8 (word, mask2))),
6878 vceqq_u8 (word, mask3));
@@ -72,6 +82,17 @@ ada_really_inline bool has_tabs_or_newline(
7282#elif ADA_SSE2
7383ada_really_inline bool has_tabs_or_newline (
7484 std::string_view user_input) noexcept {
85+ // first check for short strings in which case we do it naively.
86+ if (user_input.size () < 16 ) { // slow path
87+ for (size_t i = 0 ; i < user_input.size (); i++) {
88+ if (user_input[i] == ' \r ' || user_input[i] == ' \n ' ||
89+ user_input[i] == ' \t ' ) {
90+ return true ;
91+ }
92+ }
93+ return false ;
94+ }
95+ // fast path for long strings (expected to be common)
7596 size_t i = 0 ;
7697 const __m128i mask1 = _mm_set1_epi8 (' \r ' );
7798 const __m128i mask2 = _mm_set1_epi8 (' \n ' );
@@ -85,9 +106,8 @@ ada_really_inline bool has_tabs_or_newline(
85106 _mm_cmpeq_epi8 (word, mask3));
86107 }
87108 if (i < user_input.size ()) {
88- alignas (16 ) uint8_t buffer[16 ]{};
89- memcpy (buffer, user_input.data () + i, user_input.size () - i);
90- __m128i word = _mm_load_si128 ((const __m128i*)buffer);
109+ __m128i word = _mm_loadu_si128 (
110+ (const __m128i*)(user_input.data () + user_input.length () - 16 ));
91111 running = _mm_or_si128 (
92112 _mm_or_si128 (running, _mm_or_si128 (_mm_cmpeq_epi8 (word, mask1),
93113 _mm_cmpeq_epi8 (word, mask2))),
0 commit comments