@@ -89,110 +89,101 @@ xor 80 then << 1 then |
898911 -> 01 -> 1
9090=#
9191
92- @inline _widen_mask (msk:: UInt ) = ((msk% BigChunk) << (8 * sizeof (UInt))) | msk
93-
9492const hi_mask = CHUNKSZ == 4 ? 0x8080_8080 : 0x8080_8080_8080_8080
9593const big_hi_mask = _widen_mask (hi_mask)
9694
97- @inline _count_cont (v, msk) = (v = xor (v, msk); count_ones (xor (((v << 1 ) | v), msk) & msk))
95+ @inline get_high_mask (:: UInt ) = hi_mask
96+ @inline get_high_mask (:: BigChunk ) = big_hi_mask
97+
9898@inline msk_lead (v, msk) = (v = xor (v, msk); xor (xor (((v << 1 ) | v), msk) & msk, msk))
9999
100- @inline _count_cont (v:: UInt ) = _count_cont (v, hi_mask)
101- @inline msk_lead (v:: UInt ) = msk_lead (v, hi_mask)
100+ @inline msk_lead (v) = msk_lead (v, get_high_mask (v))
102101
103- @inline _count_cont (v:: BigChunk ) = _count_cont (v, big_hi_mask)
104- @inline _msk_lead (v:: BigChunk ) = _msk_lead (v, big_hi_mask)
102+ @inline get_lead (T, ptr) = msk_lead (unsafe_load (reinterpret (Ptr{T}, ptr)))
105103
106- @inline function _align_len_utf8 (pnt, cnt, v)
107- len = 0
108- fin = pnt + cnt
109- v = msk_lead (v)
110- while (pnt += CHUNKSZ) < fin
104+ @inline count_masked (v, cnt) = count_ones (_mask_bytes (v, cnt))
105+
106+ function _length_al (:: MultiCU , :: Type{UTF8CSE} , beg:: Ptr{UInt8} , cnt:: Int )
107+ # First check very frequent cases of short strings
108+ # (on 64-bit machines, 1-8 bytes, 9-16 bytes, and 17-24)
109+ # taking advantage of the knowledge of how String types are stored in Julia,
110+ # i.e. UInt length, immediate followed by the string data, aligned on sizeof(UInt)*2
111+ if cnt <= BIGCHUNKSZ
112+ return (cnt <= CHUNKSZ
113+ ? count_masked (get_lead (UInt, beg), cnt)
114+ : count_masked (get_lead (BigChunk, beg), cnt))
115+ end
116+ len = count_ones (get_lead (UInt, beg))
117+ cnt -= CHUNKSZ
118+ pnt = _pntbigchunk (beg + CHUNKSZ)
119+ v = get_lead (BigChunk, pnt)
120+ cnt <= BIGCHUNKSZ && return len + count_masked (v, cnt)
121+ fin = _pntbigchunk (beg + CHUNKSZ + cnt)
122+ while (pnt += BIGCHUNKSZ) < fin
111123 len += count_ones (v)
112- v = msk_lead ( unsafe_load ( pnt) )
124+ v = get_lead (BigChunk, pnt)
113125 end
114- len + count_ones (cnt & CHUNKMSK == 0 ? v : (v & _mask_bytes ( cnt)) )
126+ len + count_masked (v, cnt)
115127end
116128
117- _length_al (:: MultiCU , :: Type{UTF8CSE} , beg:: Ptr{UInt8} , cnt:: Int ) =
118- (pnt = reinterpret (Ptr{UInt}, beg); _align_len_utf8 (pnt, cnt, unsafe_load (pnt)))
119-
120- function _length (:: MultiCU , :: Type{UTF8CSE} , beg:: Ptr{UInt8} , cnt:: Int )
129+ function _length_ul (:: MultiCU , :: Type{UTF8CSE} , beg:: Ptr{UInt8} , cnt:: Int )
121130 align = reinterpret (UInt, beg)
122- pnt = reinterpret (Ptr{UInt }, align & ~ CHUNKMSK )
131+ pnt = reinterpret (Ptr{BigChunk }, align & ~ BIGCHUNKMSK )
123132 v = unsafe_load (pnt)
124- if (align &= CHUNKMSK ) != 0
125- msk = _mask_bytes (align)
126- v = (v & ~ msk) | (msk & hi_mask )
133+ if (align &= BIGCHUNKMSK ) != 0
134+ msk = _big_mask_bytes (align)
135+ v = (v & ~ msk) | (msk & big_hi_mask )
127136 cnt += align
128137 end
129- _align_len_utf8 (pnt, cnt, v)
130- end
131-
132- @inline function _check_mask_al (pnt, cnt, msk, v)
138+ len = 0
133139 fin = pnt + cnt
134- while (pnt += CHUNKSZ) < fin
135- (v & msk) == 0 || return false
136- v = unsafe_load (pnt)
137- end
138- (cnt & CHUNKMSK == 0 ? v : (v & _mask_bytes (cnt))) & msk == 0
139- end
140- @inline _check_mask_al (pnt, cnt, msk) = _check_mask_al (pnt, cnt, msk, unsafe_load (pnt))
141-
142- @inline function _check_mask_ul (beg, cnt, msk)
143- align = reinterpret (UInt, beg)
144- pnt = reinterpret (Ptr{UInt}, align & ~ CHUNKMSK)
145- v = unsafe_load (pnt)
146- if (align &= CHUNKMSK) != 0
147- v &= ~ _mask_bytes (align)
148- cnt += align
140+ v = msk_lead (v)
141+ while (pnt += BIGCHUNKSZ) < fin
142+ len += count_ones (v)
143+ v = msk_lead (unsafe_load (pnt))
149144 end
150- _check_mask_al (pnt, cnt, msk, v )
145+ len + count_masked (v, cnt )
151146end
152147
153- @inline _mask_bytes (v:: T , cnt) where {T} =
154- ifelse ((cnt & (sizeof (T)- 1 )% UInt) == 0 ,
155- v, T (v & (one (T) << ((cnt & (sizeof (T)- 1 )% UInt) << 3 )) - 1 ))
156-
157- @inline chk_chunk (ptr, msk:: T , cnt) where {T} =
158- iszero (_mask_bytes (unsafe_load (reinterpret (Ptr{T}, ptr)) & msk, cnt))
148+ @inline get_chunk (ptr, msk:: T , cnt) where {T} =
149+ _mask_bytes (unsafe_load (reinterpret (Ptr{T}, ptr)) & msk, cnt)
159150
160- @inline function _check_block_al (ptr, cnt, msk)
151+ @inline function _check_mask_al (ptr, cnt, msk)
161152 # First check very frequent cases of short strings
162153 # (on 64-bit machines, 1-8 bytes, 9-16 bytes, and 17-24)
163154 # taking advantage of the knowledge of how String types are stored in Julia,
164155 # i.e. UInt length, immediate followed by the string data, aligned on sizeof(UInt)*2
165- cnt <= CHUNKSZ && return chk_chunk (ptr, msk, cnt)
156+ cnt <= CHUNKSZ && return get_chunk (ptr, msk, cnt) == 0
166157 bigmsk = _widen_mask (msk)
167- cnt <= BIGCHUNKSZ && return chk_chunk (ptr, bigmsk, cnt)
158+ cnt <= BIGCHUNKSZ && return get_chunk (ptr, bigmsk, cnt) == 0
168159 (unsafe_load (_pntchunk (ptr)) & msk) == 0 || return false
169160 cnt -= CHUNKSZ
170- cnt <= BIGCHUNKSZ && return chk_chunk (ptr, bigmsk, cnt)
161+ cnt <= BIGCHUNKSZ && return get_chunk (ptr, bigmsk, cnt) == 0
171162 pnt = _pntbigchunk (ptr + CHUNKSZ)
172163 fin = _pntbigchunk (ptr + CHUNKSZ + cnt)
173- v = unsafe_load (pnt) & bigmsk
164+ v = unsafe_load (pnt)
174165 while (pnt += BIGCHUNKSZ) < fin
175- v == 0 || return false
176- v = unsafe_load (pnt) & bigmsk
166+ (v & bigmsk) == 0 || return false
167+ v = unsafe_load (pnt)
177168 end
178- iszero ( _mask_bytes (v, cnt))
169+ _mask_bytes (v & bigmsk , cnt) == 0
179170end
180171
181- @inline function _check_block_ul (beg, cnt, msk)
172+ @inline function _check_mask_ul (beg, cnt, msk)
173+ bigmsk = _widen_mask (msk)
182174 align = reinterpret (UInt, beg)
183175 pnt = _pntbigchunk (align & ~ BIGCHUNKMSK)
184176 v = unsafe_load (pnt)
185177 if (align &= BIGCHUNKMSK) != 0
186178 v &= ~ _big_mask_bytes (align)
187179 cnt += align
188180 end
189- fin = _pntbigchunk (pnt + cnt)
190- bigmsk = _widen_mask (msk)
181+ fin = pnt + cnt
191182 while (pnt += BIGCHUNKSZ) < fin
192183 (v & bigmsk) == 0 || return false
193184 v = unsafe_load (pnt)
194185 end
195- ((cnt & BIGCHUNKMSK) == 0 ? v : (v & _big_mask_bytes ( cnt))) & bigmsk == 0
186+ _mask_bytes (v & bigmsk, cnt) == 0
196187end
197188
198189_ascii_mask (:: Type{UInt8} ) = hi_mask
@@ -226,88 +217,99 @@ is_ascii(vec::Vector{T}) where {T<:CodeUnitTypes} =
226217
227218is_ascii (str:: Str{C} ) where {C<: ASCII_Union } =
228219 (cnt = sizeof (str)) == 0 ||
229- (@preserve str _check_block_al (pointer (str), cnt, _ascii_mask (codeunit (C))))
220+ (@preserve str _check_mask_al (pointer (str), cnt, _ascii_mask (codeunit (C))))
230221
231222# Todo! Here you need to see that 0b11yyyyxx at least 1 y must be set,
232223# which indicates a non-Latin1 character
233- _all_latin (val) = ((val & (val<< 1 ) & (val<< 2 | (val<< 3 ) | (val<< 4 ) | (val<< 5 ))) & hi_mask) == 0
224+ _all_latin (val) =
225+ ((val & (val<< 1 ) & (val<< 2 | (val<< 3 ) | (val<< 4 ) | (val<< 5 ))) & get_high_mask (val)) == 0
234226
235- @inline function _check_latin_utf8_al (pnt, cnt, v)
227+ @inline function _check_latin_utf8_al (beg, cnt)
228+ pnt = reinterpret (Ptr{UInt}, beg)
236229 fin = pnt + cnt
230+ v = unsafe_load (pnt)
237231 while (pnt += CHUNKSZ) < fin
238232 _all_latin (v) || return false
239233 v = unsafe_load (pnt)
240234 end
241- _all_latin (cnt & CHUNKMSK == 0 ? v : (v & _mask_bytes ( cnt) ))
235+ _all_latin (_mask_bytes (v, cnt))
242236end
243- @inline _check_latin_utf8_al (pnt, cnt) = _check_latin_utf8_al (pnt, cnt, unsafe_load (pnt))
244237
245238@inline function _check_latin_utf8_ul (beg, cnt)
246239 align = reinterpret (UInt, beg)
247- pnt = reinterpret (Ptr{UInt }, align & ~ CHUNKMSK )
240+ pnt = reinterpret (Ptr{BigChunk }, align & ~ BIGCHUNKMSK )
248241 v = unsafe_load (pnt)
249- if (align &= CHUNKMSK ) != 0
250- v &= ~ _mask_bytes (align)
242+ if (align &= BIGCHUNKMSK ) != 0
243+ v &= ~ _big_mask_bytes (align)
251244 cnt += align
252245 end
253- _check_latin_utf8_al (pnt, cnt, v)
246+ fin = pnt + cnt
247+ while (pnt += BIGCHUNKSZ) < fin
248+ _all_latin (v) || return false
249+ v = unsafe_load (pnt)
250+ end
251+ _all_latin (_mask_bytes (v, cnt))
254252end
255253
256254is_latin (str:: Str{UTF8CSE} ) =
257- (siz = sizeof (str)) == 0 ? true :
258- @preserve str _check_latin_utf8_al (reinterpret (Ptr{UInt}, pointer (str)), siz)
255+ (siz = sizeof (str)) == 0 || @preserve str _check_latin_utf8_al (pointer (str), siz)
259256
260257is_latin (str:: SubString{<:Str{UTF8CSE}} ) =
261- (cnt = sizeof (str)) == 0 ? true : @preserve str _check_latin_utf8_ul (pointer (str), cnt)
258+ (cnt = sizeof (str)) == 0 || @preserve str _check_latin_utf8_ul (pointer (str), cnt)
262259
263260is_latin (vec:: Vector{T} ) where {T<: Union{UInt16,UInt32} } =
264- (cnt = sizeof (vec)) == 0 ? true :
261+ (cnt = sizeof (vec)) == 0 ||
265262 @preserve vec _check_mask_ul (pointer (vec), cnt, _latin_mask (T))
266263
267264is_latin (str:: SubString{<:Str{C}} ) where {C<: Union{Word_CSEs,Quad_CSEs} } =
268- (cnt = sizeof (str)) == 0 ? true :
265+ (cnt = sizeof (str)) == 0 ||
269266 @preserve str _check_mask_ul (pointer (str), cnt, _latin_mask (codeunit (C)))
270267
271268is_latin (str:: Str{C} ) where {C<: Union{Word_CSEs,Quad_CSEs} } =
272- (cnt = sizeof (str)) == 0 ? true :
269+ (cnt = sizeof (str)) == 0 ||
273270 @preserve str _check_mask_al (pointer (str), cnt, _latin_mask (codeunit (C)))
274271
275272# All 4 top bits must be 1 (i.e. 0xfx) for this to be non-BMP
276- _all_bmp (val) = ((val | (val<< 1 ) | (val<< 2 ) | (val<< 3 )) & hi_mask ) == 0
273+ _all_bmp (val) = ((val | (val<< 1 ) | (val<< 2 ) | (val<< 3 )) & get_high_mask (val) ) == 0
277274
278- @inline function _check_bmp_utf8_al (pnt, cnt, v)
275+ @inline function _check_bmp_utf8_al (beg, cnt)
276+ pnt = reinterpret (Ptr{UInt}, beg)
279277 fin = pnt + cnt
278+ v = unsafe_load (pnt)
280279 while (pnt += CHUNKSZ) < fin
281280 _all_bmp (v) || return false
282281 v = unsafe_load (pnt)
283282 end
284- _all_bmp (cnt & CHUNKMSK == 0 ? v : (v & _mask_bytes ( cnt) ))
283+ _all_bmp (_mask_bytes (v, cnt))
285284end
286- @inline _check_bmp_utf8_al (pnt, cnt) = _check_bmp_utf8_al (pnt, cnt, unsafe_load (pnt))
287285
288286@inline function _check_bmp_utf8_ul (beg, cnt)
289287 align = reinterpret (UInt, beg)
290- pnt = reinterpret (Ptr{UInt }, align & ~ CHUNKMSK )
288+ pnt = reinterpret (Ptr{BigChunk }, align & ~ BIGCHUNKMSK )
291289 v = unsafe_load (pnt)
292- if (align &= CHUNKMSK ) != 0
293- v &= ~ _mask_bytes (align)
290+ if (align &= BIGCHUNKMSK ) != 0
291+ v &= ~ _big_mask_bytes (align)
294292 cnt += align
295293 end
296- _check_bmp_utf8_al (pnt, cnt, v)
294+ fin = pnt + cnt
295+ while (pnt += BIGCHUNKSZ) < fin
296+ _all_bmp (v) || return false
297+ v = unsafe_load (pnt)
298+ end
299+ _all_bmp (_mask_bytes (v, cnt))
297300end
298301
299302is_bmp (str:: Str{UTF8CSE} ) =
300- (cnt = sizeof (str)) == 0 ? true :
301- @preserve str _check_bmp_utf8_al (reinterpret (Ptr{UInt}, pointer (str)), cnt)
303+ (cnt = sizeof (str)) == 0 || @preserve str _check_bmp_utf8_al (pointer (str), cnt)
302304
303305is_bmp (str:: SubString{<:Str{UTF8CSE}} ) =
304- (cnt = sizeof (str)) == 0 ? true : @preserve str _check_bmp_utf8_ul (pointer (str), cnt)
306+ (cnt = sizeof (str)) == 0 || @preserve str _check_bmp_utf8_ul (pointer (str), cnt)
305307
306308is_bmp (str:: SubString{<:Str{<:Union{Text4CSE,UTF32CSE}}} ) =
307- (cnt = sizeof (str)) == 0 ? true : @preserve str _check_mask_ul (pointer (str), cnt, _bmp_mask_32)
309+ (cnt = sizeof (str)) == 0 || @preserve str _check_mask_ul (pointer (str), cnt, _bmp_mask_32)
308310
309311is_bmp (str:: Str{<:Union{Text4CSE,UTF32CSE}} ) =
310- (cnt = sizeof (str)) == 0 ? true : @preserve str _check_mask_al (pointer (str), cnt, _bmp_mask_32)
312+ (cnt = sizeof (str)) == 0 || @preserve str _check_mask_al (pointer (str), cnt, _bmp_mask_32)
311313
312314is_unicode (str:: MS_UTF8 ) = true
313315
0 commit comments