@@ -10,34 +10,59 @@ Based in (small) part on code for UTF16String that used to be in Julia
1010const _trail_mask = CHUNKSZ == 4 ? 0xdc00_dc00 : 0xdc00_dc00_dc00_dc00
1111const _hi_bit_16 = CHUNKSZ == 4 ? 0x8000_8000 : 0x8000_8000_8000_8000
1212
13- @inline _mask_surr (v) = xor ((v | v<< 1 | v<< 2 | v<< 3 | v<< 4 | v<< 5 ) & _hi_bit_16, _hi_bit_16)
14- @inline _get_masked (v:: UInt ) = _mask_surr (xor (v, _trail_mask))
15- @inline _get_masked (qpnt:: Ptr{UInt} ) = _get_masked (unsafe_load (qpnt))
16- @inline _get_lead (qpnt) = xor (_get_masked (qpnt), _hi_bit_16)
13+ const _big_trail_mask = _widen_mask (_trail_mask)
14+ const _big_hi_bit_16 = _widen_mask (_big_hi_bit_16)
1715
18- @inline function _align_len_utf16 (pnt, cnt, v)
19- len = 0
16+ @inline _mask_surr (v, msk) = xor ((v | v<< 1 | v<< 2 | v<< 3 | v<< 4 | v<< 5 ) & msk, msk)
17+
18+ @inline _get_masked (v:: UInt ) = _mask_surr (xor (v, _trail_mask))
19+ @inline _get_masked (v:: BigChunk ) = _mask_surr (xor (v, _big_trail_mask))
20+ @inline _get_masked (qpnt:: Ptr ) = _get_masked (unsafe_load (qpnt))
21+
22+ @inline _get_lead (qpnt:: Ptr{UInt} ) = xor (_get_masked (qpnt), _hi_bit_16)
23+ @inline _get_lead (qpnt:: Ptr{BigChunk} ) = xor (_get_masked (qpnt), _big_hi_bit_16)
24+
25+ @inline function _length_al (:: MultiCU , :: Type{UTF16CSE} , beg:: Ptr{UInt16} , cnt:: Int )
26+ # First check very frequent cases of short strings
27+ # (on 64-bit machines, 1-8 bytes, 9-16 bytes, and 17-24)
28+ # taking advantage of the knowledge of how String types are stored in Julia,
29+ # i.e. UInt length, immediate followed by the string data, aligned on sizeof(UInt)*2
30+ cnt <<= 1
31+ if cnt <= BIGCHUNKSZ
32+ return (cnt <= CHUNKSZ
33+ ? count_ones (_mask_bytes (_get_lead (_pntchunk (beg), cnt))
34+ : count_ones (_mask_bytes (_get_lead (_pntbigchunk (beg), cnt))
35+ end
36+ len = count_ones (_get_lead (_pntchunk (beg)))
37+ cnt -= CHUNKSZ
38+ pnt = _pntbigchunk (beg + CHUNKSZ)
39+ v = _get_lead (pnt)
40+ cnt <= BIGCHUNKSZ && return len + count_ones (_mask_bytes (v, cnt))
2041 fin = pnt + cnt
21- while (pnt += CHUNKSZ ) < fin
42+ while (pnt += BIGCHUNKSZ ) < fin
2243 len += count_ones (v)
2344 v = _get_lead (pnt)
2445 end
2546 len + count_ones (_mask_bytes (v, cnt))
2647end
2748
28- _length_al (:: MultiCU , :: Type{UTF16CSE} , beg:: Ptr{UInt16} , cnt:: Int ) =
29- (pnt = reinterpret (Ptr{UInt}, beg); _align_len_utf16 (pnt, cnt<< 1 , _get_lead (pnt)))
30-
31- function _length (:: MultiCU , :: Type{UTF16CSE} , beg:: Ptr{UInt16} , cnt:: Int )
49+ function _length_ul (:: MultiCU , :: Type{UTF16CSE} , beg:: Ptr{UInt16} , cnt:: Int )
3250 align = reinterpret (UInt, beg)
33- pnt = reinterpret (Ptr{UInt}, align & ~ CHUNKMSK)
51+ pnt = reinterpret (Ptr{BigChunk}, align & ~ BIGCHUNKMSK)
52+ cnt <<= 1
3453 v = _get_lead (pnt)
35- if (align &= CHUNKMSK ) != 0
36- msk = _mask_bytes (align)
37- v = (v & ~ msk) | (msk & _trail_mask )
38- cnt += ( align>>> 1 )
54+ if (align &= BIGCHUNKMSK ) != 0
55+ msk = _big_mask_bytes (align)
56+ v = (v & ~ msk) | (msk & _big_trail_mask )
57+ cnt += align
3958 end
40- _align_len_utf16 (pnt, cnt<< 1 , v)
59+ len = 0
60+ fin = pnt + cnt
61+ while (pnt += BIGCHUNKSZ) < fin
62+ len += count_ones (v)
63+ v = _get_lead (pnt)
64+ end
65+ len + count_ones (_mask_bytes (v, cnt))
4166end
4267
4368function _nextind (:: MultiCU , str:: MS_UTF16 , pos:: Int , nchar:: Int )
@@ -93,31 +118,41 @@ function is_bmp(str::MS_UTF16)
93118 end
94119end
95120
96- @inline function _check_bmp_utf16_al (pnt, cnt, v)
121+ @inline function _check_bmp_utf16_al (beg, cnt)
122+ cnt <= CHUNKSZ && return _mask_bytes (_get_masked (_pntchunk (beg)), cnt) == 0
123+ cnt <= BIGCHUNKSZ && return _mask_bytes (_get_masked (_pntbigchunk (beg)), cnt) == 0
124+ _get_masked (_pntchunk (beg)) == 0 || return false
125+ cnt -= CHUNKSZ
126+ cnt <= BIGCHUNKSZ && return _mask_bytes (_get_masked (_pntbigchunk (beg)), cnt) == 0
127+ pnt = _pntbigchunk (beg + CHUNKSZ)
128+ v = _get_masked (pnt)
97129 fin = pnt + cnt
98- v = _get_masked (v)
99- while (pnt += CHUNKSZ) < fin
130+ while (pnt += BIGCHUNKSZ) < fin
100131 v == 0 || return false
101132 v = _get_masked (pnt)
102133 end
103134 _mask_bytes (v, cnt) == 0
104135end
105- @inline _check_bmp_utf16_al (pnt, cnt) = _check_bmp_utf16_al (pnt, cnt, unsafe_load (pnt))
106136
107137@inline function _check_bmp_utf16_ul (beg, cnt)
108138 align = reinterpret (UInt, beg)
109- pnt = reinterpret (Ptr{UInt }, align & ~ CHUNKMSK )
139+ pnt = reinterpret (Ptr{BigChunk }, align & ~ BIGCHUNKMSK )
110140 v = unsafe_load (pnt)
111- if (align &= CHUNKMSK ) != 0
112- v &= ~ _mask_bytes (align)
141+ if (align &= BIGCHUNKMSK ) != 0
142+ v &= ~ _big_mask_bytes (align)
113143 cnt += align
114144 end
115- _check_bmp_utf16_al (pnt, cnt, v)
145+ v = _get_masked (v)
146+ fin = pnt + cnt
147+ while (pnt += BIGCHUNKSZ) < fin
148+ v == 0 || return false
149+ v = _get_masked (pnt)
150+ end
151+ _mask_bytes (v, cnt) == 0
116152end
117153
118154is_bmp (str:: Str{UTF16CSE} ) =
119- (cnt = sizeof (str)) == 0 ||
120- @preserve str _check_bmp_utf16_al (reinterpret (Ptr{UInt}, pointer (str)), cnt)
155+ (cnt = sizeof (str)) == 0 || @preserve str _check_bmp_utf16_al (pointer (str), cnt)
121156
122157is_bmp (str:: SubString{<:Str{UTF16CSE}} ) =
123158 (cnt = sizeof (str)) == 0 || @preserve str _check_bmp_utf16_ul (pointer (str), cnt)
0 commit comments