@@ -5,31 +5,16 @@ Copyright 2017-2018 Gandalf Software, Inc., Scott P. Jones
55Licensed under MIT License, see LICENSE.md
66=#
77
8- _wide_lower_l (c) = ifelse (c > (V6_COMPAT ? 0xdf : 0xde ), c != 0xf7 , c == 0xb5 )
9-
10- @inline _wide_lower_ch (ch) =
11- ch <= 0x7f ? _islower_a (ch) : (ch > 0xff ? _islower_u (ch) : _wide_lower_l (ch))
12-
13- @inline _isupper_ch (ch) =
14- ch <= 0x7f ? _isupper_a (ch) : (ch > 0xff ? _isupper_u (ch) : _isupper_l (ch))
15-
16- _wide_lower_latin (ch) = (ch == 0xb5 ) | (ch == 0xff ) | (! V6_COMPAT && (ch == 0xdf ))
17-
18- _wide_out_upper (ch) =
19- ifelse (ch == 0xb5 , 0x39c ,
20- ifelse (ch == 0xff , 0x178 , ifelse (! V6_COMPAT && ch == 0xdf , 0x1e9e , ch% UInt16)))
21-
22-
238function uppercase_first (str:: MaybeSub{S} ) where {C<: ASCIICSE ,S<: Str{C} }
249 (len = ncodeunits (str)) == 0 && return str
2510 @preserve str begin
2611 pnt = pointer (str)
2712 ch = get_codeunit (pnt)
2813 _islower_a (ch) || return str
29- out = _allocate (len)
14+ buf, out = _allocate (UInt8, len)
3015 unsafe_copyto! (out, pnt, len)
3116 set_codeunit! (out, ch - 0x20 )
32- Str (C, out )
17+ Str (C, buf )
3318 end
3419end
3520
@@ -39,10 +24,10 @@ function lowercase_first(str::MaybeSub{S}) where {C<:ASCIICSE,S<:Str{C}}
3924 pnt = pointer (str)
4025 ch = get_codeunit (pnt)
4126 _isupper_a (ch) || return str
42- out = _allocate (len)
27+ buf, out = _allocate (UInt8, len)
4328 unsafe_copyto! (out, pnt, len)
4429 set_codeunit! (out, ch + 0x20 )
45- Str (C, out )
30+ Str (C, buf )
4631 end
4732end
4833
@@ -119,7 +104,7 @@ function uppercase_first(str::MaybeSub{S}) where {C<:LatinCSE,S<:Str{C}}
119104 _can_upper (ch) || return str
120105 buf, out = _allocate (UInt8, len)
121106 set_codeunit! (out, ch - 0x20 )
122- len > 1 && unsafe_copyto! (out, pnt+ 1 , len- 1 )
107+ len > 1 && unsafe_copyto! (out + 1 , pnt+ 1 , len- 1 )
123108 Str (C, buf)
124109 end
125110end
@@ -130,19 +115,16 @@ function uppercase_first(str::MaybeSub{S}) where {C<:_LatinCSE,S<:Str{C}}
130115 @preserve str begin
131116 pnt = pointer (str)
132117 ch = get_codeunit (pnt)
133- if _can_upper (ch)
134- buf, out8 = _allocate (UInt8, len)
135- set_codeunit! (out8, ch - 0x20 )
136- len > 1 && unsafe_copyto! (out8, pnt+ 1 , len- 1 )
137- Str (C, buf)
138- elseif _wide_lower_latin (ch)
118+ if _wide_lower_latin (ch)
139119 buf, out = _allocate (UInt16, len)
120+ _widen! (out, pnt, pnt + len)
140121 set_codeunit! (out, _wide_out_upper (ch))
141- # Perform the widen operation on the rest (should be done via SIMD)
142- @inbounds for i = 2 : len
143- set_codeunit! (out += 2 , get_codeunit (pnt += 2 )% UInt16)
144- end
145122 Str (_UCS2CSE, buf)
123+ elseif _can_upper (ch)
124+ buf8, out8 = _allocate (UInt8, len)
125+ len > 1 && unsafe_copyto! (out8, pnt, len)
126+ set_codeunit! (out8, ch - 0x20 )
127+ Str (_LatinCSE, buf8)
146128 else
147129 str
148130 end
@@ -154,10 +136,10 @@ function lowercase_first(str::MaybeSub{S}) where {C<:Latin_CSEs,S<:Str{C}}
154136 @preserve str begin
155137 pnt = pointer (str)
156138 ch = get_codeunit (pnt)
157- _isupper (ch) || return str
139+ _isupper_al (ch) || return str
158140 buf, out = _allocate (UInt8, len)
159141 set_codeunit! (out, ch + 0x20 )
160- len > 1 && unsafe_copyto! (out, pnt+ 1 , len- 1 )
142+ len > 1 && unsafe_copyto! (out+ 1 , pnt+ 1 , len- 1 )
161143 Str (C, buf)
162144 end
163145end
@@ -261,14 +243,17 @@ function lowercase(str::MaybeSub{S}) where {C<:Latin_CSEs,S<:Str{C}}
261243 str
262244end
263245
246+ _is_latin_ucs2 (len, pnt) = _check_mask_ul (pnt, len, _latin_mask (UInt16))
247+
264248# result must have at least one character > 0xff, so if the only character(s)
265249# > 0xff became <= 0xff, then the result may need to be narrowed and returned as _LatinStr
266250
267251function _lower (:: Type{C} , beg, off, len) where {C<: _UCS2CSE }
268252 CU = codeunit (C)
269253 buf, out = _allocate (CU, len)
270254 unsafe_copyto! (out, beg, len)
271- fin = out + (len* sizeof (CU))
255+ lenw = len* sizeof (CU)
256+ fin = out + lenw
272257 out += off
273258 flg = false
274259 while out < fin
@@ -277,18 +262,19 @@ function _lower(::Type{C}, beg, off, len) where {C<:_UCS2CSE}
277262 _isupper_a (ch) && set_codeunit! (out, ch += 0x20 )
278263 elseif ch <= 0xff
279264 _isupper_l (ch) && set_codeunit! (out, ch += 0x20 )
280- elseif _isupper_u (ch)
281- ch = _lowercase_u (ch)
282- flg = ch <= 0xff
283- set_codeunit! (out, ch)
265+ elseif ch <= 0xffff
266+ if _can_lower_bmp (ch)
267+ ch = _lower_bmp (ch)
268+ flg = ch <= 0xff
269+ set_codeunit! (out, ch)
270+ end
284271 end
285272 out += sizeof (CU)
286273 end
287- if flg && is_latin (buf)
288- out = pointer (buf)
289- buf = _allocate (len)
290- _narrow! (pointer (buf), out, out + len)
291- Str (_LatinCSE, buf)
274+ if flg && (src = reinterpret (Ptr{UInt16}, pointer (buf)); _is_latin_ucs2 (lenw, src))
275+ buf8 = _allocate (len)
276+ _narrow! (pointer (buf8), src, src + lenw)
277+ Str (_LatinCSE, buf8)
292278 else
293279 Str (C, buf)
294280 end
@@ -302,25 +288,75 @@ function _lower(::Type{C}, beg, off, len) where {C<:Union{UCS2CSE,UTF32_CSEs}}
302288 out += off
303289 while out < fin
304290 ch = get_codeunit (out)
305- if ch <= 0x7f
306- _isupper_a (ch) && set_codeunit! (out, ch += 0x20 )
307- elseif ch <= 0xff
308- _isupper_l (ch) && set_codeunit! (out, ch += 0x20 )
309- elseif _isupper_u (ch)
310- set_codeunit! (out, _lowercase_u (ch))
291+ if ch <= 0xff
292+ _isupper_al (ch) && set_codeunit! (out, ch += 0x20 )
293+ elseif ch <= 0xffff
294+ _can_lower_bmp (ch) && set_codeunit! (out, _lower_bmp (ch) )
295+ elseif ch <= 0x1ffff
296+ _can_lower_slp (ch) && set_codeunit! (out, _lower_slp (ch))
311297 end
312298 out += sizeof (CU)
313299 end
314300 Str (C, buf)
315301end
316302
303+ function lowercase_first (str:: MaybeSub{S} ) where {C<: _UCS2CSE ,S<: Str{C} }
304+ (len = ncodeunits (str)) == 0 && return str
305+ @preserve str begin
306+ pnt = pointer (str)
307+ ch = get_codeunit (pnt)
308+ (ch <= 0xff ? _isupper_al (ch) : ch <= 0xffff ? _can_lower_bmp (ch) :
309+ ch <= 0x1ffff && _can_lower_slp (ch)) ||
310+ return str
311+ cl = _lower_ch (ch)
312+ if ch > 0xff && cl <= 0xff && _check_mask_ul (pnt+ 1 , len- 1 , _latin_mask (UInt16))
313+ buf8, out8 = _allocate (UInt8, len)
314+ len > 1 && _narrow! (out8, pnt, pnt + len)
315+ set_codeunit! (out8, cl)
316+ Str (_LatinCSE, buf8)
317+ else
318+ buf, out = _allocate (codeunit (C), len)
319+ len > 1 && unsafe_copyto! (out, pnt, len)
320+ set_codeunit! (out, cl)
321+ Str (C, buf)
322+ end
323+ end
324+ end
325+
326+ function uppercase_first (str:: MaybeSub{S} ) where {C<: Union{UCS2_CSEs,UTF32_CSEs} ,S<: Str{C} }
327+ (len = ncodeunits (str)) == 0 && return str
328+ @preserve str begin
329+ pnt = pointer (str)
330+ ch = get_codeunit (pnt)
331+ cp = _title_ch (ch)
332+ ch == cp && return str
333+ buf, out = _allocate (codeunit (C), len)
334+ len > 1 && unsafe_copyto! (out, pnt, len)
335+ set_codeunit! (out, cp)
336+ Str (C, buf)
337+ end
338+ end
339+
340+ function lowercase_first (str:: MaybeSub{S} ) where {C<: Union{UCS2CSE,UTF32_CSEs} ,S<: Str{C} }
341+ (len = ncodeunits (str)) == 0 && return str
342+ @preserve str begin
343+ pnt = pointer (str)
344+ ch = get_codeunit (pnt)
345+ _can_lower_ch (ch) || return str
346+ buf, out = _allocate (codeunit (C), len)
347+ len > 1 && unsafe_copyto! (out, pnt, len)
348+ set_codeunit! (out, _lower_ch (ch))
349+ Str (C, buf)
350+ end
351+ end
352+
317353function lowercase (str:: MaybeSub{S} ) where {C<: Union{UCS2_CSEs,UTF32_CSEs} ,S<: Str{C} }
318354 @preserve str begin
319355 CU = codeunit (C)
320356 pnt = beg = pointer (str)
321357 fin = beg + sizeof (str)
322358 while pnt < fin
323- _isupper_ch (get_codeunit (pnt)) && return _lower (C, beg, pnt- beg, ncodeunits (str))
359+ _can_lower_ch (get_codeunit (pnt)) && return _lower (C, beg, pnt- beg, ncodeunits (str))
324360 pnt += sizeof (CU)
325361 end
326362 end
@@ -337,16 +373,12 @@ function _upper(::Type{C}, beg, off, len) where {C<:Union{UCS2_CSEs,UTF32_CSEs}}
337373 ch = get_codeunit (out)
338374 if ch <= 0x7f
339375 _islower_a (ch) && set_codeunit! (out, ch -= 0x20 )
340- elseif ch > 0xff
341- _islower_u (ch) && set_codeunit! (out, _uppercase_u (ch))
342- elseif _can_upper (ch)
343- set_codeunit! (out, ch -= 0x20 )
344- elseif ch == 0xb5
345- set_codeunit! (out, 0x39c )
346- elseif ch == 0xff
347- set_codeunit! (out, 0x178 )
348- elseif ! V6_COMPAT && ch == 0xdf
349- set_codeunit! (out, 0x1e9e )
376+ elseif ch <= 0xff
377+ set_codeunit! (out, _uppercase_l (ch))
378+ elseif ch <= 0xffff
379+ _can_upper_bmp (ch) && set_codeunit! (out, _upper_bmp (ch))
380+ elseif ch <= 0x1ffff
381+ _can_upper_slp (ch) && set_codeunit! (out, _upper_slp (ch))
350382 end
351383 out += sizeof (CU)
352384 end
@@ -359,7 +391,7 @@ function uppercase(str::MaybeSub{S}) where {C<:Union{UCS2_CSEs,UTF32_CSEs},S<:St
359391 pnt = beg = pointer (str)
360392 fin = beg + sizeof (str)
361393 while pnt < fin
362- _wide_lower_ch (get_codeunit (pnt)) && return _upper (C, beg, pnt- beg, ncodeunits (str))
394+ _can_upper_ch (get_codeunit (pnt)) && return _upper (C, beg, pnt- beg, ncodeunits (str))
363395 pnt += sizeof (CU)
364396 end
365397 str
0 commit comments