Skip to content

Commit e671907

Browse files
Merge pull request #6771 from n8sh/issue-19405
Fix Issue 19405 - Speed up backwards UTF-8 decoding in stripRight & make nogc nothrow for strings
2 parents 4c02055 + f22d322 commit e671907

File tree

1 file changed

+41
-43
lines changed

1 file changed

+41
-43
lines changed

std/string.d

Lines changed: 41 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -3138,25 +3138,24 @@ if (isSomeString!Range ||
31383138
import std.uni : isWhite;
31393139
alias C = Unqual!(ElementEncodingType!(typeof(str)));
31403140

3141-
static if (isSomeString!(typeof(str)))
3141+
static if (isSomeString!(typeof(str)) && C.sizeof >= 2)
31423142
{
3143-
static if (C.sizeof >= 2)
3143+
// No whitespace takes multiple wchars to encode and due to
3144+
// the design of UTF-16 those wchars will not occur as part
3145+
// of the encoding of multi-wchar codepoints.
3146+
foreach_reverse (i, C c; str)
31443147
{
3145-
// No whitespace takes multiple wchars to encode and due to
3146-
// the design of UTF-16 those wchars will not occur as part
3147-
// of the encoding of multi-wchar codepoints.
3148-
foreach_reverse (i, C c; str)
3149-
{
3150-
if (!isWhite(c))
3151-
return str[0 .. i + 1];
3152-
}
3153-
return str[0 .. 0];
3148+
if (!isWhite(c))
3149+
return str[0 .. i + 1];
31543150
}
3155-
else
3151+
return str[0 .. 0];
3152+
}
3153+
else
3154+
{
3155+
// ASCII optimization for dynamic arrays.
3156+
static if (isDynamicArray!(typeof(str)))
31563157
{
31573158
static import std.ascii;
3158-
import std.utf : codeLength;
3159-
// ASCII optimization.
31603159
foreach_reverse (i, C c; str)
31613160
{
31623161
if (c >= 0x80)
@@ -3170,18 +3169,10 @@ if (isSomeString!Range ||
31703169
}
31713170
}
31723171
return str[0 .. 0];
3173-
3174-
NonAsciiPath:
3175-
foreach_reverse (i, dchar c; str)
3176-
{
3177-
if (!isWhite(c))
3178-
return str[0 .. i + codeLength!C(c)];
3179-
}
3180-
return str[0 .. 0];
31813172
}
3182-
}
3183-
else
3184-
{
3173+
3174+
NonAsciiPath:
3175+
31853176
size_t i = str.length;
31863177
while (i--)
31873178
{
@@ -3196,9 +3187,7 @@ if (isSomeString!Range ||
31963187
}
31973188
else static if (C.sizeof == 1)
31983189
{
3199-
import std.utf : byDchar;
3200-
3201-
char cx = str[i];
3190+
const cx = str[i];
32023191
if (cx <= 0x7F)
32033192
{
32043193
if (isWhite(cx))
@@ -3207,21 +3196,30 @@ if (isSomeString!Range ||
32073196
}
32083197
else
32093198
{
3210-
size_t stride = 0;
3211-
3212-
while (1)
3199+
if (i == 0 || (0b1100_0000 & cx) != 0b1000_0000)
3200+
break;
3201+
const uint d = 0b0011_1111 & cx;
3202+
const c2 = str[i - 1];
3203+
if ((c2 & 0b1110_0000) == 0b1100_0000) // 2 byte encoding.
32133204
{
3214-
++stride;
3215-
if (!i || (cx & 0xC0) == 0xC0 || stride == 4)
3216-
break;
3217-
cx = str[i - 1];
3218-
if (!(cx & 0x80))
3219-
break;
3220-
--i;
3205+
if (isWhite(d + (uint(c2 & 0b0001_1111) << 6)))
3206+
{
3207+
i--;
3208+
continue;
3209+
}
3210+
break;
32213211
}
3222-
3223-
if (!str[i .. i + stride].byDchar.front.isWhite)
3224-
return str[0 .. i + stride];
3212+
if (i == 1 || (c2 & 0b1100_0000) != 0b1000_0000)
3213+
break;
3214+
const c3 = str[i - 2];
3215+
// In UTF-8 all whitespace is encoded in 3 bytes or fewer.
3216+
if ((c3 & 0b1111_0000) == 0b1110_0000 &&
3217+
isWhite(d + (uint(c2 & 0b0011_1111) << 6) + (uint(c3 & 0b0000_1111) << 12)))
3218+
{
3219+
i -= 2;
3220+
continue;
3221+
}
3222+
break;
32253223
}
32263224
}
32273225
else
@@ -3233,7 +3231,7 @@ if (isSomeString!Range ||
32333231
}
32343232

32353233
///
3236-
@safe pure
3234+
nothrow @safe pure
32373235
unittest
32383236
{
32393237
import std.uni : lineSep, paraSep;
@@ -3255,7 +3253,7 @@ if (isConvertibleToString!Range)
32553253
return stripRight!(StringTypeOf!Range)(str);
32563254
}
32573255

3258-
@safe pure unittest
3256+
@nogc nothrow @safe pure unittest
32593257
{
32603258
assert(testAliasedString!stripRight("hello "));
32613259
}

0 commit comments

Comments
 (0)