Skip to content

Commit 3b11f31

Browse files
authored
Improve non-greedy repeat support (#641)
1 parent 3efd1be commit 3b11f31

File tree

5 files changed

+198
-68
lines changed

5 files changed

+198
-68
lines changed

src/pcre2_jit_compile.c

Lines changed: 140 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -10589,49 +10589,43 @@ else
1058910589
tmp_offset = LOCAL2;
1059010590
}
1059110591

10592-
/* Handle fixed part first. */
10593-
if (opcode != OP_UPTO && opcode != OP_POSUPTO)
10592+
if (opcode == OP_EXACT)
1059410593
{
10595-
if (exact > 1)
10596-
{
10597-
SLJIT_ASSERT(early_fail_ptr == 0);
10594+
SLJIT_ASSERT(early_fail_ptr == 0 && exact >= 2);
1059810595

10599-
if (common->mode == PCRE2_JIT_COMPLETE
10596+
if (common->mode == PCRE2_JIT_COMPLETE
1060010597
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
10601-
&& !common->utf
10598+
&& !common->utf
1060210599
#endif
10603-
&& type != OP_ANYNL && type != OP_EXTUNI)
10604-
{
10605-
OP2(SLJIT_SUB, TMP1, 0, STR_END, 0, STR_PTR, 0);
10606-
add_jump(compiler, &backtrack->own_backtracks, CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, IN_UCHARS(exact)));
10600+
&& type != OP_ANYNL && type != OP_EXTUNI)
10601+
{
10602+
OP2(SLJIT_SUB, TMP1, 0, STR_END, 0, STR_PTR, 0);
10603+
add_jump(compiler, &backtrack->own_backtracks, CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, IN_UCHARS(exact)));
1060710604

1060810605
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 32
10609-
if (type == OP_ALLANY && !common->invalid_utf)
10606+
if (type == OP_ALLANY && !common->invalid_utf)
1061010607
#else
10611-
if (type == OP_ALLANY)
10608+
if (type == OP_ALLANY)
1061210609
#endif
10613-
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(exact));
10614-
else
10615-
{
10616-
OP1(SLJIT_MOV, tmp_base, tmp_offset, SLJIT_IMM, exact);
10617-
label = LABEL();
10618-
compile_char1_matchingpath(common, type, cc, &backtrack->own_backtracks, FALSE);
10619-
OP2(SLJIT_SUB | SLJIT_SET_Z, tmp_base, tmp_offset, tmp_base, tmp_offset, SLJIT_IMM, 1);
10620-
JUMPTO(SLJIT_NOT_ZERO, label);
10621-
}
10622-
}
10610+
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(exact));
1062310611
else
1062410612
{
10625-
SLJIT_ASSERT(tmp_base == TMP3 || common->locals_size >= 3 * SSIZE_OF(sw));
1062610613
OP1(SLJIT_MOV, tmp_base, tmp_offset, SLJIT_IMM, exact);
1062710614
label = LABEL();
10628-
compile_char1_matchingpath(common, type, cc, &backtrack->own_backtracks, TRUE);
10615+
compile_char1_matchingpath(common, type, cc, &backtrack->own_backtracks, FALSE);
1062910616
OP2(SLJIT_SUB | SLJIT_SET_Z, tmp_base, tmp_offset, tmp_base, tmp_offset, SLJIT_IMM, 1);
1063010617
JUMPTO(SLJIT_NOT_ZERO, label);
1063110618
}
1063210619
}
10633-
else if (exact == 1 && opcode != OP_STAR && opcode != OP_MINSTAR && opcode != OP_POSSTAR)
10620+
else
10621+
{
10622+
SLJIT_ASSERT(tmp_base == TMP3 || common->locals_size >= 3 * SSIZE_OF(sw));
10623+
OP1(SLJIT_MOV, tmp_base, tmp_offset, SLJIT_IMM, exact);
10624+
label = LABEL();
1063410625
compile_char1_matchingpath(common, type, cc, &backtrack->own_backtracks, TRUE);
10626+
OP2(SLJIT_SUB | SLJIT_SET_Z, tmp_base, tmp_offset, tmp_base, tmp_offset, SLJIT_IMM, 1);
10627+
JUMPTO(SLJIT_NOT_ZERO, label);
10628+
}
1063510629
}
1063610630

1063710631
if (early_fail_type == type_fail_range)
@@ -10649,8 +10643,8 @@ if (early_fail_type == type_fail_range)
1064910643

1065010644
switch(opcode)
1065110645
{
10652-
case OP_UPTO:
1065310646
case OP_STAR:
10647+
case OP_UPTO:
1065410648
SLJIT_ASSERT(early_fail_ptr == 0 || opcode == OP_STAR);
1065510649
max += exact;
1065610650

@@ -11006,21 +11000,55 @@ switch(opcode)
1100611000
BACKTRACK_AS(char_iterator_backtrack)->matchingpath = LABEL();
1100711001
break;
1100811002

11003+
case OP_QUERY:
11004+
SLJIT_ASSERT(early_fail_ptr == 0);
11005+
if (private_data_ptr == 0)
11006+
allocate_stack(common, 1);
11007+
OP1(SLJIT_MOV, base, offset0, STR_PTR, 0);
11008+
compile_char1_matchingpath(common, type, cc, &BACKTRACK_AS(char_iterator_backtrack)->u.backtracks, TRUE);
11009+
BACKTRACK_AS(char_iterator_backtrack)->matchingpath = LABEL();
11010+
break;
11011+
1100911012
case OP_MINSTAR:
11013+
case OP_MINQUERY:
11014+
SLJIT_ASSERT(opcode == OP_MINSTAR || early_fail_ptr == 0);
1101011015
if (private_data_ptr == 0)
1101111016
allocate_stack(common, 1);
1101211017

11013-
if (exact == 1)
11018+
if (exact >= 1)
1101411019
{
11015-
BACKTRACK_AS(char_iterator_backtrack)->matchingpath = LABEL();
11020+
if (exact >= 2)
11021+
{
11022+
/* Extuni has a separate exact opcode. */
11023+
SLJIT_ASSERT(tmp_base == TMP3 && early_fail_ptr == 0);
11024+
OP1(SLJIT_MOV, TMP3, 0, SLJIT_IMM, exact);
11025+
}
11026+
11027+
if (opcode == OP_MINQUERY)
11028+
OP1(SLJIT_MOV, base, offset0, SLJIT_IMM, -1);
11029+
11030+
label = LABEL();
11031+
BACKTRACK_AS(char_iterator_backtrack)->matchingpath = label;
11032+
1101611033
compile_char1_matchingpath(common, type, cc, &BACKTRACK_AS(char_iterator_backtrack)->u.backtracks, TRUE);
11017-
OP1(SLJIT_MOV, base, offset0, STR_PTR, 0);
11034+
11035+
if (exact >= 2)
11036+
{
11037+
OP2(SLJIT_SUB | SLJIT_SET_Z, TMP3, 0, TMP3, 0, SLJIT_IMM, 1);
11038+
JUMPTO(SLJIT_NOT_ZERO, label);
11039+
}
11040+
11041+
if (opcode == OP_MINQUERY)
11042+
OP2(SLJIT_AND, base, offset0, base, offset0, STR_PTR, 0);
11043+
else
11044+
OP1(SLJIT_MOV, base, offset0, STR_PTR, 0);
1101811045
}
1101911046
else
1102011047
{
1102111048
OP1(SLJIT_MOV, base, offset0, STR_PTR, 0);
1102211049
BACKTRACK_AS(char_iterator_backtrack)->matchingpath = LABEL();
1102311050
}
11051+
1102411052
if (early_fail_ptr != 0)
1102511053
OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), early_fail_ptr, STR_PTR, 0);
1102611054
break;
@@ -11029,20 +11057,35 @@ switch(opcode)
1102911057
SLJIT_ASSERT(early_fail_ptr == 0);
1103011058
if (private_data_ptr == 0)
1103111059
allocate_stack(common, 2);
11032-
OP1(SLJIT_MOV, base, offset0, STR_PTR, 0);
11060+
1103311061
OP1(SLJIT_MOV, base, offset1, SLJIT_IMM, max + 1);
11034-
BACKTRACK_AS(char_iterator_backtrack)->matchingpath = LABEL();
11035-
break;
1103611062

11037-
case OP_QUERY:
11038-
case OP_MINQUERY:
11039-
SLJIT_ASSERT(early_fail_ptr == 0);
11040-
if (private_data_ptr == 0)
11041-
allocate_stack(common, 1);
11063+
if (exact == 0)
11064+
{
11065+
OP1(SLJIT_MOV, base, offset0, STR_PTR, 0);
11066+
BACKTRACK_AS(char_iterator_backtrack)->matchingpath = LABEL();
11067+
break;
11068+
}
11069+
11070+
if (exact >= 2)
11071+
{
11072+
/* Extuni has a separate exact opcode. */
11073+
SLJIT_ASSERT(tmp_base == TMP3);
11074+
OP1(SLJIT_MOV, TMP3, 0, SLJIT_IMM, exact);
11075+
}
11076+
11077+
label = LABEL();
11078+
BACKTRACK_AS(char_iterator_backtrack)->matchingpath = label;
11079+
11080+
compile_char1_matchingpath(common, type, cc, &BACKTRACK_AS(char_iterator_backtrack)->u.backtracks, TRUE);
11081+
11082+
if (exact >= 2)
11083+
{
11084+
OP2(SLJIT_SUB | SLJIT_SET_Z, TMP3, 0, TMP3, 0, SLJIT_IMM, 1);
11085+
JUMPTO(SLJIT_NOT_ZERO, label);
11086+
}
11087+
1104211088
OP1(SLJIT_MOV, base, offset0, STR_PTR, 0);
11043-
if (opcode == OP_QUERY)
11044-
compile_char1_matchingpath(common, type, cc, &BACKTRACK_AS(char_iterator_backtrack)->u.backtracks, TRUE);
11045-
BACKTRACK_AS(char_iterator_backtrack)->matchingpath = LABEL();
1104611089
break;
1104711090

1104811091
case OP_EXACT:
@@ -11820,15 +11863,32 @@ switch(opcode)
1182011863
}
1182111864
break;
1182211865

11866+
case OP_QUERY:
11867+
OP1(SLJIT_MOV, STR_PTR, 0, base, offset0);
11868+
OP1(SLJIT_MOV, base, offset0, SLJIT_IMM, 0);
11869+
CMPTO(SLJIT_NOT_EQUAL, STR_PTR, 0, SLJIT_IMM, 0, CURRENT_AS(char_iterator_backtrack)->matchingpath);
11870+
jump = JUMP(SLJIT_JUMP);
11871+
set_jumps(CURRENT_AS(char_iterator_backtrack)->u.backtracks, LABEL());
11872+
OP1(SLJIT_MOV, STR_PTR, 0, base, offset0);
11873+
OP1(SLJIT_MOV, base, offset0, SLJIT_IMM, 0);
11874+
JUMPTO(SLJIT_JUMP, CURRENT_AS(char_iterator_backtrack)->matchingpath);
11875+
JUMPHERE(jump);
11876+
if (private_data_ptr == 0)
11877+
free_stack(common, 1);
11878+
break;
11879+
1182311880
case OP_MINSTAR:
1182411881
OP1(SLJIT_MOV, STR_PTR, 0, base, offset0);
11825-
if (exact != 1)
11882+
if (exact == 0)
1182611883
{
1182711884
compile_char1_matchingpath(common, type, cc, &jumplist, TRUE);
1182811885
OP1(SLJIT_MOV, base, offset0, STR_PTR, 0);
1182911886
}
11887+
else if (exact > 1)
11888+
OP1(SLJIT_MOV, TMP3, 0, SLJIT_IMM, 1);
11889+
1183011890
JUMPTO(SLJIT_JUMP, CURRENT_AS(char_iterator_backtrack)->matchingpath);
11831-
set_jumps(exact == 1 ? CURRENT_AS(char_iterator_backtrack)->u.backtracks : jumplist, LABEL());
11891+
set_jumps(exact > 0 ? CURRENT_AS(char_iterator_backtrack)->u.backtracks : jumplist, LABEL());
1183211892
if (private_data_ptr == 0)
1183311893
free_stack(common, 1);
1183411894
break;
@@ -11837,40 +11897,52 @@ switch(opcode)
1183711897
OP1(SLJIT_MOV, TMP1, 0, base, offset1);
1183811898
OP1(SLJIT_MOV, STR_PTR, 0, base, offset0);
1183911899
OP2(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, TMP1, 0, SLJIT_IMM, 1);
11840-
add_jump(compiler, &jumplist, JUMP(SLJIT_ZERO));
1184111900

11842-
OP1(SLJIT_MOV, base, offset1, TMP1, 0);
11843-
compile_char1_matchingpath(common, type, cc, &jumplist, TRUE);
11844-
OP1(SLJIT_MOV, base, offset0, STR_PTR, 0);
11845-
JUMPTO(SLJIT_JUMP, CURRENT_AS(char_iterator_backtrack)->matchingpath);
11901+
if (exact == 0)
11902+
{
11903+
add_jump(compiler, &jumplist, JUMP(SLJIT_ZERO));
1184611904

11847-
set_jumps(jumplist, LABEL());
11848-
if (private_data_ptr == 0)
11849-
free_stack(common, 2);
11850-
break;
11905+
OP1(SLJIT_MOV, base, offset1, TMP1, 0);
11906+
compile_char1_matchingpath(common, type, cc, &jumplist, TRUE);
11907+
OP1(SLJIT_MOV, base, offset0, STR_PTR, 0);
11908+
JUMPTO(SLJIT_JUMP, CURRENT_AS(char_iterator_backtrack)->matchingpath);
11909+
11910+
set_jumps(jumplist, LABEL());
11911+
}
11912+
else
11913+
{
11914+
if (exact > 1)
11915+
OP1(SLJIT_MOV, TMP3, 0, SLJIT_IMM, 1);
11916+
OP1(SLJIT_MOV, base, offset1, TMP1, 0);
11917+
JUMPTO(SLJIT_NOT_ZERO, CURRENT_AS(char_iterator_backtrack)->matchingpath);
11918+
11919+
set_jumps(CURRENT_AS(char_iterator_backtrack)->u.backtracks, LABEL());
11920+
}
1185111921

11852-
case OP_QUERY:
11853-
OP1(SLJIT_MOV, STR_PTR, 0, base, offset0);
11854-
OP1(SLJIT_MOV, base, offset0, SLJIT_IMM, 0);
11855-
CMPTO(SLJIT_NOT_EQUAL, STR_PTR, 0, SLJIT_IMM, 0, CURRENT_AS(char_iterator_backtrack)->matchingpath);
11856-
jump = JUMP(SLJIT_JUMP);
11857-
set_jumps(CURRENT_AS(char_iterator_backtrack)->u.backtracks, LABEL());
11858-
OP1(SLJIT_MOV, STR_PTR, 0, base, offset0);
11859-
OP1(SLJIT_MOV, base, offset0, SLJIT_IMM, 0);
11860-
JUMPTO(SLJIT_JUMP, CURRENT_AS(char_iterator_backtrack)->matchingpath);
11861-
JUMPHERE(jump);
1186211922
if (private_data_ptr == 0)
11863-
free_stack(common, 1);
11923+
free_stack(common, 2);
1186411924
break;
1186511925

1186611926
case OP_MINQUERY:
1186711927
OP1(SLJIT_MOV, STR_PTR, 0, base, offset0);
1186811928
OP1(SLJIT_MOV, base, offset0, SLJIT_IMM, 0);
11869-
jump = CMP(SLJIT_EQUAL, STR_PTR, 0, SLJIT_IMM, 0);
11870-
compile_char1_matchingpath(common, type, cc, &jumplist, TRUE);
11871-
JUMPTO(SLJIT_JUMP, CURRENT_AS(char_iterator_backtrack)->matchingpath);
11872-
set_jumps(jumplist, LABEL());
11873-
JUMPHERE(jump);
11929+
11930+
if (exact >= 1)
11931+
{
11932+
if (exact >= 2)
11933+
OP1(SLJIT_MOV, TMP3, 0, SLJIT_IMM, 1);
11934+
CMPTO(SLJIT_NOT_EQUAL, STR_PTR, 0, SLJIT_IMM, 0, CURRENT_AS(char_iterator_backtrack)->matchingpath);
11935+
set_jumps(CURRENT_AS(char_iterator_backtrack)->u.backtracks, LABEL());
11936+
}
11937+
else
11938+
{
11939+
jump = CMP(SLJIT_EQUAL, STR_PTR, 0, SLJIT_IMM, 0);
11940+
compile_char1_matchingpath(common, type, cc, &jumplist, TRUE);
11941+
JUMPTO(SLJIT_JUMP, CURRENT_AS(char_iterator_backtrack)->matchingpath);
11942+
set_jumps(jumplist, LABEL());
11943+
JUMPHERE(jump);
11944+
}
11945+
1187411946
if (private_data_ptr == 0)
1187511947
free_stack(common, 1);
1187611948
break;

testdata/testinput1

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7056,10 +7056,21 @@ $/x
70567056
abcdefghbijb
70577057
abcdefghbij
70587058
abcdeb
7059+
abcdefghijx
70597060
\= Expect no match
70607061
abcdb
70617062
abcdefghijk
70627063

7064+
/[a-z]{1,6}?s|x/
7065+
asbs
7066+
abcdefs
7067+
abcdefghijkss
7068+
abcdefghijkx
7069+
ss
7070+
\= Expect no match
7071+
s
7072+
aaa
7073+
70637074
# --------------
70647075

70657076
# End of testinput1

testdata/testinput4

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1869,6 +1869,15 @@
18691869
ab\x{600}\x{700}ijklh
18701870
ab\x{600}h\x{700}ijklmh
18711871

1872+
/([a-z\x{1000}\x{2000}]{1,2}?u)+$/utf
1873+
\x{1000}uu\x{2000}u
1874+
\x{1001}uuuu
1875+
\x{2001}uuuuu
1876+
uuuu\x{1fff}#u#\x{2000}\x{1000}u\x{2000}u
1877+
\= Expect no match
1878+
abuabuabuabu!
1879+
uuuuuuuuuuuu#
1880+
18721881
# --------------------------------------
18731882

18741883
/(ΣΆΜΟΣ) \1/i,utf

testdata/testoutput1

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11095,12 +11095,31 @@ No match
1109511095
0: abcdefghb
1109611096
abcdeb
1109711097
0: abcdeb
11098+
abcdefghijx
11099+
0: x
1109811100
\= Expect no match
1109911101
abcdb
1110011102
No match
1110111103
abcdefghijk
1110211104
No match
1110311105

11106+
/[a-z]{1,6}?s|x/
11107+
asbs
11108+
0: as
11109+
abcdefs
11110+
0: abcdefs
11111+
abcdefghijkss
11112+
0: fghijks
11113+
abcdefghijkx
11114+
0: x
11115+
ss
11116+
0: ss
11117+
\= Expect no match
11118+
s
11119+
No match
11120+
aaa
11121+
No match
11122+
1110411123
# --------------
1110511124

1110611125
# End of testinput1

testdata/testoutput4

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3055,6 +3055,25 @@ No match
30553055
ab\x{600}h\x{700}ijklmh
30563056
No match
30573057

3058+
/([a-z\x{1000}\x{2000}]{1,2}?u)+$/utf
3059+
\x{1000}uu\x{2000}u
3060+
0: \x{1000}uu\x{2000}u
3061+
1: u\x{2000}u
3062+
\x{1001}uuuu
3063+
0: uuuu
3064+
1: uu
3065+
\x{2001}uuuuu
3066+
0: uuuuu
3067+
1: uuu
3068+
uuuu\x{1fff}#u#\x{2000}\x{1000}u\x{2000}u
3069+
0: \x{2000}\x{1000}u\x{2000}u
3070+
1: \x{2000}u
3071+
\= Expect no match
3072+
abuabuabuabu!
3073+
No match
3074+
uuuuuuuuuuuu#
3075+
No match
3076+
30583077
# --------------------------------------
30593078

30603079
/(ΣΆΜΟΣ) \1/i,utf

0 commit comments

Comments
 (0)