Skip to content

Commit d0559a0

Browse files
committed
fixes #47 change RE2 option to match the same characters for \s \w and \d as RE2
1 parent a2a8dda commit d0559a0

File tree

5 files changed

+104
-10
lines changed

5 files changed

+104
-10
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@ The default behavior of `regexp2` is to match the .NET regexp engine, however th
8080
* add support for named ascii character classes (e.g. `[[:foo:]]`)
8181
* add support for python-style capture groups (e.g. `(P<name>re)`)
8282
* change singleline behavior for `$` to only match end of string (like RE2) (see [#24](https://github.com/dlclark/regexp2/issues/24))
83+
* change the character classes `\d` `\s` and `\w` to match the same characters as RE2. NOTE: if you also use the `ECMAScript` option then this will change the `\s` character class to match ECMAScript instead of RE2. ECMAScript allows more whitespace characters in `\s` than RE2 (but still fewer than the the default behavior).
8384

8485
```go
8586
re := regexp2.MustCompile(`Your RE2-compatible pattern`, regexp2.RE2)

go.mod

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
module github.com/dlclark/regexp2
2+
3+
go 1.13

regexp_re2_test.go

Lines changed: 76 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
package regexp2
22

3-
import "testing"
3+
import (
4+
"testing"
5+
)
46

57
func TestRE2CompatCapture(t *testing.T) {
68
r := MustCompile(`re(?P<a>2)`, RE2)
@@ -119,3 +121,76 @@ func TestRE2Dollar_Multiline(t *testing.T) {
119121
t.Fatal("Expected match")
120122
}
121123
}
124+
125+
func TestRE2ExtendedZero(t *testing.T) {
126+
notZero := "߀" // \u07c0
127+
r := MustCompile(`^\d$`, RE2)
128+
if m, _ := r.MatchString(notZero); m {
129+
t.Fatal("Expected no match")
130+
}
131+
132+
r = MustCompile(`^\D$`, RE2)
133+
if m, _ := r.MatchString(notZero); !m {
134+
t.Fatal("Expected match")
135+
}
136+
}
137+
138+
func TestRegularExtendedZero(t *testing.T) {
139+
notZero := "߀" // \u07c0
140+
141+
r := MustCompile(`^\d$`, 0)
142+
if m, _ := r.MatchString(notZero); !m {
143+
t.Fatal("Expected match")
144+
}
145+
146+
r = MustCompile(`^\D$`, 0)
147+
if m, _ := r.MatchString(notZero); m {
148+
t.Fatal("Expected no match")
149+
}
150+
}
151+
152+
func TestRE2Word(t *testing.T) {
153+
r := MustCompile(`\w`, RE2)
154+
if m, _ := r.MatchString("å"); m {
155+
t.Fatal("Expected no match")
156+
}
157+
158+
r = MustCompile(`\W`, RE2)
159+
if m, _ := r.MatchString("å"); !m {
160+
t.Fatal("Expected match")
161+
}
162+
163+
}
164+
165+
func TestRegularWord(t *testing.T) {
166+
r := MustCompile(`\w`, 0)
167+
if m, _ := r.MatchString("å"); !m {
168+
t.Fatal("Expected match")
169+
}
170+
r = MustCompile(`\W`, 0)
171+
if m, _ := r.MatchString("å"); m {
172+
t.Fatal("Expected no match")
173+
}
174+
}
175+
176+
func TestRE2Space(t *testing.T) {
177+
r := MustCompile(`\s`, RE2)
178+
if m, _ := r.MatchString("\x0b"); m {
179+
t.Fatal("Expected no match")
180+
}
181+
r = MustCompile(`\S`, RE2)
182+
if m, _ := r.MatchString("\x0b"); !m {
183+
t.Fatal("Expected match")
184+
}
185+
}
186+
187+
func TestRegularSpace(t *testing.T) {
188+
r := MustCompile(`\s`, 0)
189+
if m, _ := r.MatchString("\x0b"); !m {
190+
t.Fatal("Expected match")
191+
}
192+
r = MustCompile(`\S`, 0)
193+
if m, _ := r.MatchString("\x0b"); m {
194+
t.Fatal("Expected no match")
195+
}
196+
}

syntax/charclass.go

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,8 @@ var (
3737
ecmaSpace = []rune{0x0009, 0x000e, 0x0020, 0x0021, 0x00a0, 0x00a1, 0x1680, 0x1681, 0x2000, 0x200b, 0x2028, 0x202a, 0x202f, 0x2030, 0x205f, 0x2060, 0x3000, 0x3001, 0xfeff, 0xff00}
3838
ecmaWord = []rune{0x0030, 0x003a, 0x0041, 0x005b, 0x005f, 0x0060, 0x0061, 0x007b}
3939
ecmaDigit = []rune{0x0030, 0x003a}
40+
41+
re2Space = []rune{0x0009, 0x000b, 0x000c, 0x000e, 0x0020, 0x0021}
4042
)
4143

4244
var (
@@ -56,6 +58,9 @@ var (
5658
NotSpaceClass = getCharSetFromCategoryString(true, false, spaceCategoryText)
5759
DigitClass = getCharSetFromCategoryString(false, false, "Nd")
5860
NotDigitClass = getCharSetFromCategoryString(false, true, "Nd")
61+
62+
RE2SpaceClass = getCharSetFromOldString(re2Space, false)
63+
NotRE2SpaceClass = getCharSetFromOldString(re2Space, true)
5964
)
6065

6166
var unicodeCategories = func() map[string]*unicode.RangeTable {
@@ -401,13 +406,19 @@ func (c *CharSet) addChar(ch rune) {
401406
c.addRange(ch, ch)
402407
}
403408

404-
func (c *CharSet) addSpace(ecma, negate bool) {
409+
func (c *CharSet) addSpace(ecma, re2, negate bool) {
405410
if ecma {
406411
if negate {
407412
c.addRanges(NotECMASpaceClass().ranges)
408413
} else {
409414
c.addRanges(ECMASpaceClass().ranges)
410415
}
416+
} else if re2 {
417+
if negate {
418+
c.addRanges(NotRE2SpaceClass().ranges)
419+
} else {
420+
c.addRanges(RE2SpaceClass().ranges)
421+
}
411422
} else {
412423
c.addCategories(category{cat: spaceCategoryText, negate: negate})
413424
}
@@ -563,7 +574,7 @@ func (c *CharSet) addNamedASCII(name string, negate bool) bool {
563574
case "punct": //[!-/:-@[-`{-~]
564575
rs = []singleRange{singleRange{'!', '/'}, singleRange{':', '@'}, singleRange{'[', '`'}, singleRange{'{', '~'}}
565576
case "space":
566-
c.addSpace(true, negate)
577+
c.addSpace(true, false, negate)
567578
case "upper":
568579
rs = []singleRange{singleRange{'A', 'Z'}}
569580
case "word":

syntax/parser.go

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1121,14 +1121,14 @@ func (p *parser) scanBackslash(scanOnly bool) (*regexNode, error) {
11211121

11221122
case 'w':
11231123
p.moveRight(1)
1124-
if p.useOptionE() {
1124+
if p.useOptionE() || p.useRE2() {
11251125
return newRegexNodeSet(ntSet, p.options, ECMAWordClass()), nil
11261126
}
11271127
return newRegexNodeSet(ntSet, p.options, WordClass()), nil
11281128

11291129
case 'W':
11301130
p.moveRight(1)
1131-
if p.useOptionE() {
1131+
if p.useOptionE() || p.useRE2() {
11321132
return newRegexNodeSet(ntSet, p.options, NotECMAWordClass()), nil
11331133
}
11341134
return newRegexNodeSet(ntSet, p.options, NotWordClass()), nil
@@ -1137,26 +1137,30 @@ func (p *parser) scanBackslash(scanOnly bool) (*regexNode, error) {
11371137
p.moveRight(1)
11381138
if p.useOptionE() {
11391139
return newRegexNodeSet(ntSet, p.options, ECMASpaceClass()), nil
1140+
} else if p.useRE2() {
1141+
return newRegexNodeSet(ntSet, p.options, RE2SpaceClass()), nil
11401142
}
11411143
return newRegexNodeSet(ntSet, p.options, SpaceClass()), nil
11421144

11431145
case 'S':
11441146
p.moveRight(1)
11451147
if p.useOptionE() {
11461148
return newRegexNodeSet(ntSet, p.options, NotECMASpaceClass()), nil
1149+
} else if p.useRE2() {
1150+
return newRegexNodeSet(ntSet, p.options, NotRE2SpaceClass()), nil
11471151
}
11481152
return newRegexNodeSet(ntSet, p.options, NotSpaceClass()), nil
11491153

11501154
case 'd':
11511155
p.moveRight(1)
1152-
if p.useOptionE() {
1156+
if p.useOptionE() || p.useRE2() {
11531157
return newRegexNodeSet(ntSet, p.options, ECMADigitClass()), nil
11541158
}
11551159
return newRegexNodeSet(ntSet, p.options, DigitClass()), nil
11561160

11571161
case 'D':
11581162
p.moveRight(1)
1159-
if p.useOptionE() {
1163+
if p.useOptionE() || p.useRE2() {
11601164
return newRegexNodeSet(ntSet, p.options, NotECMADigitClass()), nil
11611165
}
11621166
return newRegexNodeSet(ntSet, p.options, NotDigitClass()), nil
@@ -1462,7 +1466,7 @@ func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) {
14621466
if inRange {
14631467
return nil, p.getErr(ErrBadClassInCharRange, ch)
14641468
}
1465-
cc.addDigit(p.useOptionE(), ch == 'D', p.patternRaw)
1469+
cc.addDigit(p.useOptionE() || p.useRE2(), ch == 'D', p.patternRaw)
14661470
}
14671471
continue
14681472

@@ -1471,7 +1475,7 @@ func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) {
14711475
if inRange {
14721476
return nil, p.getErr(ErrBadClassInCharRange, ch)
14731477
}
1474-
cc.addSpace(p.useOptionE(), ch == 'S')
1478+
cc.addSpace(p.useOptionE(), p.useRE2(), ch == 'S')
14751479
}
14761480
continue
14771481

@@ -1481,7 +1485,7 @@ func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) {
14811485
return nil, p.getErr(ErrBadClassInCharRange, ch)
14821486
}
14831487

1484-
cc.addWord(p.useOptionE(), ch == 'W')
1488+
cc.addWord(p.useOptionE() || p.useRE2(), ch == 'W')
14851489
}
14861490
continue
14871491

0 commit comments

Comments
 (0)