fixes #47 change RE2 option to match the same characters for \s \w and \d as RE2

dlclark · dlclark · commit d0559a0de6e3 · 2022-03-29T18:32:51.000-05:00
diff --git a/README.md b/README.md
@@ -80,6 +80,7 @@ The default behavior of `regexp2` is to match the .NET regexp engine, however th
 * add support for named ascii character classes (e.g. `[[:foo:]]`)
 * add support for python-style capture groups (e.g. `(P<name>re)`)
 * change singleline behavior for `$` to only match end of string (like RE2) (see [#24](https://github.com/dlclark/regexp2/issues/24))
+* change the character classes `\d` `\s` and `\w` to match the same characters as RE2. NOTE: if you also use the `ECMAScript` option then this will change the `\s` character class to match ECMAScript instead of RE2.  ECMAScript allows more whitespace characters in `\s` than RE2 (but still fewer than the the default behavior).
  
 ```go
 re := regexp2.MustCompile(`Your RE2-compatible pattern`, regexp2.RE2)
diff --git a/go.mod b/go.mod
@@ -0,0 +1,3 @@
+module github.com/dlclark/regexp2
+
+go 1.13
diff --git a/regexp_re2_test.go b/regexp_re2_test.go
@@ -1,6 +1,8 @@
 package regexp2
 
-import "testing"
+import (
+	"testing"
+)
 
 func TestRE2CompatCapture(t *testing.T) {
 	r := MustCompile(`re(?P<a>2)`, RE2)
@@ -119,3 +121,76 @@ func TestRE2Dollar_Multiline(t *testing.T) {
 		t.Fatal("Expected match")
 	}
 }
+
+func TestRE2ExtendedZero(t *testing.T) {
+	notZero := "߀" // \u07c0
+	r := MustCompile(`^\d$`, RE2)
+	if m, _ := r.MatchString(notZero); m {
+		t.Fatal("Expected no match")
+	}
+
+	r = MustCompile(`^\D$`, RE2)
+	if m, _ := r.MatchString(notZero); !m {
+		t.Fatal("Expected match")
+	}
+}
+
+func TestRegularExtendedZero(t *testing.T) {
+	notZero := "߀" // \u07c0
+
+	r := MustCompile(`^\d$`, 0)
+	if m, _ := r.MatchString(notZero); !m {
+		t.Fatal("Expected match")
+	}
+
+	r = MustCompile(`^\D$`, 0)
+	if m, _ := r.MatchString(notZero); m {
+		t.Fatal("Expected no match")
+	}
+}
+
+func TestRE2Word(t *testing.T) {
+	r := MustCompile(`\w`, RE2)
+	if m, _ := r.MatchString("å"); m {
+		t.Fatal("Expected no match")
+	}
+
+	r = MustCompile(`\W`, RE2)
+	if m, _ := r.MatchString("å"); !m {
+		t.Fatal("Expected match")
+	}
+
+}
+
+func TestRegularWord(t *testing.T) {
+	r := MustCompile(`\w`, 0)
+	if m, _ := r.MatchString("å"); !m {
+		t.Fatal("Expected match")
+	}
+	r = MustCompile(`\W`, 0)
+	if m, _ := r.MatchString("å"); m {
+		t.Fatal("Expected no match")
+	}
+}
+
+func TestRE2Space(t *testing.T) {
+	r := MustCompile(`\s`, RE2)
+	if m, _ := r.MatchString("\x0b"); m {
+		t.Fatal("Expected no match")
+	}
+	r = MustCompile(`\S`, RE2)
+	if m, _ := r.MatchString("\x0b"); !m {
+		t.Fatal("Expected match")
+	}
+}
+
+func TestRegularSpace(t *testing.T) {
+	r := MustCompile(`\s`, 0)
+	if m, _ := r.MatchString("\x0b"); !m {
+		t.Fatal("Expected match")
+	}
+	r = MustCompile(`\S`, 0)
+	if m, _ := r.MatchString("\x0b"); m {
+		t.Fatal("Expected no match")
+	}
+}
diff --git a/syntax/charclass.go b/syntax/charclass.go
@@ -37,6 +37,8 @@ var (
 	ecmaSpace = []rune{0x0009, 0x000e, 0x0020, 0x0021, 0x00a0, 0x00a1, 0x1680, 0x1681, 0x2000, 0x200b, 0x2028, 0x202a, 0x202f, 0x2030, 0x205f, 0x2060, 0x3000, 0x3001, 0xfeff, 0xff00}
 	ecmaWord  = []rune{0x0030, 0x003a, 0x0041, 0x005b, 0x005f, 0x0060, 0x0061, 0x007b}
 	ecmaDigit = []rune{0x0030, 0x003a}
+
+	re2Space = []rune{0x0009, 0x000b, 0x000c, 0x000e, 0x0020, 0x0021}
 )
 
 var (
@@ -56,6 +58,9 @@ var (
 	NotSpaceClass = getCharSetFromCategoryString(true, false, spaceCategoryText)
 	DigitClass    = getCharSetFromCategoryString(false, false, "Nd")
 	NotDigitClass = getCharSetFromCategoryString(false, true, "Nd")
+
+	RE2SpaceClass    = getCharSetFromOldString(re2Space, false)
+	NotRE2SpaceClass = getCharSetFromOldString(re2Space, true)
 )
 
 var unicodeCategories = func() map[string]*unicode.RangeTable {
@@ -401,13 +406,19 @@ func (c *CharSet) addChar(ch rune) {
 	c.addRange(ch, ch)
 }
 
-func (c *CharSet) addSpace(ecma, negate bool) {
+func (c *CharSet) addSpace(ecma, re2, negate bool) {
 	if ecma {
 		if negate {
 			c.addRanges(NotECMASpaceClass().ranges)
 		} else {
 			c.addRanges(ECMASpaceClass().ranges)
 		}
+	} else if re2 {
+		if negate {
+			c.addRanges(NotRE2SpaceClass().ranges)
+		} else {
+			c.addRanges(RE2SpaceClass().ranges)
+		}
 	} else {
 		c.addCategories(category{cat: spaceCategoryText, negate: negate})
 	}
@@ -563,7 +574,7 @@ func (c *CharSet) addNamedASCII(name string, negate bool) bool {
 	case "punct": //[!-/:-@[-`{-~]
 		rs = []singleRange{singleRange{'!', '/'}, singleRange{':', '@'}, singleRange{'[', '`'}, singleRange{'{', '~'}}
 	case "space":
-		c.addSpace(true, negate)
+		c.addSpace(true, false, negate)
 	case "upper":
 		rs = []singleRange{singleRange{'A', 'Z'}}
 	case "word":
diff --git a/syntax/parser.go b/syntax/parser.go
@@ -1121,14 +1121,14 @@ func (p *parser) scanBackslash(scanOnly bool) (*regexNode, error) {
 
 	case 'w':
 		p.moveRight(1)
-		if p.useOptionE() {
+		if p.useOptionE() || p.useRE2() {
 			return newRegexNodeSet(ntSet, p.options, ECMAWordClass()), nil
 		}
 		return newRegexNodeSet(ntSet, p.options, WordClass()), nil
 
 	case 'W':
 		p.moveRight(1)
-		if p.useOptionE() {
+		if p.useOptionE() || p.useRE2() {
 			return newRegexNodeSet(ntSet, p.options, NotECMAWordClass()), nil
 		}
 		return newRegexNodeSet(ntSet, p.options, NotWordClass()), nil
@@ -1137,26 +1137,30 @@ func (p *parser) scanBackslash(scanOnly bool) (*regexNode, error) {
 		p.moveRight(1)
 		if p.useOptionE() {
 			return newRegexNodeSet(ntSet, p.options, ECMASpaceClass()), nil
+		} else if p.useRE2() {
+			return newRegexNodeSet(ntSet, p.options, RE2SpaceClass()), nil
 		}
 		return newRegexNodeSet(ntSet, p.options, SpaceClass()), nil
 
 	case 'S':
 		p.moveRight(1)
 		if p.useOptionE() {
 			return newRegexNodeSet(ntSet, p.options, NotECMASpaceClass()), nil
+		} else if p.useRE2() {
+			return newRegexNodeSet(ntSet, p.options, NotRE2SpaceClass()), nil
 		}
 		return newRegexNodeSet(ntSet, p.options, NotSpaceClass()), nil
 
 	case 'd':
 		p.moveRight(1)
-		if p.useOptionE() {
+		if p.useOptionE() || p.useRE2() {
 			return newRegexNodeSet(ntSet, p.options, ECMADigitClass()), nil
 		}
 		return newRegexNodeSet(ntSet, p.options, DigitClass()), nil
 
 	case 'D':
 		p.moveRight(1)
-		if p.useOptionE() {
+		if p.useOptionE() || p.useRE2() {
 			return newRegexNodeSet(ntSet, p.options, NotECMADigitClass()), nil
 		}
 		return newRegexNodeSet(ntSet, p.options, NotDigitClass()), nil
@@ -1462,7 +1466,7 @@ func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) {
 					if inRange {
 						return nil, p.getErr(ErrBadClassInCharRange, ch)
 					}
-					cc.addDigit(p.useOptionE(), ch == 'D', p.patternRaw)
+					cc.addDigit(p.useOptionE() || p.useRE2(), ch == 'D', p.patternRaw)
 				}
 				continue
 
@@ -1471,7 +1475,7 @@ func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) {
 					if inRange {
 						return nil, p.getErr(ErrBadClassInCharRange, ch)
 					}
-					cc.addSpace(p.useOptionE(), ch == 'S')
+					cc.addSpace(p.useOptionE(), p.useRE2(), ch == 'S')
 				}
 				continue
 
@@ -1481,7 +1485,7 @@ func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) {
 						return nil, p.getErr(ErrBadClassInCharRange, ch)
 					}
 
-					cc.addWord(p.useOptionE(), ch == 'W')
+					cc.addWord(p.useOptionE() || p.useRE2(), ch == 'W')
 				}
 				continue
 

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+module github.com/dlclark/regexp2`
	`2`	`+`
	`3`	`+go 1.13`
Original file line number	Diff line number	Diff line change
`@@ -1121,14 +1121,14 @@ func (p parser) scanBackslash(scanOnly bool) (regexNode, error) {`
`1121`	`1121`
`1122`	`1122`	`case 'w':`
`1123`	`1123`	`p.moveRight(1)`
`1124`		`- if p.useOptionE() {`
	`1124`	`+ if p.useOptionE() \|\| p.useRE2() {`
`1125`	`1125`	`return newRegexNodeSet(ntSet, p.options, ECMAWordClass()), nil`
`1126`	`1126`	`}`
`1127`	`1127`	`return newRegexNodeSet(ntSet, p.options, WordClass()), nil`
`1128`	`1128`
`1129`	`1129`	`case 'W':`
`1130`	`1130`	`p.moveRight(1)`
`1131`		`- if p.useOptionE() {`
	`1131`	`+ if p.useOptionE() \|\| p.useRE2() {`
`1132`	`1132`	`return newRegexNodeSet(ntSet, p.options, NotECMAWordClass()), nil`
`1133`	`1133`	`}`
`1134`	`1134`	`return newRegexNodeSet(ntSet, p.options, NotWordClass()), nil`
`@@ -1137,26 +1137,30 @@ func (p parser) scanBackslash(scanOnly bool) (regexNode, error) {`
`1137`	`1137`	`p.moveRight(1)`
`1138`	`1138`	`if p.useOptionE() {`
`1139`	`1139`	`return newRegexNodeSet(ntSet, p.options, ECMASpaceClass()), nil`
	`1140`	`+ } else if p.useRE2() {`
	`1141`	`+ return newRegexNodeSet(ntSet, p.options, RE2SpaceClass()), nil`
`1140`	`1142`	`}`
`1141`	`1143`	`return newRegexNodeSet(ntSet, p.options, SpaceClass()), nil`
`1142`	`1144`
`1143`	`1145`	`case 'S':`
`1144`	`1146`	`p.moveRight(1)`
`1145`	`1147`	`if p.useOptionE() {`
`1146`	`1148`	`return newRegexNodeSet(ntSet, p.options, NotECMASpaceClass()), nil`
	`1149`	`+ } else if p.useRE2() {`
	`1150`	`+ return newRegexNodeSet(ntSet, p.options, NotRE2SpaceClass()), nil`
`1147`	`1151`	`}`
`1148`	`1152`	`return newRegexNodeSet(ntSet, p.options, NotSpaceClass()), nil`
`1149`	`1153`
`1150`	`1154`	`case 'd':`
`1151`	`1155`	`p.moveRight(1)`
`1152`		`- if p.useOptionE() {`
	`1156`	`+ if p.useOptionE() \|\| p.useRE2() {`
`1153`	`1157`	`return newRegexNodeSet(ntSet, p.options, ECMADigitClass()), nil`
`1154`	`1158`	`}`
`1155`	`1159`	`return newRegexNodeSet(ntSet, p.options, DigitClass()), nil`
`1156`	`1160`
`1157`	`1161`	`case 'D':`
`1158`	`1162`	`p.moveRight(1)`
`1159`		`- if p.useOptionE() {`
	`1163`	`+ if p.useOptionE() \|\| p.useRE2() {`
`1160`	`1164`	`return newRegexNodeSet(ntSet, p.options, NotECMADigitClass()), nil`
`1161`	`1165`	`}`
`1162`	`1166`	`return newRegexNodeSet(ntSet, p.options, NotDigitClass()), nil`
`@@ -1462,7 +1466,7 @@ func (p parser) scanCharSet(caseInsensitive, scanOnly bool) (CharSet, error) {`
`1462`	`1466`	`if inRange {`
`1463`	`1467`	`return nil, p.getErr(ErrBadClassInCharRange, ch)`
`1464`	`1468`	`}`
`1465`		`- cc.addDigit(p.useOptionE(), ch == 'D', p.patternRaw)`
	`1469`	`+ cc.addDigit(p.useOptionE() \|\| p.useRE2(), ch == 'D', p.patternRaw)`
`1466`	`1470`	`}`
`1467`	`1471`	`continue`
`1468`	`1472`
`@@ -1471,7 +1475,7 @@ func (p parser) scanCharSet(caseInsensitive, scanOnly bool) (CharSet, error) {`
`1471`	`1475`	`if inRange {`
`1472`	`1476`	`return nil, p.getErr(ErrBadClassInCharRange, ch)`
`1473`	`1477`	`}`
`1474`		`- cc.addSpace(p.useOptionE(), ch == 'S')`
	`1478`	`+ cc.addSpace(p.useOptionE(), p.useRE2(), ch == 'S')`
`1475`	`1479`	`}`
`1476`	`1480`	`continue`
`1477`	`1481`
`@@ -1481,7 +1485,7 @@ func (p parser) scanCharSet(caseInsensitive, scanOnly bool) (CharSet, error) {`
`1481`	`1485`	`return nil, p.getErr(ErrBadClassInCharRange, ch)`
`1482`	`1486`	`}`
`1483`	`1487`
`1484`		`- cc.addWord(p.useOptionE(), ch == 'W')`
	`1488`	`+ cc.addWord(p.useOptionE() \|\| p.useRE2(), ch == 'W')`
`1485`	`1489`	`}`
`1486`	`1490`	`continue`
`1487`	`1491`