diff --git a/moo.js b/moo.js index cf3a27e..56bb729 100644 --- a/moo.js +++ b/moo.js @@ -53,6 +53,13 @@ } } + function zeropad(string, length) { + if (string.length < length) { + return new Array(length - string.length + 1).join("0") + string + } + return string + } + function objectToRules(object) { var keys = Object.getOwnPropertyNames(object) var result = [] @@ -227,13 +234,60 @@ // convert to RegExp var pat = reUnion(match.map(regexpOrLiteral)) + // Add backreference support + var groupCount = reGroups(pat) + let numberOfGroupsPreviousToBackreferences = groups.length + for (let g = 0; g < groupCount; g++) { + /* + * Stub group for this capture group, this should never be referenced + * later in the code since the capture group will only be non-null if + * the parent capture group (with lower index) is non-null, in which + * case the parent will win. + */ + groups.push(null) + } + /* + * Replace backreferences like \1 with backreferences to the correct + * placeholder in the built regexp, being careful to avoid + * false-positives due to escaped backslashes. + */ + var hasBackreference = false + if (groupCount > 0) { + /* + * WARNING: we require your regexp to contain a capture group to opt + * into this because you cannot use this with certain regexps, e.g. + * `/()[\1]/` should matches SOH (U+0001) but we see the \1 as a + * backreference and rewrite it. + * + * To solve this, avoid octal escapes, use `\u0001` instead. + */ + + pat = pat.replace(/((?:^|[^\\])(?:\\\\)*\\)([1-9][0-9]*)(?=[^0-9])/g, (match, front, backreferenceGroupNumber) => { + const number = parseInt(backreferenceGroupNumber, 10) + const couldBeOctal = !!backreferenceGroupNumber.match(/^[0-7]+$/) + const octalNumber = couldBeOctal && parseInt(backreferenceGroupNumber, 8) + if (number < 1 || number > groupCount) { + throw new Error( + "Backreference \\" + backreferenceGroupNumber + " out of range in regexp " + pat + + ( + couldBeOctal + ? " (if you meant to use an octal escape, instead use \\u" + zeropad(octalNumber.toString(16), 4) + ")" + : "" + ) + ) + } + hasBackreference = true + // Account for all the previous capture groups + return front + String(numberOfGroupsPreviousToBackreferences + number) + }) + } + // validate var regexp = new RegExp(pat) if (regexp.test("")) { throw new Error("RegExp matches empty string: " + regexp) } - var groupCount = reGroups(pat) - if (groupCount > 0) { + if (groupCount > 0 && !hasBackreference) { throw new Error("RegExp has capture groups: " + regexp + "\nUse (?: … ) instead") } diff --git a/test/test.js b/test/test.js index 2e20d18..8b7e55b 100644 --- a/test/test.js +++ b/test/test.js @@ -474,6 +474,56 @@ describe('value transforms', () => { }) +describe('backreferences', () => { + test('does not get processed if no capture groups', () => { + expect(() => moo.compile({ + tok: /foo\1/, + tok2: /[\1]/ + })).not.toThrow() + }) + + test('throws error on invalid backreference when capture groups present', () => { + expect(() => moo.compile({ + tok: /(f)(o)\13/ + })).toThrow('use \\u000b') + }) + + test('enable back-references', () => { + let lexer = moo.compile({ + // https://www.postgresql.org/docs/11/sql-syntax-lexical.html#SQL-SYNTAX-DOLLAR-QUOTING + // The tag, if any, of a dollar-quoted string follows the same rules as an unquoted identifier, except that it cannot contain a dollar sign. + // SQL identifiers and key words must begin with a letter (a-z, but also letters with diacritical marks and non-Latin letters) or an underscore (_). Subsequent characters in an identifier or key word can be letters, underscores, digits (0-9), or dollar signs ($). + dollarStringConstant: { + match: /\$([\w_][\w\d_]*)?\$[^]*?\$\1\$/, + lineBreaks: true, + }, + + fubar: 'fubar', + }); + const dollarString = '$outer$ outer $middle$ middle $inner$\n!inner!\n$inner$ /middle $middle$ /outer $outer$' + const fullString = 'fubar' + dollarString + 'fubar' + lexer.reset(fullString) + let tokens = lexAll(lexer).filter(t => t.type !== 'space') + expect(tokens.shift()).toMatchObject({ type: 'fubar', text: 'fubar', value: 'fubar' }) + expect(tokens.shift()).toMatchObject({ type: 'dollarStringConstant', text: dollarString, value: dollarString }) + expect(tokens.shift()).toMatchObject({ type: 'fubar', text: 'fubar', value: 'fubar' }) + }) + + test('works with multi-digit backreferences', () => { + let lexer = moo.compile({ + test: /(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)(l)\11/, + fubar: 'fubar', + }); + const alpha = 'abcdefghijklk' + const fullString = 'fubar' + alpha + 'fubar' + lexer.reset(fullString) + let tokens = lexAll(lexer).filter(t => t.type !== 'space') + expect(tokens.shift()).toMatchObject({ type: 'fubar', text: 'fubar', value: 'fubar' }) + expect(tokens.shift()).toMatchObject({ type: 'test', text: alpha, value: alpha }) + expect(tokens.shift()).toMatchObject({ type: 'fubar', text: 'fubar', value: 'fubar' }) + }) +}); + describe('lexer', () => { var simpleLexer = compile({