@@ -157,6 +157,19 @@ extension Source {
157157 return . init( start ..< currentPosition)
158158 }
159159
160+ /// Attempt to eat a given prefix that satisfies a given predicate, with the
161+ /// source location recorded.
162+ mutating func tryEatLocatedPrefix(
163+ maxLength: Int ? = nil ,
164+ _ f: ( Char ) -> Bool
165+ ) -> Located < String > ? {
166+ let result = recordLoc { src in
167+ src. tryEatPrefix ( maxLength: maxLength, f)
168+ }
169+ guard let result = result else { return nil }
170+ return result. map ( \. string)
171+ }
172+
160173 /// Throws an expected ASCII character error if not matched
161174 mutating func expectASCII( ) throws -> Located < Character > {
162175 try recordLoc { src in
@@ -217,13 +230,13 @@ extension Source {
217230 /// return the scalar value, or throw an error if the string is malformed or
218231 /// would overflow the scalar.
219232 private static func validateUnicodeScalar(
220- _ str: String , _ kind: RadixKind
221- ) throws -> Unicode . Scalar {
222- let num = try validateNumber ( str, UInt32 . self, kind)
233+ _ str: Source . Located < String > , _ kind: RadixKind
234+ ) throws -> AST . Atom . Scalar {
235+ let num = try validateNumber ( str. value , UInt32 . self, kind)
223236 guard let scalar = Unicode . Scalar ( num) else {
224237 throw ParseError . misc ( " Invalid scalar value U+ \( num. hexStr) " )
225238 }
226- return scalar
239+ return . init ( scalar, str . location )
227240 }
228241
229242 /// Try to eat a number of a particular type and radix off the front.
@@ -266,20 +279,65 @@ extension Source {
266279 /// Eat a scalar value from hexadecimal notation off the front
267280 private mutating func expectUnicodeScalar(
268281 numDigits: Int
269- ) throws -> Located < Unicode . Scalar > {
270- try recordLoc { src in
282+ ) throws -> AST . Atom . Scalar {
283+ let str = try recordLoc { src -> String in
271284 let str = src. eat ( upToCount: numDigits) . string
272285 guard str. count == numDigits else {
273286 throw ParseError . expectedNumDigits ( str, numDigits)
274287 }
275- return try Source . validateUnicodeScalar ( str, . hex )
288+ return str
276289 }
290+ return try Source . validateUnicodeScalar ( str, . hex)
291+ }
292+
293+ /// Try to lex a seqence of hex digit unicode scalars.
294+ ///
295+ /// UniScalarSequence -> Whitespace? UniScalarSequencElt+
296+ /// UniScalarSequencElt -> HexDigit{1...} Whitespace?
297+ ///
298+ mutating func expectUnicodeScalarSequence(
299+ eating ending: Character
300+ ) throws -> AST . Atom . Kind {
301+ try recordLoc { src in
302+ var scalars = [ AST . Atom. Scalar] ( )
303+ var trivia = [ AST . Trivia] ( )
304+
305+ // Eat up any leading whitespace.
306+ if let t = src. lexWhitespace ( ) { trivia. append ( t) }
307+
308+ while true {
309+ let str = src. lexUntil { src in
310+ // Hit the ending, stop lexing.
311+ if src. isEmpty || src. peek ( ) == ending {
312+ return true
313+ }
314+ // Eat up trailing whitespace, and stop lexing to record the scalar.
315+ if let t = src. lexWhitespace ( ) {
316+ trivia. append ( t)
317+ return true
318+ }
319+ // Not the ending or trivia, must be a digit of the scalar.
320+ return false
321+ }
322+ guard !str. value. isEmpty else { break }
323+ scalars. append ( try Source . validateUnicodeScalar ( str, . hex) )
324+ }
325+ guard !scalars. isEmpty else {
326+ throw ParseError . expectedNumber ( " " , kind: . hex)
327+ }
328+ try src. expect ( ending)
329+
330+ if scalars. count == 1 {
331+ return . scalar( scalars [ 0 ] )
332+ }
333+ return . scalarSequence( . init( scalars, trivia: trivia) )
334+ } . value
277335 }
278336
279337 /// Eat a scalar off the front, starting from after the
280338 /// backslash and base character (e.g. `\u` or `\x`).
281339 ///
282- /// UniScalar -> 'u{' HexDigit{1...} '}'
340+ /// UniScalar -> 'u{' UniScalarSequence '}'
283341 /// | 'u' HexDigit{4}
284342 /// | 'x{' HexDigit{1...} '}'
285343 /// | 'x' HexDigit{0...2}
@@ -289,49 +347,60 @@ extension Source {
289347 ///
290348 mutating func expectUnicodeScalar(
291349 escapedCharacter base: Character
292- ) throws -> Located < Unicode . Scalar > {
350+ ) throws -> AST . Atom . Kind {
293351 try recordLoc { src in
352+
353+ func nullScalar( ) -> AST . Atom . Kind {
354+ let pos = src. currentPosition
355+ return . scalar( . init( UnicodeScalar ( 0 ) , SourceLocation ( pos ..< pos) ) )
356+ }
357+
294358 // TODO: PCRE offers a different behavior if PCRE2_ALT_BSUX is set.
295359 switch base {
296360 // Hex numbers.
297- case " u " where src. tryEat ( " { " ) , " x " where src. tryEat ( " { " ) :
298- let str = try src. lexUntil ( eating: " } " ) . value
299- return try Source . validateUnicodeScalar ( str, . hex)
361+ case " u " where src. tryEat ( " { " ) :
362+ return try src. expectUnicodeScalarSequence ( eating: " } " )
363+
364+ case " x " where src. tryEat ( " { " ) :
365+ let str = try src. lexUntil ( eating: " } " )
366+ return . scalar( try Source . validateUnicodeScalar ( str, . hex) )
300367
301368 case " x " :
302369 // \x expects *up to* 2 digits.
303- guard let digits = src. tryEatPrefix ( maxLength: 2 , \. isHexDigit) else {
370+ guard let digits = src. tryEatLocatedPrefix ( maxLength: 2 , \. isHexDigit)
371+ else {
304372 // In PCRE, \x without any valid hex digits is \u{0}.
305373 // TODO: This doesn't appear to be followed by ICU or Oniguruma, so
306374 // could be changed to throw an error if we had a parsing mode for
307375 // them.
308- return Unicode . Scalar ( 0 )
376+ return nullScalar ( )
309377 }
310- return try Source . validateUnicodeScalar ( digits. string , . hex)
378+ return . scalar ( try Source . validateUnicodeScalar ( digits, . hex) )
311379
312380 case " u " :
313- return try src. expectUnicodeScalar ( numDigits: 4 ) . value
381+ return . scalar ( try src. expectUnicodeScalar ( numDigits: 4 ) )
314382 case " U " :
315- return try src. expectUnicodeScalar ( numDigits: 8 ) . value
383+ return . scalar ( try src. expectUnicodeScalar ( numDigits: 8 ) )
316384
317385 // Octal numbers.
318386 case " o " where src. tryEat ( " { " ) :
319- let str = try src. lexUntil ( eating: " } " ) . value
320- return try Source . validateUnicodeScalar ( str, . octal)
387+ let str = try src. lexUntil ( eating: " } " )
388+ return . scalar ( try Source . validateUnicodeScalar ( str, . octal) )
321389
322390 case " 0 " :
323391 // We can read *up to* 3 more octal digits.
324392 // FIXME: PCRE can only read up to 2 octal digits, if we get a strict
325393 // PCRE mode, we should limit it here.
326- guard let digits = src. tryEatPrefix ( maxLength: 3 , \. isOctalDigit) else {
327- return Unicode . Scalar ( 0 )
394+ guard let digits = src. tryEatLocatedPrefix ( maxLength: 3 , \. isOctalDigit)
395+ else {
396+ return nullScalar ( )
328397 }
329- return try Source . validateUnicodeScalar ( digits. string , . octal)
398+ return . scalar ( try Source . validateUnicodeScalar ( digits, . octal) )
330399
331400 default :
332401 fatalError ( " Unexpected scalar start " )
333402 }
334- }
403+ } . value
335404 }
336405
337406 /// Try to consume a quantifier
@@ -434,13 +503,22 @@ extension Source {
434503 private mutating func lexUntil(
435504 _ predicate: ( inout Source ) throws -> Bool
436505 ) rethrows -> Located < String > {
506+ // We track locations outside of recordLoc, as the predicate may advance the
507+ // input when we hit the end, and we don't want that to affect the location
508+ // of what was lexed in the `result`. We still want the recordLoc call to
509+ // attach locations to any thrown errors though.
510+ // TODO: We should find a better way of doing this, `lexUntil` seems full
511+ // of footguns.
512+ let start = currentPosition
513+ var end = currentPosition
514+ var result = " "
437515 try recordLoc { src in
438- var result = " "
439516 while try ! predicate( & src) {
440517 result. append ( src. eat ( ) )
518+ end = src. currentPosition
441519 }
442- return result
443520 }
521+ return . init( result, start ..< end)
444522 }
445523
446524 private mutating func lexUntil( eating end: String ) throws -> Located < String > {
@@ -576,6 +654,16 @@ extension Source {
576654 // inside a custom character class (and only treats whitespace as
577655 // non-semantic there for the extra-extended `(?xx)` mode). If we get a
578656 // strict-PCRE mode, we'll need to add a case for that.
657+ return lexWhitespace ( )
658+ }
659+
660+ /// Try to consume whitespace as trivia
661+ ///
662+ /// Whitespace -> WhitespaceChar+
663+ ///
664+ /// Unlike `lexNonSemanticWhitespace`, this will always attempt to lex
665+ /// whitespace.
666+ mutating func lexWhitespace( ) -> AST . Trivia ? {
579667 let trivia : Located < String > ? = recordLoc { src in
580668 src. tryEatPrefix ( \. isPatternWhitespace) ? . string
581669 }
@@ -1153,7 +1241,7 @@ extension Source {
11531241
11541242 // We should either have a unicode scalar.
11551243 if src. tryEat ( sequence: " U+ " ) {
1156- let str = try src. lexUntil ( eating: " } " ) . value
1244+ let str = try src. lexUntil ( eating: " } " )
11571245 return . scalar( try Source . validateUnicodeScalar ( str, . hex) )
11581246 }
11591247
@@ -1581,8 +1669,7 @@ extension Source {
15811669 switch char {
15821670 // Hexadecimal and octal unicode scalars.
15831671 case " u " , " x " , " U " , " o " , " 0 " :
1584- return try . scalar(
1585- src. expectUnicodeScalar ( escapedCharacter: char) . value)
1672+ return try src. expectUnicodeScalar ( escapedCharacter: char)
15861673 default :
15871674 break
15881675 }
0 commit comments