@@ -1064,11 +1064,16 @@ extension Source {
10641064 }
10651065
10661066 mutating func lexCustomCCStart(
1067+ context: ParsingContext
10671068 ) throws -> Located < CustomCC . Start > ? {
10681069 recordLoc { src in
1069- // POSIX named sets are atoms.
1070- guard !src. starts ( with: " [: " ) else { return nil }
1071-
1070+ // Make sure we don't have a POSIX character property. This may require
1071+ // walking to its ending to make sure we have a closing ':]', as otherwise
1072+ // we have a custom character class.
1073+ // TODO: This behavior seems subtle, could we warn?
1074+ guard !src. canLexPOSIXCharacterProperty ( context: context) else {
1075+ return nil
1076+ }
10721077 if src. tryEat ( " [ " ) {
10731078 return src. tryEat ( " ^ " ) ? . inverted : . normal
10741079 }
@@ -1099,12 +1104,38 @@ extension Source {
10991104 }
11001105
11011106 private mutating func lexPOSIXCharacterProperty(
1107+ context: ParsingContext
11021108 ) throws -> Located < AST . Atom . CharacterProperty > ? {
1103- try recordLoc { src in
1104- guard src. tryEat ( sequence: " [: " ) else { return nil }
1105- let inverted = src. tryEat ( " ^ " )
1106- let prop = try src. lexCharacterPropertyContents ( end: " :] " ) . value
1107- return . init( prop, isInverted: inverted, isPOSIX: true )
1109+ // Only allowed in a custom character class.
1110+ guard context. isInCustomCharacterClass else { return nil }
1111+ return try recordLoc { src in
1112+ try src. tryEating { src in
1113+ guard src. tryEat ( sequence: " [: " ) else { return nil }
1114+ let inverted = src. tryEat ( " ^ " )
1115+
1116+ // Note we lex the contents and ending *before* classifying, because we
1117+ // want to bail with nil if we don't have the right ending. This allows
1118+ // the lexing of a custom character class if we don't have a ':]'
1119+ // ending.
1120+ let ( key, value) = src. lexCharacterPropertyKeyValue ( )
1121+ guard src. tryEat ( sequence: " :] " ) else { return nil }
1122+
1123+ let prop = try Source . classifyCharacterPropertyContents ( key: key,
1124+ value: value)
1125+ return . init( prop, isInverted: inverted, isPOSIX: true )
1126+ }
1127+ }
1128+ }
1129+
1130+ private func canLexPOSIXCharacterProperty( context: ParsingContext ) -> Bool {
1131+ do {
1132+ var src = self
1133+ return try src. lexPOSIXCharacterProperty ( context: context) != nil
1134+ } catch {
1135+ // We want to tend on the side of lexing a POSIX character property, so
1136+ // even if it is invalid in some way (e.g invalid property names), still
1137+ // try and lex it.
1138+ return true
11081139 }
11091140 }
11101141
@@ -1129,26 +1160,52 @@ extension Source {
11291160 }
11301161 }
11311162
1132- private mutating func lexCharacterPropertyContents(
1133- end: String
1134- ) throws -> Located < AST . Atom . CharacterProperty . Kind > {
1135- try recordLoc { src in
1136- // We should either have:
1137- // - 'x=y' where 'x' is a property key, and 'y' is a value.
1138- // - 'y' where 'y' is a value (or a bool key with an inferred value
1139- // of true), and its key is inferred.
1140- // TODO: We could have better recovery here if we only ate the characters
1141- // that property keys and values can use.
1142- let lhs = src. lexUntil {
1143- $0. isEmpty || $0. peek ( ) == " = " || $0. starts ( with: end)
1144- } . value
1145- if src. tryEat ( " = " ) {
1146- let rhs = try src. lexUntil ( eating: end) . value
1147- return try Source . classifyCharacterProperty ( key: lhs, value: rhs)
1163+ private mutating func lexCharacterPropertyKeyValue(
1164+ ) -> ( key: String ? , value: String ) {
1165+ func atPossibleEnding( _ src: inout Source ) -> Bool {
1166+ guard let next = src. peek ( ) else { return true }
1167+ switch next {
1168+ case " = " :
1169+ // End of a key.
1170+ return true
1171+ case " : " , " [ " , " ] " :
1172+ // POSIX character property endings to cover ':]', ']', and '[' as the
1173+ // start of a nested character class.
1174+ return true
1175+ case " } " :
1176+ // Ending of '\p{'. We cover this for POSIX too as it's not a valid
1177+ // character property name anyway, and it's nice not to have diverging
1178+ // logic for these cases.
1179+ return true
1180+ default :
1181+ // We may want to handle other metacharacters here, e.g '{', '(', ')',
1182+ // as they're not valid character property names. However for now
1183+ // let's tend on the side of forming an unknown property name in case
1184+ // these characters are ever used in future character property names
1185+ // (though it's very unlikely). Users can always escape e.g the ':'
1186+ // in '[:' if they definitely want a custom character class.
1187+ return false
11481188 }
1149- try src. expect ( sequence: end)
1150- return try Source . classifyCharacterPropertyValueOnly ( lhs)
11511189 }
1190+ // We should either have:
1191+ // - 'x=y' where 'x' is a property key, and 'y' is a value.
1192+ // - 'y' where 'y' is a value (or a bool key with an inferred value of true)
1193+ // and its key is inferred.
1194+ let lhs = lexUntil ( atPossibleEnding) . value
1195+ if tryEat ( " = " ) {
1196+ let rhs = lexUntil ( atPossibleEnding) . value
1197+ return ( lhs, rhs)
1198+ }
1199+ return ( nil , lhs)
1200+ }
1201+
1202+ private static func classifyCharacterPropertyContents(
1203+ key: String ? , value: String
1204+ ) throws -> AST . Atom . CharacterProperty . Kind {
1205+ if let key = key {
1206+ return try classifyCharacterProperty ( key: key, value: value)
1207+ }
1208+ return try classifyCharacterPropertyValueOnly ( value)
11521209 }
11531210
11541211 /// Try to consume a character property.
@@ -1164,7 +1221,10 @@ extension Source {
11641221 let isInverted = src. peek ( ) == " P "
11651222 src. advance ( 2 )
11661223
1167- let prop = try src. lexCharacterPropertyContents ( end: " } " ) . value
1224+ let ( key, value) = src. lexCharacterPropertyKeyValue ( )
1225+ let prop = try Source . classifyCharacterPropertyContents ( key: key,
1226+ value: value)
1227+ try src. expect ( " } " )
11681228 return . init( prop, isInverted: isInverted, isPOSIX: false )
11691229 }
11701230 }
@@ -1758,11 +1818,8 @@ extension Source {
17581818 if !customCC && ( src. peek ( ) == " ) " || src. peek ( ) == " | " ) { return nil }
17591819 // TODO: Store customCC in the atom, if that's useful
17601820
1761- // POSIX character property. This is only allowed in a custom character
1762- // class.
1763- // TODO: Can we try and recover and diagnose these outside character
1764- // classes?
1765- if customCC, let prop = try src. lexPOSIXCharacterProperty ( ) ? . value {
1821+ // POSIX character property.
1822+ if let prop = try src. lexPOSIXCharacterProperty ( context: context) ? . value {
17661823 return . property( prop)
17671824 }
17681825
0 commit comments