@@ -256,9 +256,11 @@ fileprivate extension Compiler.ByteCodeGen {
256256 }
257257 }
258258
259- mutating func emitAlternation(
260- _ children: [ DSLTree . Node ]
261- ) throws {
259+ mutating func emitAlternationGen< C: BidirectionalCollection > (
260+ _ elements: C ,
261+ withBacktracking: Bool ,
262+ _ body: ( inout Compiler . ByteCodeGen , C . Element ) throws -> Void
263+ ) rethrows {
262264 // Alternation: p0 | p1 | ... | pn
263265 // save next_p1
264266 // <code for p0>
@@ -276,16 +278,27 @@ fileprivate extension Compiler.ByteCodeGen {
276278 // <code for pn>
277279 // done:
278280 let done = builder. makeAddress ( )
279- for component in children . dropLast ( ) {
281+ for element in elements . dropLast ( ) {
280282 let next = builder. makeAddress ( )
281283 builder. buildSave ( next)
282- try emitNode ( component)
284+ try body ( & self , element)
285+ if !withBacktracking {
286+ builder. buildClear ( )
287+ }
283288 builder. buildBranch ( to: done)
284289 builder. label ( next)
285290 }
286- try emitNode ( children . last!)
291+ try body ( & self , elements . last!)
287292 builder. label ( done)
288293 }
294+
295+ mutating func emitAlternation(
296+ _ children: [ DSLTree . Node ]
297+ ) throws {
298+ try emitAlternationGen ( children, withBacktracking: true ) {
299+ try $0. emitNode ( $1)
300+ }
301+ }
289302
290303 mutating func emitConcatenationComponent(
291304 _ node: DSLTree . Node
@@ -872,19 +885,187 @@ fileprivate extension Compiler.ByteCodeGen {
872885 }
873886 }
874887
888+ /// Flatten quoted strings into sequences of atoms, so that the standard
889+ /// CCC codegen will handle them.
890+ func flatteningCustomCharacterClassMembers(
891+ _ members: [ DSLTree . CustomCharacterClass . Member ]
892+ ) -> [ DSLTree . CustomCharacterClass . Member ] {
893+ var characters : Set < Character > = [ ]
894+ var scalars : Set < UnicodeScalar > = [ ]
895+ var result : [ DSLTree . CustomCharacterClass . Member ] = [ ]
896+ for member in members {
897+ switch member {
898+ case . atom( let atom) :
899+ switch atom {
900+ case let . char( char) :
901+ characters. insert ( char)
902+ case let . scalar( scalar) :
903+ scalars. insert ( scalar)
904+ default :
905+ result. append ( member)
906+ }
907+ case let . quotedLiteral( str) :
908+ characters. formUnion ( str)
909+ default :
910+ result. append ( member)
911+ }
912+ }
913+ result. append ( contentsOf: characters. map { . atom( . char( $0) ) } )
914+ result. append ( contentsOf: scalars. map { . atom( . scalar( $0) ) } )
915+ return result
916+ }
917+
875918 func coalescingCustomCharacterClass(
876919 _ ccc: DSLTree . CustomCharacterClass
877920 ) -> DSLTree . CustomCharacterClass {
878921 // This only needs to be done in grapheme semantic mode. In scalar semantic
879922 // mode, we don't want to coalesce any scalars into a grapheme. This
880923 // means that e.g `[e\u{301}-\u{302}]` remains a range between U+301 and
881924 // U+302.
882- guard options. semanticLevel == . graphemeCluster else { return ccc }
883-
884- let members = coalescingCustomCharacterClassMembers ( ccc. members)
885- return . init( members: members, isInverted: ccc. isInverted)
925+ let members = options. semanticLevel == . graphemeCluster
926+ ? coalescingCustomCharacterClassMembers ( ccc. members)
927+ : ccc. members
928+ return . init(
929+ members: flatteningCustomCharacterClassMembers ( members) ,
930+ isInverted: ccc. isInverted)
886931 }
887932
933+ mutating func emitCharacterInCCC( _ c: Character ) {
934+ switch options. semanticLevel {
935+ case . graphemeCluster:
936+ emitCharacter ( c)
937+ case . unicodeScalar:
938+ // When in scalar mode, act like an alternation of the individual scalars
939+ // that comprise a character.
940+ emitAlternationGen ( c. unicodeScalars, withBacktracking: false ) {
941+ $0. emitMatchScalar ( $1)
942+ }
943+ }
944+ }
945+
946+ mutating func emitCCCMember(
947+ _ member: DSLTree . CustomCharacterClass . Member
948+ ) throws {
949+ switch member {
950+ case . atom( let atom) :
951+ switch atom {
952+ case . char( let c) :
953+ emitCharacterInCCC ( c)
954+ case . scalar( let s) :
955+ emitCharacterInCCC ( Character ( s) )
956+ default :
957+ try emitAtom ( atom)
958+ }
959+ case . custom( let ccc) :
960+ try emitCustomCharacterClass ( ccc)
961+ case . quotedLiteral:
962+ fatalError ( " Removed in 'flatteningCustomCharacterClassMembers' " )
963+ case . range:
964+ let consumer = try member. generateConsumer ( options)
965+ builder. buildConsume ( by: consumer)
966+ case . trivia:
967+ return
968+
969+ // TODO: Can we decide when it's better to try `rhs` first?
970+ // Intersection is trivial, since failure on either side propagates:
971+ // - store current position
972+ // - lhs
973+ // - restore current position
974+ // - rhs
975+ case let . intersection( lhs, rhs) :
976+ let r = builder. makePositionRegister ( )
977+ builder. buildMoveCurrentPosition ( into: r)
978+ try emitCustomCharacterClass ( lhs)
979+ builder. buildRestorePosition ( from: r)
980+ try emitCustomCharacterClass ( rhs)
981+
982+ // TODO: Can we decide when it's better to try `rhs` first?
983+ // For subtraction, failure in `lhs` propagates, while failure in `rhs` is
984+ // swallowed/reversed:
985+ // - store current position
986+ // - lhs
987+ // - save to end
988+ // - restore current position
989+ // - rhs
990+ // - clear, fail (since both succeeded)
991+ // - end: ...
992+ case let . subtraction( lhs, rhs) :
993+ let r = builder. makePositionRegister ( )
994+ let end = builder. makeAddress ( )
995+ builder. buildMoveCurrentPosition ( into: r)
996+ try emitCustomCharacterClass ( lhs) // no match here = failure, propagates
997+ builder. buildSave ( end)
998+ builder. buildRestorePosition ( from: r)
999+ try emitCustomCharacterClass ( rhs) // no match here = success, resumes at 'end'
1000+ builder. buildClear ( ) // clears 'end'
1001+ builder. buildFail ( ) // this failure propagates outward
1002+ builder. label ( end)
1003+
1004+ // Symmetric difference always requires executing both `rhs` and `lhs`.
1005+ // Execute each, ignoring failure and storing the resulting position in a
1006+ // register. If those results are equal, fail. If they're different, use
1007+ // the position that is different from the starting position:
1008+ // - store current position as r0
1009+ // - save to lhsFail
1010+ // - lhs
1011+ // - clear lhsFail (and continue)
1012+ // - lhsFail: save position as r1
1013+ //
1014+ // - restore current position
1015+ // - save to rhsFail
1016+ // - rhs
1017+ // - clear rhsFail (and continue)
1018+ // - rhsFail: save position as r2
1019+ //
1020+ // - restore to resulting position from lhs (r1)
1021+ // - if equal to r2, goto fail (both sides had same result)
1022+ // - if equal to r0, goto advance (lhs failed)
1023+ // - goto end
1024+ // - advance: restore to resulting position from rhs (r2)
1025+ // - goto end
1026+ // - fail: fail
1027+ // - end: ...
1028+ case let . symmetricDifference( lhs, rhs) :
1029+ let r0 = builder. makePositionRegister ( )
1030+ let r1 = builder. makePositionRegister ( )
1031+ let r2 = builder. makePositionRegister ( )
1032+ let lhsFail = builder. makeAddress ( )
1033+ let rhsFail = builder. makeAddress ( )
1034+ let advance = builder. makeAddress ( )
1035+ let fail = builder. makeAddress ( )
1036+ let end = builder. makeAddress ( )
1037+
1038+ builder. buildMoveCurrentPosition ( into: r0)
1039+ builder. buildSave ( lhsFail)
1040+ try emitCustomCharacterClass ( lhs)
1041+ builder. buildClear ( )
1042+ builder. label ( lhsFail)
1043+ builder. buildMoveCurrentPosition ( into: r1)
1044+
1045+ builder. buildRestorePosition ( from: r0)
1046+ builder. buildSave ( rhsFail)
1047+ try emitCustomCharacterClass ( rhs)
1048+ builder. buildClear ( )
1049+ builder. label ( rhsFail)
1050+ builder. buildMoveCurrentPosition ( into: r2)
1051+
1052+ // If r1 == r2, then fail
1053+ builder. buildRestorePosition ( from: r1)
1054+ builder. buildCondBranch ( to: fail, ifSamePositionAs: r2)
1055+
1056+ // If r1 == r0, then move to r2 before ending
1057+ builder. buildCondBranch ( to: advance, ifSamePositionAs: r0)
1058+ builder. buildBranch ( to: end)
1059+ builder. label ( advance)
1060+ builder. buildRestorePosition ( from: r2)
1061+ builder. buildBranch ( to: end)
1062+
1063+ builder. label ( fail)
1064+ builder. buildFail ( )
1065+ builder. label ( end)
1066+ }
1067+ }
1068+
8881069 mutating func emitCustomCharacterClass(
8891070 _ ccc: DSLTree . CustomCharacterClass
8901071 ) throws {
@@ -902,8 +1083,67 @@ fileprivate extension Compiler.ByteCodeGen {
9021083 }
9031084 return
9041085 }
905- let consumer = try ccc. generateConsumer ( options)
906- builder. buildConsume ( by: consumer)
1086+
1087+ let updatedCCC : DSLTree . CustomCharacterClass
1088+ if optimizationsEnabled {
1089+ updatedCCC = ccc. coalescingASCIIMembers ( options)
1090+ } else {
1091+ updatedCCC = ccc
1092+ }
1093+ let filteredMembers = updatedCCC. members. filter ( { !$0. isOnlyTrivia} )
1094+
1095+ if updatedCCC. isInverted {
1096+ // inverted
1097+ // custom character class: p0 | p1 | ... | pn
1098+ // Try each member to make sure they all fail
1099+ // save next_p1
1100+ // <code for p0>
1101+ // clear, fail
1102+ // next_p1:
1103+ // save next_p2
1104+ // <code for p1>
1105+ // clear fail
1106+ // next_p2:
1107+ // save next_p...
1108+ // <code for p2>
1109+ // clear fail
1110+ // ...
1111+ // next_pn:
1112+ // save done
1113+ // <code for pn>
1114+ // clear fail
1115+ // done:
1116+ // step forward by 1
1117+ let done = builder. makeAddress ( )
1118+ for member in filteredMembers. dropLast ( ) {
1119+ let next = builder. makeAddress ( )
1120+ builder. buildSave ( next)
1121+ try emitCCCMember ( member)
1122+ builder. buildClear ( )
1123+ builder. buildFail ( )
1124+ builder. label ( next)
1125+ }
1126+ builder. buildSave ( done)
1127+ try emitCCCMember ( filteredMembers. last!)
1128+ builder. buildClear ( )
1129+ builder. buildFail ( )
1130+ builder. label ( done)
1131+
1132+ // Consume a single unit for the inverted ccc
1133+ switch options. semanticLevel {
1134+ case . graphemeCluster:
1135+ builder. buildAdvance ( 1 )
1136+ case . unicodeScalar:
1137+ builder. buildAdvanceUnicodeScalar ( 1 )
1138+ }
1139+ return
1140+ }
1141+ // non inverted CCC
1142+ // Custom character class: p0 | p1 | ... | pn
1143+ // Very similar to alternation, but we don't keep backtracking save points
1144+ try emitAlternationGen ( filteredMembers, withBacktracking: false ) {
1145+ try $0. emitCCCMember ( $1)
1146+ }
9071147 }
9081148
9091149 mutating func emitConcatenation( _ children: [ DSLTree . Node ] ) throws {
@@ -1040,6 +1280,12 @@ fileprivate extension Compiler.ByteCodeGen {
10401280}
10411281
10421282extension DSLTree . Node {
1283+ /// A Boolean value indicating whether this node advances the match position
1284+ /// on a successful match.
1285+ ///
1286+ /// For example, an alternation like `(a|b|c)` always advances the position
1287+ /// by a character, but `(a|b|)` has an empty branch, which matches without
1288+ /// advancing.
10431289 var guaranteesForwardProgress : Bool {
10441290 switch self {
10451291 case . orderedChoice( let children) :
@@ -1070,12 +1316,34 @@ extension DSLTree.Node {
10701316 case . consumer, . matcher:
10711317 // Allow zero width consumers and matchers
10721318 return false
1073- case . customCharacterClass:
1074- return true
1319+ case . customCharacterClass( let ccc ) :
1320+ return ccc . guaranteesForwardProgress
10751321 case . quantification( let amount, _, let child) :
10761322 let ( atLeast, _) = amount. ast. bounds
10771323 return atLeast ?? 0 > 0 && child. guaranteesForwardProgress
10781324 default : return false
10791325 }
10801326 }
10811327}
1328+
1329+ extension DSLTree . CustomCharacterClass {
1330+ /// We allow trivia into CustomCharacterClass, which could result in a CCC
1331+ /// that matches nothing, ie `(?x)[ ]`.
1332+ var guaranteesForwardProgress : Bool {
1333+ for m in members {
1334+ switch m {
1335+ case . trivia:
1336+ continue
1337+ case let . intersection( lhs, rhs) :
1338+ return lhs. guaranteesForwardProgress && rhs. guaranteesForwardProgress
1339+ case let . subtraction( lhs, _) :
1340+ return lhs. guaranteesForwardProgress
1341+ case let . symmetricDifference( lhs, rhs) :
1342+ return lhs. guaranteesForwardProgress && rhs. guaranteesForwardProgress
1343+ default :
1344+ return true
1345+ }
1346+ }
1347+ return false
1348+ }
1349+ }
0 commit comments