@@ -775,9 +775,131 @@ fileprivate extension Compiler.ByteCodeGen {
775775 builder. label ( exit)
776776 }
777777
778+ /// Coalesce any adjacent scalar members in a custom character class together.
779+ /// This is required in order to produce correct grapheme matching behavior.
780+ func coalescingCustomCharacterClassMembers(
781+ _ members: [ DSLTree . CustomCharacterClass . Member ]
782+ ) -> [ DSLTree . CustomCharacterClass . Member ] {
783+ struct Accumulator {
784+ /// A series of range operands. For example, in `[ab-cde-fg]`, this will
785+ /// contain the strings `["ab", "cde", "fg"]`. From there, the resulting
786+ /// ranges will be created.
787+ private var rangeOperands : [ String ] = [ " " ]
788+
789+ /// The current range operand.
790+ private var current : String {
791+ _read { yield rangeOperands [ rangeOperands. count - 1 ] }
792+ _modify { yield & rangeOperands[ rangeOperands. count - 1 ] }
793+ }
794+
795+ /// Try to accumulate a character class member, returning `true` if
796+ /// successful, `false` otherwise.
797+ mutating func tryAccumulate(
798+ _ member: DSLTree . CustomCharacterClass . Member
799+ ) -> Bool {
800+ switch member {
801+ case . atom( let a) :
802+ guard let c = a. literalCharacterValue else { return false }
803+ current. append ( c)
804+ return true
805+ case . quotedLiteral( let str) :
806+ current += str
807+ return true
808+ case let . range( lhs, rhs) :
809+ guard let lhs = lhs. literalCharacterValue,
810+ let rhs = rhs. literalCharacterValue
811+ else { return false }
812+ current. append ( lhs)
813+ rangeOperands. append ( String ( rhs) )
814+ return true
815+ case . trivia:
816+ // Trivia can be completely ignored if we've already coalesced
817+ // something.
818+ return !current. isEmpty
819+ default :
820+ return false
821+ }
822+ }
823+
824+ func finish( ) -> [ DSLTree . CustomCharacterClass . Member ] {
825+ if rangeOperands. count == 1 {
826+ // If we didn't have any additional range operands, this isn't a
827+ // range, we can just form a standard quoted literal.
828+ return [ . quotedLiteral( current) ]
829+ }
830+ var members = [ DSLTree . CustomCharacterClass. Member] ( )
831+
832+ // We have other range operands, splice them together. For N operands
833+ // we have N - 1 ranges.
834+ for (i, lhs) in rangeOperands. dropLast ( ) . enumerated ( ) {
835+ let rhs = rangeOperands [ i + 1 ]
836+
837+ // If this is the first operand we only need to drop the last
838+ // character for its quoted members, otherwise this is both an LHS
839+ // and RHS of a range, and as such needs both sides trimmed.
840+ let leading = i == 0 ? lhs. dropLast ( ) : lhs. dropFirst ( ) . dropLast ( )
841+ if !leading. isEmpty {
842+ members. append ( . quotedLiteral( String ( leading) ) )
843+ }
844+ members. append ( . range( . char( lhs. last!) , . char( rhs. first!) ) )
845+ }
846+ // We've handled everything except the quoted portion of the last
847+ // operand, add it now.
848+ let trailing = rangeOperands. last!. dropFirst ( )
849+ if !trailing. isEmpty {
850+ members. append ( . quotedLiteral( String ( trailing) ) )
851+ }
852+ return members
853+ }
854+ }
855+ return members
856+ . map { m -> DSLTree . CustomCharacterClass . Member in
857+ // First we need to recursively coalsce any child character classes.
858+ switch m {
859+ case . custom( let ccc) :
860+ return . custom( coalescingCustomCharacterClass ( ccc) )
861+ case . intersection( let lhs, let rhs) :
862+ return . intersection(
863+ coalescingCustomCharacterClass ( lhs) ,
864+ coalescingCustomCharacterClass ( rhs) )
865+ case . subtraction( let lhs, let rhs) :
866+ return . subtraction(
867+ coalescingCustomCharacterClass ( lhs) ,
868+ coalescingCustomCharacterClass ( rhs) )
869+ case . symmetricDifference( let lhs, let rhs) :
870+ return . symmetricDifference(
871+ coalescingCustomCharacterClass ( lhs) ,
872+ coalescingCustomCharacterClass ( rhs) )
873+ case . atom, . range, . quotedLiteral, . trivia:
874+ return m
875+ }
876+ }
877+ . coalescing ( with: Accumulator ( ) , into: { $0. finish ( ) } ) { accum, member in
878+ accum. tryAccumulate ( member)
879+ }
880+ }
881+
882+ func coalescingCustomCharacterClass(
883+ _ ccc: DSLTree . CustomCharacterClass
884+ ) -> DSLTree . CustomCharacterClass {
885+ // This only needs to be done in grapheme semantic mode. In scalar semantic
886+ // mode, we don't want to coalesce any scalars into a grapheme. This
887+ // means that e.g `[e\u{301}-\u{302}]` remains a range between U+301 and
888+ // U+302.
889+ guard options. semanticLevel == . graphemeCluster else { return ccc }
890+
891+ let members = coalescingCustomCharacterClassMembers ( ccc. members)
892+ return . init( members: members, isInverted: ccc. isInverted)
893+ }
894+
778895 mutating func emitCustomCharacterClass(
779896 _ ccc: DSLTree . CustomCharacterClass
780897 ) throws {
898+ // Before emitting a custom character class in grapheme semantic mode, we
899+ // need to coalesce together any adjacent characters and scalars, over which
900+ // we can perform grapheme breaking. This includes e.g range bounds for
901+ // `[e\u{301}-\u{302}]`.
902+ let ccc = coalescingCustomCharacterClass ( ccc)
781903 if let asciiBitset = ccc. asAsciiBitset ( options) ,
782904 optimizationsEnabled {
783905 if options. semanticLevel == . unicodeScalar {
@@ -791,6 +913,45 @@ fileprivate extension Compiler.ByteCodeGen {
791913 }
792914 }
793915
916+ mutating func emitConcatenation( _ children: [ DSLTree . Node ] ) throws {
917+ // Before emitting a concatenation, we need to flatten out any nested
918+ // concatenations, and coalesce any adjacent characters and scalars, forming
919+ // quoted literals of their contents, over which we can perform grapheme
920+ // breaking.
921+ func flatten( _ node: DSLTree . Node ) -> [ DSLTree . Node ] {
922+ switch node {
923+ case . concatenation( let ch) :
924+ return ch. flatMap ( flatten)
925+ case . convertedRegexLiteral( let n, _) :
926+ return flatten ( n)
927+ default :
928+ return [ node]
929+ }
930+ }
931+ let children = children
932+ . flatMap ( flatten)
933+ . coalescing ( with: " " , into: DSLTree . Node. quotedLiteral) { str, node in
934+ switch node {
935+ case . atom( let a) :
936+ guard let c = a. literalCharacterValue else { return false }
937+ str. append ( c)
938+ return true
939+ case . quotedLiteral( let q) :
940+ str += q
941+ return true
942+ case . trivia:
943+ // Trivia can be completely ignored if we've already coalesced
944+ // something.
945+ return !str. isEmpty
946+ default :
947+ return false
948+ }
949+ }
950+ for child in children {
951+ try emitConcatenationComponent ( child)
952+ }
953+ }
954+
794955 @discardableResult
795956 mutating func emitNode( _ node: DSLTree . Node ) throws -> ValueRegister ? {
796957 switch node {
@@ -799,9 +960,7 @@ fileprivate extension Compiler.ByteCodeGen {
799960 try emitAlternation ( children)
800961
801962 case let . concatenation( children) :
802- for child in children {
803- try emitConcatenationComponent ( child)
804- }
963+ try emitConcatenation ( children)
805964
806965 case let . capture( name, refId, child, transform) :
807966 options. beginScope ( )
0 commit comments