@@ -420,11 +420,16 @@ extension _StringGuts {
420420}
421421
422422extension Unicode . Scalar {
423- fileprivate var _isLinkingConsonant : Bool {
424- _swift_stdlib_isLinkingConsonant ( value)
423+ fileprivate var _isInCBConsonant : Bool {
424+ _swift_stdlib_isInCB_Consonant ( value)
425425 }
426426
427- fileprivate var _isVirama : Bool {
427+ fileprivate var _isInCBExtend : Bool {
428+ // Assuming that we're already an Extend or ZWJ...
429+ !( _isInCBConsonant || _isInCBLinker || value == 0x200C )
430+ }
431+
432+ fileprivate var _isInCBLinker : Bool {
428433 switch value {
429434 // Devanagari
430435 case 0x94D :
@@ -453,10 +458,10 @@ extension Unicode.Scalar {
453458
454459internal struct _GraphemeBreakingState : Sendable , Equatable {
455460 // When we're looking through an indic sequence, one of the requirements is
456- // that there is at LEAST 1 Virama present between two linking consonants .
461+ // that there is at LEAST 1 InCB=Linker present between two InCB=Consonant .
457462 // This value helps ensure that when we ultimately need to decide whether or
458463 // not to break that we've at least seen 1 when walking.
459- var hasSeenVirama = false
464+ var hasSeenInCBLinker = false
460465
461466 // When walking forwards in a string, we need to know whether or not we've
462467 // entered an emoji sequence to be able to eventually break after all of the
@@ -483,7 +488,7 @@ internal struct _GraphemeBreakingState: Sendable, Equatable {
483488extension _GraphemeBreakingState : CustomStringConvertible {
484489 var description : String {
485490 var r = " [ "
486- if hasSeenVirama { r += " V " }
491+ if hasSeenInCBLinker { r += " L " }
487492 if isInEmojiSequence { r += " E " }
488493 if isInIndicSequence { r += " I " }
489494 if shouldBreakRI { r += " R " }
@@ -729,8 +734,8 @@ extension _GraphemeBreakingState {
729734 var enterIndicSequence = false
730735
731736 defer {
732- self . isInEmojiSequence = enterEmojiSequence
733- self . isInIndicSequence = enterIndicSequence
737+ isInEmojiSequence = enterEmojiSequence
738+ isInIndicSequence = enterIndicSequence
734739 }
735740
736741 let y = Unicode . _GraphemeBreakProperty ( from: scalar2)
@@ -767,7 +772,7 @@ extension _GraphemeBreakingState {
767772 ( . t, . t) :
768773 return false
769774
770- // GB9 (partial GB11)
775+ // GB9 (partial GB9c and partial GB11)
771776 case ( _, . extend) ,
772777 ( _, . zwj) :
773778
@@ -780,29 +785,52 @@ extension _GraphemeBreakingState {
780785 // sequence; the sequence continues through subsequent extend/extend and
781786 // extend/zwj pairs.
782787 if (
783- x == . extendedPictographic || ( self . isInEmojiSequence && x == . extend)
788+ x == . extendedPictographic || ( isInEmojiSequence && x == . extend)
784789 ) {
785790 enterEmojiSequence = true
786791 }
787792
788- // If we're currently in an indic sequence (or if our lhs is a linking
789- // consonant), then this check and everything underneath ensures that
790- // we continue being in one and may check if this extend is a Virama.
791- if self . isInIndicSequence || scalar1. _isLinkingConsonant {
792- if y == . extend {
793- let extendNormData = Unicode . _NormData ( scalar2, fastUpperbound: 0x300 )
794-
795- // If our extend's CCC is 0, then this rule does not apply.
796- guard extendNormData. ccc != 0 else {
797- return false
798- }
793+ // GB9c: InCB=Consonant [InCB=Extend InCB=Linker]* InCB=Linker [InCB=Extend InCB=Linker]* × InCB=Consonant
794+ //
795+ // If our lhs is an InCB=Consonant and our rhs is either an InCB=Extend or
796+ // an InCB=Linker, then enter into an indic sequence and mark if scalar 2
797+ // is a linker and that we've seen a linker.
798+ //
799+ // If the lhs is not an InCB=Consonant, then check if we're currently in
800+ // an indic sequence to properly propagate that back to the state.
801+ // Otherwise, we're not in an indic sequence, but our rhs is still an
802+ // extension scalar so don't break regardless right here. If we are in an
803+ // indic sequence, tell the state that we've seen a linker if our rhs is
804+ // one.
805+ switch ( scalar1. _isInCBConsonant, scalar2. _isInCBExtend, scalar2. _isInCBLinker) {
806+ // (InCB=Consonant, InCB=Extend)
807+ case ( true , true , false ) :
808+ enterIndicSequence = true
809+
810+ // (InCB=Consonant, InCB=Linker)
811+ case ( true , false , true ) :
812+ enterIndicSequence = true
813+ hasSeenInCBLinker = true
814+
815+ // (_, InCB=Extend)
816+ case ( false , true , false ) :
817+ guard isInIndicSequence else {
818+ break
799819 }
800820
801821 enterIndicSequence = true
802822
803- if scalar2. _isVirama {
804- self . hasSeenVirama = true
823+ // (_, InCB=Linker)
824+ case ( false , false , true ) :
825+ guard isInIndicSequence else {
826+ break
805827 }
828+
829+ enterIndicSequence = true
830+ hasSeenInCBLinker = true
831+
832+ default :
833+ break
806834 }
807835
808836 return false
@@ -817,25 +845,21 @@ extension _GraphemeBreakingState {
817845
818846 // GB11
819847 case ( . zwj, . extendedPictographic) :
820- return !self . isInEmojiSequence
848+ return !isInEmojiSequence
821849
822850 // GB12 & GB13
823851 case ( . regionalIndicator, . regionalIndicator) :
824852 defer {
825- self . shouldBreakRI. toggle ( )
853+ shouldBreakRI. toggle ( )
826854 }
827855
828- return self . shouldBreakRI
856+ return shouldBreakRI
829857
830858 // GB999
831859 default :
832860 // GB9c
833- if
834- self . isInIndicSequence,
835- self . hasSeenVirama,
836- scalar2. _isLinkingConsonant
837- {
838- self . hasSeenVirama = false
861+ if isInIndicSequence, hasSeenInCBLinker, scalar2. _isInCBConsonant {
862+ hasSeenInCBLinker = false
839863 return false
840864 }
841865
@@ -905,7 +929,7 @@ extension _StringGuts {
905929 ( . t, . t) :
906930 return false
907931
908- // GB9 (partial GB11)
932+ // GB9
909933 case ( _, . extend) ,
910934 ( _, . zwj) :
911935 return false
@@ -929,22 +953,19 @@ extension _StringGuts {
929953 // GB999
930954 default :
931955 // GB9c
932- switch ( x, scalar2. _isLinkingConsonant) {
933- case ( . extend, true ) :
934- let extendNormData = Unicode . _NormData ( scalar1, fastUpperbound: 0x300 )
935-
936- guard extendNormData. ccc != 0 else {
937- return true
938- }
939-
940- return !checkIfInIndicSequence( at: index, with: previousScalar)
941-
942- case ( . zwj, true ) :
956+ //
957+ // Check if our rhs is an InCB=Consonant first because we can more easily
958+ // exit out of this branch in most cases. Otherwise, this is a consonant.
959+ // Check that the lhs is an InCB=Extend or InCB=Linker (we have to check
960+ // if it's an .extend or .zwj first because _isInCBExtend assumes that it
961+ // is true).
962+ if scalar2. _isInCBConsonant,
963+ ( x == . extend || x == . zwj) ,
964+ ( scalar1. _isInCBExtend || scalar1. _isInCBLinker) {
943965 return !checkIfInIndicSequence( at: index, with: previousScalar)
944-
945- default :
946- return true
947966 }
967+
968+ return true
948969 }
949970 }
950971
@@ -1013,69 +1034,73 @@ extension _StringGuts {
10131034 }
10141035
10151036 // When walking backwards, it's impossible to know whether we break when we
1016- // see our first ((.extend|.zwj), .linkingConsonant) without walking
1017- // further backwards. This walks the string backwards enough until we figure
1018- // out whether or not to break this indic sequence. For example:
1037+ // see our first (InCB=Extend, InCB=Consonant) or (InCB=Linker, InCB=Consonant)
1038+ // without walking further backwards. This walks the string backwards enough
1039+ // until we figure out whether or not to break this indic sequence. For example:
10191040 //
10201041 // Scalar view #1:
10211042 //
1022- // [.virama, .extend, .linkingConsonant]
1023- // ^
1024- // | = To be able to know whether or not to break these
1025- // two, we need to walk backwards to determine if
1026- // this is a legitimate indic sequence.
1043+ // [InCB=Linker, InCB=Extend, InCB=Consonant]
1044+ // ^
1045+ // | = To be able to know whether or not to
1046+ // break these two, we need to walk
1047+ // backwards to determine if this is a
1048+ // legitimate indic sequence.
10271049 // ^
1028- // | = The scalar sequence ends without a starting linking consonant ,
1050+ // | = The scalar sequence ends without a starting InCB=Consonant ,
10291051 // so this is in fact not an indic sequence, so we can break the two.
10301052 //
10311053 // Scalar view #2:
10321054 //
1033- // [.linkingConsonant, .virama, .extend, .linkingConsonant ]
1034- // ^
1035- // | = Same as above
1055+ // [InCB=Consonant, InCB=Linker, InCB=Extend, InCB=Consonant ]
1056+ // ^
1057+ // | = Same as above
10361058 // ^
1037- // | = This is a virama , so we at least have seen
1059+ // | = This is a Linker , so we at least have seen
10381060 // 1 to be able to return true if we see a
1039- // linking consonant later.
1061+ // consonant later.
10401062 // ^
1041- // | = Is a linking consonant and we've seen a virama , so this is a
1063+ // | = Is a consonant and we've seen a linker , so this is a
10421064 // legitimate indic sequence, so do NOT break the initial question.
10431065 internal func checkIfInIndicSequence(
10441066 at index: Int ,
10451067 with previousScalar: ( Int ) -> ( scalar: Unicode . Scalar , start: Int ) ?
10461068 ) -> Bool {
10471069 guard let p = previousScalar ( index) else { return false }
10481070
1049- var hasSeenVirama = p. scalar. _isVirama
1071+ var hasSeenInCBLinker = p. scalar. _isInCBLinker
10501072 var i = p. start
10511073
10521074 while let ( scalar, prev) = previousScalar ( i) {
10531075 i = prev
1076+
1077+ if scalar. _isInCBConsonant {
1078+ return hasSeenInCBLinker
1079+ }
1080+
10541081 let gbp = Unicode . _GraphemeBreakProperty ( from: scalar)
10551082
1056- switch ( gbp, scalar . _isLinkingConsonant ) {
1057- case ( . extend , false ) :
1058- let extendNormData = Unicode . _NormData ( scalar , fastUpperbound : 0x300 )
1083+ guard gbp == . extend || gbp == . zwj else {
1084+ return false
1085+ }
10591086
1060- guard extendNormData . ccc != 0 else {
1061- return false
1062- }
1087+ switch ( scalar . _isInCBExtend , scalar . _isInCBLinker ) {
1088+ case ( false , false ) :
1089+ return false
10631090
1064- if scalar. _isVirama {
1065- hasSeenVirama = true
1066- }
1091+ case ( false , true ) :
1092+ hasSeenInCBLinker = true
10671093
1068- case ( . zwj , false ) :
1094+ case ( true , false ) :
10691095 continue
10701096
1071- // LinkingConsonant
1072- case ( _, true ) :
1073- return hasSeenVirama
1074-
1075- default :
1097+ case ( true , true ) :
1098+ // This case should never happen, but if it does then just be cautious
1099+ // and say this is invalid.
10761100 return false
10771101 }
10781102 }
1103+
10791104 return false
10801105 }
10811106
0 commit comments