@@ -1097,5 +1097,197 @@ extension RegexTests {
10971097 ( " -123e1.2 " , nil )
10981098 )
10991099 }
1100+
1101+ // MARK: Character Semantics
1102+
1103+ var eComposed : String { " é " }
1104+ var eDecomposed : String { " e \u{301} " }
1105+
1106+ func testIndividualScalars( ) {
1107+ // Expectation: A standalone Unicode scalar value in a regex literal
1108+ // can match either that specific scalar value or participate in matching
1109+ // as a character.
1110+
1111+ firstMatchTest ( #"\u{65}\u{301}$"# , input: eDecomposed, match: eDecomposed)
1112+ // FIXME: Decomposed character in regex literal doesn't match an equivalent character
1113+ firstMatchTest ( #"\u{65}\u{301}$"# , input: eComposed, match: eComposed,
1114+ xfail: true )
1115+
1116+ firstMatchTest ( #"\u{65}"# , input: eDecomposed, match: " e " )
1117+ firstMatchTest ( #"\u{65}$"# , input: eDecomposed, match: nil )
1118+ // FIXME: \y is unsupported
1119+ firstMatchTest ( #"\u{65}\y"# , input: eDecomposed, match: nil ,
1120+ xfail: true )
1121+
1122+ // FIXME: Unicode scalars are only matched at the start of a grapheme cluster
1123+ firstMatchTest ( #"\u{301}"# , input: eDecomposed, match: " \u{301} " ,
1124+ xfail: true )
1125+ // FIXME: \y is unsupported
1126+ firstMatchTest ( #"\y\u{301}"# , input: eDecomposed, match: nil ,
1127+ xfail: true )
1128+ }
1129+
1130+ func testCanonicalEquivalence( ) throws {
1131+ // Expectation: Matching should use canonical equivalence whenever comparing
1132+ // characters, so a user can write characters using any equivalent spelling
1133+ // in either a regex literal or the string targeted for matching.
1134+
1135+ matchTest (
1136+ #"é$"# ,
1137+ ( eComposed, true ) ,
1138+ ( eDecomposed, true ) )
1139+
1140+ // FIXME: Decomposed character in regex literal doesn't match an equivalent character
1141+ matchTest (
1142+ #"e\u{301}$"# ,
1143+ ( eComposed, true ) ,
1144+ ( eDecomposed, true ) ,
1145+ xfail: true )
1146+
1147+ matchTest (
1148+ #"e$"# ,
1149+ ( eComposed, false ) ,
1150+ ( eDecomposed, false ) )
1151+ }
1152+
1153+ func testCanonicalEquivalenceCharacterClass( ) throws {
1154+ // Expectation: Character classes should match equivalent characters to the
1155+ // same degree, regardless of how they are spelled. Unicode "property
1156+ // classes" should match characters when all the code points that comprise
1157+ // the character are members of the property class.
1158+
1159+ // \w
1160+ matchTest (
1161+ #"^\w$"# ,
1162+ ( eComposed, true ) ,
1163+ ( eDecomposed, true ) )
1164+ // \p{Letter}
1165+ firstMatchTest ( #"\p{Letter}$"# , input: eComposed, match: eComposed)
1166+ // FIXME: \p{Letter} doesn't match a decomposed character
1167+ firstMatchTest ( #"\p{Letter}$"# , input: eDecomposed, match: eDecomposed,
1168+ xfail: true )
1169+
1170+ // \d
1171+ firstMatchTest ( #"\d"# , input: " 5 " , match: " 5 " )
1172+ // FIXME: \d shouldn't match a digit composed with a non-digit character
1173+ firstMatchTest ( #"\d"# , input: " 5 \u{305} " , match: nil ,
1174+ xfail: true )
1175+ // \p{Number}
1176+ firstMatchTest ( #"\p{Number}"# , input: " 5 " , match: " 5 " )
1177+ // FIXME: \p{Number} shouldn't match a number composed with a non-number character
1178+ firstMatchTest ( #"\p{Number}"# , input: " 5 \u{305} " , match: nil ,
1179+ xfail: true )
1180+
1181+ // Should this match the '5' but not the ZWJ, or should it treat '5'+ZWJ
1182+ // as one entity and fail to match altogether?
1183+ firstMatchTest ( #"^\d"# , input: " 5 \u{200d} 0 " , match: " 5 " ,
1184+ xfail: true )
1185+
1186+ // \s
1187+ firstMatchTest ( #"\s"# , input: " " , match: " " )
1188+ // FIXME: \s shouldn't match a number composed with a non-number character
1189+ firstMatchTest ( #"\s\u{305}"# , input: " " , match: nil ,
1190+ xfail: true )
1191+ // \p{Whitespace}
1192+ firstMatchTest ( #"\s"# , input: " " , match: " " )
1193+ // FIXME: \p{Whitespace} shouldn't match whitespace composed with a non-whitespace character
1194+ firstMatchTest ( #"\s\u{305}"# , input: " " , match: nil ,
1195+ xfail: true )
1196+ }
1197+
1198+ func testCanonicalEquivalenceCustomCharacterClass( ) throws {
1199+ // Expectation: Concatenations with custom character classes should be able
1200+ // to match within a grapheme cluster. That is, a regex should be able to
1201+ // match the scalar values that comprise a grapheme cluster in separate,
1202+ // or repeated, custom character classes.
1203+
1204+ matchTest (
1205+ #"[áéíóú]$"# ,
1206+ ( eComposed, true ) ,
1207+ ( eDecomposed, true ) )
1208+
1209+ // FIXME: Custom char classes don't use canonical equivalence with composed characters
1210+ firstMatchTest ( #"e[\u{301}]$"# , input: eComposed, match: eComposed,
1211+ xfail: true )
1212+ firstMatchTest ( #"e[\u{300}-\u{320}]$"# , input: eComposed, match: eComposed,
1213+ xfail: true )
1214+ firstMatchTest ( #"[a-z][\u{300}-\u{320}]$"# , input: eComposed, match: eComposed,
1215+ xfail: true )
1216+
1217+ // FIXME: Custom char classes don't match decomposed characters
1218+ firstMatchTest ( #"e[\u{301}]$"# , input: eDecomposed, match: eDecomposed,
1219+ xfail: true )
1220+ firstMatchTest ( #"e[\u{300}-\u{320}]$"# , input: eDecomposed, match: eDecomposed,
1221+ xfail: true )
1222+ firstMatchTest ( #"[a-z][\u{300}-\u{320}]$"# , input: eDecomposed, match: eDecomposed,
1223+ xfail: true )
1224+
1225+ let flag = " 🇰🇷 "
1226+ firstMatchTest ( #"🇰🇷"# , input: flag, match: flag)
1227+ firstMatchTest ( #"[🇰🇷]"# , input: flag, match: flag)
1228+ firstMatchTest ( #"\u{1F1F0}\u{1F1F7}"# , input: flag, match: flag)
1229+
1230+ // First Unicode scalar followed by CCC of regional indicators
1231+ firstMatchTest ( #"\u{1F1F0}[\u{1F1E6}-\u{1F1FF}]"# , input: flag, match: flag)
1232+
1233+ // FIXME: CCC of Regional Indicator doesn't match with both parts of a flag character
1234+ // A CCC of regional indicators x 2
1235+ firstMatchTest ( #"[\u{1F1E6}-\u{1F1FF}]{2}"# , input: flag, match: flag,
1236+ xfail: true )
1237+
1238+ // FIXME: A single CCC of regional indicators matches the whole flag character
1239+ // A CCC of regional indicators followed by the second Unicode scalar
1240+ firstMatchTest ( #"[\u{1F1E6}-\u{1F1FF}]\u{1F1F7}"# , input: flag, match: flag,
1241+ xfail: true )
1242+ // A single CCC of regional indicators
1243+ firstMatchTest ( #"[\u{1F1E6}-\u{1F1FF}]"# , input: flag, match: nil ,
1244+ xfail: true )
1245+
1246+ // A single CCC of actual flag emojis / combined regional indicators
1247+ firstMatchTest ( #"[🇦🇫-🇿🇼]"# , input: flag, match: flag)
1248+ // This succeeds (correctly) because \u{1F1F0} is lexicographically
1249+ // within the CCC range
1250+ firstMatchTest ( #"[🇦🇫-🇿🇼]"# , input: " \u{1F1F0} abc " , match: " \u{1F1F0} " )
1251+ }
1252+
1253+ func testAnyChar( ) throws {
1254+ // Expectation: \X and, in grapheme cluster mode, `.` should consume an
1255+ // entire character, regardless of how it's spelled. \O should consume only
1256+ // a single Unicode scalar value, leaving any other grapheme scalar
1257+ // components to be matched.
1258+
1259+ firstMatchTest ( #"(?u:.)"# , input: eDecomposed, match: " e " ,
1260+ xfail: true )
1261+
1262+ matchTest (
1263+ #".\u{301}"# ,
1264+ ( eComposed, false ) ,
1265+ ( eDecomposed, false ) )
1266+ matchTest (
1267+ #"\X\u{301}"# ,
1268+ ( eComposed, false ) ,
1269+ ( eDecomposed, false ) )
1270+
1271+ // FIXME: \O is unsupported
1272+ firstMatchTest ( #"\O\u{301}"# , input: eDecomposed, match: eDecomposed,
1273+ xfail: true )
1274+ firstMatchTest ( #"e\O"# , input: eDecomposed, match: eDecomposed,
1275+ xfail: true )
1276+ firstMatchTest ( #"\O\u{301}"# , input: eComposed, match: nil ,
1277+ xfail: true )
1278+ firstMatchTest ( #"e\O"# , input: eComposed, match: nil ,
1279+ xfail: true )
1280+
1281+ // FIXME: Unicode scalar semantic flag (?U) doesn't change behavior of `.`
1282+ matchTest (
1283+ #"(?U).\u{301}"# ,
1284+ ( eComposed, true ) ,
1285+ ( eDecomposed, true ) ,
1286+ xfail: true )
1287+ }
1288+
1289+ // TODO: Add test for implied grapheme cluster requirement at group boundaries
1290+
1291+ // TODO: Add test for grapheme boundaries at start/end of match
11001292}
11011293
0 commit comments