Skip to content

Commit 9d81c71

Browse files
Do not mis-parse certain wide-character emojis as integer
When calling ch_to_digit() with a UTF-16 or UTF-32 code unit, it simply truncates away any data stored in the non-low byte(s) of the code unit. It then uses a lookup table to determine whether the low byte corresponds to an ASCII digit. This is incorrect because as soon as any bit outside the low byte is set, the number will never correspond to a ASCII digit anymore. To fix this, we produce a mask that is all zeroes if any bit outside the low byte is set in the code unit, all ones otherwise. Anding this mask with the original code unit forces the table lookup to return the sentinel value from the zero-index if any high bit was set and causes the code unit not to be parsed as integer. This bug was discovered when loading Mastodon posts inside the Ladybird browser where some of Mastodon's JavaScript would trigger the code path that erroneously parsed the emoji as integer. It had the visible effect that some digits inside the posts would get rendered as one of the emojis that parsed to that digit. For more details see this issue: LadybirdBrowser/ladybird#6205 The emojis in the test case are simply all the emojis used on Mastodon that caused the bug. They can be found here: https://github.com/mastodon/mastodon/blob/06803422da3794538cd9cd5c7ccd61a0694ef921/app/javascript/mastodon/features/emoji/emoji_map.json
1 parent fec4082 commit 9d81c71

File tree

2 files changed

+277
-2
lines changed

2 files changed

+277
-2
lines changed

include/fast_float/float_common.h

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1132,7 +1132,13 @@ template <typename T> constexpr uint64_t int_luts<T>::min_safe_u64[];
11321132

11331133
template <typename UC>
11341134
fastfloat_really_inline constexpr uint8_t ch_to_digit(UC c) {
1135-
return int_luts<>::chdigit[static_cast<unsigned char>(c)];
1135+
using UnsignedUC = typename std::make_unsigned<UC>::type;
1136+
auto uc = static_cast<UnsignedUC>(c);
1137+
// For types larger than one byte, we need to force an index with sentinel
1138+
// value (using index zero because that is easiest) if any byte other than
1139+
// the low byte is non-zero.
1140+
auto mask = static_cast<UnsignedUC>(-((uc & ~0xFFull) == 0));
1141+
return int_luts<>::chdigit[static_cast<unsigned char>(uc & mask)];
11361142
}
11371143

11381144
fastfloat_really_inline constexpr size_t max_digits_u64(int base) {

tests/fast_int.cpp

Lines changed: 270 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -831,6 +831,275 @@ int main() {
831831
return EXIT_FAILURE;
832832
}
833833
}
834+
// dont parse UTF-16 code units of emojis as int if low byte is ascii digit
835+
{
836+
const std::u16string emojis[] = {
837+
u"", u"ℹ️", u"", u"☸️", u"", u"☹️", u"", u"✳️",
838+
u"", u"✴️", u"", u"⤴️", u"", u"⤵️", u"", u"〰️",
839+
};
840+
bool failed = false;
841+
auto array_size = sizeof(emojis) / sizeof(emojis[0]);
842+
for (size_t i = 0; i < array_size; i++) {
843+
auto e = emojis[i];
844+
int foo;
845+
auto answer = fast_float::from_chars(e.data(), e.data() + e.size(), foo);
846+
if (answer.ec == std::errc()) {
847+
failed = true;
848+
std::cerr << "Incorrectly parsed emoji #" << i << " as integer " << foo
849+
<< "." << std::endl;
850+
}
851+
}
852+
853+
if (failed) {
854+
return EXIT_FAILURE;
855+
}
856+
}
857+
// dont parse UTF-32 code points of emojis as int if low byte is ascii digit
858+
{
859+
const std::u32string emojis[] = {
860+
U"",
861+
U"ℹ️",
862+
U"",
863+
U"☸️",
864+
U"",
865+
U"☹️",
866+
U"",
867+
U"✳️",
868+
U"",
869+
U"✴️",
870+
U"",
871+
U"⤴️",
872+
U"",
873+
U"⤵️",
874+
U"",
875+
U"〰️",
876+
U"🈲",
877+
U"🈳",
878+
U"🈴",
879+
U"🈵",
880+
U"🈶",
881+
U"🈷",
882+
U"🈷️",
883+
U"🈸",
884+
U"🈹",
885+
U"🌰",
886+
U"🌱",
887+
U"🌲",
888+
U"🌳",
889+
U"🌴",
890+
U"🌵",
891+
U"🌶",
892+
U"🌶️",
893+
U"🌷",
894+
U"🌸",
895+
U"🌹",
896+
U"🐰",
897+
U"🐱",
898+
U"🐲",
899+
U"🐳",
900+
U"🐴",
901+
U"🐵",
902+
U"🐶",
903+
U"🐷",
904+
U"🐸",
905+
U"🐹",
906+
U"🔰",
907+
U"🔱",
908+
U"🔲",
909+
U"🔳",
910+
U"🔴",
911+
U"🔵",
912+
U"🔶",
913+
U"🔷",
914+
U"🔸",
915+
U"🔹",
916+
U"😰",
917+
U"😱",
918+
U"😲",
919+
U"😳",
920+
U"😴",
921+
U"😵",
922+
U"😵‍💫",
923+
U"😶",
924+
U"😶‍🌫",
925+
U"😶‍🌫️",
926+
U"😷",
927+
U"😸",
928+
U"😹",
929+
U"🤰",
930+
U"🤰🏻",
931+
U"🤰🏼",
932+
U"🤰🏽",
933+
U"🤰🏾",
934+
U"🤰🏿",
935+
U"🤱",
936+
U"🤱🏻",
937+
U"🤱🏼",
938+
U"🤱🏽",
939+
U"🤱🏾",
940+
U"🤱🏿",
941+
U"🤲",
942+
U"🤲🏻",
943+
U"🤲🏼",
944+
U"🤲🏽",
945+
U"🤲🏾",
946+
U"🤲🏿",
947+
U"🤳",
948+
U"🤳🏻",
949+
U"🤳🏼",
950+
U"🤳🏽",
951+
U"🤳🏾",
952+
U"🤳🏿",
953+
U"🤴",
954+
U"🤴🏻",
955+
U"🤴🏼",
956+
U"🤴🏽",
957+
U"🤴🏾",
958+
U"🤴🏿",
959+
U"🤵",
960+
U"🤵‍♀",
961+
U"🤵‍♀️",
962+
U"🤵‍♂",
963+
U"🤵‍♂️",
964+
U"🤵🏻",
965+
U"🤵🏻‍♀",
966+
U"🤵🏻‍♀️",
967+
U"🤵🏻‍♂",
968+
U"🤵🏻‍♂️",
969+
U"🤵🏼",
970+
U"🤵🏼‍♀",
971+
U"🤵🏼‍♀️",
972+
U"🤵🏼‍♂",
973+
U"🤵🏼‍♂️",
974+
U"🤵🏽",
975+
U"🤵🏽‍♀",
976+
U"🤵🏽‍♀️",
977+
U"🤵🏽‍♂",
978+
U"🤵🏽‍♂️",
979+
U"🤵🏾",
980+
U"🤵🏾‍♀",
981+
U"🤵🏾‍♀️",
982+
U"🤵🏾‍♂",
983+
U"🤵🏾‍♂️",
984+
U"🤵🏿",
985+
U"🤵🏿‍♀",
986+
U"🤵🏿‍♀️",
987+
U"🤵🏿‍♂",
988+
U"🤵🏿‍♂️",
989+
U"🤶",
990+
U"🤶🏻",
991+
U"🤶🏼",
992+
U"🤶🏽",
993+
U"🤶🏾",
994+
U"🤶🏿",
995+
U"🤷",
996+
U"🤷‍♀",
997+
U"🤷‍♀️",
998+
U"🤷‍♂",
999+
U"🤷‍♂️",
1000+
U"🤷🏻",
1001+
U"🤷🏻‍♀",
1002+
U"🤷🏻‍♀️",
1003+
U"🤷🏻‍♂",
1004+
U"🤷🏻‍♂️",
1005+
U"🤷🏼",
1006+
U"🤷🏼‍♀",
1007+
U"🤷🏼‍♀️",
1008+
U"🤷🏼‍♂",
1009+
U"🤷🏼‍♂️",
1010+
U"🤷🏽",
1011+
U"🤷🏽‍♀",
1012+
U"🤷🏽‍♀️",
1013+
U"🤷🏽‍♂",
1014+
U"🤷🏽‍♂️",
1015+
U"🤷🏾",
1016+
U"🤷🏾‍♀",
1017+
U"🤷🏾‍♀️",
1018+
U"🤷🏾‍♂",
1019+
U"🤷🏾‍♂️",
1020+
U"🤷🏿",
1021+
U"🤷🏿‍♀",
1022+
U"🤷🏿‍♀️",
1023+
U"🤷🏿‍♂",
1024+
U"🤷🏿‍♂️",
1025+
U"🤸",
1026+
U"🤸‍♀",
1027+
U"🤸‍♀️",
1028+
U"🤸‍♂",
1029+
U"🤸‍♂️",
1030+
U"🤸🏻",
1031+
U"🤸🏻‍♀",
1032+
U"🤸🏻‍♀️",
1033+
U"🤸🏻‍♂",
1034+
U"🤸🏻‍♂️",
1035+
U"🤸🏼",
1036+
U"🤸🏼‍♀",
1037+
U"🤸🏼‍♀️",
1038+
U"🤸🏼‍♂",
1039+
U"🤸🏼‍♂️",
1040+
U"🤸🏽",
1041+
U"🤸🏽‍♀",
1042+
U"🤸🏽‍♀️",
1043+
U"🤸🏽‍♂",
1044+
U"🤸🏽‍♂️",
1045+
U"🤸🏾",
1046+
U"🤸🏾‍♀",
1047+
U"🤸🏾‍♀️",
1048+
U"🤸🏾‍♂",
1049+
U"🤸🏾‍♂️",
1050+
U"🤸🏿",
1051+
U"🤸🏿‍♀",
1052+
U"🤸🏿‍♀️",
1053+
U"🤸🏿‍♂",
1054+
U"🤸🏿‍♂️",
1055+
U"🤹",
1056+
U"🤹‍♀",
1057+
U"🤹‍♀️",
1058+
U"🤹‍♂",
1059+
U"🤹‍♂️",
1060+
U"🤹🏻",
1061+
U"🤹🏻‍♀",
1062+
U"🤹🏻‍♀️",
1063+
U"🤹🏻‍♂",
1064+
U"🤹🏻‍♂️",
1065+
U"🤹🏼",
1066+
U"🤹🏼‍♀",
1067+
U"🤹🏼‍♀️",
1068+
U"🤹🏼‍♂",
1069+
U"🤹🏼‍♂️",
1070+
U"🤹🏽",
1071+
U"🤹🏽‍♀",
1072+
U"🤹🏽‍♀️",
1073+
U"🤹🏽‍♂",
1074+
U"🤹🏽‍♂️",
1075+
U"🤹🏾",
1076+
U"🤹🏾‍♀",
1077+
U"🤹🏾‍♀️",
1078+
U"🤹🏾‍♂",
1079+
U"🤹🏾‍♂️",
1080+
U"🤹🏿",
1081+
U"🤹🏿‍♀",
1082+
U"🤹🏿‍♀️",
1083+
U"🤹🏿‍♂",
1084+
U"🤹🏿‍♂️",
1085+
};
1086+
bool failed = false;
1087+
auto array_size = sizeof(emojis) / sizeof(emojis[0]);
1088+
for (size_t i = 0; i < array_size; i++) {
1089+
auto e = emojis[i];
1090+
int foo;
1091+
auto answer = fast_float::from_chars(e.data(), e.data() + e.size(), foo);
1092+
if (answer.ec == std::errc()) {
1093+
failed = true;
1094+
std::cerr << "Incorrectly parsed emoji #" << i << " as integer " << foo
1095+
<< "." << std::endl;
1096+
}
1097+
}
1098+
1099+
if (failed) {
1100+
return EXIT_FAILURE;
1101+
}
1102+
}
8341103

8351104
return EXIT_SUCCESS;
8361105
}
@@ -842,4 +1111,4 @@ int main() {
8421111
std::cerr << "The test requires C++17." << std::endl;
8431112
return EXIT_SUCCESS;
8441113
}
845-
#endif
1114+
#endif

0 commit comments

Comments
 (0)