File tree Expand file tree Collapse file tree 2 files changed +312
-265
lines changed Expand file tree Collapse file tree 2 files changed +312
-265
lines changed Original file line number Diff line number Diff line change @@ -8947,10 +8947,10 @@ struct llm_tokenizer_wpm {
89478947 std::vector<uint32_t> codepoints = codepoints_from_utf8(text);
89488948 std::vector<uint32_t> nfd_codepoints;
89498949 for (uint32_t code : codepoints) {
8950- auto it = nfd_map.find (code);
8951- if (it != nfd_map.end() ) {
8952- for (uint32_t c : it-> second) {
8953- nfd_codepoints.push_back(c );
8950+ auto it = nfd_map.equal_range (code);
8951+ if (it.first != it.second ) {
8952+ for (auto jt = it.first; jt != it. second; jt++ ) {
8953+ nfd_codepoints.push_back(jt->second );
89548954 }
89558955 } else {
89568956 nfd_codepoints.push_back(code);
@@ -9001,12 +9001,13 @@ struct llm_tokenizer_wpm {
90019001 }
90029002
90039003 uint32_t to_lower(uint32_t code) {
9004+ static const std::locale locale("en_US.UTF-8");
90049005#if defined(_WIN32)
90059006 if (code > 0xFFFF) {
90069007 return code;
90079008 }
90089009#endif
9009- return std::tolower(wchar_t(code), std:: locale("en_US.UTF-8") );
9010+ return std::tolower(wchar_t(code), locale);
90109011 }
90119012
90129013 bool is_ascii_punct(uint32_t code) {
You can’t perform that action at this time.
0 commit comments