@@ -4561,35 +4561,6 @@ static void llm_load_vocab(
45614561 vocab.special_cls_id = -1;
45624562 vocab.special_mask_id = -1;
45634563
4564- // For Fill-In-the-Middle (FIM)/infill models which where converted
4565- // prior to support of FIM special tokens in GGUF, the following
4566- // will allow those models to continue to work. The general names
4567- // of the known models are currently CodeLlama (LLM_ARCH_LLAMA) and
4568- // CodeGemma (LLM_ARCH_GEMMA). This can potentially be removed once
4569- // new versions of these models have been published.
4570- std::string gen_name;
4571- ml.get_key(LLM_KV_GENERAL_NAME, gen_name, false);
4572-
4573- std::transform(gen_name.begin(), gen_name.end(), gen_name.begin(),
4574- [](unsigned char c){ return std::tolower(c); });
4575-
4576- if (gen_name.find("code") != std::string::npos) {
4577- if (model.arch == LLM_ARCH_LLAMA) {
4578- vocab.special_prefix_id = 32007;
4579- vocab.special_suffix_id = 32008;
4580- vocab.special_middle_id = 32009;
4581- vocab.special_eot_id = 32010;
4582- } else if (model.arch == LLM_ARCH_GEMMA) {
4583- vocab.special_prefix_id = 67;
4584- vocab.special_suffix_id = 69;
4585- vocab.special_middle_id = 68;
4586- // TODO: this is not EOT, it is "file separator" token, needs fix
4587- // https://huggingface.co/google/codegemma-7b-it/blob/9b1d9231388358c04d90bd003458f5070d97db44/tokenizer_config.json#L565-L572
4588- //vocab.special_eot_id = 70;
4589- vocab.special_eot_id = 107;
4590- }
4591- }
4592-
45934564 const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
45944565 if (add_space_prefix_keyidx != -1) {
45954566 vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
@@ -4773,6 +4744,45 @@ static void llm_load_vocab(
47734744
47744745 // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
47754746 if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
4747+ // For Fill-In-the-Middle (FIM)/infill models which where converted
4748+ // prior to support of FIM special tokens in GGUF, the following
4749+ // will allow those models to continue to work. The general names
4750+ // of the known models are currently CodeLlama (LLM_ARCH_LLAMA) and
4751+ // CodeGemma (LLM_ARCH_GEMMA). This can potentially be removed once
4752+ // new versions of these models have been published.
4753+ std::string gen_name;
4754+ ml.get_key(LLM_KV_GENERAL_NAME, gen_name, false);
4755+
4756+ std::transform(gen_name.begin(), gen_name.end(), gen_name.begin(),
4757+ [](unsigned char c){ return std::tolower(c); });
4758+
4759+ if (gen_name.find("code") != std::string::npos) {
4760+ if (model.arch == LLM_ARCH_LLAMA
4761+ && 32010 < vocab.id_to_token.size()
4762+ && vocab.id_to_token[32007].text == "<PRE>"
4763+ && vocab.id_to_token[32008].text == "<SUF>"
4764+ && vocab.id_to_token[32009].text == "<MID>"
4765+ && vocab.id_to_token[32010].text == "<EOT>") {
4766+ vocab.special_prefix_id = 32007;
4767+ vocab.special_suffix_id = 32008;
4768+ vocab.special_middle_id = 32009;
4769+ vocab.special_eot_id = 32010;
4770+ } else if (model.arch == LLM_ARCH_GEMMA
4771+ && 107 < vocab.id_to_token.size()
4772+ && vocab.id_to_token[67].text == "<|fim_prefix|>"
4773+ && vocab.id_to_token[69].text == "<|fim_suffix|>"
4774+ && vocab.id_to_token[68].text == "<|fim_middle|>"
4775+ && vocab.id_to_token[107].text == "<end_of_turn>") {
4776+ vocab.special_prefix_id = 67;
4777+ vocab.special_suffix_id = 69;
4778+ vocab.special_middle_id = 68;
4779+ // TODO: this is not EOT, it is "file separator" token, needs fix
4780+ // https://huggingface.co/google/codegemma-7b-it/blob/9b1d9231388358c04d90bd003458f5070d97db44/tokenizer_config.json#L565-L572
4781+ //vocab.special_eot_id = 70;
4782+ vocab.special_eot_id = 107;
4783+ }
4784+ }
4785+
47764786 try {
47774787 vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
47784788 } catch (const std::exception & e) {
0 commit comments