Skip to content

Commit 438a2ec

Browse files
author
liyang
committed
Refactor JinaCLIP vision mmproj mapping to use tensor_mapping table
1 parent f5b8651 commit 438a2ec

File tree

3 files changed

+46
-74
lines changed

3 files changed

+46
-74
lines changed

convert_hf_to_gguf.py

Lines changed: 9 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -1521,7 +1521,9 @@ class MmprojModel(ModelBase):
15211521
preprocessor_config: dict[str, Any]
15221522
global_config: dict[str, Any]
15231523

1524-
n_block_keys = ["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth"]
1524+
# Prefer explicit "layers" (e.g. JinaCLIP),
1525+
# keep legacy keys for other models.
1526+
n_block_keys = ["layers", "n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth"]
15251527

15261528
has_vision_encoder: bool = True # by default
15271529
has_audio_encoder: bool = False
@@ -6804,6 +6806,11 @@ def __init__(self, *args, **kwargs):
68046806
with open(config_path, encoding="utf-8") as f:
68056807
self.vision_config = json.load(f)
68066808

6809+
def get_vision_config(self) -> dict[str, Any] | None:
6810+
# For JinaCLIPVisionModel, the top-level AutoConfig dict is already
6811+
# the vision-only configuration.
6812+
return self.global_config
6813+
68076814
def set_vocab(self):
68086815
# Vision encoder doesn't need vocabulary
68096816
pass
@@ -6861,73 +6868,10 @@ def set_gguf_parameters(self):
68616868
def _strip_vm_prefix(self, name: str) -> str:
68626869
return name[len('vision_model.'):] if name.startswith('vision_model.') else name
68636870

6864-
def _map_block_tensor(self, layer: int, rest: str, data_torch: Tensor, name: str) -> list[tuple[str, Tensor]] | None:
6865-
parts = rest.split('.')
6866-
# layer norms
6867-
if rest.startswith('norm1.'):
6868-
suffix = parts[-1]
6869-
return [(f'v.blk.{layer}.ln1.{suffix}', data_torch)]
6870-
if rest.startswith('norm2.'):
6871-
suffix = parts[-1]
6872-
return [(f'v.blk.{layer}.ln2.{suffix}', data_torch)]
6873-
if rest.startswith('attn.inner_attn_ln.'):
6874-
suffix = parts[-1]
6875-
return [(f'v.blk.{layer}.attn_ln.{suffix}', data_torch)]
6876-
6877-
if rest == 'attn.q_bias':
6878-
return [(f'v.blk.{layer}.attn_q.bias', data_torch)]
6879-
if rest == 'attn.v_bias':
6880-
return [(f'v.blk.{layer}.attn_v.bias', data_torch)]
6881-
6882-
if rest.startswith('attn.q_proj.'):
6883-
suffix = parts[-1]
6884-
return [(f'v.blk.{layer}.attn_q.{suffix}', data_torch)]
6885-
if rest.startswith('attn.k_proj.'):
6886-
suffix = parts[-1]
6887-
return [(f'v.blk.{layer}.attn_k.{suffix}', data_torch)]
6888-
if rest.startswith('attn.v_proj.'):
6889-
suffix = parts[-1]
6890-
return [(f'v.blk.{layer}.attn_v.{suffix}', data_torch)]
6891-
if rest.startswith('attn.proj.'):
6892-
suffix = parts[-1]
6893-
return [(f'v.blk.{layer}.attn_out.{suffix}', data_torch)]
6894-
6895-
# MLP
6896-
if rest.startswith('mlp.w1.'):
6897-
suffix = parts[-1]
6898-
return [(f'v.blk.{layer}.ffn_gate.{suffix}', data_torch)]
6899-
if rest.startswith('mlp.w2.'):
6900-
suffix = parts[-1]
6901-
return [(f'v.blk.{layer}.ffn_up.{suffix}', data_torch)]
6902-
if rest.startswith('mlp.w3.'):
6903-
suffix = parts[-1]
6904-
return [(f'v.blk.{layer}.ffn_down.{suffix}', data_torch)]
6905-
if rest.startswith('mlp.ffn_ln.'):
6906-
suffix = parts[-1]
6907-
return [(f'v.blk.{layer}.ffn_norm.{suffix}', data_torch)]
6908-
if rest.startswith('mlp.fc1.'):
6909-
suffix = parts[-1]
6910-
return [(f'v.blk.{layer}.ffn_up.{suffix}', data_torch)]
6911-
if rest.startswith('mlp.fc2.'):
6912-
suffix = parts[-1]
6913-
return [(f'v.blk.{layer}.ffn_down.{suffix}', data_torch)]
6914-
return None
6915-
69166871
def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str:
6917-
"""Prefer base table-driven mapping; keep Jina-specific targets if already mapped; fallback to legacy mapper."""
6918-
# Already a GGUF target name (e.g., "v.*" or "mm.*"): return as-is
69196872
if name.startswith('v.') or name.startswith('mm.'):
69206873
return name
6921-
# Try the base mapping first
6922-
try:
6923-
return super().map_tensor_name(name, try_suffixes=try_suffixes)
6924-
except Exception:
6925-
# Fallback to legacy Jina-specific mapper for any remaining edge keys
6926-
if hasattr(self, "_map_jinaclip_tensor_name"):
6927-
mapped = self._map_jinaclip_tensor_name(name) # type: ignore[attr-defined]
6928-
if mapped:
6929-
return mapped
6930-
return name
6874+
return super().map_tensor_name(name, try_suffixes=try_suffixes)
69316875

69326876
def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
69336877
yielded_any = False
@@ -6991,15 +6935,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
69916935
base = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_POST_NORM]
69926936
return [(f'{base}.{suffix}', data_torch)]
69936937

6994-
if src_no_vm.startswith('blocks.'):
6995-
parts = src_no_vm.split('.')
6996-
if len(parts) >= 3 and parts[1].isdigit():
6997-
layer = int(parts[1])
6998-
rest = '.'.join(parts[2:])
6999-
mapped = self._map_block_tensor(layer, rest, data_torch, name)
7000-
if mapped is not None:
7001-
return mapped
7002-
70036938
try:
70046939
return [(self.map_tensor_name(name), data_torch)]
70056940
except Exception:

gguf-py/gguf/constants.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -634,9 +634,13 @@ class MODEL_TENSOR(IntEnum):
634634
V_ENC_ATTN_O = auto()
635635
V_ENC_ATTN_O_NORM = auto()
636636
V_ENC_POST_ATTN_NORM = auto()
637+
V_ENC_ATTN_LN = auto()
637638
V_ENC_FFN_UP = auto()
638639
V_ENC_FFN_GATE = auto()
639640
V_ENC_FFN_DOWN = auto()
641+
V_ENC_FFN_NORM = auto()
642+
V_ENC_ATTN_Q_BIAS = auto()
643+
V_ENC_ATTN_V_BIAS = auto()
640644
V_LAYER_SCALE_1 = auto()
641645
V_LAYER_SCALE_2 = auto()
642646
V_PRE_NORM = auto()
@@ -1002,9 +1006,13 @@ class MODEL_TENSOR(IntEnum):
10021006
MODEL_TENSOR.V_ENC_ATTN_O: "v.blk.{bid}.attn_out",
10031007
MODEL_TENSOR.V_ENC_ATTN_O_NORM: "v.blk.{bid}.attn_out_norm",
10041008
MODEL_TENSOR.V_ENC_POST_ATTN_NORM: "v.blk.{bid}.ln2",
1009+
MODEL_TENSOR.V_ENC_ATTN_LN: "v.blk.{bid}.attn_ln",
10051010
MODEL_TENSOR.V_ENC_FFN_UP: "v.blk.{bid}.ffn_up",
10061011
MODEL_TENSOR.V_ENC_FFN_GATE: "v.blk.{bid}.ffn_gate",
10071012
MODEL_TENSOR.V_ENC_FFN_DOWN: "v.blk.{bid}.ffn_down",
1013+
MODEL_TENSOR.V_ENC_FFN_NORM: "v.blk.{bid}.ffn_norm",
1014+
MODEL_TENSOR.V_ENC_ATTN_Q_BIAS: "v.blk.{bid}.attn_q.bias",
1015+
MODEL_TENSOR.V_ENC_ATTN_V_BIAS: "v.blk.{bid}.attn_v.bias",
10081016
MODEL_TENSOR.V_LAYER_SCALE_1: "v.blk.{bid}.ls1",
10091017
MODEL_TENSOR.V_LAYER_SCALE_2: "v.blk.{bid}.ls2",
10101018
MODEL_TENSOR.V_PRE_NORM: "v.pre_ln",
@@ -1080,9 +1088,13 @@ class MODEL_TENSOR(IntEnum):
10801088
MODEL_TENSOR.V_ENC_ATTN_O,
10811089
MODEL_TENSOR.V_ENC_ATTN_O_NORM,
10821090
MODEL_TENSOR.V_ENC_POST_ATTN_NORM,
1091+
MODEL_TENSOR.V_ENC_ATTN_LN,
10831092
MODEL_TENSOR.V_ENC_FFN_UP,
10841093
MODEL_TENSOR.V_ENC_FFN_GATE,
10851094
MODEL_TENSOR.V_ENC_FFN_DOWN,
1095+
MODEL_TENSOR.V_ENC_FFN_NORM,
1096+
MODEL_TENSOR.V_ENC_ATTN_Q_BIAS,
1097+
MODEL_TENSOR.V_ENC_ATTN_V_BIAS,
10861098
MODEL_TENSOR.V_LAYER_SCALE_1,
10871099
MODEL_TENSOR.V_LAYER_SCALE_2,
10881100
MODEL_TENSOR.V_PRE_NORM,

gguf-py/gguf/tensor_mapping.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1243,6 +1243,7 @@ class TensorNameMap:
12431243
"vision_encoder.transformer.layers.{bid}.attention.wq", # pixtral
12441244
"visual.blocks.{bid}.attn.q", # qwen2vl, generated
12451245
"vision_tower.encoder.blocks.{bid}.wq", # kimi-vl, generated
1246+
"blocks.{bid}.attn.q_proj", # JinaCLIP v2 vision
12461247
),
12471248

12481249
MODEL_TENSOR.V_ENC_ATTN_Q_NORM: (
@@ -1260,6 +1261,7 @@ class TensorNameMap:
12601261
"vision_encoder.transformer.layers.{bid}.attention.wk", # pixtral
12611262
"visual.blocks.{bid}.attn.k", # qwen2vl, generated
12621263
"vision_tower.encoder.blocks.{bid}.wk", # kimi-vl, generated
1264+
"blocks.{bid}.attn.k_proj", # JinaCLIP v2 vision
12631265
),
12641266

12651267
MODEL_TENSOR.V_ENC_ATTN_K_NORM: (
@@ -1277,6 +1279,7 @@ class TensorNameMap:
12771279
"vision_encoder.transformer.layers.{bid}.attention.wv", # pixtral
12781280
"visual.blocks.{bid}.attn.v", # qwen2vl, generated
12791281
"vision_tower.encoder.blocks.{bid}.wv", # kimi-vl, generated
1282+
"blocks.{bid}.attn.v_proj", # JinaCLIP v2 vision
12801283
),
12811284

12821285
MODEL_TENSOR.V_ENC_INPUT_NORM: (
@@ -1291,6 +1294,7 @@ class TensorNameMap:
12911294
"visual.blocks.{bid}.norm1", # qwen2vl
12921295
"vision_tower.encoder.blocks.{bid}.norm0", # kimi-vl (norm0/norm1)
12931296
"model.vision.transformer.layers.{bid}.input_layernorm", # cogvlm
1297+
"blocks.{bid}.norm1", # JinaCLIP v2 vision
12941298
),
12951299

12961300
MODEL_TENSOR.V_ENC_ATTN_O: (
@@ -1306,6 +1310,7 @@ class TensorNameMap:
13061310
"visual.blocks.{bid}.attn.proj", # qwen2vl
13071311
"vision_tower.encoder.blocks.{bid}.wo", # kimi-vl
13081312
"model.vision.transformer.layers.{bid}.attention.dense", # cogvlm
1313+
"blocks.{bid}.attn.proj", # JinaCLIP v2 vision
13091314
),
13101315

13111316
MODEL_TENSOR.V_ENC_POST_ATTN_NORM: (
@@ -1320,6 +1325,11 @@ class TensorNameMap:
13201325
"visual.blocks.{bid}.norm2", # qwen2vl
13211326
"vision_tower.encoder.blocks.{bid}.norm1", # kimi-vl (norm0/norm1)
13221327
"model.vision.transformer.layers.{bid}.post_attention_layernorm", # cogvlm
1328+
"blocks.{bid}.norm2", # JinaCLIP v2 vision
1329+
),
1330+
1331+
MODEL_TENSOR.V_ENC_ATTN_LN: (
1332+
"blocks.{bid}.attn.inner_attn_ln", # JinaCLIP v2 vision
13231333
),
13241334

13251335
MODEL_TENSOR.V_ENC_FFN_UP: (
@@ -1335,12 +1345,14 @@ class TensorNameMap:
13351345
"visual.blocks.{bid}.mlp.linear_fc1", # qwen3vl
13361346
"vision_tower.encoder.blocks.{bid}.mlp.fc0", # kimi-vl (fc0/fc1)
13371347
"model.vision.transformer.layers.{bid}.mlp.fc1", # cogvlm
1348+
"blocks.{bid}.mlp.w2", # JinaCLIP v2 vision (up)
13381349
),
13391350

13401351
MODEL_TENSOR.V_ENC_FFN_GATE: (
13411352
"vision_tower.transformer.layers.{bid}.feed_forward.gate_proj", # pixtral-hf
13421353
"vision_encoder.transformer.layers.{bid}.feed_forward.w1", # pixtral
13431354
"visual.blocks.{bid}.mlp.gate_proj", # qwen2.5vl
1355+
"blocks.{bid}.mlp.w1", # JinaCLIP v2 vision
13441356
),
13451357

13461358
MODEL_TENSOR.V_ENC_FFN_DOWN: (
@@ -1356,6 +1368,11 @@ class TensorNameMap:
13561368
"visual.blocks.{bid}.mlp.linear_fc2", # qwen3vl
13571369
"vision_tower.encoder.blocks.{bid}.mlp.fc1", # kimi-vl (fc0/fc1)
13581370
"model.vision.transformer.layers.{bid}.mlp.fc2", # cogvlm
1371+
"blocks.{bid}.mlp.w3", # JinaCLIP v2 vision (down)
1372+
),
1373+
1374+
MODEL_TENSOR.V_ENC_FFN_NORM: (
1375+
"blocks.{bid}.mlp.ffn_ln", # JinaCLIP v2 vision
13591376
),
13601377

13611378
MODEL_TENSOR.V_LAYER_SCALE_1: (
@@ -1368,6 +1385,14 @@ class TensorNameMap:
13681385
"model.vision_tower.encoder.layer.{bid}.lambda_2", # Intern-S1
13691386
),
13701387

1388+
MODEL_TENSOR.V_ENC_ATTN_Q_BIAS: (
1389+
"blocks.{bid}.attn.q_bias", # JinaCLIP v2 vision
1390+
),
1391+
1392+
MODEL_TENSOR.V_ENC_ATTN_V_BIAS: (
1393+
"blocks.{bid}.attn.v_bias", # JinaCLIP v2 vision
1394+
),
1395+
13711396
MODEL_TENSOR.V_PRE_NORM: (
13721397
"vision_tower.vision_model.pre_layrnorm",
13731398
"vision_tower.ln_pre", # pixtral-hf

0 commit comments

Comments
 (0)