@@ -1015,6 +1015,8 @@ def checkpoint_filter_fn(
10151015 return _convert_openai_clip (state_dict , model )
10161016 elif 'module.visual.class_embedding' in state_dict :
10171017 return _convert_openai_clip (state_dict , model , prefix = 'module.visual.' )
1018+ elif '_image_encoder.module.visual.class_embedding' in state_dict :
1019+ return _convert_openai_clip (state_dict , model , prefix = '_image_encoder.module.visual.' )
10181020
10191021 if "mask_token" in state_dict :
10201022 state_dict = _convert_dinov2 (state_dict , model )
@@ -1735,6 +1737,10 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
17351737 input_size = (3 , 384 , 384 ),
17361738 num_classes = 0 ),
17371739
1740+ 'vit_8m_patch16_tinyclip_224.yfcc15m' : _cfg (
1741+ url = 'https://github.com/wkcn/TinyCLIP-model-zoo/releases/download/checkpoints/TinyCLIP-ViT-8M-16-Text-3M-YFCC15M.pt' ,
1742+ mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , num_classes = 512 ),
1743+
17381744 'vit_medium_patch16_reg4_256' : _cfg (
17391745 input_size = (3 , 256 , 256 )),
17401746 'vit_medium_patch16_reg4_gap_256' : _cfg (
@@ -2621,6 +2627,14 @@ def vit_so400m_patch14_siglip_384(pretrained: bool = False, **kwargs) -> VisionT
26212627 return model
26222628
26232629
2630+ @register_model
2631+ def vit_8m_patch16_tinyclip_224 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
2632+ model_args = dict (embed_dim = 256 , depth = 10 , num_heads = 4 , pre_norm = True , norm_layer = nn .LayerNorm )
2633+ model = _create_vision_transformer (
2634+ 'vit_8m_patch16_tinyclip_224' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
2635+ return model
2636+
2637+
26242638@register_model
26252639def vit_medium_patch16_reg4_256 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
26262640 model_args = dict (
0 commit comments