@@ -964,11 +964,13 @@ def _convert_openai_clip(
964964 v = v .unsqueeze (0 )
965965 if v .shape [1 ] != model .pos_embed .shape [1 ]:
966966 # To resize pos embedding when using model at different size from pretrained weights
967- v = resize_pos_embed (
967+ num_prefix_tokens = 0 if getattr (model , 'no_embed_class' , False ) \
968+ else getattr (model , 'num_prefix_tokens' , 1 )
969+ v = resample_abs_pos_embed (
968970 v ,
969- model .pos_embed ,
970- 0 if getattr ( model , 'no_embed_class' ) else getattr ( model , ' num_prefix_tokens' , 1 ) ,
971- model . patch_embed . grid_size
971+ new_size = model .patch_embed . grid_size ,
972+ num_prefix_tokens = num_prefix_tokens ,
973+ verbose = True ,
972974 )
973975 out_dict [k ] = v
974976 return out_dict
@@ -1015,8 +1017,6 @@ def checkpoint_filter_fn(
10151017 return _convert_openai_clip (state_dict , model )
10161018 elif 'module.visual.class_embedding' in state_dict :
10171019 return _convert_openai_clip (state_dict , model , prefix = 'module.visual.' )
1018- elif '_image_encoder.module.visual.class_embedding' in state_dict :
1019- return _convert_openai_clip (state_dict , model , prefix = '_image_encoder.module.visual.' )
10201020
10211021 if "mask_token" in state_dict :
10221022 state_dict = _convert_dinov2 (state_dict , model )
@@ -1737,20 +1737,24 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
17371737 input_size = (3 , 384 , 384 ),
17381738 num_classes = 0 ),
17391739
1740- 'vit_8m_patch16_tinyclip_224.yfcc15m' : _cfg (
1741- url = 'https://github.com/wkcn/TinyCLIP-model-zoo/releases/download/checkpoints/TinyCLIP-ViT-8M-16-Text-3M-YFCC15M.pt' ,
1740+ 'vit_xsmall_patch16_clip_224.tinyclip_yfcc15m' : _cfg (
1741+ hf_hub_id = 'timm/' ,
1742+ hf_hub_filename = 'open_clip_pytorch_model.bin' ,
17421743 license = 'mit' ,
17431744 mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , num_classes = 512 ),
1744- 'vit_39m_patch16_tinyclip_224.yfcc15m' : _cfg (
1745- url = 'https://github.com/wkcn/TinyCLIP-model-zoo/releases/download/checkpoints/TinyCLIP-ViT-39M-16-Text-19M-YFCC15M.pt' ,
1745+ 'vit_medium_patch32_clip_224.tinyclip_laion400m' : _cfg (
1746+ hf_hub_id = 'timm/' ,
1747+ hf_hub_filename = 'open_clip_pytorch_model.bin' ,
17461748 license = 'mit' ,
17471749 mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , num_classes = 512 ),
1748- 'vit_40m_patch32_tinyclip_224.laion400m' : _cfg (
1749- url = 'https://github.com/wkcn/TinyCLIP-model-zoo/releases/download/checkpoints/TinyCLIP-ViT-40M-32-Text-19M-LAION400M.pt' ,
1750+ 'vit_medium_patch16_clip_224.tinyclip_yfcc15m' : _cfg (
1751+ hf_hub_id = 'timm/' ,
1752+ hf_hub_filename = 'open_clip_pytorch_model.bin' ,
17501753 license = 'mit' ,
17511754 mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , num_classes = 512 ),
1752- 'vit_61m_patch32_tinyclip_224.laion400m' : _cfg (
1753- url = 'https://github.com/wkcn/TinyCLIP-model-zoo/releases/download/checkpoints/TinyCLIP-ViT-61M-32-Text-29M-LAION400M.pt' ,
1755+ 'vit_betwixt_patch32_clip_224.tinyclip_laion400m' : _cfg (
1756+ hf_hub_id = 'timm/' ,
1757+ hf_hub_filename = 'open_clip_pytorch_model.bin' ,
17541758 license = 'mit' ,
17551759 mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , num_classes = 512 ),
17561760
@@ -2092,6 +2096,44 @@ def vit_giant_patch16_gap_224(pretrained: bool = False, **kwargs) -> VisionTrans
20922096 return model
20932097
20942098
2099+ @register_model
2100+ def vit_xsmall_patch16_clip_224 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
2101+ # TinyCLIP 8M
2102+ model_args = dict (embed_dim = 256 , depth = 10 , num_heads = 4 , pre_norm = True , norm_layer = nn .LayerNorm )
2103+ model = _create_vision_transformer (
2104+ 'vit_xsmall_patch16_clip_224' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
2105+ return model
2106+
2107+
2108+ @register_model
2109+ def vit_medium_patch32_clip_224 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
2110+ # TinyCLIP 40M
2111+ model_args = dict (
2112+ patch_size = 32 , embed_dim = 512 , depth = 12 , num_heads = 8 , pre_norm = True , norm_layer = nn .LayerNorm )
2113+ model = _create_vision_transformer (
2114+ 'vit_medium_patch32_clip_224' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
2115+ return model
2116+
2117+
2118+ @register_model
2119+ def vit_medium_patch16_clip_224 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
2120+ # TinyCLIP 39M
2121+ model_args = dict (embed_dim = 512 , depth = 12 , num_heads = 8 , pre_norm = True , norm_layer = nn .LayerNorm )
2122+ model = _create_vision_transformer (
2123+ 'vit_medium_patch16_clip_224' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
2124+ return model
2125+
2126+
2127+ @register_model
2128+ def vit_betwixt_patch32_clip_224 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
2129+ # TinyCLIP 61M
2130+ model_args = dict (
2131+ patch_size = 32 , embed_dim = 640 , depth = 12 , num_heads = 10 , pre_norm = True , norm_layer = nn .LayerNorm )
2132+ model = _create_vision_transformer (
2133+ 'vit_betwixt_patch32_clip_224' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
2134+ return model
2135+
2136+
20952137@register_model
20962138def vit_base_patch32_clip_224 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
20972139 """ ViT-B/32 CLIP image tower @ 224x224
@@ -2640,40 +2682,6 @@ def vit_so400m_patch14_siglip_384(pretrained: bool = False, **kwargs) -> VisionT
26402682 return model
26412683
26422684
2643- @register_model
2644- def vit_8m_patch16_tinyclip_224 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
2645- model_args = dict (embed_dim = 256 , depth = 10 , num_heads = 4 , pre_norm = True , norm_layer = nn .LayerNorm )
2646- model = _create_vision_transformer (
2647- 'vit_8m_patch16_tinyclip_224' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
2648- return model
2649-
2650-
2651- @register_model
2652- def vit_39m_patch16_tinyclip_224 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
2653- model_args = dict (embed_dim = 512 , depth = 12 , num_heads = 8 , pre_norm = True , norm_layer = nn .LayerNorm )
2654- model = _create_vision_transformer (
2655- 'vit_39m_patch16_tinyclip_224' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
2656- return model
2657-
2658-
2659- @register_model
2660- def vit_40m_patch32_tinyclip_224 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
2661- model_args = dict (
2662- patch_size = 32 , embed_dim = 512 , depth = 12 , num_heads = 8 , pre_norm = True , norm_layer = nn .LayerNorm )
2663- model = _create_vision_transformer (
2664- 'vit_40m_patch32_tinyclip_224' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
2665- return model
2666-
2667-
2668- @register_model
2669- def vit_61m_patch32_tinyclip_224 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
2670- model_args = dict (
2671- patch_size = 32 , embed_dim = 640 , depth = 12 , num_heads = 10 , pre_norm = True , norm_layer = nn .LayerNorm )
2672- model = _create_vision_transformer (
2673- 'vit_61m_patch32_tinyclip_224' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
2674- return model
2675-
2676-
26772685@register_model
26782686def vit_medium_patch16_reg4_256 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
26792687 model_args = dict (
0 commit comments