@@ -1739,6 +1739,19 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
17391739
17401740 'vit_8m_patch16_tinyclip_224.yfcc15m' : _cfg (
17411741 url = 'https://github.com/wkcn/TinyCLIP-model-zoo/releases/download/checkpoints/TinyCLIP-ViT-8M-16-Text-3M-YFCC15M.pt' ,
1742+ license = 'mit' ,
1743+ mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , num_classes = 512 ),
1744+ 'vit_39m_patch16_tinyclip_224.yfcc15m' : _cfg (
1745+ url = 'https://github.com/wkcn/TinyCLIP-model-zoo/releases/download/checkpoints/TinyCLIP-ViT-39M-16-Text-19M-YFCC15M.pt' ,
1746+ license = 'mit' ,
1747+ mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , num_classes = 512 ),
1748+ 'vit_40m_patch32_tinyclip_224.laion400m' : _cfg (
1749+ url = 'https://github.com/wkcn/TinyCLIP-model-zoo/releases/download/checkpoints/TinyCLIP-ViT-40M-32-Text-19M-LAION400M.pt' ,
1750+ license = 'mit' ,
1751+ mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , num_classes = 512 ),
1752+ 'vit_61m_patch32_tinyclip_224.laion400m' : _cfg (
1753+ url = 'https://github.com/wkcn/TinyCLIP-model-zoo/releases/download/checkpoints/TinyCLIP-ViT-61M-32-Text-29M-LAION400M.pt' ,
1754+ license = 'mit' ,
17421755 mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , num_classes = 512 ),
17431756
17441757 'vit_medium_patch16_reg4_256' : _cfg (
@@ -2635,6 +2648,32 @@ def vit_8m_patch16_tinyclip_224(pretrained: bool = False, **kwargs) -> VisionTra
26352648 return model
26362649
26372650
2651+ @register_model
2652+ def vit_39m_patch16_tinyclip_224 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
2653+ model_args = dict (embed_dim = 512 , depth = 12 , num_heads = 8 , pre_norm = True , norm_layer = nn .LayerNorm )
2654+ model = _create_vision_transformer (
2655+ 'vit_39m_patch16_tinyclip_224' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
2656+ return model
2657+
2658+
2659+ @register_model
2660+ def vit_40m_patch32_tinyclip_224 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
2661+ model_args = dict (
2662+ patch_size = 32 , embed_dim = 512 , depth = 12 , num_heads = 8 , pre_norm = True , norm_layer = nn .LayerNorm )
2663+ model = _create_vision_transformer (
2664+ 'vit_40m_patch32_tinyclip_224' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
2665+ return model
2666+
2667+
2668+ @register_model
2669+ def vit_61m_patch32_tinyclip_224 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
2670+ model_args = dict (
2671+ patch_size = 32 , embed_dim = 640 , depth = 12 , num_heads = 10 , pre_norm = True , norm_layer = nn .LayerNorm )
2672+ model = _create_vision_transformer (
2673+ 'vit_61m_patch32_tinyclip_224' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
2674+ return model
2675+
2676+
26382677@register_model
26392678def vit_medium_patch16_reg4_256 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
26402679 model_args = dict (
0 commit comments