@@ -1473,18 +1473,22 @@ def _cfg(url='', **kwargs):
14731473 'vit_base_patch32_clip_224.metaclip_2pt5b' : _cfg (
14741474 hf_hub_id = 'facebook/metaclip-b32-fullcc2.5b' ,
14751475 hf_hub_filename = 'metaclip_b32_fullcc2.5b.bin' ,
1476+ license = 'cc-by-nc-4.0' ,
14761477 mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , crop_pct = 1.0 , num_classes = 512 ),
14771478 'vit_base_patch16_clip_224.metaclip_2pt5b' : _cfg (
14781479 hf_hub_id = 'facebook/metaclip-b16-fullcc2.5b' ,
14791480 hf_hub_filename = 'metaclip_b16_fullcc2.5b.bin' ,
1481+ license = 'cc-by-nc-4.0' ,
14801482 mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , crop_pct = 1.0 , num_classes = 512 ),
14811483 'vit_large_patch14_clip_224.metaclip_2pt5b' : _cfg (
14821484 hf_hub_id = 'facebook/metaclip-l14-fullcc2.5b' ,
14831485 hf_hub_filename = 'metaclip_l14_fullcc2.5b.bin' ,
1486+ license = 'cc-by-nc-4.0' ,
14841487 mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , crop_pct = 1.0 , num_classes = 768 ),
14851488 'vit_huge_patch14_clip_224.metaclip_2pt5b' : _cfg (
14861489 hf_hub_id = 'facebook/metaclip-h14-fullcc2.5b' ,
14871490 hf_hub_filename = 'metaclip_h14_fullcc2.5b.bin' ,
1491+ license = 'cc-by-nc-4.0' ,
14881492 mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , crop_pct = 1.0 , num_classes = 1024 ),
14891493
14901494 'vit_base_patch32_clip_224.openai' : _cfg (
@@ -2129,7 +2133,8 @@ def vit_base_patch32_clip_quickgelu_224(pretrained=False, **kwargs) -> VisionTra
21292133 patch_size = 32 , embed_dim = 768 , depth = 12 , num_heads = 12 , pre_norm = True ,
21302134 norm_layer = nn .LayerNorm , act_layer = 'quick_gelu' )
21312135 model = _create_vision_transformer (
2132- 'vit_base_patch32_clip_224' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
2136+ 'vit_base_patch32_clip_224' , # map to non quickgelu pretrained_cfg intentionally
2137+ pretrained = pretrained , ** dict (model_args , ** kwargs ))
21332138 return model
21342139
21352140
@@ -2141,7 +2146,8 @@ def vit_base_patch16_clip_quickgelu_224(pretrained=False, **kwargs) -> VisionTra
21412146 patch_size = 16 , embed_dim = 768 , depth = 12 , num_heads = 12 , pre_norm = True ,
21422147 norm_layer = nn .LayerNorm , act_layer = 'quick_gelu' )
21432148 model = _create_vision_transformer (
2144- 'vit_base_patch16_clip_224' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
2149+ 'vit_base_patch16_clip_224' , # map to non quickgelu pretrained_cfg intentionally
2150+ pretrained = pretrained , ** dict (model_args , ** kwargs ))
21452151 return model
21462152
21472153
@@ -2154,7 +2160,8 @@ def vit_large_patch14_clip_quickgelu_224(pretrained=False, **kwargs) -> VisionTr
21542160 patch_size = 14 , embed_dim = 1024 , depth = 24 , num_heads = 16 , pre_norm = True ,
21552161 norm_layer = nn .LayerNorm , act_layer = 'quick_gelu' )
21562162 model = _create_vision_transformer (
2157- 'vit_large_patch14_clip_224' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
2163+ 'vit_large_patch14_clip_224' , # map to non quickgelu pretrained_cfg intentionally
2164+ pretrained = pretrained , ** dict (model_args , ** kwargs ))
21582165 return model
21592166
21602167
@@ -2166,7 +2173,8 @@ def vit_large_patch14_clip_quickgelu_336(pretrained=False, **kwargs) -> VisionTr
21662173 patch_size = 14 , embed_dim = 1024 , depth = 24 , num_heads = 16 , pre_norm = True ,
21672174 norm_layer = nn .LayerNorm , act_layer = 'quick_gelu' )
21682175 model = _create_vision_transformer (
2169- 'vit_large_patch14_clip_336' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
2176+ 'vit_large_patch14_clip_336' , # map to non quickgelu pretrained_cfg intentionally
2177+ pretrained = pretrained , ** dict (model_args , ** kwargs ))
21702178 return model
21712179
21722180
@@ -2178,7 +2186,8 @@ def vit_huge_patch14_clip_quickgelu_224(pretrained=False, **kwargs) -> VisionTra
21782186 patch_size = 14 , embed_dim = 1280 , depth = 32 , num_heads = 16 , pre_norm = True ,
21792187 norm_layer = nn .LayerNorm , act_layer = 'quick_gelu' )
21802188 model = _create_vision_transformer (
2181- 'vit_huge_patch14_clip_224' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
2189+ 'vit_huge_patch14_clip_224' , # map to non quickgelu pretrained_cfg intentionally
2190+ pretrained = pretrained , ** dict (model_args , ** kwargs ))
21822191 return model
21832192
21842193
@@ -2190,7 +2199,8 @@ def vit_huge_patch14_clip_quickgelu_378(pretrained=False, **kwargs) -> VisionTra
21902199 patch_size = 14 , embed_dim = 1280 , depth = 32 , num_heads = 16 , pre_norm = True ,
21912200 norm_layer = nn .LayerNorm , act_layer = 'quick_gelu' )
21922201 model = _create_vision_transformer (
2193- 'vit_huge_patch14_clip_378' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
2202+ 'vit_huge_patch14_clip_378' , # map to non quickgelu pretrained_cfg intentionally
2203+ pretrained = pretrained , ** dict (model_args , ** kwargs ))
21942204 return model
21952205
21962206
0 commit comments