@@ -3499,18 +3499,6 @@ def vit_so400m_patch14_siglip_gap_224(pretrained: bool = False, **kwargs) -> Vis
34993499 return model
35003500
35013501
3502- @register_model
3503- def vit_so400m_patch16_siglip_gap_256 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
3504- """ A SigLIP variant of ViT with global average pooling (GAP) instead of attention pooling (MAP)."""
3505- model_args = dict (
3506- patch_size = 16 , embed_dim = 1152 , depth = 27 , num_heads = 16 , mlp_ratio = 3.7362 ,
3507- class_token = False , global_pool = 'avg' , fc_norm = False ,
3508- )
3509- model = _create_vision_transformer (
3510- 'vit_so400m_patch16_siglip_gap_256' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
3511- return model
3512-
3513-
35143502@register_model
35153503def vit_so400m_patch14_siglip_gap_378 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
35163504 """ A SigLIP variant of ViT with global average pooling (GAP) instead of attention pooling (MAP)."""
@@ -3561,9 +3549,10 @@ def vit_so400m_patch14_siglip_gap_896(pretrained: bool = False, **kwargs) -> Vis
35613549
35623550@register_model
35633551def vit_so400m_patch16_siglip_gap_256 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
3552+ """ A SigLIP variant of ViT with global average pooling (GAP) instead of attention pooling (MAP)."""
35643553 model_args = dict (
3565- patch_size = 16 , embed_dim = 1152 , depth = 27 , num_heads = 16 , mlp_ratio = 3.7362 , class_token = False ,
3566- global_pool = 'avg' , fc_norm = False , act_layer = 'gelu_tanh'
3554+ patch_size = 16 , embed_dim = 1152 , depth = 27 , num_heads = 16 , mlp_ratio = 3.7362 ,
3555+ class_token = False , global_pool = 'avg' , fc_norm = False , act_layer = 'gelu_tanh' ,
35673556 )
35683557 model = _create_vision_transformer (
35693558 'vit_so400m_patch16_siglip_gap_256' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
0 commit comments