@@ -1778,6 +1778,35 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
17781778 input_size = (3 , 384 , 384 ),
17791779 num_classes = 0 ),
17801780
1781+ 'vit_base_patch16_siglip_gap_224.webli' : _cfg (
1782+ hf_hub_id = 'timm/ViT-B-16-SigLIP' ,
1783+ hf_hub_filename = 'open_clip_pytorch_model.bin' ,
1784+ num_classes = 0 ),
1785+ 'vit_base_patch16_siglip_gap_256.webli' : _cfg (
1786+ hf_hub_id = 'timm/ViT-B-16-SigLIP-256' ,
1787+ hf_hub_filename = 'open_clip_pytorch_model.bin' ,
1788+ input_size = (3 , 256 , 256 ),
1789+ num_classes = 0 ),
1790+ 'vit_base_patch16_siglip_gap_384.webli' : _cfg (
1791+ hf_hub_id = 'timm/ViT-B-16-SigLIP-384' ,
1792+ hf_hub_filename = 'open_clip_pytorch_model.bin' ,
1793+ input_size = (3 , 384 , 384 ),
1794+ num_classes = 0 ),
1795+ 'vit_base_patch16_siglip_gap_512.webli' : _cfg (
1796+ hf_hub_id = 'timm/ViT-B-16-SigLIP-512' ,
1797+ hf_hub_filename = 'open_clip_pytorch_model.bin' ,
1798+ input_size = (3 , 512 , 512 ),
1799+ num_classes = 0 ),
1800+ 'vit_large_patch16_siglip_gap_256.webli' : _cfg (
1801+ hf_hub_id = 'timm/ViT-L-16-SigLIP-256' ,
1802+ hf_hub_filename = 'open_clip_pytorch_model.bin' ,
1803+ input_size = (3 , 256 , 256 ),
1804+ num_classes = 0 ),
1805+ 'vit_large_patch16_siglip_gap_384.webli' : _cfg (
1806+ hf_hub_id = 'timm/ViT-L-16-SigLIP-384' ,
1807+ hf_hub_filename = 'open_clip_pytorch_model.bin' ,
1808+ input_size = (3 , 384 , 384 ),
1809+ num_classes = 0 ),
17811810 'vit_so400m_patch14_siglip_gap_224.webli' : _cfg (
17821811 hf_hub_id = 'timm/ViT-SO400M-14-SigLIP' ,
17831812 hf_hub_filename = 'open_clip_pytorch_model.bin' ,
@@ -2803,8 +2832,75 @@ def vit_so400m_patch14_siglip_384(pretrained: bool = False, **kwargs) -> VisionT
28032832 return model
28042833
28052834
2835+ @register_model
2836+ def vit_base_patch16_siglip_gap_224 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
2837+ """ A SigLIP variant of ViT with global average pooling (GAP) instead of attention pooling (MAP)."""
2838+ model_args = dict (
2839+ patch_size = 16 , embed_dim = 768 , depth = 12 , num_heads = 12 , class_token = False , global_pool = 'avg' , fc_norm = False ,
2840+ )
2841+ model = _create_vision_transformer (
2842+ 'vit_base_patch16_siglip_gap_224' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
2843+ return model
2844+
2845+
2846+ @register_model
2847+ def vit_base_patch16_siglip_gap_256 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
2848+ """ A SigLIP variant of ViT with global average pooling (GAP) instead of attention pooling (MAP)."""
2849+ model_args = dict (
2850+ patch_size = 16 , embed_dim = 768 , depth = 12 , num_heads = 12 , class_token = False , global_pool = 'avg' , fc_norm = False ,
2851+ )
2852+ model = _create_vision_transformer (
2853+ 'vit_base_patch16_siglip_gap_256' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
2854+ return model
2855+
2856+
2857+ @register_model
2858+ def vit_base_patch16_siglip_gap_384 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
2859+ """ A SigLIP variant of ViT with global average pooling (GAP) instead of attention pooling (MAP)."""
2860+ model_args = dict (
2861+ patch_size = 16 , embed_dim = 768 , depth = 12 , num_heads = 12 , class_token = False , global_pool = 'avg' , fc_norm = False ,
2862+ )
2863+ model = _create_vision_transformer (
2864+ 'vit_base_patch16_siglip_gap_384' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
2865+ return model
2866+
2867+
2868+ @register_model
2869+ def vit_base_patch16_siglip_gap_512 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
2870+ """ A SigLIP variant of ViT with global average pooling (GAP) instead of attention pooling (MAP)."""
2871+ model_args = dict (
2872+ patch_size = 16 , embed_dim = 768 , depth = 12 , num_heads = 12 , class_token = False , global_pool = 'avg' , fc_norm = False ,
2873+ )
2874+ model = _create_vision_transformer (
2875+ 'vit_base_patch16_siglip_gap_512' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
2876+ return model
2877+
2878+
2879+ @register_model
2880+ def vit_large_patch16_siglip_gap_256 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
2881+ """ A SigLIP variant of ViT with global average pooling (GAP) instead of attention pooling (MAP)."""
2882+ model_args = dict (
2883+ patch_size = 16 , embed_dim = 1024 , depth = 24 , num_heads = 16 , class_token = False , global_pool = 'avg' , fc_norm = False ,
2884+ )
2885+ model = _create_vision_transformer (
2886+ 'vit_large_patch16_siglip_gap_256' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
2887+ return model
2888+
2889+
2890+ @register_model
2891+ def vit_large_patch16_siglip_gap_384 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
2892+ """ A SigLIP variant of ViT with global average pooling (GAP) instead of attention pooling (MAP)."""
2893+ model_args = dict (
2894+ patch_size = 16 , embed_dim = 1024 , depth = 24 , num_heads = 16 , class_token = False , global_pool = 'avg' , fc_norm = False ,
2895+ )
2896+ model = _create_vision_transformer (
2897+ 'vit_large_patch16_siglip_gap_384' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
2898+ return model
2899+
2900+
28062901@register_model
28072902def vit_so400m_patch14_siglip_gap_224 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
2903+ """ A SigLIP variant of ViT with global average pooling (GAP) instead of attention pooling (MAP)."""
28082904 model_args = dict (
28092905 patch_size = 14 , embed_dim = 1152 , depth = 27 , num_heads = 16 , mlp_ratio = 3.7362 ,
28102906 class_token = False , global_pool = 'avg' , fc_norm = False ,
@@ -2816,6 +2912,7 @@ def vit_so400m_patch14_siglip_gap_224(pretrained: bool = False, **kwargs) -> Vis
28162912
28172913@register_model
28182914def vit_so400m_patch14_siglip_gap_384 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
2915+ """ A SigLIP variant of ViT with global average pooling (GAP) instead of attention pooling (MAP)."""
28192916 model_args = dict (
28202917 patch_size = 14 , embed_dim = 1152 , depth = 27 , num_heads = 16 , mlp_ratio = 3.7362 ,
28212918 class_token = False , global_pool = 'avg' , fc_norm = False ,
@@ -2827,6 +2924,7 @@ def vit_so400m_patch14_siglip_gap_384(pretrained: bool = False, **kwargs) -> Vis
28272924
28282925@register_model
28292926def vit_so400m_patch14_siglip_gap_448 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
2927+ """ A SigLIP variant of ViT with global average pooling (GAP) instead of attention pooling (MAP)."""
28302928 model_args = dict (
28312929 patch_size = 14 , embed_dim = 1152 , depth = 27 , num_heads = 16 , mlp_ratio = 3.7362 ,
28322930 class_token = False , global_pool = 'avg' , fc_norm = False ,
@@ -2838,6 +2936,7 @@ def vit_so400m_patch14_siglip_gap_448(pretrained: bool = False, **kwargs) -> Vis
28382936
28392937@register_model
28402938def vit_so400m_patch14_siglip_gap_896 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
2939+ """ A SigLIP variant of ViT with global average pooling (GAP) instead of attention pooling (MAP)."""
28412940 model_args = dict (
28422941 patch_size = 14 , embed_dim = 1152 , depth = 27 , num_heads = 16 , mlp_ratio = 3.7362 ,
28432942 class_token = False , global_pool = 'avg' , fc_norm = False ,
0 commit comments