@@ -638,12 +638,10 @@ def __init__(
638638 self .norm = norm_layer (embed_dim ) if activate_post_norm else nn .Identity ()
639639
640640 if global_pool == 'map' :
641- attn_pool_num_heads = attn_pool_num_heads or num_heads
642- attn_pool_mlp_ratio = attn_pool_mlp_ratio or mlp_ratio
643641 self .attn_pool = AttentionPoolLatent (
644642 self .embed_dim ,
645- num_heads = attn_pool_num_heads ,
646- mlp_ratio = attn_pool_mlp_ratio ,
643+ num_heads = attn_pool_num_heads or num_heads ,
644+ mlp_ratio = attn_pool_mlp_ratio or mlp_ratio ,
647645 norm_layer = norm_layer ,
648646 act_layer = nn .GELU ,
649647 )
@@ -1366,6 +1364,20 @@ def _pe_cfg(url: str = '', **kwargs) -> Dict[str, Any]:
13661364 ),
13671365
13681366 # Perception Encoder weights
1367+ 'vit_pe_core_tiny_patch16_384.fb' : _pe_cfg (
1368+ hf_hub_id = 'timm/' ,
1369+ #hf_hub_id='facebook/PE-Core-T16-384',
1370+ #hf_hub_filename='PE-Core-T16-384.pt',
1371+ input_size = (3 , 384 , 384 ),
1372+ num_classes = 512 , # output proj dim
1373+ ),
1374+ 'vit_pe_core_small_patch16_384.fb' : _pe_cfg (
1375+ hf_hub_id = 'timm/' ,
1376+ #hf_hub_id='facebook/PE-Core-S16-384',
1377+ #hf_hub_filename='PE-Core-S16-384.pt',
1378+ input_size = (3 , 384 , 384 ),
1379+ num_classes = 512 , # output proj dim
1380+ ),
13691381 'vit_pe_core_base_patch16_224.fb' : _pe_cfg (
13701382 hf_hub_id = 'timm/' ,
13711383 #hf_hub_id='facebook/PE-Core-B16-224',
@@ -1387,20 +1399,64 @@ def _pe_cfg(url: str = '', **kwargs) -> Dict[str, Any]:
13871399 input_size = (3 , 448 , 448 ),
13881400 num_classes = 1280 , # output proj dim
13891401 ),
1402+
13901403 'vit_pe_lang_large_patch14_448.fb' : _pe_cfg (
13911404 hf_hub_id = 'timm/' ,
13921405 #hf_hub_id='facebook/PE-Lang-L14-448',
13931406 #hf_hub_filename='PE-Lang-L14-448.pt',
13941407 input_size = (3 , 448 , 448 ),
13951408 num_classes = 0 ,
13961409 ),
1410+ 'vit_pe_lang_large_patch14_448.fb_tiling' : _pe_cfg (
1411+ hf_hub_id = 'timm/' ,
1412+ #hf_hub_id='facebook/PE-Lang-L14-448-Tiling',
1413+ #hf_hub_filename='PE-Lang-L14-448-Tiling.pt',
1414+ input_size = (3 , 448 , 448 ),
1415+ num_classes = 0 ,
1416+ ),
13971417 'vit_pe_lang_gigantic_patch14_448.fb' : _pe_cfg (
13981418 hf_hub_id = 'timm/' ,
13991419 #hf_hub_id='facebook/PE-Lang-G14-448',
14001420 #hf_hub_filename='PE-Lang-G14-448.pt',
14011421 input_size = (3 , 448 , 448 ),
14021422 num_classes = 0 ,
14031423 ),
1424+ 'vit_pe_lang_gigantic_patch14_448.fb_tiling' : _pe_cfg (
1425+ hf_hub_id = 'timm/' ,
1426+ #hf_hub_id='facebook/PE-Lang-G14-448-Tiling',
1427+ #hf_hub_filename='PE-Lang-G14-448-Tiling.pt',
1428+ input_size = (3 , 448 , 448 ),
1429+ num_classes = 0 ,
1430+ ),
1431+
1432+ 'vit_pe_spatial_tiny_patch16_512.fb' : _pe_cfg (
1433+ hf_hub_id = 'timm/' ,
1434+ #hf_hub_id='facebook/PE-Spatial-T16-512',
1435+ #hf_hub_filename='PE-Spatial-T16-512.pt',
1436+ input_size = (3 , 512 , 512 ),
1437+ num_classes = 0 ,
1438+ ),
1439+ 'vit_pe_spatial_small_patch16_512.fb' : _pe_cfg (
1440+ hf_hub_id = 'timm/' ,
1441+ #hf_hub_id='facebook/PE-Spatial-S16-512',
1442+ #hf_hub_filename='PE-Spatial-S16-512.pt',
1443+ input_size = (3 , 512 , 512 ),
1444+ num_classes = 0 ,
1445+ ),
1446+ 'vit_pe_spatial_base_patch16_512.fb' : _pe_cfg (
1447+ hf_hub_id = 'timm/' ,
1448+ #hf_hub_id='facebook/PE-Spatial-B16-512',
1449+ #hf_hub_filename='PE-Spatial-B16-512.pt',
1450+ input_size = (3 , 512 , 512 ),
1451+ num_classes = 0 ,
1452+ ),
1453+ 'vit_pe_spatial_large_patch14_448.fb' : _pe_cfg (
1454+ hf_hub_id = 'timm/' ,
1455+ #hf_hub_id='facebook/PE-Spatial-L14-448',
1456+ #hf_hub_filename='PE-Spatial-L14-448.pt',
1457+ input_size = (3 , 448 , 448 ),
1458+ num_classes = 0 ,
1459+ ),
14041460 'vit_pe_spatial_gigantic_patch14_448.fb' : _pe_cfg (
14051461 hf_hub_id = 'timm/' ,
14061462 #hf_hub_id='facebook/PE-Spatial-G14-448',
@@ -1842,6 +1898,55 @@ def vit_base_patch16_rope_reg1_gap_256(pretrained: bool = False, **kwargs) -> Ev
18421898 return model
18431899
18441900
1901+ @register_model
1902+ def vit_pe_core_tiny_patch16_384 (pretrained : bool = False , ** kwargs ) -> Eva :
1903+ """Perception Encoder (PE) ViT from Meta (https://arxiv.org/abs/2504.13181)"""
1904+ model_args = dict (
1905+ patch_size = 16 ,
1906+ embed_dim = 192 ,
1907+ depth = 12 ,
1908+ num_heads = 3 ,
1909+ mlp_ratio = 4.0 ,
1910+ global_pool = 'map' ,
1911+ attn_type = 'rope' ,
1912+ use_pre_transformer_norm = True ,
1913+ use_rot_pos_emb = True ,
1914+ ref_feat_shape = (24 , 24 ),
1915+ rope_grid_offset = 1. ,
1916+ rope_grid_indexing = 'xy' ,
1917+ attn_pool_num_heads = 8 ,
1918+ attn_pool_mlp_ratio = 4. ,
1919+ norm_layer = partial (LayerNorm , eps = 1e-5 ),
1920+ #dynamic_img_size=True
1921+ )
1922+ return _create_eva ('vit_pe_core_tiny_patch16_384' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
1923+
1924+
1925+
1926+ @register_model
1927+ def vit_pe_core_small_patch16_384 (pretrained : bool = False , ** kwargs ) -> Eva :
1928+ """Perception Encoder (PE) ViT from Meta (https://arxiv.org/abs/2504.13181)"""
1929+ model_args = dict (
1930+ patch_size = 16 ,
1931+ embed_dim = 384 ,
1932+ depth = 12 ,
1933+ num_heads = 6 ,
1934+ mlp_ratio = 4.0 ,
1935+ global_pool = 'map' ,
1936+ attn_type = 'rope' ,
1937+ use_pre_transformer_norm = True ,
1938+ use_rot_pos_emb = True ,
1939+ ref_feat_shape = (24 , 24 ),
1940+ rope_grid_offset = 1. ,
1941+ rope_grid_indexing = 'xy' ,
1942+ attn_pool_num_heads = 8 ,
1943+ attn_pool_mlp_ratio = 4. ,
1944+ norm_layer = partial (LayerNorm , eps = 1e-5 ),
1945+ #dynamic_img_size=True
1946+ )
1947+ return _create_eva ('vit_pe_core_small_patch16_384' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
1948+
1949+
18451950@register_model
18461951def vit_pe_core_base_patch16_224 (pretrained : bool = False , ** kwargs ) -> Eva :
18471952 """Perception Encoder (PE) ViT from Meta (https://arxiv.org/abs/2504.13181)"""
@@ -1963,6 +2068,98 @@ def vit_pe_lang_gigantic_patch14_448(pretrained: bool = False, **kwargs) -> Eva:
19632068 return _create_eva ('vit_pe_lang_gigantic_patch14_448' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
19642069
19652070
2071+ @register_model
2072+ def vit_pe_spatial_tiny_patch16_512 (pretrained : bool = False , ** kwargs ) -> Eva :
2073+ """Perception Encoder (PE) ViT from Meta (https://arxiv.org/abs/2504.13181)"""
2074+ model_args = dict (
2075+ patch_size = 16 ,
2076+ embed_dim = 192 ,
2077+ depth = 12 ,
2078+ num_heads = 3 ,
2079+ mlp_ratio = 4.0 ,
2080+ attn_type = 'rope' ,
2081+ use_pre_transformer_norm = True ,
2082+ use_post_transformer_norm = False ,
2083+ use_fc_norm = False , # explicitly disable
2084+ use_rot_pos_emb = True ,
2085+ ref_feat_shape = (32 , 32 ),
2086+ rope_grid_offset = 1. ,
2087+ rope_grid_indexing = 'xy' ,
2088+ norm_layer = partial (LayerNorm , eps = 1e-5 ),
2089+ #dynamic_img_size=True
2090+ )
2091+ return _create_eva ('vit_pe_spatial_tiny_patch16_512' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
2092+
2093+
2094+ @register_model
2095+ def vit_pe_spatial_small_patch16_512 (pretrained : bool = False , ** kwargs ) -> Eva :
2096+ """Perception Encoder (PE) ViT from Meta (https://arxiv.org/abs/2504.13181)"""
2097+ model_args = dict (
2098+ patch_size = 16 ,
2099+ embed_dim = 384 ,
2100+ depth = 12 ,
2101+ num_heads = 6 ,
2102+ mlp_ratio = 4.0 ,
2103+ attn_type = 'rope' ,
2104+ use_pre_transformer_norm = True ,
2105+ use_post_transformer_norm = False ,
2106+ use_fc_norm = False , # explicitly disable
2107+ use_rot_pos_emb = True ,
2108+ ref_feat_shape = (32 , 32 ),
2109+ rope_grid_offset = 1. ,
2110+ rope_grid_indexing = 'xy' ,
2111+ norm_layer = partial (LayerNorm , eps = 1e-5 ),
2112+ #dynamic_img_size=True
2113+ )
2114+ return _create_eva ('vit_pe_spatial_small_patch16_512' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
2115+
2116+
2117+ @register_model
2118+ def vit_pe_spatial_base_patch16_512 (pretrained : bool = False , ** kwargs ) -> Eva :
2119+ """Perception Encoder (PE) ViT from Meta (https://arxiv.org/abs/2504.13181)"""
2120+ model_args = dict (
2121+ patch_size = 16 ,
2122+ embed_dim = 768 ,
2123+ depth = 12 ,
2124+ num_heads = 12 ,
2125+ mlp_ratio = 4.0 ,
2126+ attn_type = 'rope' ,
2127+ use_pre_transformer_norm = True ,
2128+ use_post_transformer_norm = False ,
2129+ use_fc_norm = False , # explicitly disable
2130+ use_rot_pos_emb = True ,
2131+ ref_feat_shape = (32 , 32 ),
2132+ rope_grid_offset = 1. ,
2133+ rope_grid_indexing = 'xy' ,
2134+ norm_layer = partial (LayerNorm , eps = 1e-5 ),
2135+ #dynamic_img_size=True
2136+ )
2137+ return _create_eva ('vit_pe_spatial_base_patch16_512' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
2138+
2139+
2140+ @register_model
2141+ def vit_pe_spatial_large_patch14_448 (pretrained : bool = False , ** kwargs ) -> Eva :
2142+ """Perception Encoder (PE) ViT from Meta (https://arxiv.org/abs/2504.13181)"""
2143+ model_args = dict (
2144+ patch_size = 14 ,
2145+ embed_dim = 1024 ,
2146+ depth = 24 ,
2147+ num_heads = 16 ,
2148+ mlp_ratio = 4.0 ,
2149+ attn_type = 'rope' ,
2150+ use_pre_transformer_norm = True ,
2151+ use_post_transformer_norm = False ,
2152+ use_fc_norm = False , # explicitly disable
2153+ use_rot_pos_emb = True ,
2154+ ref_feat_shape = (32 , 32 ),
2155+ rope_grid_offset = 1. ,
2156+ rope_grid_indexing = 'xy' ,
2157+ norm_layer = partial (LayerNorm , eps = 1e-5 ),
2158+ #dynamic_img_size=True,
2159+ )
2160+ return _create_eva ('vit_pe_spatial_large_patch14_448' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
2161+
2162+
19662163@register_model
19672164def vit_pe_spatial_gigantic_patch14_448 (pretrained : bool = False , ** kwargs ) -> Eva :
19682165 """Perception Encoder (PE) ViT from Meta (https://arxiv.org/abs/2504.13181)"""
0 commit comments