huggingface
diff --git a/‎README.md‎
Lines changed: 15 additions & 0 deletions b/‎README.md‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎tests/test_models.py‎
Lines changed: 3 additions & 2 deletions b/‎tests/test_models.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎timm/layers/attention.py‎
Lines changed: 6 additions & 1 deletion b/‎timm/layers/attention.py‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎timm/models/__init__.py‎
Lines changed: 4 additions & 0 deletions b/‎timm/models/__init__.py‎
Lines changed: 4 additions & 0 deletions
@@ -12,6 +12,21 @@
 
 ## What's New
 
+## May 28, 2025
+* Add a number of small/fast models thanks to https://github.com/brianhou0208
+  * SwiftFormer - [(ICCV2023) SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://github.com/Amshaker/SwiftFormer) 
+  * FasterNet - [(CVPR2023) Run, Don’t Walk: Chasing Higher FLOPS for Faster Neural Networks](https://github.com/JierunChen/FasterNet)
+  * SHViT - [(CVPR2024) SHViT: Single-Head Vision Transformer with Memory Efficient](https://github.com/ysj9909/SHViT)
+  * StarNet - [(CVPR2024) Rewrite the Stars](https://github.com/ma-xu/Rewrite-the-Stars)
+  * GhostNet-V3 [GhostNetV3: Exploring the Training Strategies for Compact Models](https://github.com/huawei-noah/Efficient-AI-Backbones/tree/master/ghostnetv3_pytorch)
+* Update EVA ViT (closest match) to support Perception Encoder models (https://arxiv.org/abs/2504.13181) from Meta, loading Hub weights but I still need to push dedicated `timm` weights
+  * Add some flexibility to ROPE impl
+* Big increase in number of models supporting `forward_intermediates()` and some additional fixes thanks to https://github.com/brianhou0208
+  * DaViT, EdgeNeXt, EfficientFormerV2, EfficientViT(MIT), EfficientViT(MSRA), FocalNet, GCViT, HGNet /V2, InceptionNeXt, Inception-V4, MambaOut, MetaFormer, NesT, Next-ViT, PiT, PVT V2, RepGhostNet, RepViT, ResNetV2, ReXNet, TinyViT, TResNet, VoV
+* TNT model updated w/ new weights `forward_intermediates()` thanks to https://github.com/brianhou0208
+* Add `local-dir:` pretrained schema, can use `local-dir:/path/to/model/folder` for model name to source model / pretrained cfg & weights Hugging Face Hub models (config.json + weights file) from a local folder.
+* Fixes, improvements for onnx export
+    
 ## Feb 21, 2025
 * SigLIP 2 ViT image encoders added (https://huggingface.co/collections/timm/siglip-2-67b8e72ba08b09dd97aecaf9)
   * Variable resolution / aspect NaFlex versions are a WIP
 
@@ -56,13 +56,13 @@
     'regnet', 'byobnet', 'byoanet', 'mlp_mixer', 'hiera', 'fastvit', 'hieradet_sam2', 'aimv2*', 'tnt',
     'tiny_vit', 'vovnet', 'tresnet', 'rexnet', 'resnetv2', 'repghost', 'repvit', 'pvt_v2', 'nextvit', 'nest',
     'mambaout', 'inception_next', 'inception_v4', 'hgnet', 'gcvit', 'focalnet', 'efficientformer_v2', 'edgenext',
-    'davit', 'rdnet', 'convnext', 'pit'
+    'davit', 'rdnet', 'convnext', 'pit', 'starnet', 'shvit', 'fasternet', 'swiftformer', 'ghostnet',
 ]
 
 # transformer / hybrid models don't support full set of spatial / feature APIs and/or have spatial output.
 NON_STD_FILTERS = [
     'vit_*', 'tnt_*', 'pit_*', 'coat_*', 'cait_*', '*mixer_*', 'gmlp_*', 'resmlp_*', 'twins_*',
-    'convit_*', 'levit*', 'visformer*', 'deit*', 'xcit_*', 'crossvit_*', 'beit*', 'aimv2*',
+    'convit_*', 'levit*', 'visformer*', 'deit*', 'xcit_*', 'crossvit_*', 'beit*', 'aimv2*', 'swiftformer_*',
     'poolformer_*', 'volo_*', 'sequencer2d_*', 'mvitv2*', 'gcvit*', 'efficientformer*', 'sam_hiera*',
     'eva_*', 'flexivit*', 'eva02*', 'samvit_*', 'efficientvit_m*', 'tiny_vit_*', 'hiera_*', 'vitamin*', 'test_vit*',
 ]
@@ -221,6 +221,7 @@ def test_model_backward(model_name, batch_size):
 EARLY_POOL_MODELS = (
     timm.models.EfficientVit,
     timm.models.EfficientVitLarge,
+    timm.models.FasterNet,
     timm.models.HighPerfGpuNet,
     timm.models.GhostNet,
     timm.models.MetaNeXt, # InceptionNeXt
 
@@ -28,10 +28,11 @@ def __init__(
             num_heads: int = 8,
             qkv_bias: bool = False,
             qk_norm: bool = False,
+            scale_norm: bool = False,
             proj_bias: bool = True,
             attn_drop: float = 0.,
             proj_drop: float = 0.,
-            norm_layer: Type[nn.Module] = nn.LayerNorm,
+            norm_layer: Optional[Type[nn.Module]] = None,
     ) -> None:
         """Initialize the Attention module.
 
@@ -47,6 +48,8 @@ def __init__(
         """
         super().__init__()
         assert dim % num_heads == 0, 'dim should be divisible by num_heads'
+        if qk_norm or scale_norm:
+            assert norm_layer is not None, 'norm_layer must be provided if qk_norm or scale_norm is True'
         self.num_heads = num_heads
         self.head_dim = dim // num_heads
         self.scale = self.head_dim ** -0.5
@@ -56,6 +59,7 @@ def __init__(
         self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
         self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
         self.attn_drop = nn.Dropout(attn_drop)
+        self.norm = norm_layer(dim) if scale_norm else nn.Identity()
         self.proj = nn.Linear(dim, dim, bias=proj_bias)
         self.proj_drop = nn.Dropout(proj_drop)
 
@@ -84,6 +88,7 @@ def forward(
             x = attn @ v
 
         x = x.transpose(1, 2).reshape(B, N, C)
+        x = self.norm(x)
         x = self.proj(x)
         x = self.proj_drop(x)
         return x
 
@@ -20,6 +20,7 @@
 from .efficientvit_mit import *
 from .efficientvit_msra import *
 from .eva import *
+from .fasternet import *
 from .fastvit import *
 from .focalnet import *
 from .gcvit import *
@@ -61,7 +62,10 @@
 from .selecsls import *
 from .senet import *
 from .sequencer import *
+from .shvit import *
 from .sknet import *
+from .starnet import *
+from .swiftformer import *
 from .swin_transformer import *
 from .swin_transformer_v2 import *
 from .swin_transformer_v2_cr import *