Add ideas from 'Scaling ViT to 22-B Params', testing PyTorch 2.0 fused F.scaled_dot_product_attention impl in vit, vit_relpos, maxxvit / coatnet.

rwightman · rwightman · commit 621e1b218250 · 2023-02-16T16:57:42.000-08:00
diff --git a/timm/layers/__init__.py b/timm/layers/__init__.py
@@ -28,7 +28,7 @@
 from .mixed_conv2d import MixedConv2d
 from .mlp import Mlp, GluMlp, GatedMlp, ConvMlp, GlobalResponseNormMlp
 from .non_local_attn import NonLocalAttn, BatNonLocalAttn
-from .norm import GroupNorm, GroupNorm1, LayerNorm, LayerNorm2d
+from .norm import GroupNorm, GroupNorm1, LayerNorm, LayerNorm2d, RmsNorm
 from .norm_act import BatchNormAct2d, GroupNormAct, GroupNorm1Act, LayerNormAct, LayerNormAct2d,\
     SyncBatchNormAct, convert_sync_batchnorm, FrozenBatchNormAct2d, freeze_batch_norm_2d, unfreeze_batch_norm_2d
 from .padding import get_padding, get_same_padding, pad_same
diff --git a/timm/layers/fast_norm.py b/timm/layers/fast_norm.py
@@ -17,6 +17,12 @@
 except ImportError:
     has_apex = False
 
+try:
+    from apex.normalization.fused_layer_norm import fused_rms_norm_affine, fused_rms_norm
+    has_apex_rmsnorm = True
+except ImportError:
+    has_apex_rmsnorm = False
+
 
 # fast (ie lower precision LN) can be disabled with this flag if issues crop up
 _USE_FAST_NORM = False  # defaulting to False for now
@@ -76,3 +82,32 @@ def fast_layer_norm(
 
     with torch.cuda.amp.autocast(enabled=False):
         return F.layer_norm(x, normalized_shape, weight, bias, eps)
+
+
+def rms_norm(
+    x: torch.Tensor,
+    normalized_shape: List[int],
+    weight: Optional[torch.Tensor] = None,
+    eps: float = 1e-5,
+):
+    dims = tuple(i for i in range(-1, -len(normalized_shape) - 1, -1))
+    v = torch.var(x, dim=dims, keepdim=True)
+    x = x * torch.rsqrt(v + eps)
+    if weight is not None:
+        x = x * weight
+    return x
+
+
+def fast_rms_norm(
+    x: torch.Tensor,
+    normalized_shape: List[int],
+    weight: Optional[torch.Tensor] = None,
+    eps: float = 1e-5,
+) -> torch.Tensor:
+    if torch.jit.is_scripting() or not has_apex_rmsnorm:
+        return rms_norm(x, normalized_shape, weight, eps)
+
+    if weight is None:
+        return fused_rms_norm(x, normalized_shape, eps)
+    else:
+        return fused_rms_norm_affine(x, weight, normalized_shape, eps)
diff --git a/timm/layers/norm.py b/timm/layers/norm.py
@@ -4,12 +4,14 @@
 
 Hacked together by / Copyright 2022 Ross Wightman
 """
+import numbers
+from typing import Tuple
 
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 
-from .fast_norm import is_fast_norm, fast_group_norm, fast_layer_norm
+from .fast_norm import is_fast_norm, fast_group_norm, fast_layer_norm, fast_rms_norm
 
 
 class GroupNorm(nn.GroupNorm):
@@ -115,3 +117,38 @@ def forward(self, x) -> torch.Tensor:
         else:
             x = _layer_norm_cf(x, self.weight, self.bias, self.eps)
         return x
+
+
+class RmsNorm(nn.Module):
+    """ RmsNorm w/ fast (apex) norm if available
+    """
+    normalized_shape: Tuple[int, ...]
+    eps: float
+    elementwise_affine: bool
+
+    def __init__(self, channels, eps=1e-6, affine=True, device=None, dtype=None) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__()
+        normalized_shape = channels
+        if isinstance(normalized_shape, numbers.Integral):
+            # mypy error: incompatible types in assignment
+            normalized_shape = (normalized_shape,)  # type: ignore[assignment]
+        self.normalized_shape = tuple(normalized_shape)  # type: ignore[arg-type]
+        self.eps = eps
+        self.elementwise_affine = affine
+        if self.elementwise_affine:
+            self.weight = nn.Parameter(torch.empty(self.normalized_shape, **factory_kwargs))
+        else:
+            self.register_parameter('weight', None)
+
+        self.reset_parameters()
+
+    def reset_parameters(self) -> None:
+        if self.elementwise_affine:
+            nn.init.ones_(self.weight)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # NOTE fast norm fallback needs our rms norm impl, so both paths through here.
+        # Since there is no built-in PyTorch impl, always use APEX RmsNorm if is installed.
+        x = fast_rms_norm(x, self.normalized_shape, self.weight, self.eps)
+        return x
diff --git a/timm/layers/pos_embed_rel.py b/timm/layers/pos_embed_rel.py
@@ -83,8 +83,8 @@ def gen_relative_log_coords(
         pretrained_win_size: Tuple[int, int] = (0, 0),
         mode='swin',
 ):
-    assert mode in ('swin', 'cr', 'rw')
-    # as per official swin-v2 impl, supporting timm specific 'cr' and 'rw' log coords as well
+    assert mode in ('swin', 'cr')
+    # as per official swin-v2 impl, supporting timm specific 'cr' log coords as well
     relative_coords_h = torch.arange(-(win_size[0] - 1), win_size[0], dtype=torch.float32)
     relative_coords_w = torch.arange(-(win_size[1] - 1), win_size[1], dtype=torch.float32)
     relative_coords_table = torch.stack(torch.meshgrid([relative_coords_h, relative_coords_w]))
@@ -100,18 +100,9 @@ def gen_relative_log_coords(
         relative_coords_table = torch.sign(relative_coords_table) * torch.log2(
             1.0 + relative_coords_table.abs()) / math.log2(8)
     else:
-        if mode == 'rw':
-            # cr w/ window size normalization -> [-1,1] log coords
-            relative_coords_table[:, :, 0] /= (win_size[0] - 1)
-            relative_coords_table[:, :, 1] /= (win_size[1] - 1)
-            relative_coords_table *= 8  # scale to -8, 8
-            relative_coords_table = torch.sign(relative_coords_table) * torch.log2(
-                1.0 + relative_coords_table.abs())
-            relative_coords_table /= math.log2(9)   # -> [-1, 1]
-        else:
-            # mode == 'cr'
-            relative_coords_table = torch.sign(relative_coords_table) * torch.log(
-                1.0 + relative_coords_table.abs())
+        # mode == 'cr'
+        relative_coords_table = torch.sign(relative_coords_table) * torch.log(
+            1.0 + relative_coords_table.abs())
 
     return relative_coords_table
 
@@ -141,10 +132,6 @@ def __init__(
             self.bias_act = nn.Sigmoid()
             self.bias_gain = 16
             mlp_bias = (True, False)
-        elif mode == 'rw':
-            self.bias_act = nn.Tanh()
-            self.bias_gain = 4
-            mlp_bias = True
         else:
             self.bias_act = nn.Identity()
             self.bias_gain = None
diff --git a/timm/models/maxxvit.py b/timm/models/maxxvit.py
@@ -160,6 +160,7 @@ def __init__(
         self.dim_head = dim_head
         self.head_first = head_first
         self.scale = dim_head ** -0.5
+        self.fast_attn = hasattr(torch.nn.functional, 'scaled_dot_product_attention')  # FIXME
 
         self.qkv = nn.Conv2d(dim, dim_attn * 3, 1, bias=bias)
         self.rel_pos = rel_pos_cls(num_heads=self.num_heads) if rel_pos_cls else None
@@ -175,15 +176,31 @@ def forward(self, x, shared_rel_pos: Optional[torch.Tensor] = None):
         else:
             q, k, v = self.qkv(x).reshape(B, 3, self.num_heads, self.dim_head, -1).unbind(1)
 
-        attn = (q.transpose(-2, -1) @ k) * self.scale
-        if self.rel_pos is not None:
-            attn = self.rel_pos(attn)
-        elif shared_rel_pos is not None:
-            attn = attn + shared_rel_pos
-        attn = attn.softmax(dim=-1)
-        attn = self.attn_drop(attn)
+        if self.fast_attn:
+            if self.rel_pos is not None:
+                attn_bias = self.rel_pos.get_bias()
+            elif shared_rel_pos is not None:
+                attn_bias = shared_rel_pos
+            else:
+                attn_bias = None
+            x = torch.nn.functional.scaled_dot_product_attention(
+                q.transpose(-1, -2),
+                k.transpose(-1, -2),
+                v.transpose(-1, -2),
+                attn_mask=attn_bias,
+                dropout_p=self.attn_drop.p,
+            ).transpose(-1, -2).reshape(B, -1, H, W)
+        else:
+            q = q * self.scale
+            attn = q.transpose(-2, -1) @ k
+            if self.rel_pos is not None:
+                attn = self.rel_pos(attn)
+            elif shared_rel_pos is not None:
+                attn = attn + shared_rel_pos
+            attn = attn.softmax(dim=-1)
+            attn = self.attn_drop(attn)
+            x = (v @ attn.transpose(-2, -1)).view(B, -1, H, W)
 
-        x = (v @ attn.transpose(-2, -1)).view(B, -1, H, W)
         x = self.proj(x)
         x = self.proj_drop(x)
         return x
@@ -211,6 +228,7 @@ def __init__(
         self.dim_head = dim_head
         self.head_first = head_first
         self.scale = dim_head ** -0.5
+        self.fast_attn = hasattr(torch.nn.functional, 'scaled_dot_product_attention')  # FIXME
 
         self.qkv = nn.Linear(dim, dim_attn * 3, bias=bias)
         self.rel_pos = rel_pos_cls(num_heads=self.num_heads) if rel_pos_cls else None
@@ -227,15 +245,30 @@ def forward(self, x, shared_rel_pos: Optional[torch.Tensor] = None):
         else:
             q, k, v = self.qkv(x).reshape(B, -1, 3, self.num_heads, self.dim_head).transpose(1, 3).unbind(2)
 
-        attn = (q @ k.transpose(-2, -1)) * self.scale
-        if self.rel_pos is not None:
-            attn = self.rel_pos(attn, shared_rel_pos=shared_rel_pos)
-        elif shared_rel_pos is not None:
-            attn = attn + shared_rel_pos
-        attn = attn.softmax(dim=-1)
-        attn = self.attn_drop(attn)
-
-        x = (attn @ v).transpose(1, 2).reshape(restore_shape + (-1,))
+        if self.fast_attn:
+            if self.rel_pos is not None:
+                attn_bias = self.rel_pos.get_bias()
+            elif shared_rel_pos is not None:
+                attn_bias = shared_rel_pos
+            else:
+                attn_bias = None
+            x = torch.nn.functional.scaled_dot_product_attention(
+                q, k, v,
+                attn_mask=attn_bias,
+                dropout_p=self.attn_drop.p,
+            )
+        else:
+            q = q * self.scale
+            attn = q @ k.transpose(-2, -1)
+            if self.rel_pos is not None:
+                attn = self.rel_pos(attn, shared_rel_pos=shared_rel_pos)
+            elif shared_rel_pos is not None:
+                attn = attn + shared_rel_pos
+            attn = attn.softmax(dim=-1)
+            attn = self.attn_drop(attn)
+            x = attn @ v
+
+        x = x.transpose(1, 2).reshape(restore_shape + (-1,))
         x = self.proj(x)
         x = self.proj_drop(x)
         return x
diff --git a/timm/models/vision_transformer.py b/timm/models/vision_transformer.py
diff --git a/timm/models/vision_transformer_relpos.py b/timm/models/vision_transformer_relpos.py