From 251b323582c8d70b8ca7a31caf9229ec7a8708c1 Mon Sep 17 00:00:00 2001
From: Abhirath Anand <74202102+theabhirath@users.noreply.github.com>
Date: Thu, 25 Aug 2022 08:37:01 +0530
Subject: [PATCH 1/8] More uniform mid level API

1. Expose `pretrain` option for all models
2. Make it easier to initialise models with config options at the mid level by providing an additional dispatch
3. Some cleanup + documentation
---
 src/convnets/alexnet.jl                      |  9 ++++---
 src/convnets/convnext.jl                     | 22 ++++++++++++++---
 src/convnets/efficientnets/core.jl           | 10 ++++----
 src/convnets/efficientnets/efficientnet.jl   |  4 +--
 src/convnets/efficientnets/efficientnetv2.jl |  4 +--
 src/convnets/inceptions/googlenet.jl         | 22 ++++++++---------
 src/convnets/mobilenets/mnasnet.jl           | 13 +++++++---
 src/convnets/mobilenets/mobilenetv1.jl       |  6 +++++
 src/convnets/mobilenets/mobilenetv2.jl       |  8 ++++++
 src/convnets/mobilenets/mobilenetv3.jl       | 11 ++++++---
 src/convnets/resnets/core.jl                 | 21 ++++++++--------
 src/layers/conv.jl                           | 26 +++++++++++++++-----
 src/mixers/gmlp.jl                           |  5 +++-
 src/mixers/mlpmixer.jl                       |  5 +++-
 src/mixers/resmlp.jl                         |  5 +++-
 src/vit-based/vit.jl                         |  5 +++-
 16 files changed, 123 insertions(+), 53 deletions(-)

diff --git a/src/convnets/alexnet.jl b/src/convnets/alexnet.jl
index 3c713839e..9f9bcacf0 100644
--- a/src/convnets/alexnet.jl
+++ b/src/convnets/alexnet.jl
@@ -1,15 +1,16 @@
 """
-    alexnet(; inchannels::Integer = 3, nclasses::Integer = 1000)
+    alexnet(; dropout_rate = 0.5, inchannels::Integer = 3, nclasses::Integer = 1000)
 
 Create an AlexNet model
 ([reference](https://papers.nips.cc/paper/2012/file/c399862d3b9d6b76c8436e924a68c45b-Paper.pdf)).
 
 # Arguments
 
+  - `dropout_rate`: dropout rate for the classifier
   - `inchannels`: The number of input channels.
   - `nclasses`: the number of output classes
 """
-function alexnet(; inchannels::Integer = 3, nclasses::Integer = 1000)
+function alexnet(; dropout_rate = 0.5, inchannels::Integer = 3, nclasses::Integer = 1000)
     backbone = Chain(Conv((11, 11), inchannels => 64, relu; stride = 4, pad = 2),
                      MaxPool((3, 3); stride = 2),
                      Conv((5, 5), 64 => 192, relu; pad = 2),
@@ -19,9 +20,9 @@ function alexnet(; inchannels::Integer = 3, nclasses::Integer = 1000)
                      Conv((3, 3), 256 => 256, relu; pad = 1),
                      MaxPool((3, 3); stride = 2))
     classifier = Chain(AdaptiveMeanPool((6, 6)), MLUtils.flatten,
-                       Dropout(0.5),
+                       Dropout(dropout_rate),
                        Dense(256 * 6 * 6, 4096, relu),
-                       Dropout(0.5),
+                       Dropout(dropout_rate),
                        Dense(4096, 4096, relu),
                        Dense(4096, nclasses))
     return Chain(backbone, classifier)
diff --git a/src/convnets/convnext.jl b/src/convnets/convnext.jl
index 15271cfed..0e74f4a8d 100644
--- a/src/convnets/convnext.jl
+++ b/src/convnets/convnext.jl
@@ -68,6 +68,12 @@ function convnext(depths::AbstractVector{<:Integer}, planes::AbstractVector{<:In
     return Chain(Chain(backbone...), classifier)
 end
 
+function convnext(config::Symbol; drop_path_rate = 0.0, layerscale_init = 1.0f-6,
+                  inchannels::Integer = 3, nclasses::Integer = 1000)
+    return convnext(CONVNEXT_CONFIGS[config]...; drop_path_rate, layerscale_init,
+                    inchannels, nclasses)
+end
+
 # Configurations for ConvNeXt models
 const CONVNEXT_CONFIGS = Dict(:tiny => ([3, 3, 9, 3], [96, 192, 384, 768]),
                               :small => ([3, 3, 27, 3], [96, 192, 384, 768]),
@@ -76,7 +82,8 @@ const CONVNEXT_CONFIGS = Dict(:tiny => ([3, 3, 9, 3], [96, 192, 384, 768]),
                               :xlarge => ([3, 3, 27, 3], [256, 512, 1024, 2048]))
 
 """
-    ConvNeXt(config::Symbol; inchannels::Integer = 3, nclasses::Integer = 1000)
+    ConvNeXt(config::Symbol; pretrain::Bool = true, inchannels::Integer = 3,
+             nclasses::Integer = 1000)
 
 Creates a ConvNeXt model.
 ([reference](https://arxiv.org/abs/2201.03545))
@@ -84,9 +91,14 @@ Creates a ConvNeXt model.
 # Arguments
 
   - `config`: The size of the model, one of `tiny`, `small`, `base`, `large` or `xlarge`.
+  - `pretrain`: set to `true` to load pre-trained weights for ImageNet
   - `inchannels`: number of input channels
   - `nclasses`: number of output classes
 
+!!! warning
+    
+    `ConvNeXt` does not currently support pretrained weights.
+
 See also [`Metalhead.convnext`](#).
 """
 struct ConvNeXt
@@ -94,9 +106,13 @@ struct ConvNeXt
 end
 @functor ConvNeXt
 
-function ConvNeXt(config::Symbol; inchannels::Integer = 3, nclasses::Integer = 1000)
+function ConvNeXt(config::Symbol; pretrain::Bool = true, inchannels::Integer = 3,
+                  nclasses::Integer = 1000)
     _checkconfig(config, keys(CONVNEXT_CONFIGS))
-    layers = convnext(CONVNEXT_CONFIGS[config]...; inchannels, nclasses)
+    layers = convnext(config; inchannels, nclasses)
+    if pretrain
+        layers = load_pretrained(layers, "convnext_$config")
+    end
     return ConvNeXt(layers)
 end
 
diff --git a/src/convnets/efficientnets/core.jl b/src/convnets/efficientnets/core.jl
index d853e0c1a..8c10ae0fd 100644
--- a/src/convnets/efficientnets/core.jl
+++ b/src/convnets/efficientnets/core.jl
@@ -1,8 +1,8 @@
-function efficientnet(block_configs::AbstractVector{<:Tuple}; inplanes::Integer,
-                      scalings::NTuple{2, Real} = (1, 1),
-                      headplanes::Integer = block_configs[end][3] * 4,
-                      norm_layer = BatchNorm, dropout_rate = nothing,
-                      inchannels::Integer = 3, nclasses::Integer = 1000)
+function efficientnetcore(block_configs::AbstractVector{<:Tuple}; inplanes::Integer,
+                          scalings::NTuple{2, Real} = (1, 1),
+                          headplanes::Integer = block_configs[end][3] * 4,
+                          norm_layer = BatchNorm, dropout_rate = nothing,
+                          inchannels::Integer = 3, nclasses::Integer = 1000)
     layers = []
     # stem of the model
     inplanes = _round_channels(inplanes * scalings[1])
diff --git a/src/convnets/efficientnets/efficientnet.jl b/src/convnets/efficientnets/efficientnet.jl
index 5eb81b21d..ab53bdd58 100644
--- a/src/convnets/efficientnets/efficientnet.jl
+++ b/src/convnets/efficientnets/efficientnet.jl
@@ -52,8 +52,8 @@ function EfficientNet(config::Symbol; pretrain::Bool = false, inchannels::Intege
                       nclasses::Integer = 1000)
     _checkconfig(config, keys(EFFICIENTNET_GLOBAL_CONFIGS))
     scalings = EFFICIENTNET_GLOBAL_CONFIGS[config][2]
-    layers = efficientnet(EFFICIENTNET_BLOCK_CONFIGS; inplanes = 32, scalings,
-                          inchannels, nclasses)
+    layers = efficientnet_core(EFFICIENTNET_BLOCK_CONFIGS; inplanes = 32, scalings,
+                               inchannels, nclasses)
     if pretrain
         loadpretrain!(layers, string("efficientnet-", config))
     end
diff --git a/src/convnets/efficientnets/efficientnetv2.jl b/src/convnets/efficientnets/efficientnetv2.jl
index ff64eea23..516208bf7 100644
--- a/src/convnets/efficientnets/efficientnetv2.jl
+++ b/src/convnets/efficientnets/efficientnetv2.jl
@@ -59,8 +59,8 @@ function EfficientNetv2(config::Symbol; pretrain::Bool = false,
                         inchannels::Integer = 3, nclasses::Integer = 1000)
     _checkconfig(config, sort(collect(keys(EFFNETV2_CONFIGS))))
     block_configs = EFFNETV2_CONFIGS[config]
-    layers = efficientnet(block_configs; inplanes = block_configs[1][3],
-                          headplanes = 1280, inchannels, nclasses)
+    layers = efficientnet_core(block_configs; inplanes = block_configs[1][3],
+                               headplanes = 1280, inchannels, nclasses)
     if pretrain
         loadpretrain!(layers, string("efficientnetv2-", config))
     end
diff --git a/src/convnets/inceptions/googlenet.jl b/src/convnets/inceptions/googlenet.jl
index 54f814479..1018ad99a 100644
--- a/src/convnets/inceptions/googlenet.jl
+++ b/src/convnets/inceptions/googlenet.jl
@@ -1,5 +1,5 @@
 """
-    _inceptionblock(inplanes, out_1x1, red_3x3, out_3x3, red_5x5, out_3x3, pool_proj)
+    inceptionblock(inplanes, out_1x1, red_3x3, out_3x3, red_5x5, out_3x3, pool_proj)
 
 Create an inception module for use in GoogLeNet
 ([reference](https://arxiv.org/abs/1409.4842v1)).
@@ -14,7 +14,7 @@ Create an inception module for use in GoogLeNet
   - `out_5x5`: the number of output feature maps for the 5x5 convolution (branch 3)
   - `pool_proj`: the number of output feature maps for the pooling projection (branch 4)
 """
-function _inceptionblock(inplanes, out_1x1, red_3x3, out_3x3, red_5x5, out_5x5, pool_proj)
+function inceptionblock(inplanes, out_1x1, red_3x3, out_3x3, red_5x5, out_5x5, pool_proj)
     branch1 = Chain(Conv((1, 1), inplanes => out_1x1))
     branch2 = Chain(Conv((1, 1), inplanes => red_3x3),
                     Conv((3, 3), red_3x3 => out_3x3; pad = 1))
@@ -42,17 +42,17 @@ function googlenet(; dropout_rate = 0.4, inchannels::Integer = 3, nclasses::Inte
                      Conv((1, 1), 64 => 64),
                      Conv((3, 3), 64 => 192; pad = 1),
                      MaxPool((3, 3); stride = 2, pad = 1),
-                     _inceptionblock(192, 64, 96, 128, 16, 32, 32),
-                     _inceptionblock(256, 128, 128, 192, 32, 96, 64),
+                     inceptionblock(192, 64, 96, 128, 16, 32, 32),
+                     inceptionblock(256, 128, 128, 192, 32, 96, 64),
                      MaxPool((3, 3); stride = 2, pad = 1),
-                     _inceptionblock(480, 192, 96, 208, 16, 48, 64),
-                     _inceptionblock(512, 160, 112, 224, 24, 64, 64),
-                     _inceptionblock(512, 128, 128, 256, 24, 64, 64),
-                     _inceptionblock(512, 112, 144, 288, 32, 64, 64),
-                     _inceptionblock(528, 256, 160, 320, 32, 128, 128),
+                     inceptionblock(480, 192, 96, 208, 16, 48, 64),
+                     inceptionblock(512, 160, 112, 224, 24, 64, 64),
+                     inceptionblock(512, 128, 128, 256, 24, 64, 64),
+                     inceptionblock(512, 112, 144, 288, 32, 64, 64),
+                     inceptionblock(528, 256, 160, 320, 32, 128, 128),
                      MaxPool((3, 3); stride = 2, pad = 1),
-                     _inceptionblock(832, 256, 160, 320, 32, 128, 128),
-                     _inceptionblock(832, 384, 192, 384, 48, 128, 128))
+                     inceptionblock(832, 256, 160, 320, 32, 128, 128),
+                     inceptionblock(832, 384, 192, 384, 48, 128, 128))
     return Chain(backbone, create_classifier(1024, nclasses; dropout_rate))
 end
 
diff --git a/src/convnets/mobilenets/mnasnet.jl b/src/convnets/mobilenets/mnasnet.jl
index 2f6db2acf..fa8207ca8 100644
--- a/src/convnets/mobilenets/mnasnet.jl
+++ b/src/convnets/mobilenets/mnasnet.jl
@@ -41,6 +41,13 @@ function mnasnet(block_configs::AbstractVector{<:Tuple}; width_mult::Real = 1,
     return Chain(Chain(layers...), create_classifier(max_width, nclasses; dropout_rate))
 end
 
+function mnasnet(config::Symbol; width_mult::Real = 1, max_width::Integer = 1280,
+                 dropout_rate = 0.2, inchannels::Integer = 3, nclasses::Integer = 1000)
+    inplanes, block_configs = MNASNET_CONFIGS[config]
+    return mnasnet(block_configs; width_mult, max_width, dropout_rate, inplanes,
+                   inchannels, nclasses)
+end
+
 # Layer configurations for MNasNet
 # f: block function - we use `dwsep_conv_bn` for the first block and `mbconv` for the rest
 # k: kernel size
@@ -79,7 +86,8 @@ const MNASNET_CONFIGS = Dict(:B1 => (32,
                                             (mbconv, 5, 32, 6, 2, 4, 4, relu),
                                             (mbconv, 3, 32, 6, 1, 3, 4, relu),
                                             (mbconv, 5, 88, 6, 2, 3, 4, relu),
-                                            (mbconv, 3, 144, 6, 1, 1, nothing, relu)]))
+                                            (mbconv, 3, 144, 6, 1, 1, nothing, relu),
+                                        ]))
 
 """
     MNASNet(width_mult = 1; inchannels::Integer = 3, pretrain::Bool = false,
@@ -111,8 +119,7 @@ end
 function MNASNet(config::Symbol; width_mult::Real = 1, pretrain::Bool = false,
                  inchannels::Integer = 3, nclasses::Integer = 1000)
     _checkconfig(config, keys(MNASNET_CONFIGS))
-    inplanes, block_configs = MNASNET_CONFIGS[config]
-    layers = mnasnet(block_configs; width_mult, inplanes, inchannels, nclasses)
+    layers = mnasnet(config; width_mult, inchannels, nclasses)
     if pretrain
         loadpretrain!(layers, "mnasnet$(width_mult)")
     end
diff --git a/src/convnets/mobilenets/mobilenetv1.jl b/src/convnets/mobilenets/mobilenetv1.jl
index 24240d0c0..fb7e3d4ce 100644
--- a/src/convnets/mobilenets/mobilenetv1.jl
+++ b/src/convnets/mobilenets/mobilenetv1.jl
@@ -37,6 +37,12 @@ function mobilenetv1(config::AbstractVector{<:Tuple}; width_mult::Real = 1,
     return Chain(Chain(layers...), create_classifier(outplanes, nclasses; dropout_rate))
 end
 
+function mobilenetv1(width_mult::Real = 1; activation = relu, dropout_rate = nothing,
+                     inchannels::Integer = 3, nclasses::Integer = 1000)
+    return mobilenetv1(MOBILENETV1_CONFIGS[config]; width_mult, activation,
+                       dropout_rate, inchannels, nclasses)
+end
+
 # Layer configurations for MobileNetv1
 # f: block function - we use `dwsep_conv_bn` for all blocks
 # k: kernel size
diff --git a/src/convnets/mobilenets/mobilenetv2.jl b/src/convnets/mobilenets/mobilenetv2.jl
index f3e26862c..1166676ed 100644
--- a/src/convnets/mobilenets/mobilenetv2.jl
+++ b/src/convnets/mobilenets/mobilenetv2.jl
@@ -43,6 +43,14 @@ function mobilenetv2(block_configs::AbstractVector{<:Tuple}; width_mult::Real =
     return Chain(Chain(layers...), create_classifier(headplanes, nclasses; dropout_rate))
 end
 
+function mobilenetv2(width_mult::Real = 1; max_width::Integer = 1280,
+                     divisor::Integer = 8, inplanes::Integer = 32,
+                     dropout_rate = 0.2, inchannels::Integer = 3,
+                     nclasses::Integer = 1000)
+    return mobilenetv2(MOBILENETV2_CONFIGS; width_mult, max_width, divisor, inplanes,
+                       dropout_rate, inchannels, nclasses)
+end
+
 # Layer configurations for MobileNetv2
 # f: block function - we use `mbconv` for all blocks
 # k: kernel size
diff --git a/src/convnets/mobilenets/mobilenetv3.jl b/src/convnets/mobilenets/mobilenetv3.jl
index 2614c7c2f..207bc77f7 100644
--- a/src/convnets/mobilenets/mobilenetv3.jl
+++ b/src/convnets/mobilenets/mobilenetv3.jl
@@ -46,6 +46,13 @@ function mobilenetv3(configs::AbstractVector{<:Tuple}; width_mult::Real = 1,
                                    (hardswish, identity); dropout_rate))
 end
 
+function mobilenetv3(config::Symbol; width_mult::Real = 1, dropout_rate = 0.2,
+                     inchannels::Integer = 3, nclasses::Integer = 1000)
+    max_width = config === :large ? 1280 : 1024
+    return mobilenetv3(MOBILENETV3_CONFIGS[config]; width_mult, max_width,
+                       dropout_rate, inchannels, nclasses)
+end
+
 # Layer configurations for small and large models for MobileNetv3
 # f: mbconv block function - we use `mbconv` for all blocks
 # k: kernel size
@@ -110,9 +117,7 @@ end
 function MobileNetv3(config::Symbol; width_mult::Real = 1, pretrain::Bool = false,
                      inchannels::Integer = 3, nclasses::Integer = 1000)
     _checkconfig(config, [:small, :large])
-    max_width = config == :large ? 1280 : 1024
-    layers = mobilenetv3(MOBILENETV3_CONFIGS[config]; width_mult, max_width, inchannels,
-                         nclasses)
+    layers = mobilenetv3(config; width_mult, inchannels, nclasses)
     if pretrain
         loadpretrain!(layers, string("MobileNetv3", config))
     end
diff --git a/src/convnets/resnets/core.jl b/src/convnets/resnets/core.jl
index dc143b57e..df58524a7 100644
--- a/src/convnets/resnets/core.jl
+++ b/src/convnets/resnets/core.jl
@@ -196,8 +196,8 @@ function resnet_planes(block_repeats::AbstractVector{<:Integer})
                              for (stage_idx, stages) in enumerate(block_repeats))
 end
 
-function resnet(img_dims, stem, get_layers, block_repeats::AbstractVector{<:Integer},
-                connection, classifier_fn)
+function resnetcore(img_dims, stem, get_layers, block_repeats::AbstractVector{<:Integer},
+                    connection, classifier_fn)
     # Build stages of the ResNet
     stage_blocks = cnn_stages(get_layers, block_repeats, connection)
     backbone = Chain(stem, stage_blocks...)
@@ -209,13 +209,14 @@ end
 function resnet(block_type, block_repeats::AbstractVector{<:Integer},
                 downsample_opt::NTuple{2, Any} = (downsample_conv, downsample_identity);
                 cardinality::Integer = 1, base_width::Integer = 64,
-                inplanes::Integer = 64,
-                reduction_factor::Integer = 1, imsize::Dims{2} = (256, 256),
-                inchannels::Integer = 3, stem_fn = resnet_stem, connection = addact,
-                activation = relu, norm_layer = BatchNorm, revnorm::Bool = false,
+                inplanes::Integer = 64, reduction_factor::Integer = 1,
+                stem_fn = resnet_stem, connection = addact, activation = relu,
+                norm_layer = BatchNorm, revnorm::Bool = false,
                 attn_fn = planes -> identity, pool_layer = AdaptiveMeanPool((1, 1)),
-                use_conv::Bool = false, drop_block_rate = nothing, drop_path_rate = nothing,
-                dropout_rate = nothing, nclasses::Integer = 1000, kwargs...)
+                use_conv::Bool = false, drop_block_rate = nothing,
+                drop_path_rate = nothing, dropout_rate = nothing,
+                imsize::Dims{2} = (256, 256), inchannels::Integer = 3,
+                nclasses::Integer = 1000, kwargs...)
     # Build stem
     stem = stem_fn(; inchannels)
     # Block builder
@@ -253,8 +254,8 @@ function resnet(block_type, block_repeats::AbstractVector{<:Integer},
     end
     classifier_fn = nfeatures -> create_classifier(nfeatures, nclasses; dropout_rate,
                                                    pool_layer, use_conv)
-    return resnet((imsize..., inchannels), stem, get_layers, block_repeats,
-                  connection$activation, classifier_fn)
+    return resnet_core((imsize..., inchannels), stem, get_layers, block_repeats,
+                       connection$activation, classifier_fn)
 end
 function resnet(block_fn, block_repeats, downsample_opt::Symbol = :B; kwargs...)
     return resnet(block_fn, block_repeats, RESNET_SHORTCUTS[downsample_opt]; kwargs...)
diff --git a/src/layers/conv.jl b/src/layers/conv.jl
index e49611280..6581197cb 100644
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@@ -8,7 +8,7 @@
     conv_norm(kernel_size::Dims{2}, inplanes => outplanes, activation = identity;
               kwargs...)
 
-Create a convolution + batch normalization pair with activation.
+Create a convolution + normalisation layer pair with activation.
 
 # Arguments
 
@@ -16,11 +16,11 @@ Create a convolution + batch normalization pair with activation.
   - `inplanes`: number of input feature maps
   - `outplanes`: number of output feature maps
   - `activation`: the activation function for the final layer
-  - `norm_layer`: the normalization layer used
+  - `norm_layer`: the normalisation layer used
   - `revnorm`: set to `true` to place the normalisation layer before the convolution
-  - `preact`: set to `true` to place the activation function before the batch norm
+  - `preact`: set to `true` to place the activation function before the normalisation layer
     (only compatible with `revnorm = false`)
-  - `use_norm`: set to `false` to disable normalization
+  - `use_norm`: set to `false` to disable normalisation
     (only compatible with `revnorm = false` and `preact = false`)
   - `stride`: stride of the convolution kernel
   - `pad`: padding of the convolution kernel
@@ -70,8 +70,22 @@ function conv_norm(kernel_size::Dims{2}, ch::Pair{<:Integer, <:Integer},
     return conv_norm(kernel_size, inplanes, outplanes, activation; kwargs...)
 end
 
-# conv + bn layer combination as used by the inception model family matching
-# the default values used in TensorFlow
+"""
+    basic_conv_bn(kernel_size::Dims{2}, inplanes, outplanes, activation = relu;
+                  kwargs...)
+
+Returns a convolution + batch normalisation pair with activation as used by the
+Inception family of models with default values matching those used in the official
+TensorFlow implementation.
+
+# Arguments
+
+  - `kernel_size`: size of the convolution kernel (tuple)
+  - `inplanes`: number of input feature maps
+  - `outplanes`: number of output feature maps
+  - `activation`: the activation function for the final layer
+  - `kwargs`: keyword arguments passed to [`conv_norm`](@ref)
+"""
 function basic_conv_bn(kernel_size::Dims{2}, inplanes, outplanes, activation = relu;
                        kwargs...)
     return conv_norm(kernel_size, inplanes, outplanes, activation; norm_layer = BatchNorm,
diff --git a/src/mixers/gmlp.jl b/src/mixers/gmlp.jl
index ab89baadc..9288a0129 100644
--- a/src/mixers/gmlp.jl
+++ b/src/mixers/gmlp.jl
@@ -94,10 +94,13 @@ end
 @functor gMLP
 
 function gMLP(config::Symbol; imsize::Dims{2} = (224, 224), patch_size::Dims{2} = (16, 16),
-              inchannels::Integer = 3, nclasses::Integer = 1000)
+              pretrain::Bool = false, inchannels::Integer = 3, nclasses::Integer = 1000)
     _checkconfig(config, keys(MIXER_CONFIGS))
     layers = mlpmixer(spatial_gating_block, imsize; mlp_layer = gated_mlp_block, patch_size,
                       MIXER_CONFIGS[config]..., inchannels, nclasses)
+    if pretrain
+        loadpretrain!(layers, string("gmlp", config))
+    end
     return gMLP(layers)
 end
 
diff --git a/src/mixers/mlpmixer.jl b/src/mixers/mlpmixer.jl
index 37cc271fb..5f5079827 100644
--- a/src/mixers/mlpmixer.jl
+++ b/src/mixers/mlpmixer.jl
@@ -57,11 +57,14 @@ end
 @functor MLPMixer
 
 function MLPMixer(config::Symbol; imsize::Dims{2} = (224, 224),
-                  patch_size::Dims{2} = (16, 16),
+                  patch_size::Dims{2} = (16, 16), pretrain::Bool = false,
                   inchannels::Integer = 3, nclasses::Integer = 1000)
     _checkconfig(config, keys(MIXER_CONFIGS))
     layers = mlpmixer(mixerblock, imsize; patch_size, MIXER_CONFIGS[config]..., inchannels,
                       nclasses)
+    if pretrain
+        loadpretrain!(layers, string("mlpmixer", config))
+    end
     return MLPMixer(layers)
 end
 
diff --git a/src/mixers/resmlp.jl b/src/mixers/resmlp.jl
index 21ad89d65..370ed04b2 100644
--- a/src/mixers/resmlp.jl
+++ b/src/mixers/resmlp.jl
@@ -56,11 +56,14 @@ end
 @functor ResMLP
 
 function ResMLP(config::Symbol; imsize::Dims{2} = (224, 224),
-                patch_size::Dims{2} = (16, 16),
+                patch_size::Dims{2} = (16, 16), pretrain::Bool = false,
                 inchannels::Integer = 3, nclasses::Integer = 1000)
     _checkconfig(config, keys(MIXER_CONFIGS))
     layers = mlpmixer(resmixerblock, imsize; mlp_ratio = 4.0, patch_size,
                       MIXER_CONFIGS[config]..., inchannels, nclasses)
+    if pretrain
+        loadpretrain!(layers, string(resmlp, config))
+    end
     return ResMLP(layers)
 end
 
diff --git a/src/vit-based/vit.jl b/src/vit-based/vit.jl
index 75bfb5b07..43e74b394 100644
--- a/src/vit-based/vit.jl
+++ b/src/vit-based/vit.jl
@@ -100,9 +100,12 @@ end
 @functor ViT
 
 function ViT(config::Symbol; imsize::Dims{2} = (256, 256), patch_size::Dims{2} = (16, 16),
-             inchannels::Integer = 3, nclasses::Integer = 1000)
+             pretrain::Bool = false, inchannels::Integer = 3, nclasses::Integer = 1000)
     _checkconfig(config, keys(VIT_CONFIGS))
     layers = vit(imsize; inchannels, patch_size, nclasses, VIT_CONFIGS[config]...)
+    if pretrain
+        loadpretrain!(layers, string("vit", config))
+    end
     return ViT(layers)
 end
 

From 5ce0204d1a9c50c3ccf231e002825471e34d8dba Mon Sep 17 00:00:00 2001
From: Abhirath Anand <74202102+theabhirath@users.noreply.github.com>
Date: Thu, 25 Aug 2022 22:06:10 +0530
Subject: [PATCH 2/8] The one true function

1. Unify MobileNet and EfficientNet APIs into a single lower level builder function
2. Further unification and consolidation of the mid level API
3. Some cleanup
---
 src/Metalhead.jl                             |   7 +-
 src/convnets/builders/irmodel.jl             |  41 +++++++
 src/convnets/builders/mbconv.jl              |  88 +++++----------
 src/convnets/builders/resnet.jl              |   9 ++
 src/convnets/builders/{core.jl => stages.jl} |   0
 src/convnets/convnext.jl                     |   6 +-
 src/convnets/efficientnets/core.jl           |  21 ----
 src/convnets/efficientnets/efficientnet.jl   |  16 ++-
 src/convnets/efficientnets/efficientnetv2.jl |  14 ++-
 src/convnets/inceptions/xception.jl          |   6 +-
 src/convnets/mobilenets/mnasnet.jl           |  68 +++--------
 src/convnets/mobilenets/mobilenetv1.jl       |  68 +++--------
 src/convnets/mobilenets/mobilenetv2.jl       |  78 +++----------
 src/convnets/mobilenets/mobilenetv3.jl       | 112 ++++++-------------
 src/convnets/resnets/core.jl                 |  40 +++----
 src/convnets/resnets/res2net.jl              |   6 +-
 src/layers/Layers.jl                         |   2 +-
 src/layers/conv.jl                           |  28 +++--
 src/layers/drop.jl                           |   8 +-
 src/layers/mbconv.jl                         |  18 +--
 src/layers/selayers.jl                       |  20 ++--
 src/vit-based/vit.jl                         |   2 +-
 22 files changed, 243 insertions(+), 415 deletions(-)
 create mode 100644 src/convnets/builders/irmodel.jl
 create mode 100644 src/convnets/builders/resnet.jl
 rename src/convnets/builders/{core.jl => stages.jl} (100%)
 delete mode 100644 src/convnets/efficientnets/core.jl

diff --git a/src/Metalhead.jl b/src/Metalhead.jl
index f9be49db1..01fc149eb 100644
--- a/src/Metalhead.jl
+++ b/src/Metalhead.jl
@@ -20,18 +20,20 @@ using .Layers
 
 # CNN models
 ## Builders
-include("convnets/builders/core.jl")
+include("convnets/builders/irmodel.jl")
 include("convnets/builders/mbconv.jl")
 include("convnets/builders/resblocks.jl")
+include("convnets/builders/resnet.jl")
+include("convnets/builders/stages.jl")
 ## AlexNet and VGG
 include("convnets/alexnet.jl")
 include("convnets/vgg.jl")
 ## ResNets
 include("convnets/resnets/core.jl")
+include("convnets/resnets/res2net.jl")
 include("convnets/resnets/resnet.jl")
 include("convnets/resnets/resnext.jl")
 include("convnets/resnets/seresnet.jl")
-include("convnets/resnets/res2net.jl")
 ## Inceptions
 include("convnets/inceptions/googlenet.jl")
 include("convnets/inceptions/inceptionv3.jl")
@@ -39,7 +41,6 @@ include("convnets/inceptions/inceptionv4.jl")
 include("convnets/inceptions/inceptionresnetv2.jl")
 include("convnets/inceptions/xception.jl")
 ## EfficientNets
-include("convnets/efficientnets/core.jl")
 include("convnets/efficientnets/efficientnet.jl")
 include("convnets/efficientnets/efficientnetv2.jl")
 ## MobileNets
diff --git a/src/convnets/builders/irmodel.jl b/src/convnets/builders/irmodel.jl
new file mode 100644
index 000000000..7e604bbe0
--- /dev/null
+++ b/src/convnets/builders/irmodel.jl
@@ -0,0 +1,41 @@
+function irmodelbuilder(scalings::NTuple{2, Real}, block_configs::AbstractVector{<:Tuple};
+                        inplanes::Integer = 32, connection = +, activation = relu,
+                        norm_layer = BatchNorm, divisor::Integer = 8,
+                        tail_conv::Bool = true, expanded_classifier::Bool = false,
+                        headplanes::Integer, dropout_rate = nothing,
+                        inchannels::Integer = 3, nclasses::Integer = 1000, kwargs...)
+    width_mult, _ = scalings
+    # building first layer
+    inplanes = _round_channels(inplanes * width_mult, divisor)
+    layers = []
+    append!(layers,
+            conv_norm((3, 3), inchannels, inplanes, activation; stride = 2, pad = 1,
+                      norm_layer))
+    # building inverted residual blocks
+    get_layers, block_repeats = mbconv_stage_builder(block_configs, inplanes, scalings;
+                                                     norm_layer, divisor, kwargs...)
+    append!(layers, cnn_stages(get_layers, block_repeats, connection))
+    # building last layers
+    outplanes = _round_channels(block_configs[end][3] * width_mult, divisor)
+    if tail_conv
+        # special case, supported fully only for MobileNetv3
+        if expanded_classifier
+            midplanes = _round_channels(outplanes * block_configs[end][4], divisor)
+            append!(layers,
+                    conv_norm((1, 1), outplanes, midplanes, activation; norm_layer))
+            classifier = create_classifier(midplanes, headplanes, nclasses,
+                                           (hardswish, identity); dropout_rate)
+        else
+            append!(layers,
+                    conv_norm((1, 1), outplanes, headplanes, activation; norm_layer))
+            classifier = create_classifier(headplanes, nclasses; dropout_rate)
+        end
+    else
+        classifier = create_classifier(outplanes, nclasses; dropout_rate)
+    end
+    return Chain(Chain(layers...), classifier)
+end
+
+function irmodelbuilder(width_mult::Real, block_configs::AbstractVector{<:Tuple}; kwargs...)
+    return irmodelbuilder((width_mult, 1), block_configs; kwargs...)
+end
diff --git a/src/convnets/builders/mbconv.jl b/src/convnets/builders/mbconv.jl
index 31a936add..8190eb884 100644
--- a/src/convnets/builders/mbconv.jl
+++ b/src/convnets/builders/mbconv.jl
@@ -1,9 +1,14 @@
-function dwsepconv_builder(block_configs, inplanes::Integer, stage_idx::Integer,
-                           width_mult::Real; norm_layer = BatchNorm, kwargs...)
+# TODO - potentially make these builders more flexible to specify stuff like
+# activation functions and reductions that don't change over the stages
+
+function dwsepconv_builder(block_configs::AbstractVector{<:Tuple}, inplanes::Integer,
+                           stage_idx::Integer, scalings::NTuple{2, Real};
+                           norm_layer = BatchNorm, divisor::Integer = 8, kwargs...)
+    width_mult, depth_mult = scalings
     block_fn, k, outplanes, stride, nrepeats, activation = block_configs[stage_idx]
-    outplanes = _round_channels(outplanes * width_mult)
+    outplanes = _round_channels(outplanes * width_mult, divisor)
     if stage_idx != 1
-        inplanes = _round_channels(block_configs[stage_idx - 1][3] * width_mult)
+        inplanes = _round_channels(block_configs[stage_idx - 1][3] * width_mult, divisor)
     end
     function get_layers(block_idx::Integer)
         inplanes = block_idx == 1 ? inplanes : outplanes
@@ -12,13 +17,14 @@ function dwsepconv_builder(block_configs, inplanes::Integer, stage_idx::Integer,
                                stride, pad = SamePad(), norm_layer, kwargs...)...)
         return (block,)
     end
-    return get_layers, nrepeats
+    return get_layers, ceil(Int, nrepeats * depth_mult)
 end
+_get_builder(::typeof(dwsep_conv_norm)) = dwsepconv_builder
 
-function mbconv_builder(block_configs, inplanes::Integer, stage_idx::Integer,
-                        scalings::NTuple{2, Real}; norm_layer = BatchNorm,
-                        divisor::Integer = 8, se_from_explanes::Bool = false,
-                        kwargs...)
+function mbconv_builder(block_configs::AbstractVector{<:Tuple}, inplanes::Integer,
+                        stage_idx::Integer, scalings::NTuple{2, Real};
+                        norm_layer = BatchNorm, divisor::Integer = 8,
+                        se_from_explanes::Bool = false, kwargs...)
     width_mult, depth_mult = scalings
     block_fn, k, outplanes, expansion, stride, nrepeats, reduction, activation = block_configs[stage_idx]
     # calculate number of reduced channels for squeeze-excite layer from explanes instead of inplanes
@@ -39,69 +45,31 @@ function mbconv_builder(block_configs, inplanes::Integer, stage_idx::Integer,
     end
     return get_layers, ceil(Int, nrepeats * depth_mult)
 end
+_get_builder(::typeof(mbconv)) = mbconv_builder
 
-function mbconv_builder(block_configs, inplanes::Integer, stage_idx::Integer,
-                        width_mult::Real; norm_layer = BatchNorm, kwargs...)
-    return mbconv_builder(block_configs, inplanes, stage_idx, (width_mult, 1);
-                          norm_layer, kwargs...)
-end
-
-function fused_mbconv_builder(block_configs, inplanes::Integer, stage_idx::Integer;
-                              norm_layer = BatchNorm, kwargs...)
+function fused_mbconv_builder(block_configs::AbstractVector{<:Tuple}, inplanes::Integer,
+                              stage_idx::Integer, scalings::NTuple{2, Real};
+                              norm_layer = BatchNorm, divisor::Integer = 8, kwargs...)
+    width_mult, depth_mult = scalings
     block_fn, k, outplanes, expansion, stride, nrepeats, activation = block_configs[stage_idx]
     inplanes = stage_idx == 1 ? inplanes : block_configs[stage_idx - 1][3]
+    outplanes = _round_channels(outplanes * width_mult, divisor)
     function get_layers(block_idx::Integer)
         inplanes = block_idx == 1 ? inplanes : outplanes
-        explanes = _round_channels(inplanes * expansion, 8)
+        explanes = _round_channels(inplanes * expansion, divisor)
         stride = block_idx == 1 ? stride : 1
         block = block_fn((k, k), inplanes, explanes, outplanes, activation;
                          norm_layer, stride, kwargs...)
         return stride == 1 && inplanes == outplanes ? (identity, block) : (block,)
     end
-    return get_layers, nrepeats
-end
-
-# TODO - these builders need to be more flexible to potentially specify stuff like
-# activation functions and reductions that don't change
-function _get_builder(::typeof(dwsep_conv_bn), block_configs::AbstractVector{<:Tuple},
-                      inplanes::Integer, stage_idx::Integer;
-                      scalings::Union{Nothing, NTuple{2, Real}} = nothing,
-                      width_mult::Union{Nothing, Number} = nothing, norm_layer, kwargs...)
-    @assert isnothing(scalings) "dwsep_conv_bn does not support the `scalings` argument"
-    return dwsepconv_builder(block_configs, inplanes, stage_idx, width_mult; norm_layer,
-                             kwargs...)
-end
-
-function _get_builder(::typeof(mbconv), block_configs::AbstractVector{<:Tuple},
-                      inplanes::Integer, stage_idx::Integer;
-                      scalings::Union{Nothing, NTuple{2, Real}} = nothing,
-                      width_mult::Union{Nothing, Number} = nothing, norm_layer, kwargs...)
-    if isnothing(scalings)
-        return mbconv_builder(block_configs, inplanes, stage_idx, width_mult; norm_layer,
-                              kwargs...)
-    elseif isnothing(width_mult)
-        return mbconv_builder(block_configs, inplanes, stage_idx, scalings; norm_layer,
-                              kwargs...)
-    else
-        throw(ArgumentError("Only one of `scalings` and `width_mult` can be specified"))
-    end
-end
-
-function _get_builder(::typeof(fused_mbconv), block_configs::AbstractVector{<:Tuple},
-                      inplanes::Integer, stage_idx::Integer;
-                      scalings::Union{Nothing, NTuple{2, Real}} = nothing,
-                      width_mult::Union{Nothing, Number} = nothing, norm_layer)
-    @assert isnothing(width_mult) "fused_mbconv does not support the `width_mult` argument."
-    @assert isnothing(scalings)||scalings == (1, 1) "fused_mbconv does not support the `scalings` argument"
-    return fused_mbconv_builder(block_configs, inplanes, stage_idx; norm_layer)
+    return get_layers, ceil(Int, nrepeats * depth_mult)
 end
+_get_builder(::typeof(fused_mbconv)) = fused_mbconv_builder
 
-function mbconv_stack_builder(block_configs::AbstractVector{<:Tuple}, inplanes::Integer;
-                              scalings::Union{Nothing, NTuple{2, Real}} = nothing,
-                              width_mult::Union{Nothing, Number} = nothing,
-                              norm_layer = BatchNorm, kwargs...)
-    bxs = [_get_builder(block_configs[idx][1], block_configs, inplanes, idx; scalings,
-                        width_mult, norm_layer, kwargs...)
+function mbconv_stage_builder(block_configs::AbstractVector{<:Tuple}, inplanes::Integer,
+                              scalings::NTuple{2, Real}; kwargs...)
+    builders = _get_builder.(first.(block_configs))
+    bxs = [builders[idx](block_configs, inplanes, idx, scalings; kwargs...)
            for idx in eachindex(block_configs)]
     return (stage_idx, block_idx) -> first.(bxs)[stage_idx](block_idx), last.(bxs)
 end
diff --git a/src/convnets/builders/resnet.jl b/src/convnets/builders/resnet.jl
new file mode 100644
index 000000000..a77ff4c6d
--- /dev/null
+++ b/src/convnets/builders/resnet.jl
@@ -0,0 +1,9 @@
+function resnetbuilder(img_dims, stem, get_layers, block_repeats::AbstractVector{<:Integer},
+                       connection, classifier_fn)
+    # Build stages of the ResNet
+    stage_blocks = cnn_stages(get_layers, block_repeats, connection)
+    backbone = Chain(stem, stage_blocks...)
+    # Add classifier to the backbone
+    nfeaturemaps = Flux.outputsize(backbone, img_dims; padbatch = true)[3]
+    return Chain(backbone, classifier_fn(nfeaturemaps))
+end
diff --git a/src/convnets/builders/core.jl b/src/convnets/builders/stages.jl
similarity index 100%
rename from src/convnets/builders/core.jl
rename to src/convnets/builders/stages.jl
diff --git a/src/convnets/convnext.jl b/src/convnets/convnext.jl
index 0e74f4a8d..e11568d01 100644
--- a/src/convnets/convnext.jl
+++ b/src/convnets/convnext.jl
@@ -46,11 +46,11 @@ function convnext(depths::AbstractVector{<:Integer}, planes::AbstractVector{<:In
     "`planes` should have exactly one value for each block"
     downsample_layers = []
     push!(downsample_layers,
-          Chain(conv_norm((4, 4), inchannels => planes[1]; stride = 4,
+          Chain(conv_norm((4, 4), inchannels, planes[1]; stride = 4,
                           norm_layer = ChannelLayerNorm)...))
     for m in 1:(length(depths) - 1)
         push!(downsample_layers,
-              Chain(conv_norm((2, 2), planes[m] => planes[m + 1]; stride = 2,
+              Chain(conv_norm((2, 2), planes[m], planes[m + 1]; stride = 2,
                               norm_layer = ChannelLayerNorm, revnorm = true)...))
     end
     stages = []
@@ -111,7 +111,7 @@ function ConvNeXt(config::Symbol; pretrain::Bool = true, inchannels::Integer = 3
     _checkconfig(config, keys(CONVNEXT_CONFIGS))
     layers = convnext(config; inchannels, nclasses)
     if pretrain
-        layers = load_pretrained(layers, "convnext_$config")
+        layers = loadpretrain!(layers, "convnext_$config")
     end
     return ConvNeXt(layers)
 end
diff --git a/src/convnets/efficientnets/core.jl b/src/convnets/efficientnets/core.jl
deleted file mode 100644
index 8c10ae0fd..000000000
--- a/src/convnets/efficientnets/core.jl
+++ /dev/null
@@ -1,21 +0,0 @@
-function efficientnetcore(block_configs::AbstractVector{<:Tuple}; inplanes::Integer,
-                          scalings::NTuple{2, Real} = (1, 1),
-                          headplanes::Integer = block_configs[end][3] * 4,
-                          norm_layer = BatchNorm, dropout_rate = nothing,
-                          inchannels::Integer = 3, nclasses::Integer = 1000)
-    layers = []
-    # stem of the model
-    inplanes = _round_channels(inplanes * scalings[1])
-    append!(layers,
-            conv_norm((3, 3), inchannels, inplanes, swish; norm_layer, stride = 2,
-                      pad = SamePad()))
-    # building inverted residual blocks
-    get_layers, block_repeats = mbconv_stack_builder(block_configs, inplanes; scalings,
-                                                     norm_layer)
-    append!(layers, cnn_stages(get_layers, block_repeats, +))
-    # building last layers
-    append!(layers,
-            conv_norm((1, 1), _round_channels(block_configs[end][3] * scalings[1]),
-                      headplanes, swish; pad = SamePad()))
-    return Chain(Chain(layers...), create_classifier(headplanes, nclasses; dropout_rate))
-end
diff --git a/src/convnets/efficientnets/efficientnet.jl b/src/convnets/efficientnets/efficientnet.jl
index ab53bdd58..abc818bea 100644
--- a/src/convnets/efficientnets/efficientnet.jl
+++ b/src/convnets/efficientnets/efficientnet.jl
@@ -31,6 +31,17 @@ const EFFICIENTNET_GLOBAL_CONFIGS = Dict(:b0 => (224, (1.0, 1.0)),
                                          :b7 => (600, (2.0, 3.1)),
                                          :b8 => (672, (2.2, 3.6)))
 
+function efficientnet(config::Symbol; norm_layer = BatchNorm,
+                      dropout_rate = nothing, inchannels::Integer = 3,
+                      nclasses::Integer = 1000)
+    _checkconfig(config, keys(EFFICIENTNET_GLOBAL_CONFIGS))
+    scalings = EFFICIENTNET_GLOBAL_CONFIGS[config][2]
+    return irmodelbuilder(scalings, EFFICIENTNET_BLOCK_CONFIGS; inplanes = 32,
+                          norm_layer, activation = swish,
+                          headplanes = EFFICIENTNET_BLOCK_CONFIGS[end][3] * 4,
+                          dropout_rate, inchannels, nclasses)
+end
+
 """
     EfficientNet(config::Symbol; pretrain::Bool = false)
 
@@ -50,10 +61,7 @@ end
 
 function EfficientNet(config::Symbol; pretrain::Bool = false, inchannels::Integer = 3,
                       nclasses::Integer = 1000)
-    _checkconfig(config, keys(EFFICIENTNET_GLOBAL_CONFIGS))
-    scalings = EFFICIENTNET_GLOBAL_CONFIGS[config][2]
-    layers = efficientnet_core(EFFICIENTNET_BLOCK_CONFIGS; inplanes = 32, scalings,
-                               inchannels, nclasses)
+    layers = efficientnet(config; inchannels, nclasses)
     if pretrain
         loadpretrain!(layers, string("efficientnet-", config))
     end
diff --git a/src/convnets/efficientnets/efficientnetv2.jl b/src/convnets/efficientnets/efficientnetv2.jl
index 516208bf7..6eda393ed 100644
--- a/src/convnets/efficientnets/efficientnetv2.jl
+++ b/src/convnets/efficientnets/efficientnetv2.jl
@@ -35,6 +35,15 @@ const EFFNETV2_CONFIGS = Dict(:small => [(fused_mbconv, 3, 24, 1, 1, 2, swish),
                                   (mbconv, 3, 512, 6, 2, 32, 4, swish),
                                   (mbconv, 3, 768, 6, 1, 8, 4, swish)])
 
+function efficientnetv2(config::Symbol; norm_layer = BatchNorm, dropout_rate = nothing,
+                        inchannels::Integer = 3, nclasses::Integer = 1000)
+    _checkconfig(config, keys(EFFNETV2_CONFIGS))
+    block_configs = EFFNETV2_CONFIGS[config]
+    return irmodelbuilder((1, 1), block_configs; activation = swish, norm_layer,
+                          inplanes = block_configs[1][3], headplanes = 1280,
+                          dropout_rate, inchannels, nclasses)
+end
+
 """
     EfficientNetv2(config::Symbol; pretrain::Bool = false, width_mult::Real = 1,
                    inchannels::Integer = 3, nclasses::Integer = 1000)
@@ -57,10 +66,7 @@ end
 
 function EfficientNetv2(config::Symbol; pretrain::Bool = false,
                         inchannels::Integer = 3, nclasses::Integer = 1000)
-    _checkconfig(config, sort(collect(keys(EFFNETV2_CONFIGS))))
-    block_configs = EFFNETV2_CONFIGS[config]
-    layers = efficientnet_core(block_configs; inplanes = block_configs[1][3],
-                               headplanes = 1280, inchannels, nclasses)
+    layers = efficientnetv2(config; inchannels, nclasses)
     if pretrain
         loadpretrain!(layers, string("efficientnetv2-", config))
     end
diff --git a/src/convnets/inceptions/xception.jl b/src/convnets/inceptions/xception.jl
index 171bddd19..4c28644d0 100644
--- a/src/convnets/inceptions/xception.jl
+++ b/src/convnets/inceptions/xception.jl
@@ -34,7 +34,7 @@ function xception_block(inchannels::Integer, outchannels::Integer, nrepeats::Int
         end
         push!(layers, relu)
         append!(layers,
-                dwsep_conv_bn((3, 3), inc, outc; pad = 1, use_norm = (false, false)))
+                dwsep_conv_norm((3, 3), inc, outc; pad = 1, use_norm = (false, false)))
         push!(layers, BatchNorm(outc))
     end
     layers = start_with_relu ? layers : layers[2:end]
@@ -62,8 +62,8 @@ function xception(; dropout_rate = 0.0, inchannels::Integer = 3, nclasses::Integ
                      xception_block(256, 728, 2; stride = 2),
                      [xception_block(728, 728, 3) for _ in 1:8]...,
                      xception_block(728, 1024, 2; stride = 2, grow_at_start = false),
-                     dwsep_conv_bn((3, 3), 1024, 1536; pad = 1)...,
-                     dwsep_conv_bn((3, 3), 1536, 2048; pad = 1)...)
+                     dwsep_conv_norm((3, 3), 1024, 1536; pad = 1)...,
+                     dwsep_conv_norm((3, 3), 1536, 2048; pad = 1)...)
     return Chain(backbone, create_classifier(2048, nclasses; dropout_rate))
 end
 
diff --git a/src/convnets/mobilenets/mnasnet.jl b/src/convnets/mobilenets/mnasnet.jl
index fa8207ca8..6906d1b6d 100644
--- a/src/convnets/mobilenets/mnasnet.jl
+++ b/src/convnets/mobilenets/mnasnet.jl
@@ -1,55 +1,5 @@
-# momentum used for BatchNorm as per Tensorflow implementation
-const _MNASNET_BN_MOMENTUM = 0.0003f0
-
-"""
-    mnasnet(block_configs::AbstractVector{<:Tuple}; width_mult::Real = 1,
-            max_width = 1280, dropout_rate = 0.2, inchannels::Integer = 3,
-            nclasses::Integer = 1000)
-
-Create an MNASNet model with the specified configuration.
-([reference](https://arxiv.org/abs/1807.11626)).
-
-# Arguments
-
-  - `width_mult`: Controls the number of output feature maps in each block
-    (with 1 being the default in the paper)
-  - `max_width`: The maximum number of feature maps in any layer of the network
-  - `dropout_rate`: rate of dropout in the classifier head. Set to `nothing` to disable dropout.
-  - `inchannels`: The number of input channels.
-  - `nclasses`: The number of output classes
-"""
-function mnasnet(block_configs::AbstractVector{<:Tuple}; width_mult::Real = 1,
-                 max_width::Integer = 1280, inplanes::Integer = 32, dropout_rate = 0.2,
-                 inchannels::Integer = 3, nclasses::Integer = 1000)
-    # norm layer for MNASNet is different from other models
-    norm_layer = (args...; kwargs...) -> BatchNorm(args...; momentum = _MNASNET_BN_MOMENTUM,
-                                                   kwargs...)
-    # building first layer
-    inplanes = _round_channels(inplanes * width_mult)
-    layers = []
-    append!(layers,
-            conv_norm((3, 3), inchannels, inplanes, relu; stride = 2, pad = 1,
-                      norm_layer))
-    # building inverted residual blocks
-    get_layers, block_repeats = mbconv_stack_builder(block_configs, inplanes; width_mult,
-                                                     norm_layer)
-    append!(layers, cnn_stages(get_layers, block_repeats, +))
-    # building last layers
-    outplanes = _round_channels(block_configs[end][3] * width_mult)
-    append!(layers,
-            conv_norm((1, 1), outplanes, max_width, relu; norm_layer))
-    return Chain(Chain(layers...), create_classifier(max_width, nclasses; dropout_rate))
-end
-
-function mnasnet(config::Symbol; width_mult::Real = 1, max_width::Integer = 1280,
-                 dropout_rate = 0.2, inchannels::Integer = 3, nclasses::Integer = 1000)
-    inplanes, block_configs = MNASNET_CONFIGS[config]
-    return mnasnet(block_configs; width_mult, max_width, dropout_rate, inplanes,
-                   inchannels, nclasses)
-end
-
 # Layer configurations for MNasNet
-# f: block function - we use `dwsep_conv_bn` for the first block and `mbconv` for the rest
+# f: block function - we use `dwsep_conv_norm` for the first block and `mbconv` for the rest
 # k: kernel size
 # c: output channels
 # e: expansion factor - only used for `mbconv`
@@ -60,7 +10,7 @@ end
 # Data is organised as (f, k, c, (e,) s, n, (r,) a)
 const MNASNET_CONFIGS = Dict(:B1 => (32,
                                      [
-                                         (dwsep_conv_bn, 3, 16, 1, 1, relu),
+                                         (dwsep_conv_norm, 3, 16, 1, 1, relu),
                                          (mbconv, 3, 24, 3, 2, 3, nothing, relu),
                                          (mbconv, 5, 40, 3, 2, 3, nothing, relu),
                                          (mbconv, 5, 80, 6, 2, 3, nothing, relu),
@@ -70,7 +20,7 @@ const MNASNET_CONFIGS = Dict(:B1 => (32,
                                      ]),
                              :A1 => (32,
                                      [
-                                         (dwsep_conv_bn, 3, 16, 1, 1, relu),
+                                         (dwsep_conv_norm, 3, 16, 1, 1, relu),
                                          (mbconv, 3, 24, 6, 2, 2, nothing, relu),
                                          (mbconv, 5, 40, 3, 2, 3, 4, relu),
                                          (mbconv, 3, 80, 6, 2, 4, nothing, relu),
@@ -80,7 +30,7 @@ const MNASNET_CONFIGS = Dict(:B1 => (32,
                                      ]),
                              :small => (8,
                                         [
-                                            (dwsep_conv_bn, 3, 8, 1, 1, relu),
+                                            (dwsep_conv_norm, 3, 8, 1, 1, relu),
                                             (mbconv, 3, 16, 3, 2, 1, nothing, relu),
                                             (mbconv, 3, 16, 6, 2, 2, nothing, relu),
                                             (mbconv, 5, 32, 6, 2, 4, 4, relu),
@@ -89,6 +39,16 @@ const MNASNET_CONFIGS = Dict(:B1 => (32,
                                             (mbconv, 3, 144, 6, 1, 1, nothing, relu),
                                         ]))
 
+function mnasnet(config::Symbol; width_mult::Real = 1, max_width::Integer = 1280,
+                 dropout_rate = 0.2, inchannels::Integer = 3, nclasses::Integer = 1000)
+    _checkconfig(config, keys(MNASNET_CONFIGS))
+    # momentum used for BatchNorm is as per Tensorflow implementation
+    norm_layer = (args...; kwargs...) -> BatchNorm(args...; momentum = 0.0003f0, kwargs...)
+    inplanes, block_configs = MNASNET_CONFIGS[config]
+    return irmodelbuilder(width_mult, block_configs; inplanes, norm_layer,
+                          headplanes = max_width, dropout_rate, inchannels, nclasses)
+end
+
 """
     MNASNet(width_mult = 1; inchannels::Integer = 3, pretrain::Bool = false,
             nclasses::Integer = 1000)
diff --git a/src/convnets/mobilenets/mobilenetv1.jl b/src/convnets/mobilenets/mobilenetv1.jl
index fb7e3d4ce..092391030 100644
--- a/src/convnets/mobilenets/mobilenetv1.jl
+++ b/src/convnets/mobilenets/mobilenetv1.jl
@@ -1,64 +1,26 @@
-"""
-    mobilenetv1(width_mult::Real, config::AbstractVector{<:Tuple}; 
-                activation = relu, dropout_rate = nothing,
-                inchannels::Integer = 3, nclasses::Integer = 1000)
-
-Create a MobileNetv1 model ([reference](https://arxiv.org/abs/1704.04861v1)).
-
-# Arguments
-
-  - `width_mult`: Controls the number of output feature maps in each block
-    (with 1 being the default in the paper)
-
-  - `configs`: A "list of tuples" configuration for each layer that details:
-    
-      + `dw`: Set true to use a depthwise separable convolution or false for regular convolution
-      + `o`: The number of output feature maps
-      + `s`: The stride of the convolutional kernel
-      + `r`: The number of time this configuration block is repeated
-  - `activate`: The activation function to use throughout the network
-  - `dropout_rate`: The dropout rate to use in the classifier head. Set to `nothing` to disable.
-  - `inchannels`: The number of input channels. The default value is 3.
-  - `nclasses`: The number of output classes
-"""
-function mobilenetv1(config::AbstractVector{<:Tuple}; width_mult::Real = 1,
-                     activation = relu, dropout_rate = nothing,
-                     inplanes::Integer = 32, inchannels::Integer = 3,
-                     nclasses::Integer = 1000)
-    layers = []
-    # stem of the model
-    inplanes = _round_channels(inplanes * width_mult)
-    append!(layers,
-            conv_norm((3, 3), inchannels, inplanes, activation; stride = 2, pad = 1))
-    # building inverted residual blocks
-    get_layers, block_repeats = mbconv_stack_builder(config, inplanes; width_mult)
-    append!(layers, cnn_stages(get_layers, block_repeats))
-    outplanes = _round_channels(config[end][3] * width_mult)
-    return Chain(Chain(layers...), create_classifier(outplanes, nclasses; dropout_rate))
-end
-
-function mobilenetv1(width_mult::Real = 1; activation = relu, dropout_rate = nothing,
-                     inchannels::Integer = 3, nclasses::Integer = 1000)
-    return mobilenetv1(MOBILENETV1_CONFIGS[config]; width_mult, activation,
-                       dropout_rate, inchannels, nclasses)
-end
-
 # Layer configurations for MobileNetv1
-# f: block function - we use `dwsep_conv_bn` for all blocks
+# f: block function - we use `dwsep_conv_norm` for all blocks
 # k: kernel size
 # c: output channels
 # s: stride
 # n: number of repeats
 # a: activation function
+# Data is organised as (f, k, c, s, n, a)
 const MOBILENETV1_CONFIGS = [
-    # f, k, c, s, n, a
-    (dwsep_conv_bn, 3, 64, 1, 1, relu6),
-    (dwsep_conv_bn, 3, 128, 2, 2, relu6),
-    (dwsep_conv_bn, 3, 256, 2, 2, relu6),
-    (dwsep_conv_bn, 3, 512, 2, 6, relu6),
-    (dwsep_conv_bn, 3, 1024, 2, 2, relu6),
+    (dwsep_conv_norm, 3, 64, 1, 1, relu6),
+    (dwsep_conv_norm, 3, 128, 2, 2, relu6),
+    (dwsep_conv_norm, 3, 256, 2, 2, relu6),
+    (dwsep_conv_norm, 3, 512, 2, 6, relu6),
+    (dwsep_conv_norm, 3, 1024, 2, 2, relu6),
 ]
 
+function mobilenetv1(width_mult::Real = 1; inplanes::Integer = 32, dropout_rate = nothing,
+                     inchannels::Integer = 3, nclasses::Integer = 1000)
+    return irmodelbuilder(width_mult, MOBILENETV1_CONFIGS; inplanes, inchannels,
+                          activation = relu6, connection = nothing, tail_conv = false,
+                          headplanes = 1024, dropout_rate, nclasses)
+end
+
 """
     MobileNetv1(width_mult::Real = 1; pretrain::Bool = false,
                 inchannels::Integer = 3, nclasses::Integer = 1000)
@@ -88,7 +50,7 @@ end
 
 function MobileNetv1(width_mult::Real = 1; pretrain::Bool = false,
                      inchannels::Integer = 3, nclasses::Integer = 1000)
-    layers = mobilenetv1(MOBILENETV1_CONFIGS; width_mult, inchannels, nclasses)
+    layers = mobilenetv1(width_mult; inchannels, nclasses)
     if pretrain
         loadpretrain!(layers, string("MobileNetv1"))
     end
diff --git a/src/convnets/mobilenets/mobilenetv2.jl b/src/convnets/mobilenets/mobilenetv2.jl
index 1166676ed..8f3db168b 100644
--- a/src/convnets/mobilenets/mobilenetv2.jl
+++ b/src/convnets/mobilenets/mobilenetv2.jl
@@ -1,56 +1,3 @@
-"""
-    mobilenetv2(configs::AbstractVector{<:Tuple}; width_mult::Real = 1,
-                max_width::Integer = 1280, divisor::Integer = 8, dropout_rate = 0.2,
-                inchannels::Integer = 3, nclasses::Integer = 1000)
-
-Create a MobileNetv2 model.
-([reference](https://arxiv.org/abs/1801.04381)).
-
-# Arguments
-
-  - `configs`: A "list of tuples" configuration for each layer that details:
-    
-      + `t`: The expansion factor that controls the number of feature maps in the bottleneck layer
-      + `c`: The number of output feature maps
-      + `n`: The number of times a block is repeated
-      + `s`: The stride of the convolutional kernel
-
-  - `width_mult`: Controls the number of output feature maps in each block
-    (with 1 being the default in the paper)
-  - `max_width`: The maximum number of feature maps in any layer of the network
-  - `divisor`: The divisor used to round the number of feature maps in each block
-  - `dropout_rate`: rate of dropout in the classifier head. Set to `nothing` to disable dropout.
-  - `inchannels`: The number of input channels.
-  - `nclasses`: The number of output classes
-"""
-function mobilenetv2(block_configs::AbstractVector{<:Tuple}; width_mult::Real = 1,
-                     max_width::Integer = 1280, divisor::Integer = 8,
-                     inplanes::Integer = 32, dropout_rate = 0.2,
-                     inchannels::Integer = 3, nclasses::Integer = 1000)
-    # building first layer
-    inplanes = _round_channels(inplanes * width_mult, divisor)
-    layers = []
-    append!(layers,
-            conv_norm((3, 3), inchannels, inplanes; pad = 1, stride = 2))
-    # building inverted residual blocks
-    get_layers, block_repeats = mbconv_stack_builder(block_configs, inplanes; width_mult,
-                                                     divisor)
-    append!(layers, cnn_stages(get_layers, block_repeats, +))
-    # building last layers
-    outplanes = _round_channels(block_configs[end][3] * width_mult, divisor)
-    headplanes = _round_channels(max_width * max(1, width_mult), divisor)
-    append!(layers, conv_norm((1, 1), outplanes, headplanes, relu6))
-    return Chain(Chain(layers...), create_classifier(headplanes, nclasses; dropout_rate))
-end
-
-function mobilenetv2(width_mult::Real = 1; max_width::Integer = 1280,
-                     divisor::Integer = 8, inplanes::Integer = 32,
-                     dropout_rate = 0.2, inchannels::Integer = 3,
-                     nclasses::Integer = 1000)
-    return mobilenetv2(MOBILENETV2_CONFIGS; width_mult, max_width, divisor, inplanes,
-                       dropout_rate, inchannels, nclasses)
-end
-
 # Layer configurations for MobileNetv2
 # f: block function - we use `mbconv` for all blocks
 # k: kernel size
@@ -60,17 +7,24 @@ end
 # n: number of repeats
 # r: reduction factor
 # a: activation function
+# Data is organised as (f, k, c, e, s, n, r, a)
 const MOBILENETV2_CONFIGS = [
-    # f, k, c, e, s, n, r, a
-    (mbconv, 3, 16, 1, 1, 1, nothing, swish),
-    (mbconv, 3, 24, 6, 2, 2, nothing, swish),
-    (mbconv, 3, 32, 6, 2, 3, nothing, swish),
-    (mbconv, 3, 64, 6, 2, 4, nothing, swish),
-    (mbconv, 3, 96, 6, 1, 3, nothing, swish),
-    (mbconv, 3, 160, 6, 2, 3, nothing, swish),
-    (mbconv, 3, 320, 6, 1, 1, nothing, swish),
+    (mbconv, 3, 16, 1, 1, 1, nothing, relu6),
+    (mbconv, 3, 24, 6, 2, 2, nothing, relu6),
+    (mbconv, 3, 32, 6, 2, 3, nothing, relu6),
+    (mbconv, 3, 64, 6, 2, 4, nothing, relu6),
+    (mbconv, 3, 96, 6, 1, 3, nothing, relu6),
+    (mbconv, 3, 160, 6, 2, 3, nothing, relu6),
+    (mbconv, 3, 320, 6, 1, 1, nothing, relu6),
 ]
 
+function mobilenetv2(width_mult::Real = 1; max_width::Integer = 1280,
+                     inplanes::Integer = 32, dropout_rate = 0.2,
+                     inchannels::Integer = 3, nclasses::Integer = 1000)
+    return irmodelbuilder(width_mult, MOBILENETV2_CONFIGS; activation = relu6, inplanes,
+                          headplanes = max_width, dropout_rate, inchannels, nclasses)
+end
+
 """
     MobileNetv2(width_mult = 1.0; inchannels::Integer = 3, pretrain::Bool = false,
                 nclasses::Integer = 1000)
@@ -100,7 +54,7 @@ end
 
 function MobileNetv2(width_mult::Real = 1; pretrain::Bool = false,
                      inchannels::Integer = 3, nclasses::Integer = 1000)
-    layers = mobilenetv2(MOBILENETV2_CONFIGS; width_mult, inchannels, nclasses)
+    layers = mobilenetv2(width_mult; inchannels, nclasses)
     if pretrain
         loadpretrain!(layers, string("MobileNetv2"))
     end
diff --git a/src/convnets/mobilenets/mobilenetv3.jl b/src/convnets/mobilenets/mobilenetv3.jl
index 207bc77f7..2ad394b61 100644
--- a/src/convnets/mobilenets/mobilenetv3.jl
+++ b/src/convnets/mobilenets/mobilenetv3.jl
@@ -1,58 +1,3 @@
-"""
-    mobilenetv3(configs::AbstractVector{<:Tuple}; width_mult::Real = 1,
-                max_width::Integer = 1024, dropout_rate = 0.2,
-                inchannels::Integer = 3, nclasses::Integer = 1000)
-
-Create a MobileNetv3 model.
-([reference](https://arxiv.org/abs/1905.02244)).
-
-# Arguments
-
-  - `configs`: a "list of tuples" configuration for each layer that details:
-    
-      + `k::Integer` - The size of the convolutional kernel
-      + `c::Float` - The multiplier factor for deciding the number of feature maps in the hidden layer
-      + `t::Integer` - The number of output feature maps for a given block
-      + `r::Integer` - The reduction factor (`>= 1` or `nothing` to skip) for squeeze and excite layers
-      + `s::Integer` - The stride of the convolutional kernel
-      + `a` - The activation function used in the bottleneck (typically `hardswish` or `relu`)
-
-  - `width_mult`: Controls the number of output feature maps in each block
-    (with 1 being the default in the paper; this is usually a value between 0.1 and 1.4.)
-  - `max_width`: The maximum number of feature maps in any layer of the network
-  - `dropout_rate`: The dropout rate to use in the classifier head. Set to `nothing` to disable.
-  - `inchannels`: The number of input channels.
-  - `nclasses`: the number of output classes
-"""
-function mobilenetv3(configs::AbstractVector{<:Tuple}; width_mult::Real = 1,
-                     max_width::Integer = 1024, dropout_rate = 0.2,
-                     inchannels::Integer = 3, nclasses::Integer = 1000)
-    # building first layer
-    inplanes = _round_channels(16 * width_mult)
-    layers = []
-    append!(layers,
-            conv_norm((3, 3), inchannels, inplanes, hardswish; stride = 2, pad = 1))
-    # building inverted residual blocks
-    get_layers, block_repeats = mbconv_stack_builder(configs, inplanes; width_mult,
-                                                     se_from_explanes = true,
-                                                     se_round_fn = _round_channels)
-    append!(layers, cnn_stages(get_layers, block_repeats, +))
-    # building last layers
-    explanes = _round_channels(configs[end][3] * width_mult)
-    midplanes = _round_channels(explanes * configs[end][4])
-    append!(layers, conv_norm((1, 1), explanes, midplanes, hardswish))
-    return Chain(Chain(layers...),
-                 create_classifier(midplanes, max_width, nclasses,
-                                   (hardswish, identity); dropout_rate))
-end
-
-function mobilenetv3(config::Symbol; width_mult::Real = 1, dropout_rate = 0.2,
-                     inchannels::Integer = 3, nclasses::Integer = 1000)
-    max_width = config === :large ? 1280 : 1024
-    return mobilenetv3(MOBILENETV3_CONFIGS[config]; width_mult, max_width,
-                       dropout_rate, inchannels, nclasses)
-end
-
 # Layer configurations for small and large models for MobileNetv3
 # f: mbconv block function - we use `mbconv` for all blocks
 # k: kernel size
@@ -63,27 +8,41 @@ end
 # r: squeeze and excite reduction factor
 # a: activation function
 # Data is organised as (f, k, c, e, s, n, r, a)
-const MOBILENETV3_CONFIGS = Dict(:small => [
-                                     (mbconv, 3, 16, 1, 2, 1, 4, relu),
-                                     (mbconv, 3, 24, 4.5, 2, 1, nothing, relu),
-                                     (mbconv, 3, 24, 3.67, 1, 1, nothing, relu),
-                                     (mbconv, 5, 40, 4, 2, 1, 4, hardswish),
-                                     (mbconv, 5, 40, 6, 1, 2, 4, hardswish),
-                                     (mbconv, 5, 48, 3, 1, 2, 4, hardswish),
-                                     (mbconv, 5, 96, 6, 1, 3, 4, hardswish),
-                                 ],
-                                 :large => [
-                                     (mbconv, 3, 16, 1, 1, 1, nothing, relu),
-                                     (mbconv, 3, 24, 4, 2, 1, nothing, relu),
-                                     (mbconv, 3, 24, 3, 1, 1, nothing, relu),
-                                     (mbconv, 5, 40, 3, 2, 1, 4, relu),
-                                     (mbconv, 5, 40, 3, 1, 2, 4, relu),
-                                     (mbconv, 3, 80, 6, 2, 1, nothing, hardswish),
-                                     (mbconv, 3, 80, 2.5, 1, 1, nothing, hardswish),
-                                     (mbconv, 3, 80, 2.3, 1, 2, nothing, hardswish),
-                                     (mbconv, 3, 112, 6, 1, 2, 4, hardswish),
-                                     (mbconv, 5, 160, 6, 1, 3, 4, hardswish),
-                                 ])
+const MOBILENETV3_CONFIGS = Dict(:small => (1024,
+                                            [
+                                                (mbconv, 3, 16, 1, 2, 1, 4, relu),
+                                                (mbconv, 3, 24, 4.5, 2, 1, nothing, relu),
+                                                (mbconv, 3, 24, 3.67, 1, 1, nothing, relu),
+                                                (mbconv, 5, 40, 4, 2, 1, 4, hardswish),
+                                                (mbconv, 5, 40, 6, 1, 2, 4, hardswish),
+                                                (mbconv, 5, 48, 3, 1, 2, 4, hardswish),
+                                                (mbconv, 5, 96, 6, 1, 3, 4, hardswish),
+                                            ]),
+                                 :large => (1280,
+                                            [
+                                                (mbconv, 3, 16, 1, 1, 1, nothing, relu),
+                                                (mbconv, 3, 24, 4, 2, 1, nothing, relu),
+                                                (mbconv, 3, 24, 3, 1, 1, nothing, relu),
+                                                (mbconv, 5, 40, 3, 2, 3, 4, relu),
+                                                (mbconv, 3, 80, 6, 2, 1, nothing,
+                                                 hardswish),
+                                                (mbconv, 3, 80, 2.5, 1, 1, nothing,
+                                                 hardswish),
+                                                (mbconv, 3, 80, 2.3, 1, 2, nothing,
+                                                 hardswish),
+                                                (mbconv, 3, 112, 6, 1, 2, 4, hardswish),
+                                                (mbconv, 5, 160, 6, 1, 3, 4, hardswish),
+                                            ]))
+
+function mobilenetv3(config::Symbol; width_mult::Real = 1, dropout_rate = 0.2,
+                     inchannels::Integer = 3, nclasses::Integer = 1000)
+    _checkconfig(config, [:small, :large])
+    max_width, block_configs = MOBILENETV3_CONFIGS[config]
+    return irmodelbuilder(width_mult, block_configs; inplanes = 16,
+                          headplanes = max_width, activation = relu,
+                          se_from_explanes = true, se_round_fn = _round_channels,
+                          expanded_classifier = true, dropout_rate, inchannels, nclasses)
+end
 
 """
     MobileNetv3(config::Symbol; width_mult::Real = 1, pretrain::Bool = false,
@@ -116,7 +75,6 @@ end
 
 function MobileNetv3(config::Symbol; width_mult::Real = 1, pretrain::Bool = false,
                      inchannels::Integer = 3, nclasses::Integer = 1000)
-    _checkconfig(config, [:small, :large])
     layers = mobilenetv3(config; width_mult, inchannels, nclasses)
     if pretrain
         loadpretrain!(layers, string("MobileNetv3", config))
diff --git a/src/convnets/resnets/core.jl b/src/convnets/resnets/core.jl
index df58524a7..c9c33392b 100644
--- a/src/convnets/resnets/core.jl
+++ b/src/convnets/resnets/core.jl
@@ -27,9 +27,9 @@ function basicblock(inplanes::Integer, planes::Integer; stride::Integer = 1,
                     drop_block = identity, drop_path = identity,
                     attn_fn = planes -> identity)
     first_planes = planes ÷ reduction_factor
-    conv_bn1 = conv_norm((3, 3), inplanes => first_planes, identity; norm_layer, revnorm,
+    conv_bn1 = conv_norm((3, 3), inplanes, first_planes, identity; norm_layer, revnorm,
                          stride, pad = 1)
-    conv_bn2 = conv_norm((3, 3), first_planes => planes, identity; norm_layer, revnorm,
+    conv_bn2 = conv_norm((3, 3), first_planes, planes, identity; norm_layer, revnorm,
                          pad = 1)
     layers = [conv_bn1..., drop_block, activation, conv_bn2..., attn_fn(planes),
         drop_path]
@@ -71,11 +71,11 @@ function bottleneck(inplanes::Integer, planes::Integer; stride::Integer,
     width = fld(planes * base_width, 64) * cardinality
     first_planes = width ÷ reduction_factor
     outplanes = planes * 4
-    conv_bn1 = conv_norm((1, 1), inplanes => first_planes, activation; norm_layer,
+    conv_bn1 = conv_norm((1, 1), inplanes, first_planes, activation; norm_layer,
                          revnorm)
-    conv_bn2 = conv_norm((3, 3), first_planes => width, identity; norm_layer, revnorm,
+    conv_bn2 = conv_norm((3, 3), first_planes, width, identity; norm_layer, revnorm,
                          stride, pad = 1, groups = cardinality)
-    conv_bn3 = conv_norm((1, 1), width => outplanes, identity; norm_layer, revnorm)
+    conv_bn3 = conv_norm((1, 1), width, outplanes, identity; norm_layer, revnorm)
     layers = [conv_bn1..., conv_bn2..., drop_block, activation, conv_bn3...,
         attn_fn(outplanes), drop_path]
     return Chain(filter!(!=(identity), layers)...)
@@ -84,7 +84,7 @@ end
 # Downsample layer using convolutions.
 function downsample_conv(inplanes::Integer, outplanes::Integer; stride::Integer = 1,
                          norm_layer = BatchNorm, revnorm::Bool = false)
-    return Chain(conv_norm((1, 1), inplanes => outplanes, identity; norm_layer, revnorm,
+    return Chain(conv_norm((1, 1), inplanes, outplanes, identity; norm_layer, revnorm,
                            pad = SamePad(), stride)...)
 end
 
@@ -93,7 +93,7 @@ function downsample_pool(inplanes::Integer, outplanes::Integer; stride::Integer
                          norm_layer = BatchNorm, revnorm::Bool = false)
     pool = stride == 1 ? identity : MeanPool((2, 2); stride, pad = SamePad())
     return Chain(pool,
-                 conv_norm((1, 1), inplanes => outplanes, identity; norm_layer,
+                 conv_norm((1, 1), inplanes, outplanes, identity; norm_layer,
                            revnorm)...)
 end
 
@@ -165,18 +165,18 @@ function resnet_stem(stem_type::Symbol = :default; inchannels::Integer = 3,
                      norm_layer = BatchNorm, revnorm::Bool = false)
     _checkconfig(stem_type, [:default, :deep, :deep_tiered])
     # Main stem
-    deep_stem = stem_type == :deep || stem_type == :deep_tiered
+    deep_stem = stem_type === :deep || stem_type === :deep_tiered
     inplanes = deep_stem ? stem_width * 2 : 64
     # Deep stem that uses three successive 3x3 convolutions instead of a single 7x7 convolution
     if deep_stem
-        if stem_type == :deep
+        if stem_type === :deep
             stem_channels = (stem_width, stem_width)
-        elseif stem_type == :deep_tiered
+        elseif stem_type === :deep_tiered
             stem_channels = (3 * (stem_width ÷ 4), stem_width)
         end
-        conv1 = Chain(conv_norm((3, 3), inchannels => stem_channels[1], activation;
+        conv1 = Chain(conv_norm((3, 3), inchannels, stem_channels[1], activation;
                                 norm_layer, revnorm, stride = 2, pad = 1)...,
-                      conv_norm((3, 3), stem_channels[1] => stem_channels[2], activation;
+                      conv_norm((3, 3), stem_channels[1], stem_channels[2], activation;
                                 norm_layer, pad = 1)...,
                       Conv((3, 3), stem_channels[2] => inplanes; pad = 1, bias = false))
     else
@@ -185,7 +185,7 @@ function resnet_stem(stem_type::Symbol = :default; inchannels::Integer = 3,
     bn1 = norm_layer(inplanes, activation)
     # Stem pooling
     stempool = replace_pool ?
-               Chain(conv_norm((3, 3), inplanes => inplanes, activation; norm_layer,
+               Chain(conv_norm((3, 3), inplanes, inplanes, activation; norm_layer,
                                revnorm, stride = 2, pad = 1)...) :
                MaxPool((3, 3); stride = 2, pad = 1)
     return Chain(conv1, bn1, stempool)
@@ -196,16 +196,6 @@ function resnet_planes(block_repeats::AbstractVector{<:Integer})
                              for (stage_idx, stages) in enumerate(block_repeats))
 end
 
-function resnetcore(img_dims, stem, get_layers, block_repeats::AbstractVector{<:Integer},
-                    connection, classifier_fn)
-    # Build stages of the ResNet
-    stage_blocks = cnn_stages(get_layers, block_repeats, connection)
-    backbone = Chain(stem, stage_blocks...)
-    # Add classifier to the backbone
-    nfeaturemaps = Flux.outputsize(backbone, img_dims; padbatch = true)[3]
-    return Chain(backbone, classifier_fn(nfeaturemaps))
-end
-
 function resnet(block_type, block_repeats::AbstractVector{<:Integer},
                 downsample_opt::NTuple{2, Any} = (downsample_conv, downsample_identity);
                 cardinality::Integer = 1, base_width::Integer = 64,
@@ -254,8 +244,8 @@ function resnet(block_type, block_repeats::AbstractVector{<:Integer},
     end
     classifier_fn = nfeatures -> create_classifier(nfeatures, nclasses; dropout_rate,
                                                    pool_layer, use_conv)
-    return resnet_core((imsize..., inchannels), stem, get_layers, block_repeats,
-                       connection$activation, classifier_fn)
+    return resnetbuilder((imsize..., inchannels), stem, get_layers, block_repeats,
+                         connection$activation, classifier_fn)
 end
 function resnet(block_fn, block_repeats, downsample_opt::Symbol = :B; kwargs...)
     return resnet(block_fn, block_repeats, RESNET_SHORTCUTS[downsample_opt]; kwargs...)
diff --git a/src/convnets/resnets/res2net.jl b/src/convnets/resnets/res2net.jl
index e308e1125..5ef4987ad 100644
--- a/src/convnets/resnets/res2net.jl
+++ b/src/convnets/resnets/res2net.jl
@@ -29,17 +29,17 @@ function bottle2neck(inplanes::Integer, planes::Integer; stride::Integer = 1,
     width = fld(planes * base_width, 64) * cardinality
     outplanes = planes * 4
     pool = is_first && scale > 1 ? MeanPool((3, 3); stride, pad = 1) : identity
-    conv_bns = [Chain(conv_norm((3, 3), width => width, activation; norm_layer, stride,
+    conv_bns = [Chain(conv_norm((3, 3), width, width, activation; norm_layer, stride,
                                 pad = 1, groups = cardinality)...)
                 for _ in 1:max(1, scale - 1)]
     reslayer = is_first ? Parallel(cat_channels, pool, conv_bns...) :
                Parallel(cat_channels, identity, Chain(PairwiseFusion(+, conv_bns...)))
     tuplify = is_first ? x -> tuple(x...) : x -> tuple(x[1], tuple(x[2:end]...))
     layers = [
-        conv_norm((1, 1), inplanes => width * scale, activation;
+        conv_norm((1, 1), inplanes, width * scale, activation;
                   norm_layer, revnorm)...,
         chunk$(; size = width, dims = 3), tuplify, reslayer,
-        conv_norm((1, 1), width * scale => outplanes, activation;
+        conv_norm((1, 1), width * scale, outplanes, activation;
                   norm_layer, revnorm)...,
         attn_fn(outplanes),
     ]
diff --git a/src/layers/Layers.jl b/src/layers/Layers.jl
index 9bdf1f913..33d1d1aed 100644
--- a/src/layers/Layers.jl
+++ b/src/layers/Layers.jl
@@ -19,7 +19,7 @@ include("attention.jl")
 export MHAttention
 
 include("conv.jl")
-export conv_norm, basic_conv_bn, dwsep_conv_bn
+export conv_norm, basic_conv_bn, dwsep_conv_norm
 
 include("drop.jl")
 export DropBlock, DropPath
diff --git a/src/layers/conv.jl b/src/layers/conv.jl
index 6581197cb..068546f5e 100644
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@@ -5,9 +5,6 @@
               stride::Integer = 1, pad::Integer = 0, dilation::Integer = 1, 
               groups::Integer = 1, [bias, weight, init])
 
-    conv_norm(kernel_size::Dims{2}, inplanes => outplanes, activation = identity;
-              kwargs...)
-
 Create a convolution + normalisation layer pair with activation.
 
 # Arguments
@@ -16,7 +13,9 @@ Create a convolution + normalisation layer pair with activation.
   - `inplanes`: number of input feature maps
   - `outplanes`: number of output feature maps
   - `activation`: the activation function for the final layer
-  - `norm_layer`: the normalisation layer used
+  - `norm_layer`: the normalisation layer used. Note that using `identity` as the normalisation
+    layer will result in no normalisation being applied i.e. this will be the same as
+    setting `use_norm = false`.
   - `revnorm`: set to `true` to place the normalisation layer before the convolution
   - `preact`: set to `true` to place the activation function before the normalisation layer
     (only compatible with `revnorm = false`)
@@ -34,10 +33,14 @@ function conv_norm(kernel_size::Dims{2}, inplanes::Integer, outplanes::Integer,
                    activation = relu; norm_layer = BatchNorm, revnorm::Bool = false,
                    eps::Float32 = 1.0f-5, preact::Bool = false, use_norm::Bool = true,
                    bias = !use_norm, kwargs...)
-    # no normalization layer
+    # no normalization layer (including case where normalization layer is identity)
+    use_norm = use_norm && norm_layer !== identity
     if !use_norm
         if preact || revnorm
-            throw(ArgumentError("`preact` only supported with `use_norm = true`"))
+            throw(ArgumentError("`preact` only supported with `use_norm = true`. Check if
+            `use_norm = false` is intended. Note that it is also possible to trigger this
+            error if you set `norm_layer` to `identity` since that returns the same
+            behaviour as `use_norm`."))
         else
             # early return if no norm layer is required
             return [Conv(kernel_size, inplanes => outplanes, activation; kwargs...)]
@@ -45,10 +48,10 @@ function conv_norm(kernel_size::Dims{2}, inplanes::Integer, outplanes::Integer,
     end
     # channels for norm layer and activation functions for both conv and norm
     if revnorm
-        activations = (conv = activation, bn = identity)
+        activations = (conv = activation, norm = identity)
         normplanes = inplanes
     else
-        activations = (conv = identity, bn = activation)
+        activations = (conv = identity, norm = activation)
         normplanes = outplanes
     end
     # handle pre-activation
@@ -56,19 +59,14 @@ function conv_norm(kernel_size::Dims{2}, inplanes::Integer, outplanes::Integer,
         if revnorm
             throw(ArgumentError("`preact` and `revnorm` cannot be set at the same time"))
         else
-            activations = (conv = activation, bn = identity)
+            activations = (conv = activation, norm = identity)
         end
     end
     # layers
     layers = [Conv(kernel_size, inplanes => outplanes, activations.conv; bias, kwargs...),
-        norm_layer(normplanes, activations.bn; ϵ = eps)]
+        norm_layer(normplanes, activations.norm; ϵ = eps)]
     return revnorm ? reverse(layers) : layers
 end
-function conv_norm(kernel_size::Dims{2}, ch::Pair{<:Integer, <:Integer},
-                   activation = identity; kwargs...)
-    inplanes, outplanes = ch
-    return conv_norm(kernel_size, inplanes, outplanes, activation; kwargs...)
-end
 
 """
     basic_conv_bn(kernel_size::Dims{2}, inplanes, outplanes, activation = relu;
diff --git a/src/layers/drop.jl b/src/layers/drop.jl
index 8a82d5b16..6f9dc7d1a 100644
--- a/src/layers/drop.jl
+++ b/src/layers/drop.jl
@@ -8,6 +8,8 @@
 end
 ChainRulesCore.@non_differentiable _dropblock_mask(rng, x, gamma, clipped_block_size)
 
+# TODO add experimental `DropBlock` options from timm such as gaussian noise and
+# more precise `DropBlock` to deal with edges (#188)
 """
     dropblock([rng = rng_from_array(x)], x::AbstractArray{T, 4}, drop_block_prob, block_size,
               gamma_scale, active::Bool = true)
@@ -20,7 +22,7 @@ regions of size `block_size` in the input. Otherwise, it simply returns the inpu
   - `rng`: can be used to pass in a custom RNG instead of the default. Custom RNGs are only
     supported on the CPU.
   - `x`: input array
-  - `drop_block_prob`: probability of dropping a block. If `nothing` is passed, it returns 
+  - `drop_block_prob`: probability of dropping a block. If `nothing` is passed, it returns
     `identity`.
   - `block_size`: size of the block to drop
   - `gamma_scale`: multiplicative factor for `gamma` used. For the calculations,
@@ -28,8 +30,6 @@ regions of size `block_size` in the input. Otherwise, it simply returns the inpu
 
 If you are an end-user, you do not want this function. Use [`DropBlock`](#) instead.
 """
-# TODO add experimental `DropBlock` options from timm such as gaussian noise and
-# more precise `DropBlock` to deal with edges (#188)
 function dropblock(rng::AbstractRNG, x::AbstractArray{T, 4}, drop_block_prob,
                    block_size::Integer, gamma_scale) where {T}
     H, W, _, _ = size(x)
@@ -108,7 +108,7 @@ function (m::DropBlock)(x)
 end
 
 function Flux.testmode!(m::DropBlock, mode = true)
-    return (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m)
+    return (m.active = (isnothing(mode) || mode === :auto) ? nothing : !mode; m)
 end
 
 function DropBlock(drop_block_prob = 0.1, block_size::Integer = 7, gamma_scale = 1.0,
diff --git a/src/layers/mbconv.jl b/src/layers/mbconv.jl
index e37a98406..93f029f9c 100644
--- a/src/layers/mbconv.jl
+++ b/src/layers/mbconv.jl
@@ -1,5 +1,5 @@
 """
-    dwsep_conv_bn(kernel_size::Dims{2}, inplanes::Integer, outplanes::Integer,
+    dwsep_conv_norm(kernel_size::Dims{2}, inplanes::Integer, outplanes::Integer,
                   activation = relu; eps::Float32 = 1.0f-5, revnorm::Bool = false, 
                   stride::Integer = 1, use_norm::NTuple{2, Bool} = (true, true),
                   pad::Integer = 0, [bias, weight, init])
@@ -32,15 +32,15 @@ See Fig. 3 in [reference](https://arxiv.org/abs/1704.04861v1).
   - `pad`: padding of the first convolution kernel
   - `weight`, `init`: initialization for the convolution kernel (see [`Flux.Conv`](#))
 """
-function dwsep_conv_bn(kernel_size::Dims{2}, inplanes::Integer, outplanes::Integer,
-                       activation = relu; eps::Float32 = 1.0f-5, revnorm::Bool = false,
-                       stride::Integer = 1, use_norm::NTuple{2, Bool} = (true, true),
-                       bias::NTuple{2, Bool} = (!use_norm[1], !use_norm[2]), kwargs...)
-    return vcat(conv_norm(kernel_size, inplanes, inplanes, activation; eps,
-                          revnorm, use_norm = use_norm[1], stride, bias = bias[1],
+function dwsep_conv_norm(kernel_size::Dims{2}, inplanes::Integer, outplanes::Integer,
+                         activation = relu; norm_layer = BatchNorm, eps::Float32 = 1.0f-5,
+                         use_norm::NTuple{2, Bool} = (true, true), stride::Integer = 1,
+                         bias::NTuple{2, Bool} = (!use_norm[1], !use_norm[2]), kwargs...)
+    return vcat(conv_norm(kernel_size, inplanes, inplanes, activation; eps, norm_layer,
+                          use_norm = use_norm[1], stride, bias = bias[1],
                           groups = inplanes, kwargs...),
-                conv_norm((1, 1), inplanes, outplanes, activation; eps,
-                          revnorm, use_norm = use_norm[2], bias = bias[2]))
+                conv_norm((1, 1), inplanes, outplanes, activation; eps, norm_layer,
+                          use_norm = use_norm[2], bias = bias[2]))
 end
 
 # TODO add support for stochastic depth to mbconv and fused_mbconv
diff --git a/src/layers/selayers.jl b/src/layers/selayers.jl
index 044d61dbf..8ac02e889 100644
--- a/src/layers/selayers.jl
+++ b/src/layers/selayers.jl
@@ -21,19 +21,13 @@ Creates a squeeze-and-excitation layer used in MobileNets, EfficientNets and SE-
   - `norm_layer`: The normalization layer to be used after the convolution layers
   - `rd_planes`: The number of hidden feature maps in a squeeze and excite layer
 """
-# TODO look into a `get_norm_act` layer that will return a closure over the norm layer
-# with the activation function passed in when the norm layer is not `identity`
-function squeeze_excite(inplanes::Integer, squeeze_planes::Integer;
-                        norm_layer = planes -> identity, activation = relu,
-                        gate_activation = sigmoid)
-    layers = [AdaptiveMeanPool((1, 1)),
-        Conv((1, 1), inplanes => squeeze_planes),
-        norm_layer(squeeze_planes),
-        activation,
-        Conv((1, 1), squeeze_planes => inplanes),
-        norm_layer(inplanes),
-        gate_activation]
-    return SkipConnection(Chain(filter!(!=(identity), layers)...), .*)
+function squeeze_excite(inplanes::Integer, squeeze_planes::Integer; norm_layer = identity,
+                        activation = relu, gate_activation = sigmoid)
+    return SkipConnection(Chain(AdaptiveMeanPool((1, 1)),
+                                conv_norm((1, 1), inplanes, squeeze_planes, activation;
+                                          norm_layer)...,
+                                conv_norm((1, 1), squeeze_planes, inplanes,
+                                          gate_activation; norm_layer)...), .*)
 end
 function squeeze_excite(inplanes::Integer; reduction::Real = 16,
                         round_fn = _round_channels, kwargs...)
diff --git a/src/vit-based/vit.jl b/src/vit-based/vit.jl
index 43e74b394..84c7eb910 100644
--- a/src/vit-based/vit.jl
+++ b/src/vit-based/vit.jl
@@ -61,7 +61,7 @@ function vit(imsize::Dims{2} = (256, 256); inchannels::Integer = 3,
                        Dropout(emb_dropout_rate),
                        transformer_encoder(embedplanes, depth, nheads; mlp_ratio,
                                            dropout_rate),
-                       pool == :class ? x -> x[:, 1, :] : seconddimmean),
+                       pool === :class ? x -> x[:, 1, :] : seconddimmean),
                  Chain(LayerNorm(embedplanes), Dense(embedplanes, nclasses, tanh_fast)))
 end
 

From 4f487a07526848d89ad3f7c30e0ecb1c4cc93b77 Mon Sep 17 00:00:00 2001
From: Abhirath Anand <74202102+theabhirath@users.noreply.github.com>
Date: Sat, 27 Aug 2022 16:20:33 +0530
Subject: [PATCH 3/8] Complete `StochasticDepth`

1. Rename `DropPath` to `StochasticDepth` to better reflect the paper
2. `_prob` is more accurate than `_rate`.
3. Add stochastic depth to the `mbconv` builders
---
 src/convnets/alexnet.jl                      | 10 ++--
 src/convnets/builders/irmodel.jl             |  8 +--
 src/convnets/builders/mbconv.jl              | 51 ++++++++++++--------
 src/convnets/builders/resblocks.jl           | 26 +++++-----
 src/convnets/convmixer.jl                    |  4 +-
 src/convnets/convnext.jl                     | 24 ++++-----
 src/convnets/densenet.jl                     | 14 +++---
 src/convnets/efficientnets/efficientnet.jl   |  4 +-
 src/convnets/efficientnets/efficientnetv2.jl |  4 +-
 src/convnets/inceptions/googlenet.jl         |  4 +-
 src/convnets/inceptions/inceptionresnetv2.jl |  8 +--
 src/convnets/inceptions/inceptionv3.jl       |  4 +-
 src/convnets/inceptions/inceptionv4.jl       |  8 +--
 src/convnets/inceptions/xception.jl          |  8 +--
 src/convnets/mobilenets/mnasnet.jl           |  4 +-
 src/convnets/mobilenets/mobilenetv1.jl       |  4 +-
 src/convnets/mobilenets/mobilenetv2.jl       |  4 +-
 src/convnets/mobilenets/mobilenetv3.jl       |  4 +-
 src/convnets/resnets/core.jl                 | 22 ++++-----
 src/convnets/vgg.jl                          | 26 +++++-----
 src/layers/Layers.jl                         |  2 +-
 src/layers/attention.jl                      | 12 ++---
 src/layers/classifier.jl                     | 16 +++---
 src/layers/drop.jl                           | 35 +++++++++-----
 src/layers/mbconv.jl                         | 11 ++---
 src/layers/mlp.jl                            | 20 ++++----
 src/mixers/core.jl                           | 13 ++---
 src/mixers/gmlp.jl                           | 13 ++---
 src/mixers/mlpmixer.jl                       | 20 ++++----
 src/mixers/resmlp.jl                         | 16 +++---
 src/vit-based/vit.jl                         | 28 +++++------
 test/convnets.jl                             |  6 +--
 32 files changed, 229 insertions(+), 204 deletions(-)

diff --git a/src/convnets/alexnet.jl b/src/convnets/alexnet.jl
index 9f9bcacf0..3256e4ab8 100644
--- a/src/convnets/alexnet.jl
+++ b/src/convnets/alexnet.jl
@@ -1,16 +1,16 @@
 """
-    alexnet(; dropout_rate = 0.5, inchannels::Integer = 3, nclasses::Integer = 1000)
+    alexnet(; dropout_prob = 0.5, inchannels::Integer = 3, nclasses::Integer = 1000)
 
 Create an AlexNet model
 ([reference](https://papers.nips.cc/paper/2012/file/c399862d3b9d6b76c8436e924a68c45b-Paper.pdf)).
 
 # Arguments
 
-  - `dropout_rate`: dropout rate for the classifier
+  - `dropout_prob`: dropout probability for the classifier
   - `inchannels`: The number of input channels.
   - `nclasses`: the number of output classes
 """
-function alexnet(; dropout_rate = 0.5, inchannels::Integer = 3, nclasses::Integer = 1000)
+function alexnet(; dropout_prob = 0.5, inchannels::Integer = 3, nclasses::Integer = 1000)
     backbone = Chain(Conv((11, 11), inchannels => 64, relu; stride = 4, pad = 2),
                      MaxPool((3, 3); stride = 2),
                      Conv((5, 5), 64 => 192, relu; pad = 2),
@@ -20,9 +20,9 @@ function alexnet(; dropout_rate = 0.5, inchannels::Integer = 3, nclasses::Intege
                      Conv((3, 3), 256 => 256, relu; pad = 1),
                      MaxPool((3, 3); stride = 2))
     classifier = Chain(AdaptiveMeanPool((6, 6)), MLUtils.flatten,
-                       Dropout(dropout_rate),
+                       Dropout(dropout_prob),
                        Dense(256 * 6 * 6, 4096, relu),
-                       Dropout(dropout_rate),
+                       Dropout(dropout_prob),
                        Dense(4096, 4096, relu),
                        Dense(4096, nclasses))
     return Chain(backbone, classifier)
diff --git a/src/convnets/builders/irmodel.jl b/src/convnets/builders/irmodel.jl
index 7e604bbe0..d65ea6531 100644
--- a/src/convnets/builders/irmodel.jl
+++ b/src/convnets/builders/irmodel.jl
@@ -2,7 +2,7 @@ function irmodelbuilder(scalings::NTuple{2, Real}, block_configs::AbstractVector
                         inplanes::Integer = 32, connection = +, activation = relu,
                         norm_layer = BatchNorm, divisor::Integer = 8,
                         tail_conv::Bool = true, expanded_classifier::Bool = false,
-                        headplanes::Integer, dropout_rate = nothing,
+                        headplanes::Integer, dropout_prob = nothing,
                         inchannels::Integer = 3, nclasses::Integer = 1000, kwargs...)
     width_mult, _ = scalings
     # building first layer
@@ -24,14 +24,14 @@ function irmodelbuilder(scalings::NTuple{2, Real}, block_configs::AbstractVector
             append!(layers,
                     conv_norm((1, 1), outplanes, midplanes, activation; norm_layer))
             classifier = create_classifier(midplanes, headplanes, nclasses,
-                                           (hardswish, identity); dropout_rate)
+                                           (hardswish, identity); dropout_prob)
         else
             append!(layers,
                     conv_norm((1, 1), outplanes, headplanes, activation; norm_layer))
-            classifier = create_classifier(headplanes, nclasses; dropout_rate)
+            classifier = create_classifier(headplanes, nclasses; dropout_prob)
         end
     else
-        classifier = create_classifier(outplanes, nclasses; dropout_rate)
+        classifier = create_classifier(outplanes, nclasses; dropout_prob)
     end
     return Chain(Chain(layers...), classifier)
 end
diff --git a/src/convnets/builders/mbconv.jl b/src/convnets/builders/mbconv.jl
index 8190eb884..c430c247f 100644
--- a/src/convnets/builders/mbconv.jl
+++ b/src/convnets/builders/mbconv.jl
@@ -1,9 +1,10 @@
 # TODO - potentially make these builders more flexible to specify stuff like
 # activation functions and reductions that don't change over the stages
 
-function dwsepconv_builder(block_configs::AbstractVector{<:Tuple}, inplanes::Integer,
-                           stage_idx::Integer, scalings::NTuple{2, Real};
-                           norm_layer = BatchNorm, divisor::Integer = 8, kwargs...)
+function _get_builder(::typeof(dwsep_conv_norm), block_configs::AbstractVector{<:Tuple},
+                      inplanes::Integer, stage_idx::Integer, scalings::NTuple{2, Real};
+                      stochastic_depth_prob = nothing, norm_layer = BatchNorm,
+                      divisor::Integer = 8, kwargs...)
     width_mult, depth_mult = scalings
     block_fn, k, outplanes, stride, nrepeats, activation = block_configs[stage_idx]
     outplanes = _round_channels(outplanes * width_mult, divisor)
@@ -19,14 +20,15 @@ function dwsepconv_builder(block_configs::AbstractVector{<:Tuple}, inplanes::Int
     end
     return get_layers, ceil(Int, nrepeats * depth_mult)
 end
-_get_builder(::typeof(dwsep_conv_norm)) = dwsepconv_builder
 
-function mbconv_builder(block_configs::AbstractVector{<:Tuple}, inplanes::Integer,
-                        stage_idx::Integer, scalings::NTuple{2, Real};
-                        norm_layer = BatchNorm, divisor::Integer = 8,
-                        se_from_explanes::Bool = false, kwargs...)
+function _get_builder(::typeof(mbconv), block_configs::AbstractVector{<:Tuple},
+                      inplanes::Integer, stage_idx::Integer, scalings::NTuple{2, Real};
+                      norm_layer = BatchNorm, divisor::Integer = 8,
+                      se_from_explanes::Bool = false, kwargs...)
     width_mult, depth_mult = scalings
-    block_fn, k, outplanes, expansion, stride, nrepeats, reduction, activation = block_configs[stage_idx]
+    block_repeats = [ceil(Int, block_configs[idx][end - 3] * depth_mult)
+                     for idx in eachindex(block_configs)]
+    block_fn, k, outplanes, expansion, stride, _, reduction, activation = block_configs[stage_idx]
     # calculate number of reduced channels for squeeze-excite layer from explanes instead of inplanes
     if !isnothing(reduction)
         reduction = !se_from_explanes ? reduction * expansion : reduction
@@ -35,41 +37,48 @@ function mbconv_builder(block_configs::AbstractVector{<:Tuple}, inplanes::Intege
         inplanes = _round_channels(block_configs[stage_idx - 1][3] * width_mult, divisor)
     end
     outplanes = _round_channels(outplanes * width_mult, divisor)
+    pathschedule = linear_scheduler(stochastic_depth_prob; depth = sum(block_repeats))
     function get_layers(block_idx::Integer)
         inplanes = block_idx == 1 ? inplanes : outplanes
         explanes = _round_channels(inplanes * expansion, divisor)
         stride = block_idx == 1 ? stride : 1
         block = block_fn((k, k), inplanes, explanes, outplanes, activation; norm_layer,
                          stride, reduction, kwargs...)
-        return stride == 1 && inplanes == outplanes ? (identity, block) : (block,)
+        schedule_idx = sum(block_repeats[1:(stage_idx - 1)]) + block_idx
+        drop_path = StochasticDepth(pathschedule(schedule_idx))
+        return stride == 1 && inplanes == outplanes ? (drop_path, block) : (block,)
     end
     return get_layers, ceil(Int, nrepeats * depth_mult)
 end
-_get_builder(::typeof(mbconv)) = mbconv_builder
 
-function fused_mbconv_builder(block_configs::AbstractVector{<:Tuple}, inplanes::Integer,
-                              stage_idx::Integer, scalings::NTuple{2, Real};
-                              norm_layer = BatchNorm, divisor::Integer = 8, kwargs...)
-    width_mult, depth_mult = scalings
-    block_fn, k, outplanes, expansion, stride, nrepeats, activation = block_configs[stage_idx]
+function _get_builder(::typeof(fused_mbconv), block_configs::AbstractVector{<:Tuple},
+                      inplanes::Integer, stage_idx::Integer, scalings::NTuple{2, Real};
+                      norm_layer = BatchNorm, divisor::Integer = 8, kwargs...)
+    width_mult, depth_mult = scaling
+    block_repeats = [ceil(Int, block_configs[idx][end - 2] * depth_mult)
+                     for idx in eachindex(block_configs)]
+    block_fn, k, outplanes, expansion, stride, _, activation = block_configs[stage_idx]
     inplanes = stage_idx == 1 ? inplanes : block_configs[stage_idx - 1][3]
     outplanes = _round_channels(outplanes * width_mult, divisor)
+    block_repeats = sum(block_configs[idx][4] for idx in 1:stage_idx)
+    pathschedule = linear_scheduler(stochastic_depth_prob; depth = sum(block_repeats))
     function get_layers(block_idx::Integer)
         inplanes = block_idx == 1 ? inplanes : outplanes
         explanes = _round_channels(inplanes * expansion, divisor)
         stride = block_idx == 1 ? stride : 1
         block = block_fn((k, k), inplanes, explanes, outplanes, activation;
                          norm_layer, stride, kwargs...)
-        return stride == 1 && inplanes == outplanes ? (identity, block) : (block,)
+        schedule_idx = sum(block_repeats[1:(stage_idx - 1)]) + block_idx
+        drop_path = StochasticDepth(pathschedule(schedule_idx))
+        return stride == 1 && inplanes == outplanes ? (drop_path, block) : (block,)
     end
-    return get_layers, ceil(Int, nrepeats * depth_mult)
+    return get_layers, block_repeats[stage_idx]
 end
-_get_builder(::typeof(fused_mbconv)) = fused_mbconv_builder
 
 function mbconv_stage_builder(block_configs::AbstractVector{<:Tuple}, inplanes::Integer,
                               scalings::NTuple{2, Real}; kwargs...)
-    builders = _get_builder.(first.(block_configs))
-    bxs = [builders[idx](block_configs, inplanes, idx, scalings; kwargs...)
+    bxs = [_get_builder(block_configs[1], block_configs, inplanes, idx, scalings;
+                        kwargs...)
            for idx in eachindex(block_configs)]
     return (stage_idx, block_idx) -> first.(bxs)[stage_idx](block_idx), last.(bxs)
 end
diff --git a/src/convnets/builders/resblocks.jl b/src/convnets/builders/resblocks.jl
index 8343bf811..e82557305 100644
--- a/src/convnets/builders/resblocks.jl
+++ b/src/convnets/builders/resblocks.jl
@@ -3,17 +3,17 @@ function basicblock_builder(block_repeats::AbstractVector{<:Integer};
                             expansion::Integer = 1, norm_layer = BatchNorm,
                             revnorm::Bool = false, activation = relu,
                             attn_fn = planes -> identity,
-                            drop_block_rate = nothing, drop_path_rate = nothing,
+                            dropblock_prob = nothing, stochastic_depth_prob = nothing,
                             stride_fn = resnet_stride, planes_fn = resnet_planes,
                             downsample_tuple = (downsample_conv, downsample_identity))
-    # DropBlock, DropPath both take in rates based on a linear scaling schedule
+    # DropBlock, StochasticDepth both take in rates based on a linear scaling schedule
     # Also get `planes_vec` needed for block `inplanes` and `planes` calculations
-    pathschedule = linear_scheduler(drop_path_rate; depth = sum(block_repeats))
-    blockschedule = linear_scheduler(drop_block_rate; depth = sum(block_repeats))
+    pathschedule = linear_scheduler(stochastic_depth_prob; depth = sum(block_repeats))
+    blockschedule = linear_scheduler(dropblock_prob; depth = sum(block_repeats))
     planes_vec = collect(planes_fn(block_repeats))
     # closure over `idxs`
     function get_layers(stage_idx::Integer, block_idx::Integer)
-        # DropBlock, DropPath both take in rates based on a linear scaling schedule
+        # DropBlock, StochasticDepth both take in rates based on a linear scaling schedule
         # This is also needed for block `inplanes` and `planes` calculations
         schedule_idx = sum(block_repeats[1:(stage_idx - 1)]) + block_idx
         planes = planes_vec[schedule_idx]
@@ -23,13 +23,13 @@ function basicblock_builder(block_repeats::AbstractVector{<:Integer};
         stride = stride_fn(stage_idx, block_idx)
         downsample_fn = stride != 1 || inplanes != planes * expansion ?
                         downsample_tuple[1] : downsample_tuple[2]
-        drop_path = DropPath(pathschedule[schedule_idx])
+        drop_path = StochasticDepth(pathschedule[schedule_idx])
         drop_block = DropBlock(blockschedule[schedule_idx])
         block = basicblock(inplanes, planes; stride, reduction_factor, activation,
                            norm_layer, revnorm, attn_fn, drop_path, drop_block)
         downsample = downsample_fn(inplanes, planes * expansion; stride, norm_layer,
                                    revnorm)
-        return block, downsample
+        return (downsample, block)
     end
     return get_layers
 end
@@ -40,15 +40,15 @@ function bottleneck_builder(block_repeats::AbstractVector{<:Integer};
                             expansion::Integer = 4, norm_layer = BatchNorm,
                             revnorm::Bool = false, activation = relu,
                             attn_fn = planes -> identity,
-                            drop_block_rate = nothing, drop_path_rate = nothing,
+                            dropblock_prob = nothing, stochastic_depth_prob = nothing,
                             stride_fn = resnet_stride, planes_fn = resnet_planes,
                             downsample_tuple = (downsample_conv, downsample_identity))
-    pathschedule = linear_scheduler(drop_path_rate; depth = sum(block_repeats))
-    blockschedule = linear_scheduler(drop_block_rate; depth = sum(block_repeats))
+    pathschedule = linear_scheduler(stochastic_depth_prob; depth = sum(block_repeats))
+    blockschedule = linear_scheduler(dropblock_prob; depth = sum(block_repeats))
     planes_vec = collect(planes_fn(block_repeats))
     # closure over `idxs`
     function get_layers(stage_idx::Integer, block_idx::Integer)
-        # DropBlock, DropPath both take in rates based on a linear scaling schedule
+        # DropBlock, StochasticDepth both take in rates based on a linear scaling schedule
         # This is also needed for block `inplanes` and `planes` calculations
         schedule_idx = sum(block_repeats[1:(stage_idx - 1)]) + block_idx
         planes = planes_vec[schedule_idx]
@@ -58,14 +58,14 @@ function bottleneck_builder(block_repeats::AbstractVector{<:Integer};
         stride = stride_fn(stage_idx, block_idx)
         downsample_fn = stride != 1 || inplanes != planes * expansion ?
                         downsample_tuple[1] : downsample_tuple[2]
-        drop_path = DropPath(pathschedule[schedule_idx])
+        drop_path = StochasticDepth(pathschedule[schedule_idx])
         drop_block = DropBlock(blockschedule[schedule_idx])
         block = bottleneck(inplanes, planes; stride, cardinality, base_width,
                            reduction_factor, activation, norm_layer, revnorm,
                            attn_fn, drop_path, drop_block)
         downsample = downsample_fn(inplanes, planes * expansion; stride, norm_layer,
                                    revnorm)
-        return block, downsample
+        return (downsample, block)
     end
     return get_layers
 end
diff --git a/src/convnets/convmixer.jl b/src/convnets/convmixer.jl
index bc1a71a5f..4cd22cb6b 100644
--- a/src/convnets/convmixer.jl
+++ b/src/convnets/convmixer.jl
@@ -17,7 +17,7 @@ Creates a ConvMixer model.
   - `nclasses`: number of classes in the output
 """
 function convmixer(planes::Integer, depth::Integer; kernel_size::Dims{2} = (9, 9),
-                   patch_size::Dims{2} = (7, 7), activation = gelu, dropout_rate = nothing,
+                   patch_size::Dims{2} = (7, 7), activation = gelu, dropout_prob = nothing,
                    inchannels::Integer = 3, nclasses::Integer = 1000)
     layers = []
     # stem of the model
@@ -31,7 +31,7 @@ function convmixer(planes::Integer, depth::Integer; kernel_size::Dims{2} = (9, 9
                     conv_norm((1, 1), planes, planes, activation; preact = true)...)
               for _ in 1:depth]
     append!(layers, stages)
-    return Chain(Chain(layers...), create_classifier(planes, nclasses; dropout_rate))
+    return Chain(Chain(layers...), create_classifier(planes, nclasses; dropout_prob))
 end
 
 const CONVMIXER_CONFIGS = Dict(:base => ((1536, 20),
diff --git a/src/convnets/convnext.jl b/src/convnets/convnext.jl
index e11568d01..04513220d 100644
--- a/src/convnets/convnext.jl
+++ b/src/convnets/convnext.jl
@@ -1,5 +1,5 @@
 """
-    convnextblock(planes::Integer, drop_path_rate = 0.0, layerscale_init = 1.0f-6)
+    convnextblock(planes::Integer, stochastic_depth_prob = 0.0, layerscale_init = 1.0f-6)
 
 Creates a single block of ConvNeXt.
 ([reference](https://arxiv.org/abs/2201.03545))
@@ -7,23 +7,24 @@ Creates a single block of ConvNeXt.
 # Arguments
 
   - `planes`: number of input channels.
-  - `drop_path_rate`: Stochastic depth rate.
+  - `stochastic_depth_prob`: Stochastic depth probability.
   - `layerscale_init`: Initial value for [`LayerScale`](#)
 """
-function convnextblock(planes::Integer, drop_path_rate = 0.0, layerscale_init = 1.0f-6)
+function convnextblock(planes::Integer, stochastic_depth_prob = 0.0,
+                       layerscale_init = 1.0f-6)
     layers = SkipConnection(Chain(DepthwiseConv((7, 7), planes => planes; pad = 3),
                                   swapdims((3, 1, 2, 4)),
                                   LayerNorm(planes; ϵ = 1.0f-6),
                                   mlp_block(planes, 4 * planes),
                                   LayerScale(planes, layerscale_init),
                                   swapdims((2, 3, 1, 4)),
-                                  DropPath(drop_path_rate)), +)
+                                  StochasticDepth(stochastic_depth_prob)), +)
     return layers
 end
 
 """
     convnext(depths::AbstractVector{<:Integer}, planes::AbstractVector{<:Integer};
-             drop_path_rate = 0.0, layerscale_init = 1.0f-6, inchannels::Integer = 3,
+             stochastic_depth_prob = 0.0, layerscale_init = 1.0f-6, inchannels::Integer = 3,
              nclasses::Integer = 1000)
 
 Creates the layers for a ConvNeXt model.
@@ -33,14 +34,15 @@ Creates the layers for a ConvNeXt model.
 
   - `depths`: list with configuration for depth of each block
   - `planes`: list with configuration for number of output channels in each block
-  - `drop_path_rate`: Stochastic depth rate.
+  - `stochastic_depth_prob`: Stochastic depth probability.
   - `layerscale_init`: Initial value for [`LayerScale`](#)
     ([reference](https://arxiv.org/abs/2103.17239))
   - `inchannels`: number of input channels.
   - `nclasses`: number of output classes
 """
 function convnext(depths::AbstractVector{<:Integer}, planes::AbstractVector{<:Integer};
-                  drop_path_rate = 0.0, layerscale_init = 1.0f-6, inchannels::Integer = 3,
+                  stochastic_depth_prob = 0.0, layerscale_init = 1.0f-6,
+                  inchannels::Integer = 3,
                   nclasses::Integer = 1000)
     @assert length(depths) == length(planes)
     "`planes` should have exactly one value for each block"
@@ -54,7 +56,7 @@ function convnext(depths::AbstractVector{<:Integer}, planes::AbstractVector{<:In
                               norm_layer = ChannelLayerNorm, revnorm = true)...))
     end
     stages = []
-    dp_rates = linear_scheduler(drop_path_rate; depth = sum(depths))
+    dp_rates = linear_scheduler(stochastic_depth_prob; depth = sum(depths))
     cur = 0
     for i in eachindex(depths)
         push!(stages,
@@ -68,9 +70,9 @@ function convnext(depths::AbstractVector{<:Integer}, planes::AbstractVector{<:In
     return Chain(Chain(backbone...), classifier)
 end
 
-function convnext(config::Symbol; drop_path_rate = 0.0, layerscale_init = 1.0f-6,
+function convnext(config::Symbol; stochastic_depth_prob = 0.0, layerscale_init = 1.0f-6,
                   inchannels::Integer = 3, nclasses::Integer = 1000)
-    return convnext(CONVNEXT_CONFIGS[config]...; drop_path_rate, layerscale_init,
+    return convnext(CONVNEXT_CONFIGS[config]...; stochastic_depth_prob, layerscale_init,
                     inchannels, nclasses)
 end
 
@@ -106,7 +108,7 @@ struct ConvNeXt
 end
 @functor ConvNeXt
 
-function ConvNeXt(config::Symbol; pretrain::Bool = true, inchannels::Integer = 3,
+function ConvNeXt(config::Symbol; pretrain::Bool = false, inchannels::Integer = 3,
                   nclasses::Integer = 1000)
     _checkconfig(config, keys(CONVNEXT_CONFIGS))
     layers = convnext(config; inchannels, nclasses)
diff --git a/src/convnets/densenet.jl b/src/convnets/densenet.jl
index a7c367c1c..0f5be78c8 100644
--- a/src/convnets/densenet.jl
+++ b/src/convnets/densenet.jl
@@ -55,7 +55,7 @@ function dense_block(inplanes::Integer, growth_rates)
 end
 
 """
-    densenet(inplanes, growth_rates; reduction = 0.5, dropout_rate = nothing, 
+    densenet(inplanes, growth_rates; reduction = 0.5, dropout_prob = nothing, 
              inchannels::Integer = 3, nclasses::Integer = 1000)
 
 Create a DenseNet model
@@ -67,10 +67,10 @@ Create a DenseNet model
   - `growth_rates`: the growth rates of output feature maps within each
     [`dense_block`](#) (a vector of vectors)
   - `reduction`: the factor by which the number of feature maps is scaled across each transition
-  - `dropout_rate`: the dropout rate for the classifier head. Set to `nothing` to disable dropout.
+  - `dropout_prob`: the dropout probability for the classifier head. Set to `nothing` to disable dropout.
   - `nclasses`: the number of output classes
 """
-function densenet(inplanes::Integer, growth_rates; reduction = 0.5, dropout_rate = nothing,
+function densenet(inplanes::Integer, growth_rates; reduction = 0.5, dropout_prob = nothing,
                   inchannels::Integer = 3, nclasses::Integer = 1000)
     layers = []
     append!(layers,
@@ -85,7 +85,7 @@ function densenet(inplanes::Integer, growth_rates; reduction = 0.5, dropout_rate
         inplanes = floor(Int, outplanes * reduction)
     end
     push!(layers, BatchNorm(outplanes, relu))
-    return Chain(Chain(layers...), create_classifier(outplanes, nclasses; dropout_rate))
+    return Chain(Chain(layers...), create_classifier(outplanes, nclasses; dropout_prob))
 end
 
 """
@@ -97,15 +97,15 @@ Create a DenseNet model
 # Arguments
 
   - `nblocks`: number of dense blocks between transitions
-  - `growth_rate`: the output feature map growth rate of dense blocks (i.e. `k` in the ref)
+  - `growth_rate`: the output feature map growth probability of dense blocks (i.e. `k` in the ref)
   - `reduction`: the factor by which the number of feature maps is scaled across each transition
   - `nclasses`: the number of output classes
 """
 function densenet(nblocks::AbstractVector{<:Integer}; growth_rate::Integer = 32,
-                  reduction = 0.5, dropout_rate = nothing, inchannels::Integer = 3,
+                  reduction = 0.5, dropout_prob = nothing, inchannels::Integer = 3,
                   nclasses::Integer = 1000)
     return densenet(2 * growth_rate, [fill(growth_rate, n) for n in nblocks];
-                    reduction, dropout_rate, inchannels, nclasses)
+                    reduction, dropout_prob, inchannels, nclasses)
 end
 
 const DENSENET_CONFIGS = Dict(121 => [6, 12, 24, 16],
diff --git a/src/convnets/efficientnets/efficientnet.jl b/src/convnets/efficientnets/efficientnet.jl
index abc818bea..08b4b87dd 100644
--- a/src/convnets/efficientnets/efficientnet.jl
+++ b/src/convnets/efficientnets/efficientnet.jl
@@ -32,14 +32,14 @@ const EFFICIENTNET_GLOBAL_CONFIGS = Dict(:b0 => (224, (1.0, 1.0)),
                                          :b8 => (672, (2.2, 3.6)))
 
 function efficientnet(config::Symbol; norm_layer = BatchNorm,
-                      dropout_rate = nothing, inchannels::Integer = 3,
+                      dropout_prob = nothing, inchannels::Integer = 3,
                       nclasses::Integer = 1000)
     _checkconfig(config, keys(EFFICIENTNET_GLOBAL_CONFIGS))
     scalings = EFFICIENTNET_GLOBAL_CONFIGS[config][2]
     return irmodelbuilder(scalings, EFFICIENTNET_BLOCK_CONFIGS; inplanes = 32,
                           norm_layer, activation = swish,
                           headplanes = EFFICIENTNET_BLOCK_CONFIGS[end][3] * 4,
-                          dropout_rate, inchannels, nclasses)
+                          dropout_prob, inchannels, nclasses)
 end
 
 """
diff --git a/src/convnets/efficientnets/efficientnetv2.jl b/src/convnets/efficientnets/efficientnetv2.jl
index 6eda393ed..e792cd1da 100644
--- a/src/convnets/efficientnets/efficientnetv2.jl
+++ b/src/convnets/efficientnets/efficientnetv2.jl
@@ -35,13 +35,13 @@ const EFFNETV2_CONFIGS = Dict(:small => [(fused_mbconv, 3, 24, 1, 1, 2, swish),
                                   (mbconv, 3, 512, 6, 2, 32, 4, swish),
                                   (mbconv, 3, 768, 6, 1, 8, 4, swish)])
 
-function efficientnetv2(config::Symbol; norm_layer = BatchNorm, dropout_rate = nothing,
+function efficientnetv2(config::Symbol; norm_layer = BatchNorm, dropout_prob = nothing,
                         inchannels::Integer = 3, nclasses::Integer = 1000)
     _checkconfig(config, keys(EFFNETV2_CONFIGS))
     block_configs = EFFNETV2_CONFIGS[config]
     return irmodelbuilder((1, 1), block_configs; activation = swish, norm_layer,
                           inplanes = block_configs[1][3], headplanes = 1280,
-                          dropout_rate, inchannels, nclasses)
+                          dropout_prob, inchannels, nclasses)
 end
 
 """
diff --git a/src/convnets/inceptions/googlenet.jl b/src/convnets/inceptions/googlenet.jl
index 1018ad99a..1f786bbb5 100644
--- a/src/convnets/inceptions/googlenet.jl
+++ b/src/convnets/inceptions/googlenet.jl
@@ -36,7 +36,7 @@ Create an Inception-v1 model (commonly referred to as GoogLeNet)
 
   - `nclasses`: the number of output classes
 """
-function googlenet(; dropout_rate = 0.4, inchannels::Integer = 3, nclasses::Integer = 1000)
+function googlenet(; dropout_prob = 0.4, inchannels::Integer = 3, nclasses::Integer = 1000)
     backbone = Chain(Conv((7, 7), inchannels => 64; stride = 2, pad = 3),
                      MaxPool((3, 3); stride = 2, pad = 1),
                      Conv((1, 1), 64 => 64),
@@ -53,7 +53,7 @@ function googlenet(; dropout_rate = 0.4, inchannels::Integer = 3, nclasses::Inte
                      MaxPool((3, 3); stride = 2, pad = 1),
                      inceptionblock(832, 256, 160, 320, 32, 128, 128),
                      inceptionblock(832, 384, 192, 384, 48, 128, 128))
-    return Chain(backbone, create_classifier(1024, nclasses; dropout_rate))
+    return Chain(backbone, create_classifier(1024, nclasses; dropout_prob))
 end
 
 """
diff --git a/src/convnets/inceptions/inceptionresnetv2.jl b/src/convnets/inceptions/inceptionresnetv2.jl
index bd88648e9..d4a034bed 100644
--- a/src/convnets/inceptions/inceptionresnetv2.jl
+++ b/src/convnets/inceptions/inceptionresnetv2.jl
@@ -64,7 +64,7 @@ function block8(scale = 1.0f0; activation = identity)
 end
 
 """
-    inceptionresnetv2(; inchannels::Integer = 3, dropout_rate = nothing, nclasses::Integer = 1000)
+    inceptionresnetv2(; inchannels::Integer = 3, dropout_prob = nothing, nclasses::Integer = 1000)
 
 Creates an InceptionResNetv2 model.
 ([reference](https://arxiv.org/abs/1602.07261))
@@ -72,10 +72,10 @@ Creates an InceptionResNetv2 model.
 # Arguments
 
   - `inchannels`: number of input channels.
-  - `dropout_rate`: rate of dropout in classifier head. Set to `nothing` to disable dropout.
+  - `dropout_prob`: probability of dropout in classifier head. Set to `nothing` to disable dropout.
   - `nclasses`: the number of output classes.
 """
-function inceptionresnetv2(; dropout_rate = nothing, inchannels::Integer = 3,
+function inceptionresnetv2(; dropout_prob = nothing, inchannels::Integer = 3,
                            nclasses::Integer = 1000)
     backbone = Chain(basic_conv_bn((3, 3), inchannels, 32; stride = 2)...,
                      basic_conv_bn((3, 3), 32, 32)...,
@@ -92,7 +92,7 @@ function inceptionresnetv2(; dropout_rate = nothing, inchannels::Integer = 3,
                      [block8(0.20f0) for _ in 1:9]...,
                      block8(; activation = relu),
                      basic_conv_bn((1, 1), 2080, 1536)...)
-    return Chain(backbone, create_classifier(1536, nclasses; dropout_rate))
+    return Chain(backbone, create_classifier(1536, nclasses; dropout_prob))
 end
 
 """
diff --git a/src/convnets/inceptions/inceptionv3.jl b/src/convnets/inceptions/inceptionv3.jl
index 32fbbede5..c36273e63 100644
--- a/src/convnets/inceptions/inceptionv3.jl
+++ b/src/convnets/inceptions/inceptionv3.jl
@@ -133,7 +133,7 @@ Create an Inception-v3 model ([reference](https://arxiv.org/abs/1512.00567v3)).
 
   - `nclasses`: the number of output classes
 """
-function inceptionv3(; dropout_rate = 0.2, inchannels::Integer = 3,
+function inceptionv3(; dropout_prob = 0.2, inchannels::Integer = 3,
                      nclasses::Integer = 1000)
     backbone = Chain(basic_conv_bn((3, 3), inchannels, 32; stride = 2)...,
                      basic_conv_bn((3, 3), 32, 32)...,
@@ -153,7 +153,7 @@ function inceptionv3(; dropout_rate = 0.2, inchannels::Integer = 3,
                      inceptionv3_d(768),
                      inceptionv3_e(1280),
                      inceptionv3_e(2048))
-    return Chain(backbone, create_classifier(2048, nclasses; dropout_rate))
+    return Chain(backbone, create_classifier(2048, nclasses; dropout_prob))
 end
 
 """
diff --git a/src/convnets/inceptions/inceptionv4.jl b/src/convnets/inceptions/inceptionv4.jl
index 13d40da25..e3b4e30f7 100644
--- a/src/convnets/inceptions/inceptionv4.jl
+++ b/src/convnets/inceptions/inceptionv4.jl
@@ -85,7 +85,7 @@ function inceptionv4_c()
 end
 
 """
-    inceptionv4(; inchannels::Integer = 3, dropout_rate = nothing, nclasses::Integer = 1000)
+    inceptionv4(; inchannels::Integer = 3, dropout_prob = nothing, nclasses::Integer = 1000)
 
 Create an Inceptionv4 model.
 ([reference](https://arxiv.org/abs/1602.07261))
@@ -93,10 +93,10 @@ Create an Inceptionv4 model.
 # Arguments
 
   - `inchannels`: number of input channels.
-  - `dropout_rate`: rate of dropout in classifier head. Set to `nothing` to disable dropout.
+  - `dropout_prob`: probability of dropout in classifier head. Set to `nothing` to disable dropout.
   - `nclasses`: the number of output classes.
 """
-function inceptionv4(; dropout_rate = nothing, inchannels::Integer = 3,
+function inceptionv4(; dropout_prob = nothing, inchannels::Integer = 3,
                      nclasses::Integer = 1000)
     backbone = Chain(basic_conv_bn((3, 3), inchannels, 32; stride = 2)...,
                      basic_conv_bn((3, 3), 32, 32)...,
@@ -107,7 +107,7 @@ function inceptionv4(; dropout_rate = nothing, inchannels::Integer = 3,
                      [inceptionv4_b() for _ in 1:7]...,
                      reduction_b(),  # mixed_7a
                      [inceptionv4_c() for _ in 1:3]...)
-    return Chain(backbone, create_classifier(1536, nclasses; dropout_rate))
+    return Chain(backbone, create_classifier(1536, nclasses; dropout_prob))
 end
 
 """
diff --git a/src/convnets/inceptions/xception.jl b/src/convnets/inceptions/xception.jl
index 4c28644d0..efb60edd3 100644
--- a/src/convnets/inceptions/xception.jl
+++ b/src/convnets/inceptions/xception.jl
@@ -43,18 +43,18 @@ function xception_block(inchannels::Integer, outchannels::Integer, nrepeats::Int
 end
 
 """
-    xception(; dropout_rate = nothing, inchannels::Integer = 3, nclasses::Integer = 1000)
+    xception(; dropout_prob = nothing, inchannels::Integer = 3, nclasses::Integer = 1000)
 
 Creates an Xception model.
 ([reference](https://arxiv.org/abs/1610.02357))
 
 # Arguments
 
-  - `dropout_rate`: rate of dropout in classifier head. Set to `nothing` to disable dropout.
+  - `dropout_prob`: probability of dropout in classifier head. Set to `nothing` to disable dropout.
   - `inchannels`: number of input channels.
   - `nclasses`: the number of output classes.
 """
-function xception(; dropout_rate = 0.0, inchannels::Integer = 3, nclasses::Integer = 1000)
+function xception(; dropout_prob = 0.0, inchannels::Integer = 3, nclasses::Integer = 1000)
     backbone = Chain(conv_norm((3, 3), inchannels, 32; stride = 2)...,
                      conv_norm((3, 3), 32, 64)...,
                      xception_block(64, 128, 2; stride = 2, start_with_relu = false),
@@ -64,7 +64,7 @@ function xception(; dropout_rate = 0.0, inchannels::Integer = 3, nclasses::Integ
                      xception_block(728, 1024, 2; stride = 2, grow_at_start = false),
                      dwsep_conv_norm((3, 3), 1024, 1536; pad = 1)...,
                      dwsep_conv_norm((3, 3), 1536, 2048; pad = 1)...)
-    return Chain(backbone, create_classifier(2048, nclasses; dropout_rate))
+    return Chain(backbone, create_classifier(2048, nclasses; dropout_prob))
 end
 
 """
diff --git a/src/convnets/mobilenets/mnasnet.jl b/src/convnets/mobilenets/mnasnet.jl
index 6906d1b6d..6c9cd40d6 100644
--- a/src/convnets/mobilenets/mnasnet.jl
+++ b/src/convnets/mobilenets/mnasnet.jl
@@ -40,13 +40,13 @@ const MNASNET_CONFIGS = Dict(:B1 => (32,
                                         ]))
 
 function mnasnet(config::Symbol; width_mult::Real = 1, max_width::Integer = 1280,
-                 dropout_rate = 0.2, inchannels::Integer = 3, nclasses::Integer = 1000)
+                 dropout_prob = 0.2, inchannels::Integer = 3, nclasses::Integer = 1000)
     _checkconfig(config, keys(MNASNET_CONFIGS))
     # momentum used for BatchNorm is as per Tensorflow implementation
     norm_layer = (args...; kwargs...) -> BatchNorm(args...; momentum = 0.0003f0, kwargs...)
     inplanes, block_configs = MNASNET_CONFIGS[config]
     return irmodelbuilder(width_mult, block_configs; inplanes, norm_layer,
-                          headplanes = max_width, dropout_rate, inchannels, nclasses)
+                          headplanes = max_width, dropout_prob, inchannels, nclasses)
 end
 
 """
diff --git a/src/convnets/mobilenets/mobilenetv1.jl b/src/convnets/mobilenets/mobilenetv1.jl
index 092391030..e71c25451 100644
--- a/src/convnets/mobilenets/mobilenetv1.jl
+++ b/src/convnets/mobilenets/mobilenetv1.jl
@@ -14,11 +14,11 @@ const MOBILENETV1_CONFIGS = [
     (dwsep_conv_norm, 3, 1024, 2, 2, relu6),
 ]
 
-function mobilenetv1(width_mult::Real = 1; inplanes::Integer = 32, dropout_rate = nothing,
+function mobilenetv1(width_mult::Real = 1; inplanes::Integer = 32, dropout_prob = nothing,
                      inchannels::Integer = 3, nclasses::Integer = 1000)
     return irmodelbuilder(width_mult, MOBILENETV1_CONFIGS; inplanes, inchannels,
                           activation = relu6, connection = nothing, tail_conv = false,
-                          headplanes = 1024, dropout_rate, nclasses)
+                          headplanes = 1024, dropout_prob, nclasses)
 end
 
 """
diff --git a/src/convnets/mobilenets/mobilenetv2.jl b/src/convnets/mobilenets/mobilenetv2.jl
index 8f3db168b..7f4279793 100644
--- a/src/convnets/mobilenets/mobilenetv2.jl
+++ b/src/convnets/mobilenets/mobilenetv2.jl
@@ -19,10 +19,10 @@ const MOBILENETV2_CONFIGS = [
 ]
 
 function mobilenetv2(width_mult::Real = 1; max_width::Integer = 1280,
-                     inplanes::Integer = 32, dropout_rate = 0.2,
+                     inplanes::Integer = 32, dropout_prob = 0.2,
                      inchannels::Integer = 3, nclasses::Integer = 1000)
     return irmodelbuilder(width_mult, MOBILENETV2_CONFIGS; activation = relu6, inplanes,
-                          headplanes = max_width, dropout_rate, inchannels, nclasses)
+                          headplanes = max_width, dropout_prob, inchannels, nclasses)
 end
 
 """
diff --git a/src/convnets/mobilenets/mobilenetv3.jl b/src/convnets/mobilenets/mobilenetv3.jl
index 2ad394b61..293435271 100644
--- a/src/convnets/mobilenets/mobilenetv3.jl
+++ b/src/convnets/mobilenets/mobilenetv3.jl
@@ -34,14 +34,14 @@ const MOBILENETV3_CONFIGS = Dict(:small => (1024,
                                                 (mbconv, 5, 160, 6, 1, 3, 4, hardswish),
                                             ]))
 
-function mobilenetv3(config::Symbol; width_mult::Real = 1, dropout_rate = 0.2,
+function mobilenetv3(config::Symbol; width_mult::Real = 1, dropout_prob = 0.2,
                      inchannels::Integer = 3, nclasses::Integer = 1000)
     _checkconfig(config, [:small, :large])
     max_width, block_configs = MOBILENETV3_CONFIGS[config]
     return irmodelbuilder(width_mult, block_configs; inplanes = 16,
                           headplanes = max_width, activation = relu,
                           se_from_explanes = true, se_round_fn = _round_channels,
-                          expanded_classifier = true, dropout_rate, inchannels, nclasses)
+                          expanded_classifier = true, dropout_prob, inchannels, nclasses)
 end
 
 """
diff --git a/src/convnets/resnets/core.jl b/src/convnets/resnets/core.jl
index c9c33392b..1b15c35f8 100644
--- a/src/convnets/resnets/core.jl
+++ b/src/convnets/resnets/core.jl
@@ -126,10 +126,10 @@ end
 
 # returns `DropBlock`s for each stage of the ResNet as in timm.
 # TODO - add experimental options for DropBlock as part of the API (#188)
-# function _drop_blocks(drop_block_rate::AbstractFloat)
+# function _drop_blocks(dropblock_prob::AbstractFloat)
 #     return [
 #         identity, identity,
-#         DropBlock(drop_block_rate, 5, 0.25), DropBlock(drop_block_rate, 3, 1.00),
+#         DropBlock(dropblock_prob, 5, 0.25), DropBlock(dropblock_prob, 3, 1.00),
 #     ]
 # end
 
@@ -203,8 +203,8 @@ function resnet(block_type, block_repeats::AbstractVector{<:Integer},
                 stem_fn = resnet_stem, connection = addact, activation = relu,
                 norm_layer = BatchNorm, revnorm::Bool = false,
                 attn_fn = planes -> identity, pool_layer = AdaptiveMeanPool((1, 1)),
-                use_conv::Bool = false, drop_block_rate = nothing,
-                drop_path_rate = nothing, dropout_rate = nothing,
+                use_conv::Bool = false, dropblock_prob = nothing,
+                stochastic_depth_prob = nothing, dropout_prob = nothing,
                 imsize::Dims{2} = (256, 256), inchannels::Integer = 3,
                 nclasses::Integer = 1000, kwargs...)
     # Build stem
@@ -215,22 +215,22 @@ function resnet(block_type, block_repeats::AbstractVector{<:Integer},
         @assert base_width==64 "Base width must be 64 for `basicblock`"
         get_layers = basicblock_builder(block_repeats; inplanes, reduction_factor,
                                         activation, norm_layer, revnorm, attn_fn,
-                                        drop_block_rate, drop_path_rate,
+                                        dropblock_prob, stochastic_depth_prob,
                                         stride_fn = resnet_stride,
                                         planes_fn = resnet_planes,
                                         downsample_tuple = downsample_opt, kwargs...)
     elseif block_type == bottleneck
         get_layers = bottleneck_builder(block_repeats; inplanes, cardinality, base_width,
                                         reduction_factor, activation, norm_layer, revnorm,
-                                        attn_fn, drop_block_rate, drop_path_rate,
+                                        attn_fn, dropblock_prob, stochastic_depth_prob,
                                         stride_fn = resnet_stride,
                                         planes_fn = resnet_planes,
                                         downsample_tuple = downsample_opt, kwargs...)
     elseif block_type == bottle2neck
-        @assert isnothing(drop_block_rate) "DropBlock not supported for `bottle2neck`.
-        Set `drop_block_rate` to nothing."
-        @assert isnothing(drop_path_rate) "DropPath not supported for `bottle2neck`.
-        Set `drop_path_rate` to nothing."
+        @assert isnothing(dropblock_prob) "DropBlock not supported for `bottle2neck`.
+        Set `dropblock_prob` to nothing."
+        @assert isnothing(stochastic_depth_prob) "StochasticDepth not supported for `bottle2neck`.
+        Set `stochastic_depth_prob` to nothing."
         @assert reduction_factor==1 "Reduction factor not supported for `bottle2neck`.
         Set `reduction_factor` to 1."
         get_layers = bottle2neck_builder(block_repeats; inplanes, cardinality, base_width,
@@ -242,7 +242,7 @@ function resnet(block_type, block_repeats::AbstractVector{<:Integer},
         # TODO: write better message when we have link to dev docs for resnet
         throw(ArgumentError("Unknown block type $block_type"))
     end
-    classifier_fn = nfeatures -> create_classifier(nfeatures, nclasses; dropout_rate,
+    classifier_fn = nfeatures -> create_classifier(nfeatures, nclasses; dropout_prob,
                                                    pool_layer, use_conv)
     return resnetbuilder((imsize..., inchannels), stem, get_layers, block_repeats,
                          connection$activation, classifier_fn)
diff --git a/src/convnets/vgg.jl b/src/convnets/vgg.jl
index 163c13b68..345f1e275 100644
--- a/src/convnets/vgg.jl
+++ b/src/convnets/vgg.jl
@@ -53,7 +53,7 @@ function vgg_convolutional_layers(config::AbstractVector{<:Tuple}, batchnorm::Bo
 end
 
 """
-    vgg_classifier_layers(imsize, nclasses, fcsize, dropout_rate)
+    vgg_classifier_layers(imsize, nclasses, fcsize, dropout_prob)
 
 Create VGG classifier (fully connected) layers
 ([reference](https://arxiv.org/abs/1409.1556v6)).
@@ -64,20 +64,20 @@ Create VGG classifier (fully connected) layers
     the convolution layers (see [`Metalhead.vgg_convolutional_layers`](#))
   - `nclasses`: number of output classes
   - `fcsize`: input and output size of the intermediate fully connected layer
-  - `dropout_rate`: the dropout level between each fully connected layer
+  - `dropout_prob`: the dropout level between each fully connected layer
 """
 function vgg_classifier_layers(imsize::NTuple{3, <:Integer}, nclasses::Integer,
-                               fcsize::Integer, dropout_rate)
+                               fcsize::Integer, dropout_prob)
     return Chain(MLUtils.flatten,
                  Dense(prod(imsize), fcsize, relu),
-                 Dropout(dropout_rate),
+                 Dropout(dropout_prob),
                  Dense(fcsize, fcsize, relu),
-                 Dropout(dropout_rate),
+                 Dropout(dropout_prob),
                  Dense(fcsize, nclasses))
 end
 
 """
-    vgg(imsize; config, inchannels, batchnorm = false, nclasses, fcsize, dropout_rate)
+    vgg(imsize; config, inchannels, batchnorm = false, nclasses, fcsize, dropout_prob)
 
 Create a VGG model
 ([reference](https://arxiv.org/abs/1409.1556v6)).
@@ -92,13 +92,13 @@ Create a VGG model
   - `nclasses`: number of output classes
   - `fcsize`: intermediate fully connected layer size
     (see [`Metalhead.vgg_classifier_layers`](#))
-  - `dropout_rate`: dropout level between fully connected layers
+  - `dropout_prob`: dropout level between fully connected layers
 """
 function vgg(imsize::Dims{2}; config, batchnorm::Bool = false, fcsize::Integer = 4096,
-             dropout_rate = 0.0, inchannels::Integer = 3, nclasses::Integer = 1000)
+             dropout_prob = 0.0, inchannels::Integer = 3, nclasses::Integer = 1000)
     conv = vgg_convolutional_layers(config, batchnorm, inchannels)
     imsize = outputsize(conv, (imsize..., inchannels); padbatch = true)[1:3]
-    class = vgg_classifier_layers(imsize, nclasses, fcsize, dropout_rate)
+    class = vgg_classifier_layers(imsize, nclasses, fcsize, dropout_prob)
     return Chain(Chain(conv...), class)
 end
 
@@ -110,7 +110,7 @@ const VGG_CONV_CONFIGS = Dict(:A => [(64, 1), (128, 1), (256, 2), (512, 2), (512
 const VGG_CONFIGS = Dict(11 => :A, 13 => :B, 16 => :D, 19 => :E)
 
 """
-    VGG(imsize::Dims{2}; config, inchannels, batchnorm = false, nclasses, fcsize, dropout_rate)
+    VGG(imsize::Dims{2}; config, inchannels, batchnorm = false, nclasses, fcsize, dropout_prob)
 
 Construct a VGG model with the specified input image size. Typically, the image size is `(224, 224)`.
 
@@ -123,16 +123,16 @@ Construct a VGG model with the specified input image size. Typically, the image
   - `nclasses`: number of output classes
   - `fcsize`: intermediate fully connected layer size
     (see [`Metalhead.vgg_classifier_layers`](#))
-  - `dropout_rate`: dropout level between fully connected layers
+  - `dropout_prob`: dropout level between fully connected layers
 """
 struct VGG
     layers::Any
 end
 @functor VGG
 
-function VGG(imsize::Dims{2}; config, batchnorm::Bool = false, dropout_rate = 0.5,
+function VGG(imsize::Dims{2}; config, batchnorm::Bool = false, dropout_prob = 0.5,
              inchannels::Integer = 3, nclasses::Integer = 1000)
-    layers = vgg(imsize; config, inchannels, batchnorm, nclasses, dropout_rate)
+    layers = vgg(imsize; config, inchannels, batchnorm, nclasses, dropout_prob)
     return VGG(layers)
 end
 
diff --git a/src/layers/Layers.jl b/src/layers/Layers.jl
index 33d1d1aed..423ff41ec 100644
--- a/src/layers/Layers.jl
+++ b/src/layers/Layers.jl
@@ -22,7 +22,7 @@ include("conv.jl")
 export conv_norm, basic_conv_bn, dwsep_conv_norm
 
 include("drop.jl")
-export DropBlock, DropPath
+export DropBlock, StochasticDepth
 
 include("embeddings.jl")
 export PatchEmbedding, ViPosEmbedding, ClassTokens
diff --git a/src/layers/attention.jl b/src/layers/attention.jl
index b8fd38165..b8560f12f 100644
--- a/src/layers/attention.jl
+++ b/src/layers/attention.jl
@@ -1,6 +1,6 @@
 """
     MHAttention(planes::Integer, nheads::Integer = 8; qkv_bias::Bool = false, 
-                attn_dropout_rate = 0., proj_dropout_rate = 0.)
+                attn_dropout_prob = 0., proj_dropout_prob = 0.)
 
 Multi-head self-attention layer.
 
@@ -9,8 +9,8 @@ Multi-head self-attention layer.
   - `planes`: number of input channels
   - `nheads`: number of heads
   - `qkv_bias`: whether to use bias in the layer to get the query, key and value
-  - `attn_dropout_rate`: dropout rate after the self-attention layer
-  - `proj_dropout_rate`: dropout rate after the projection layer
+  - `attn_dropout_prob`: dropout probability after the self-attention layer
+  - `proj_dropout_prob`: dropout probability after the projection layer
 """
 struct MHAttention{P, Q, R}
     nheads::Int
@@ -21,11 +21,11 @@ end
 @functor MHAttention
 
 function MHAttention(planes::Integer, nheads::Integer = 8; qkv_bias::Bool = false,
-                     attn_dropout_rate = 0.0, proj_dropout_rate = 0.0)
+                     attn_dropout_prob = 0.0, proj_dropout_prob = 0.0)
     @assert planes % nheads==0 "planes should be divisible by nheads"
     qkv_layer = Dense(planes, planes * 3; bias = qkv_bias)
-    attn_drop = Dropout(attn_dropout_rate)
-    proj = Chain(Dense(planes, planes), Dropout(proj_dropout_rate))
+    attn_drop = Dropout(attn_dropout_prob)
+    proj = Chain(Dense(planes, planes), Dropout(proj_dropout_prob))
     return MHAttention(nheads, qkv_layer, attn_drop, proj)
 end
 
diff --git a/src/layers/classifier.jl b/src/layers/classifier.jl
index bebdc4099..90ab5ba33 100644
--- a/src/layers/classifier.jl
+++ b/src/layers/classifier.jl
@@ -1,7 +1,7 @@
 """
     create_classifier(inplanes::Integer, nclasses::Integer, activation = identity;
                       use_conv::Bool = false, pool_layer = AdaptiveMeanPool((1, 1)), 
-                      dropout_rate = nothing)
+                      dropout_prob = nothing)
 
 Creates a classifier head to be used for models.
 
@@ -13,11 +13,11 @@ Creates a classifier head to be used for models.
   - `use_conv`: whether to use a 1x1 convolutional layer instead of a `Dense` layer.
   - `pool_layer`: pooling layer to use. This is passed in with the layer instantiated with
     any arguments that are needed i.e. as `AdaptiveMeanPool((1, 1))`, for example.
-  - `dropout_rate`: dropout rate used in the classifier head. Set to `nothing` to disable dropout.
+  - `dropout_prob`: dropout probability used in the classifier head. Set to `nothing` to disable dropout.
 """
 function create_classifier(inplanes::Integer, nclasses::Integer, activation = identity;
                            use_conv::Bool = false, pool_layer = AdaptiveMeanPool((1, 1)),
-                           dropout_rate = nothing)
+                           dropout_prob = nothing)
     # Decide whether to flatten the input or not
     flatten_in_pool = !use_conv && pool_layer !== identity
     if use_conv
@@ -31,7 +31,7 @@ function create_classifier(inplanes::Integer, nclasses::Integer, activation = id
         push!(classifier, pool_layer)
     end
     # Dropout is applied after the pooling layer
-    isnothing(dropout_rate) ? nothing : push!(classifier, Dropout(dropout_rate))
+    isnothing(dropout_prob) ? nothing : push!(classifier, Dropout(dropout_prob))
     # Fully-connected layer
     if use_conv
         push!(classifier, Conv((1, 1), inplanes => nclasses, activation))
@@ -45,7 +45,7 @@ end
     create_classifier(inplanes::Integer, hidden_planes::Integer, nclasses::Integer,
                       activations::NTuple{2} = (relu, identity);
                       use_conv::NTuple{2, Bool} = (false, false),
-                      pool_layer = AdaptiveMeanPool((1, 1)), dropout_rate = nothing)
+                      pool_layer = AdaptiveMeanPool((1, 1)), dropout_prob = nothing)
 
 Creates a classifier head to be used for models with an extra hidden layer.
 
@@ -62,12 +62,12 @@ Creates a classifier head to be used for models with an extra hidden layer.
     layer.
   - `pool_layer`: pooling layer to use. This is passed in with the layer instantiated with
     any arguments that are needed i.e. as `AdaptiveMeanPool((1, 1))`, for example.
-  - `dropout_rate`: dropout rate used in the classifier head. Set to `nothing` to disable dropout.
+  - `dropout_prob`: dropout probability used in the classifier head. Set to `nothing` to disable dropout.
 """
 function create_classifier(inplanes::Integer, hidden_planes::Integer, nclasses::Integer,
                            activations::NTuple{2, Any} = (relu, identity);
                            use_conv::NTuple{2, Bool} = (false, false),
-                           pool_layer = AdaptiveMeanPool((1, 1)), dropout_rate = nothing)
+                           pool_layer = AdaptiveMeanPool((1, 1)), dropout_prob = nothing)
     fc_layers = [uc ? Conv$(1, 1) : Dense for uc in use_conv]
     # Decide whether to flatten the input or not
     flatten_in_pool = !use_conv[1] && pool_layer !== identity
@@ -86,7 +86,7 @@ function create_classifier(inplanes::Integer, hidden_planes::Integer, nclasses::
         push!(classifier, fc_layers[1](inplanes => hidden_planes, activations[1]))
     end
     # Dropout is applied after the first dense layer
-    isnothing(dropout_rate) ? nothing : push!(classifier, Dropout(dropout_rate))
+    isnothing(dropout_prob) ? nothing : push!(classifier, Dropout(dropout_prob))
     # second fully-connected layer
     push!(classifier, fc_layers[2](hidden_planes => nclasses, activations[2]))
     return Chain(classifier...)
diff --git a/src/layers/drop.jl b/src/layers/drop.jl
index 6f9dc7d1a..21ebbd56e 100644
--- a/src/layers/drop.jl
+++ b/src/layers/drop.jl
@@ -46,7 +46,8 @@ end
 # Dispatch for GPU
 dropblock_mask(rng::CUDA.RNG, x::CuArray, gamma, bs) = _dropblock_mask(rng, x, gamma, bs)
 function dropblock_mask(rng, x::CuArray, gamma, bs)
-    throw(ArgumentError("x isa CuArray, but rng isa $(typeof(rng)). dropblock only supports CUDA.RNG for CuArrays."))
+    throw(ArgumentError("x isa CuArray, but rng isa $(typeof(rng)). dropblock only supports
+                        CUDA.RNG for CuArrays."))
 end
 # Dispatch for CPU
 dropblock_mask(rng, x, gamma, bs) = _dropblock_mask(rng, x, gamma, bs)
@@ -93,7 +94,8 @@ end
 trainable(a::DropBlock) = (;)
 
 function _dropblock_checks(x::AbstractArray{<:Any, 4}, drop_block_prob, gamma_scale)
-    @assert 0≤drop_block_prob≤1 "drop_block_prob must be between 0 and 1, got $drop_block_prob"
+    @assert 0≤drop_block_prob≤1 "drop_block_prob must be between 0 and 1, got
+    $drop_block_prob"
     @assert 0≤gamma_scale≤1 "gamma_scale must be between 0 and 1, got $gamma_scale"
 end
 function _dropblock_checks(x, drop_block_prob, gamma_scale)
@@ -126,36 +128,47 @@ function Base.show(io::IO, d::DropBlock)
     return print(io, ")")
 end
 
-# TODO look into "row" mode for stochastic depth
 """
-    DropPath(p; [rng = rng_from_array(x)])
+    StochasticDepth(p, mode = :row; rng = rng_from_array())
 
-Implements Stochastic Depth - equivalent to `Dropout(p; dims = 4)` when `0 ≤ p ≤ 1` and
-`identity` if p is `nothing`.
+Implements Stochastic Depth. This is a `Dropout` layer from Flux that drops values
+with probability `p`.
 ([reference](https://arxiv.org/abs/1603.09382))
 
 This layer can be used to drop certain blocks in a residual structure and allow them to
 propagate completely through the skip connection. It can be used in two ways: either with
 all blocks having the same survival probability or with a linear scaling rule across the
-blocks. This is performed only at training time. At test time, the `DropPath` layer is
+blocks. This is performed only at training time. At test time, the `StochasticDepth` layer is
 equivalent to `identity`.
 
 !!! warning
     
     In the case of the linear scaling rule, the calculations of survival probabilities for each
     block may lead to a survival probability > 1 for a given block. This will lead to
-    `DropPath` erroring. This usually happens with a low number of blocks and a high base
+    `StochasticDepth` erroring. This usually happens with a low number of blocks and a high base
     survival probability, so in such cases it is recommended to use a fixed base survival
     probability across  blocks. If this is not desired, then a lower base survival probability
     is recommended.
 
 # Arguments
 
-  - `p`: rate of Stochastic Depth.
+  - `p`: probability of Stochastic Depth.
+  - `mode`: Either `:batch` or `:row`. :batch randomly zeroes the entire input, `row` zeroes
+    randomly selected rows from the batch. The default is `:row`.
   - `rng`: can be used to pass in a custom RNG instead of the default. See `Flux.Dropout`
     for more information on the behaviour of this argument. Custom RNGs are only supported
     on the CPU.
 """
-function DropPath(p; rng = rng_from_array())
-    return isnothing(p) ? identity : Dropout(p; dims = 4, rng)
+function StochasticDepth(p, mode = :row; rng = rng_from_array())
+    if isnothing(p)
+        return identity
+    else
+        if mode === :batch
+            return Dropout(p; dims = 5, rng)
+        elseif mode === :row
+            return Dropout(p; dims = 4, rng)
+        else
+            throw(ArgumentError("mode must be either `:batch` or `:row`, got $mode"))
+        end
+    end
 end
diff --git a/src/layers/mbconv.jl b/src/layers/mbconv.jl
index 93f029f9c..eb6801aaa 100644
--- a/src/layers/mbconv.jl
+++ b/src/layers/mbconv.jl
@@ -1,8 +1,8 @@
 """
     dwsep_conv_norm(kernel_size::Dims{2}, inplanes::Integer, outplanes::Integer,
-                  activation = relu; eps::Float32 = 1.0f-5, revnorm::Bool = false, 
-                  stride::Integer = 1, use_norm::NTuple{2, Bool} = (true, true),
-                  pad::Integer = 0, [bias, weight, init])
+                    activation = relu; eps::Float32 = 1.0f-5, revnorm::Bool = false, 
+                    stride::Integer = 1, use_norm::NTuple{2, Bool} = (true, true),
+                    pad::Integer = 0, [bias, weight, init])
 
 Create a depthwise separable convolution chain as used in MobileNetv1.
 This is sequence of layers:
@@ -38,12 +38,11 @@ function dwsep_conv_norm(kernel_size::Dims{2}, inplanes::Integer, outplanes::Int
                          bias::NTuple{2, Bool} = (!use_norm[1], !use_norm[2]), kwargs...)
     return vcat(conv_norm(kernel_size, inplanes, inplanes, activation; eps, norm_layer,
                           use_norm = use_norm[1], stride, bias = bias[1],
-                          groups = inplanes, kwargs...),
+                          groups = inplanes, kwargs...), # depthwise convolution
                 conv_norm((1, 1), inplanes, outplanes, activation; eps, norm_layer,
-                          use_norm = use_norm[2], bias = bias[2]))
+                          use_norm = use_norm[2], bias = bias[2])) # pointwise convolution
 end
 
-# TODO add support for stochastic depth to mbconv and fused_mbconv
 """
     mbconv(kernel_size::Dims{2}, inplanes::Integer, explanes::Integer,
            outplanes::Integer, activation = relu; stride::Integer,
diff --git a/src/layers/mlp.jl b/src/layers/mlp.jl
index e6336de9c..e6f3517fc 100644
--- a/src/layers/mlp.jl
+++ b/src/layers/mlp.jl
@@ -1,7 +1,7 @@
 # TODO @theabhirath figure out consistent behaviour for dropout rates - 0.0 vs `nothing`
 """
     mlp_block(inplanes::Integer, hidden_planes::Integer, outplanes::Integer = inplanes; 
-              dropout_rate = 0., activation = gelu)
+              dropout_prob = 0., activation = gelu)
 
 Feedforward block used in many MLPMixer-like and vision-transformer models.
 
@@ -10,18 +10,18 @@ Feedforward block used in many MLPMixer-like and vision-transformer models.
   - `inplanes`: Number of dimensions in the input.
   - `hidden_planes`: Number of dimensions in the intermediate layer.
   - `outplanes`: Number of dimensions in the output - by default it is the same as `inplanes`.
-  - `dropout_rate`: Dropout rate.
+  - `dropout_prob`: Dropout probability.
   - `activation`: Activation function to use.
 """
 function mlp_block(inplanes::Integer, hidden_planes::Integer, outplanes::Integer = inplanes;
-                   dropout_rate = 0.0, activation = gelu)
-    return Chain(Dense(inplanes, hidden_planes, activation), Dropout(dropout_rate),
-                 Dense(hidden_planes, outplanes), Dropout(dropout_rate))
+                   dropout_prob = 0.0, activation = gelu)
+    return Chain(Dense(inplanes, hidden_planes, activation), Dropout(dropout_prob),
+                 Dense(hidden_planes, outplanes), Dropout(dropout_prob))
 end
 
 """
     gated_mlp(gate_layer, inplanes::Integer, hidden_planes::Integer, 
-              outplanes::Integer = inplanes; dropout_rate = 0.0, activation = gelu)
+              outplanes::Integer = inplanes; dropout_prob = 0.0, activation = gelu)
 
 Feedforward block based on the implementation in the paper "Pay Attention to MLPs".
 ([reference](https://arxiv.org/abs/2105.08050))
@@ -32,17 +32,17 @@ Feedforward block based on the implementation in the paper "Pay Attention to MLP
   - `inplanes`: Number of dimensions in the input.
   - `hidden_planes`: Number of dimensions in the intermediate layer.
   - `outplanes`: Number of dimensions in the output - by default it is the same as `inplanes`.
-  - `dropout_rate`: Dropout rate.
+  - `dropout_prob`: Dropout probability.
   - `activation`: Activation function to use.
 """
 function gated_mlp_block(gate_layer, inplanes::Integer, hidden_planes::Integer,
-                         outplanes::Integer = inplanes; dropout_rate = 0.0,
+                         outplanes::Integer = inplanes; dropout_prob = 0.0,
                          activation = gelu)
     @assert hidden_planes % 2==0 "`hidden_planes` must be even for gated MLP"
     return Chain(Dense(inplanes, hidden_planes, activation),
-                 Dropout(dropout_rate),
+                 Dropout(dropout_prob),
                  gate_layer(hidden_planes),
                  Dense(hidden_planes ÷ 2, outplanes),
-                 Dropout(dropout_rate))
+                 Dropout(dropout_prob))
 end
 gated_mlp_block(::typeof(identity), args...; kwargs...) = mlp_block(args...; kwargs...)
diff --git a/src/mixers/core.jl b/src/mixers/core.jl
index 875136b2e..c9d3949b9 100644
--- a/src/mixers/core.jl
+++ b/src/mixers/core.jl
@@ -1,6 +1,6 @@
 """
     mlpmixer(block, imsize::Dims{2} = (224, 224); inchannels::Integer = 3, norm_layer = LayerNorm,
-             patch_size::Dims{2} = (16, 16), embedplanes = 512, drop_path_rate = 0.,
+             patch_size::Dims{2} = (16, 16), embedplanes = 512, stochastic_depth_prob = 0.,
              depth::Integer = 12, nclasses::Integer = 1000, kwargs...)
 
 Creates a model with the MLPMixer architecture.
@@ -9,26 +9,27 @@ Creates a model with the MLPMixer architecture.
 # Arguments
 
   - `block`: the type of mixer block to use in the model - architecture dependent
-    (a constructor of the form `block(embedplanes, npatches; drop_path_rate, kwargs...)`)
+    (a constructor of the form `block(embedplanes, npatches; stochastic_depth_prob, kwargs...)`)
   - `imsize`: the size of the input image
   - `inchannels`: the number of input channels
   - `norm_layer`: the normalization layer to use in the model
   - `patch_size`: the size of the patches
   - `embedplanes`: the number of channels after the patch embedding (denotes the hidden dimension)
-  - `drop_path_rate`: Stochastic depth rate
+  - `stochastic_depth_prob`: Stochastic depth probability
   - `depth`: the number of blocks in the model
   - `nclasses`: number of output classes
   - `kwargs`: additional arguments (if any) to pass to the mixer block. Will use the defaults if
     not specified.
 """
 function mlpmixer(block, imsize::Dims{2} = (224, 224); norm_layer = LayerNorm,
-                  patch_size::Dims{2} = (16, 16), embedplanes = 512, drop_path_rate = 0.0,
+                  patch_size::Dims{2} = (16, 16), embedplanes = 512,
+                  stochastic_depth_prob = 0.0,
                   depth::Integer = 12, inchannels::Integer = 3, nclasses::Integer = 1000,
                   kwargs...)
     npatches = prod(imsize .÷ patch_size)
-    dp_rates = linear_scheduler(drop_path_rate; depth)
+    dp_rates = linear_scheduler(stochastic_depth_prob; depth)
     layers = Chain(PatchEmbedding(imsize; inchannels, patch_size, embedplanes),
-                   Chain([block(embedplanes, npatches; drop_path_rate = dp_rates[i],
+                   Chain([block(embedplanes, npatches; stochastic_depth_prob = dp_rates[i],
                                 kwargs...)
                           for i in 1:depth]...))
     classifier = Chain(norm_layer(embedplanes), seconddimmean, Dense(embedplanes, nclasses))
diff --git a/src/mixers/gmlp.jl b/src/mixers/gmlp.jl
index 9288a0129..08f64d65a 100644
--- a/src/mixers/gmlp.jl
+++ b/src/mixers/gmlp.jl
@@ -44,7 +44,7 @@ end
 """
     spatial_gating_block(planes::Integer, npatches::Integer; mlp_ratio = 4.0,
                          norm_layer = LayerNorm, mlp_layer = gated_mlp_block,
-                         dropout_rate = 0.0, drop_path_rate = 0.0, activation = gelu)
+                         dropout_prob = 0.0, stochastic_depth_prob = 0.0, activation = gelu)
 
 Creates a feedforward block based on the gMLP model architecture described in the paper.
 ([reference](https://arxiv.org/abs/2105.08050))
@@ -56,19 +56,20 @@ Creates a feedforward block based on the gMLP model architecture described in th
   - `mlp_ratio`: ratio of the number of hidden channels in the channel mixing MLP to the number
     of planes in the block
   - `norm_layer`: the normalisation layer to use
-  - `dropout_rate`: the dropout rate to use in the MLP blocks
-  - `drop_path_rate`: Stochastic depth rate
+  - `dropout_prob`: the dropout probability to use in the MLP blocks
+  - `stochastic_depth_prob`: Stochastic depth probability
   - `activation`: the activation function to use in the MLP blocks
 """
 function spatial_gating_block(planes::Integer, npatches::Integer; mlp_ratio = 4.0,
                               norm_layer = LayerNorm, mlp_layer = gated_mlp_block,
-                              dropout_rate = 0.0, drop_path_rate = 0.0, activation = gelu)
+                              dropout_prob = 0.0, stochastic_depth_prob = 0.0,
+                              activation = gelu)
     channelplanes = floor(Int, mlp_ratio * planes)
     sgu = inplanes -> SpatialGatingUnit(inplanes, npatches; norm_layer)
     return SkipConnection(Chain(norm_layer(planes),
                                 mlp_layer(sgu, planes, channelplanes; activation,
-                                          dropout_rate),
-                                DropPath(drop_path_rate)), +)
+                                          dropout_prob),
+                                StochasticDepth(stochastic_depth_prob)), +)
 end
 
 """
diff --git a/src/mixers/mlpmixer.jl b/src/mixers/mlpmixer.jl
index 5f5079827..e4861d726 100644
--- a/src/mixers/mlpmixer.jl
+++ b/src/mixers/mlpmixer.jl
@@ -1,6 +1,6 @@
 """
     mixerblock(planes::Integer, npatches::Integer; mlp_layer = mlp_block,
-               mlp_ratio = (0.5, 4.0), dropout_rate = 0.0, drop_path_rate = 0.0,
+               mlp_ratio = (0.5, 4.0), dropout_prob = 0.0, stochastic_depth_prob = 0.0,
                activation = gelu)
 
 Creates a feedforward block for the MLPMixer architecture.
@@ -13,24 +13,24 @@ Creates a feedforward block for the MLPMixer architecture.
   - `mlp_ratio`: number(s) that determine(s) the number of hidden channels in the token mixing MLP
     and/or the channel mixing MLP as a ratio to the number of planes in the block.
   - `mlp_layer`: the MLP layer to use in the block
-  - `dropout_rate`: the dropout rate to use in the MLP blocks
-  - `drop_path_rate`: Stochastic depth rate
+  - `dropout_prob`: the dropout probability to use in the MLP blocks
+  - `stochastic_depth_prob`: Stochastic depth probability
   - `activation`: the activation function to use in the MLP blocks
 """
 function mixerblock(planes::Integer, npatches::Integer; mlp_layer = mlp_block,
-                    mlp_ratio::NTuple{2, Number} = (0.5, 4.0), dropout_rate = 0.0,
-                    drop_path_rate = 0.0, activation = gelu)
+                    mlp_ratio::NTuple{2, Number} = (0.5, 4.0), dropout_prob = 0.0,
+                    stochastic_depth_prob = 0.0, activation = gelu)
     tokenplanes, channelplanes = floor.(Int, planes .* mlp_ratio)
     return Chain(SkipConnection(Chain(LayerNorm(planes),
                                       swapdims((2, 1, 3)),
                                       mlp_layer(npatches, tokenplanes; activation,
-                                                dropout_rate),
+                                                dropout_prob),
                                       swapdims((2, 1, 3)),
-                                      DropPath(drop_path_rate)), +),
+                                      StochasticDepth(stochastic_depth_prob)), +),
                  SkipConnection(Chain(LayerNorm(planes),
                                       mlp_layer(planes, channelplanes; activation,
-                                                dropout_rate),
-                                      DropPath(drop_path_rate)), +))
+                                                dropout_prob),
+                                      StochasticDepth(stochastic_depth_prob)), +))
 end
 
 """
@@ -45,7 +45,7 @@ Creates a model with the MLPMixer architecture.
   - `config`: the size of the model - one of `small`, `base`, `large` or `huge`
   - `patch_size`: the size of the patches
   - `imsize`: the size of the input image
-  - `drop_path_rate`: Stochastic depth rate
+  - `stochastic_depth_prob`: Stochastic depth probability
   - `inchannels`: the number of input channels
   - `nclasses`: number of output classes
 
diff --git a/src/mixers/resmlp.jl b/src/mixers/resmlp.jl
index 370ed04b2..b307a8e3e 100644
--- a/src/mixers/resmlp.jl
+++ b/src/mixers/resmlp.jl
@@ -1,5 +1,5 @@
 """
-    resmixerblock(planes, npatches; dropout_rate = 0., drop_path_rate = 0., mlp_ratio = 4.0,
+    resmixerblock(planes, npatches; dropout_prob = 0., stochastic_depth_prob = 0., mlp_ratio = 4.0,
                   activation = gelu, layerscale_init = 1e-4)
 
 Creates a block for the ResMixer architecture.
@@ -12,25 +12,25 @@ Creates a block for the ResMixer architecture.
   - `mlp_ratio`: ratio of the number of hidden channels in the channel mixing MLP to the number
     of planes in the block
   - `mlp_layer`: the MLP block to use
-  - `dropout_rate`: the dropout rate to use in the MLP blocks
-  - `drop_path_rate`: Stochastic depth rate
+  - `dropout_prob`: the dropout probability to use in the MLP blocks
+  - `stochastic_depth_prob`: Stochastic depth probability
   - `activation`: the activation function to use in the MLP blocks
   - `layerscale_init`: initialisation constant for the LayerScale
 """
 function resmixerblock(planes::Integer, npatches::Integer; mlp_layer = mlp_block,
-                       mlp_ratio = 4.0, layerscale_init = 1e-4, dropout_rate = 0.0,
-                       drop_path_rate = 0.0, activation = gelu)
+                       mlp_ratio = 4.0, layerscale_init = 1e-4, dropout_prob = 0.0,
+                       stochastic_depth_prob = 0.0, activation = gelu)
     return Chain(SkipConnection(Chain(Flux.Scale(planes),
                                       swapdims((2, 1, 3)),
                                       Dense(npatches, npatches),
                                       swapdims((2, 1, 3)),
                                       LayerScale(planes, layerscale_init),
-                                      DropPath(drop_path_rate)), +),
+                                      StochasticDepth(stochastic_depth_prob)), +),
                  SkipConnection(Chain(Flux.Scale(planes),
                                       mlp_layer(planes, floor(Int, mlp_ratio * planes);
-                                                dropout_rate, activation),
+                                                dropout_prob, activation),
                                       LayerScale(planes, layerscale_init),
-                                      DropPath(drop_path_rate)), +))
+                                      StochasticDepth(stochastic_depth_prob)), +))
 end
 
 """
diff --git a/src/vit-based/vit.jl b/src/vit-based/vit.jl
index 84c7eb910..d9688d229 100644
--- a/src/vit-based/vit.jl
+++ b/src/vit-based/vit.jl
@@ -1,5 +1,5 @@
 """
-    transformer_encoder(planes, depth, nheads; mlp_ratio = 4.0, dropout_rate = 0.)
+    transformer_encoder(planes, depth, nheads; mlp_ratio = 4.0, dropout_prob = 0.)
 
 Transformer as used in the base ViT architecture.
 ([reference](https://arxiv.org/abs/2010.11929)).
@@ -10,26 +10,26 @@ Transformer as used in the base ViT architecture.
   - `depth`: number of attention blocks
   - `nheads`: number of attention heads
   - `mlp_ratio`: ratio of MLP layers to the number of input channels
-  - `dropout_rate`: dropout rate
+  - `dropout_prob`: dropout probability
 """
 function transformer_encoder(planes::Integer, depth::Integer, nheads::Integer;
-                             mlp_ratio = 4.0, dropout_rate = 0.0)
+                             mlp_ratio = 4.0, dropout_prob = 0.0)
     layers = [Chain(SkipConnection(prenorm(planes,
                                            MHAttention(planes, nheads;
-                                                       attn_dropout_rate = dropout_rate,
-                                                       proj_dropout_rate = dropout_rate)),
+                                                       attn_dropout_prob = dropout_prob,
+                                                       proj_dropout_prob = dropout_prob)),
                                    +),
                     SkipConnection(prenorm(planes,
                                            mlp_block(planes, floor(Int, mlp_ratio * planes);
-                                                     dropout_rate)), +))
+                                                     dropout_prob)), +))
               for _ in 1:depth]
     return Chain(layers)
 end
 
 """
     vit(imsize::Dims{2} = (256, 256); inchannels::Integer = 3, patch_size::Dims{2} = (16, 16),
-        embedplanes = 768, depth = 6, nheads = 16, mlp_ratio = 4.0, dropout_rate = 0.1,
-        emb_dropout_rate = 0.1, pool = :class, nclasses::Integer = 1000)
+        embedplanes = 768, depth = 6, nheads = 16, mlp_ratio = 4.0, dropout_prob = 0.1,
+        emb_dropout_prob = 0.1, pool = :class, nclasses::Integer = 1000)
 
 Creates a Vision Transformer (ViT) model.
 ([reference](https://arxiv.org/abs/2010.11929)).
@@ -43,24 +43,24 @@ Creates a Vision Transformer (ViT) model.
   - `depth`: number of blocks in the transformer
   - `nheads`: number of attention heads in the transformer
   - `mlpplanes`: number of hidden channels in the MLP block in the transformer
-  - `dropout_rate`: dropout rate
-  - `emb_dropout`: dropout rate for the positional embedding layer
+  - `dropout_prob`: dropout probability
+  - `emb_dropout`: dropout probability for the positional embedding layer
   - `pool`: pooling type, either :class or :mean
   - `nclasses`: number of classes in the output
 """
 function vit(imsize::Dims{2} = (256, 256); inchannels::Integer = 3,
              patch_size::Dims{2} = (16, 16), embedplanes::Integer = 768,
-             depth::Integer = 6, nheads::Integer = 16, mlp_ratio = 4.0, dropout_rate = 0.1,
-             emb_dropout_rate = 0.1, pool::Symbol = :class, nclasses::Integer = 1000)
+             depth::Integer = 6, nheads::Integer = 16, mlp_ratio = 4.0, dropout_prob = 0.1,
+             emb_dropout_prob = 0.1, pool::Symbol = :class, nclasses::Integer = 1000)
     @assert pool in [:class, :mean]
     "Pool type must be either `:class` (class token) or `:mean` (mean pooling)"
     npatches = prod(imsize .÷ patch_size)
     return Chain(Chain(PatchEmbedding(imsize; inchannels, patch_size, embedplanes),
                        ClassTokens(embedplanes),
                        ViPosEmbedding(embedplanes, npatches + 1),
-                       Dropout(emb_dropout_rate),
+                       Dropout(emb_dropout_prob),
                        transformer_encoder(embedplanes, depth, nheads; mlp_ratio,
-                                           dropout_rate),
+                                           dropout_prob),
                        pool === :class ? x -> x[:, 1, :] : seconddimmean),
                  Chain(LayerNorm(embedplanes), Dense(embedplanes, nclasses, tanh_fast)))
 end
diff --git a/test/convnets.jl b/test/convnets.jl
index 0c796e24c..13079a8af 100644
--- a/test/convnets.jl
+++ b/test/convnets.jl
@@ -42,9 +42,9 @@ end
             ]
             @testset for layers in layer_list
                 drop_list = [
-                    (dropout_rate = 0.1, drop_path_rate = 0.1, drop_block_rate = 0.1),
-                    (dropout_rate = 0.5, drop_path_rate = 0.5, drop_block_rate = 0.5),
-                    (dropout_rate = 0.8, drop_path_rate = 0.8, drop_block_rate = 0.8),
+                    (dropout_prob = 0.1, stochastic_depth_prob = 0.1, dropblock_prob = 0.1),
+                    (dropout_prob = 0.5, stochastic_depth_prob = 0.5, dropblock_prob = 0.5),
+                    (dropout_prob = 0.8, stochastic_depth_prob = 0.8, dropblock_prob = 0.8),
                 ]
                 @testset for drop_rates in drop_list
                     m = Metalhead.resnet(block_fn, layers; drop_rates...)

From 4cdcaef3c34ce097d1ba854c7474693b66bae8c8 Mon Sep 17 00:00:00 2001
From: Abhirath Anand <74202102+theabhirath@users.noreply.github.com>
Date: Sun, 28 Aug 2022 08:36:53 +0530
Subject: [PATCH 4/8] Add `StochasticDepth` to EfficientNets

More cleanup
---
 src/convnets/builders/irmodel.jl             | 19 +++++-----
 src/convnets/builders/mbconv.jl              | 39 +++++++++++---------
 src/convnets/builders/resblocks.jl           | 20 +++++-----
 src/convnets/builders/resnet.jl              |  4 +-
 src/convnets/convnext.jl                     |  4 +-
 src/convnets/efficientnets/efficientnet.jl   | 10 ++---
 src/convnets/efficientnets/efficientnetv2.jl | 11 +++---
 src/convnets/mobilenets/mnasnet.jl           |  4 +-
 src/convnets/mobilenets/mobilenetv1.jl       |  6 +--
 src/convnets/mobilenets/mobilenetv2.jl       |  4 +-
 src/convnets/mobilenets/mobilenetv3.jl       |  8 ++--
 src/convnets/resnets/core.jl                 |  4 +-
 src/layers/drop.jl                           | 18 ---------
 src/layers/mlp.jl                            |  2 +-
 src/mixers/core.jl                           |  5 ++-
 src/utilities.jl                             | 17 +++++----
 test/convnets.jl                             |  4 +-
 17 files changed, 85 insertions(+), 94 deletions(-)

diff --git a/src/convnets/builders/irmodel.jl b/src/convnets/builders/irmodel.jl
index d65ea6531..9c16608f3 100644
--- a/src/convnets/builders/irmodel.jl
+++ b/src/convnets/builders/irmodel.jl
@@ -1,9 +1,9 @@
-function irmodelbuilder(scalings::NTuple{2, Real}, block_configs::AbstractVector{<:Tuple};
-                        inplanes::Integer = 32, connection = +, activation = relu,
-                        norm_layer = BatchNorm, divisor::Integer = 8,
-                        tail_conv::Bool = true, expanded_classifier::Bool = false,
-                        headplanes::Integer, dropout_prob = nothing,
-                        inchannels::Integer = 3, nclasses::Integer = 1000, kwargs...)
+function build_irmodel(scalings::NTuple{2, Real}, block_configs::AbstractVector{<:Tuple};
+                       inplanes::Integer = 32, connection = +, activation = relu,
+                       norm_layer = BatchNorm, divisor::Integer = 8, tail_conv::Bool = true,
+                       expanded_classifier::Bool = false, stochastic_depth_prob = nothing,
+                       headplanes::Integer, dropout_prob = nothing, inchannels::Integer = 3,
+                       nclasses::Integer = 1000, kwargs...)
     width_mult, _ = scalings
     # building first layer
     inplanes = _round_channels(inplanes * width_mult, divisor)
@@ -13,7 +13,8 @@ function irmodelbuilder(scalings::NTuple{2, Real}, block_configs::AbstractVector
                       norm_layer))
     # building inverted residual blocks
     get_layers, block_repeats = mbconv_stage_builder(block_configs, inplanes, scalings;
-                                                     norm_layer, divisor, kwargs...)
+                                                     stochastic_depth_prob, norm_layer,
+                                                     divisor, kwargs...)
     append!(layers, cnn_stages(get_layers, block_repeats, connection))
     # building last layers
     outplanes = _round_channels(block_configs[end][3] * width_mult, divisor)
@@ -36,6 +37,6 @@ function irmodelbuilder(scalings::NTuple{2, Real}, block_configs::AbstractVector
     return Chain(Chain(layers...), classifier)
 end
 
-function irmodelbuilder(width_mult::Real, block_configs::AbstractVector{<:Tuple}; kwargs...)
-    return irmodelbuilder((width_mult, 1), block_configs; kwargs...)
+function build_irmodel(width_mult::Real, block_configs::AbstractVector{<:Tuple}; kwargs...)
+    return build_irmodel((width_mult, 1), block_configs; kwargs...)
 end
diff --git a/src/convnets/builders/mbconv.jl b/src/convnets/builders/mbconv.jl
index c430c247f..fbf52a03f 100644
--- a/src/convnets/builders/mbconv.jl
+++ b/src/convnets/builders/mbconv.jl
@@ -23,10 +23,10 @@ end
 
 function _get_builder(::typeof(mbconv), block_configs::AbstractVector{<:Tuple},
                       inplanes::Integer, stage_idx::Integer, scalings::NTuple{2, Real};
-                      norm_layer = BatchNorm, divisor::Integer = 8,
-                      se_from_explanes::Bool = false, kwargs...)
+                      stochastic_depth_prob = nothing, norm_layer = BatchNorm,
+                      divisor::Integer = 8, se_from_explanes::Bool = false, kwargs...)
     width_mult, depth_mult = scalings
-    block_repeats = [ceil(Int, block_configs[idx][end - 3] * depth_mult)
+    block_repeats = [ceil(Int, block_configs[idx][end - 2] * depth_mult)
                      for idx in eachindex(block_configs)]
     block_fn, k, outplanes, expansion, stride, _, reduction, activation = block_configs[stage_idx]
     # calculate number of reduced channels for squeeze-excite layer from explanes instead of inplanes
@@ -37,31 +37,37 @@ function _get_builder(::typeof(mbconv), block_configs::AbstractVector{<:Tuple},
         inplanes = _round_channels(block_configs[stage_idx - 1][3] * width_mult, divisor)
     end
     outplanes = _round_channels(outplanes * width_mult, divisor)
-    pathschedule = linear_scheduler(stochastic_depth_prob; depth = sum(block_repeats))
+    sdschedule = linear_scheduler(stochastic_depth_prob; depth = sum(block_repeats))
     function get_layers(block_idx::Integer)
         inplanes = block_idx == 1 ? inplanes : outplanes
         explanes = _round_channels(inplanes * expansion, divisor)
         stride = block_idx == 1 ? stride : 1
         block = block_fn((k, k), inplanes, explanes, outplanes, activation; norm_layer,
                          stride, reduction, kwargs...)
-        schedule_idx = sum(block_repeats[1:(stage_idx - 1)]) + block_idx
-        drop_path = StochasticDepth(pathschedule(schedule_idx))
-        return stride == 1 && inplanes == outplanes ? (drop_path, block) : (block,)
+        use_skip = stride == 1 && inplanes == outplanes
+        if use_skip
+            schedule_idx = sum(block_repeats[1:(stage_idx - 1)]) + block_idx
+
+            drop_path = StochasticDepth(sdschedule[schedule_idx])
+            return (drop_path, block)
+        else
+            return (block,)
+        end
     end
-    return get_layers, ceil(Int, nrepeats * depth_mult)
+    return get_layers, block_repeats[stage_idx]
 end
 
 function _get_builder(::typeof(fused_mbconv), block_configs::AbstractVector{<:Tuple},
                       inplanes::Integer, stage_idx::Integer, scalings::NTuple{2, Real};
-                      norm_layer = BatchNorm, divisor::Integer = 8, kwargs...)
-    width_mult, depth_mult = scaling
-    block_repeats = [ceil(Int, block_configs[idx][end - 2] * depth_mult)
+                      stochastic_depth_prob = nothing, norm_layer = BatchNorm,
+                      divisor::Integer = 8, kwargs...)
+    width_mult, depth_mult = scalings
+    block_repeats = [ceil(Int, block_configs[idx][end - 1] * depth_mult)
                      for idx in eachindex(block_configs)]
     block_fn, k, outplanes, expansion, stride, _, activation = block_configs[stage_idx]
     inplanes = stage_idx == 1 ? inplanes : block_configs[stage_idx - 1][3]
     outplanes = _round_channels(outplanes * width_mult, divisor)
-    block_repeats = sum(block_configs[idx][4] for idx in 1:stage_idx)
-    pathschedule = linear_scheduler(stochastic_depth_prob; depth = sum(block_repeats))
+    sdschedule = linear_scheduler(stochastic_depth_prob; depth = sum(block_repeats))
     function get_layers(block_idx::Integer)
         inplanes = block_idx == 1 ? inplanes : outplanes
         explanes = _round_channels(inplanes * expansion, divisor)
@@ -69,7 +75,7 @@ function _get_builder(::typeof(fused_mbconv), block_configs::AbstractVector{<:Tu
         block = block_fn((k, k), inplanes, explanes, outplanes, activation;
                          norm_layer, stride, kwargs...)
         schedule_idx = sum(block_repeats[1:(stage_idx - 1)]) + block_idx
-        drop_path = StochasticDepth(pathschedule(schedule_idx))
+        drop_path = StochasticDepth(sdschedule[schedule_idx])
         return stride == 1 && inplanes == outplanes ? (drop_path, block) : (block,)
     end
     return get_layers, block_repeats[stage_idx]
@@ -77,8 +83,7 @@ end
 
 function mbconv_stage_builder(block_configs::AbstractVector{<:Tuple}, inplanes::Integer,
                               scalings::NTuple{2, Real}; kwargs...)
-    bxs = [_get_builder(block_configs[1], block_configs, inplanes, idx, scalings;
-                        kwargs...)
-           for idx in eachindex(block_configs)]
+    bxs = [_get_builder(block_configs[idx][1], block_configs, inplanes, idx, scalings;
+                        kwargs...) for idx in eachindex(block_configs)]
     return (stage_idx, block_idx) -> first.(bxs)[stage_idx](block_idx), last.(bxs)
 end
diff --git a/src/convnets/builders/resblocks.jl b/src/convnets/builders/resblocks.jl
index e82557305..506c622e3 100644
--- a/src/convnets/builders/resblocks.jl
+++ b/src/convnets/builders/resblocks.jl
@@ -6,14 +6,14 @@ function basicblock_builder(block_repeats::AbstractVector{<:Integer};
                             dropblock_prob = nothing, stochastic_depth_prob = nothing,
                             stride_fn = resnet_stride, planes_fn = resnet_planes,
                             downsample_tuple = (downsample_conv, downsample_identity))
-    # DropBlock, StochasticDepth both take in rates based on a linear scaling schedule
+    # DropBlock, StochasticDepth both take in probabilities based on a linear scaling schedule
     # Also get `planes_vec` needed for block `inplanes` and `planes` calculations
-    pathschedule = linear_scheduler(stochastic_depth_prob; depth = sum(block_repeats))
-    blockschedule = linear_scheduler(dropblock_prob; depth = sum(block_repeats))
+    sdschedule = linear_scheduler(stochastic_depth_prob; depth = sum(block_repeats))
+    dbschedule = linear_scheduler(dropblock_prob; depth = sum(block_repeats))
     planes_vec = collect(planes_fn(block_repeats))
     # closure over `idxs`
     function get_layers(stage_idx::Integer, block_idx::Integer)
-        # DropBlock, StochasticDepth both take in rates based on a linear scaling schedule
+        # DropBlock, StochasticDepth both take in probabilities based on a linear scaling schedule
         # This is also needed for block `inplanes` and `planes` calculations
         schedule_idx = sum(block_repeats[1:(stage_idx - 1)]) + block_idx
         planes = planes_vec[schedule_idx]
@@ -23,8 +23,8 @@ function basicblock_builder(block_repeats::AbstractVector{<:Integer};
         stride = stride_fn(stage_idx, block_idx)
         downsample_fn = stride != 1 || inplanes != planes * expansion ?
                         downsample_tuple[1] : downsample_tuple[2]
-        drop_path = StochasticDepth(pathschedule[schedule_idx])
-        drop_block = DropBlock(blockschedule[schedule_idx])
+        drop_path = StochasticDepth(sdschedule[schedule_idx])
+        drop_block = DropBlock(dbschedule[schedule_idx])
         block = basicblock(inplanes, planes; stride, reduction_factor, activation,
                            norm_layer, revnorm, attn_fn, drop_path, drop_block)
         downsample = downsample_fn(inplanes, planes * expansion; stride, norm_layer,
@@ -43,8 +43,8 @@ function bottleneck_builder(block_repeats::AbstractVector{<:Integer};
                             dropblock_prob = nothing, stochastic_depth_prob = nothing,
                             stride_fn = resnet_stride, planes_fn = resnet_planes,
                             downsample_tuple = (downsample_conv, downsample_identity))
-    pathschedule = linear_scheduler(stochastic_depth_prob; depth = sum(block_repeats))
-    blockschedule = linear_scheduler(dropblock_prob; depth = sum(block_repeats))
+    sdschedule = linear_scheduler(stochastic_depth_prob; depth = sum(block_repeats))
+    dbschedule = linear_scheduler(dropblock_prob; depth = sum(block_repeats))
     planes_vec = collect(planes_fn(block_repeats))
     # closure over `idxs`
     function get_layers(stage_idx::Integer, block_idx::Integer)
@@ -58,8 +58,8 @@ function bottleneck_builder(block_repeats::AbstractVector{<:Integer};
         stride = stride_fn(stage_idx, block_idx)
         downsample_fn = stride != 1 || inplanes != planes * expansion ?
                         downsample_tuple[1] : downsample_tuple[2]
-        drop_path = StochasticDepth(pathschedule[schedule_idx])
-        drop_block = DropBlock(blockschedule[schedule_idx])
+        drop_path = StochasticDepth(sdschedule[schedule_idx])
+        drop_block = DropBlock(dbschedule[schedule_idx])
         block = bottleneck(inplanes, planes; stride, cardinality, base_width,
                            reduction_factor, activation, norm_layer, revnorm,
                            attn_fn, drop_path, drop_block)
diff --git a/src/convnets/builders/resnet.jl b/src/convnets/builders/resnet.jl
index a77ff4c6d..5bcc16960 100644
--- a/src/convnets/builders/resnet.jl
+++ b/src/convnets/builders/resnet.jl
@@ -1,5 +1,5 @@
-function resnetbuilder(img_dims, stem, get_layers, block_repeats::AbstractVector{<:Integer},
-                       connection, classifier_fn)
+function build_resnet(img_dims, stem, get_layers, block_repeats::AbstractVector{<:Integer},
+                      connection, classifier_fn)
     # Build stages of the ResNet
     stage_blocks = cnn_stages(get_layers, block_repeats, connection)
     backbone = Chain(stem, stage_blocks...)
diff --git a/src/convnets/convnext.jl b/src/convnets/convnext.jl
index 04513220d..283c8662e 100644
--- a/src/convnets/convnext.jl
+++ b/src/convnets/convnext.jl
@@ -56,11 +56,11 @@ function convnext(depths::AbstractVector{<:Integer}, planes::AbstractVector{<:In
                               norm_layer = ChannelLayerNorm, revnorm = true)...))
     end
     stages = []
-    dp_rates = linear_scheduler(stochastic_depth_prob; depth = sum(depths))
+    sdschedule = linear_scheduler(stochastic_depth_prob; depth = sum(depths))
     cur = 0
     for i in eachindex(depths)
         push!(stages,
-              [convnextblock(planes[i], dp_rates[cur + j], layerscale_init)
+              [convnextblock(planes[i], sdschedule[cur + j], layerscale_init)
                for j in 1:depths[i]])
         cur += depths[i]
     end
diff --git a/src/convnets/efficientnets/efficientnet.jl b/src/convnets/efficientnets/efficientnet.jl
index 08b4b87dd..bbd864a1a 100644
--- a/src/convnets/efficientnets/efficientnet.jl
+++ b/src/convnets/efficientnets/efficientnet.jl
@@ -31,15 +31,15 @@ const EFFICIENTNET_GLOBAL_CONFIGS = Dict(:b0 => (224, (1.0, 1.0)),
                                          :b7 => (600, (2.0, 3.1)),
                                          :b8 => (672, (2.2, 3.6)))
 
-function efficientnet(config::Symbol; norm_layer = BatchNorm,
+function efficientnet(config::Symbol; norm_layer = BatchNorm, stochastic_depth_prob = 0.2,
                       dropout_prob = nothing, inchannels::Integer = 3,
                       nclasses::Integer = 1000)
     _checkconfig(config, keys(EFFICIENTNET_GLOBAL_CONFIGS))
     scalings = EFFICIENTNET_GLOBAL_CONFIGS[config][2]
-    return irmodelbuilder(scalings, EFFICIENTNET_BLOCK_CONFIGS; inplanes = 32,
-                          norm_layer, activation = swish,
-                          headplanes = EFFICIENTNET_BLOCK_CONFIGS[end][3] * 4,
-                          dropout_prob, inchannels, nclasses)
+    return build_irmodel(scalings, EFFICIENTNET_BLOCK_CONFIGS; inplanes = 32,
+                         norm_layer, stochastic_depth_prob, activation = swish,
+                         headplanes = EFFICIENTNET_BLOCK_CONFIGS[end][3] * 4,
+                         dropout_prob, inchannels, nclasses)
 end
 
 """
diff --git a/src/convnets/efficientnets/efficientnetv2.jl b/src/convnets/efficientnets/efficientnetv2.jl
index e792cd1da..5ea35740c 100644
--- a/src/convnets/efficientnets/efficientnetv2.jl
+++ b/src/convnets/efficientnets/efficientnetv2.jl
@@ -35,13 +35,14 @@ const EFFNETV2_CONFIGS = Dict(:small => [(fused_mbconv, 3, 24, 1, 1, 2, swish),
                                   (mbconv, 3, 512, 6, 2, 32, 4, swish),
                                   (mbconv, 3, 768, 6, 1, 8, 4, swish)])
 
-function efficientnetv2(config::Symbol; norm_layer = BatchNorm, dropout_prob = nothing,
-                        inchannels::Integer = 3, nclasses::Integer = 1000)
+function efficientnetv2(config::Symbol; norm_layer = BatchNorm, stochastic_depth_prob = 0.2,
+                        dropout_prob = nothing, inchannels::Integer = 3,
+                        nclasses::Integer = 1000)
     _checkconfig(config, keys(EFFNETV2_CONFIGS))
     block_configs = EFFNETV2_CONFIGS[config]
-    return irmodelbuilder((1, 1), block_configs; activation = swish, norm_layer,
-                          inplanes = block_configs[1][3], headplanes = 1280,
-                          dropout_prob, inchannels, nclasses)
+    return build_irmodel((1, 1), block_configs; activation = swish, norm_layer,
+                         inplanes = block_configs[1][3], headplanes = 1280,
+                         stochastic_depth_prob, dropout_prob, inchannels, nclasses)
 end
 
 """
diff --git a/src/convnets/mobilenets/mnasnet.jl b/src/convnets/mobilenets/mnasnet.jl
index 6c9cd40d6..cf1f597fb 100644
--- a/src/convnets/mobilenets/mnasnet.jl
+++ b/src/convnets/mobilenets/mnasnet.jl
@@ -45,8 +45,8 @@ function mnasnet(config::Symbol; width_mult::Real = 1, max_width::Integer = 1280
     # momentum used for BatchNorm is as per Tensorflow implementation
     norm_layer = (args...; kwargs...) -> BatchNorm(args...; momentum = 0.0003f0, kwargs...)
     inplanes, block_configs = MNASNET_CONFIGS[config]
-    return irmodelbuilder(width_mult, block_configs; inplanes, norm_layer,
-                          headplanes = max_width, dropout_prob, inchannels, nclasses)
+    return build_irmodel(width_mult, block_configs; inplanes, norm_layer,
+                         headplanes = max_width, dropout_prob, inchannels, nclasses)
 end
 
 """
diff --git a/src/convnets/mobilenets/mobilenetv1.jl b/src/convnets/mobilenets/mobilenetv1.jl
index e71c25451..8d534d111 100644
--- a/src/convnets/mobilenets/mobilenetv1.jl
+++ b/src/convnets/mobilenets/mobilenetv1.jl
@@ -16,9 +16,9 @@ const MOBILENETV1_CONFIGS = [
 
 function mobilenetv1(width_mult::Real = 1; inplanes::Integer = 32, dropout_prob = nothing,
                      inchannels::Integer = 3, nclasses::Integer = 1000)
-    return irmodelbuilder(width_mult, MOBILENETV1_CONFIGS; inplanes, inchannels,
-                          activation = relu6, connection = nothing, tail_conv = false,
-                          headplanes = 1024, dropout_prob, nclasses)
+    return build_irmodel(width_mult, MOBILENETV1_CONFIGS; inplanes, inchannels,
+                         activation = relu6, connection = nothing, tail_conv = false,
+                         headplanes = 1024, dropout_prob, nclasses)
 end
 
 """
diff --git a/src/convnets/mobilenets/mobilenetv2.jl b/src/convnets/mobilenets/mobilenetv2.jl
index 7f4279793..271cb1e37 100644
--- a/src/convnets/mobilenets/mobilenetv2.jl
+++ b/src/convnets/mobilenets/mobilenetv2.jl
@@ -21,8 +21,8 @@ const MOBILENETV2_CONFIGS = [
 function mobilenetv2(width_mult::Real = 1; max_width::Integer = 1280,
                      inplanes::Integer = 32, dropout_prob = 0.2,
                      inchannels::Integer = 3, nclasses::Integer = 1000)
-    return irmodelbuilder(width_mult, MOBILENETV2_CONFIGS; activation = relu6, inplanes,
-                          headplanes = max_width, dropout_prob, inchannels, nclasses)
+    return build_irmodel(width_mult, MOBILENETV2_CONFIGS; activation = relu6, inplanes,
+                         headplanes = max_width, dropout_prob, inchannels, nclasses)
 end
 
 """
diff --git a/src/convnets/mobilenets/mobilenetv3.jl b/src/convnets/mobilenets/mobilenetv3.jl
index 293435271..1fa4cab26 100644
--- a/src/convnets/mobilenets/mobilenetv3.jl
+++ b/src/convnets/mobilenets/mobilenetv3.jl
@@ -38,10 +38,10 @@ function mobilenetv3(config::Symbol; width_mult::Real = 1, dropout_prob = 0.2,
                      inchannels::Integer = 3, nclasses::Integer = 1000)
     _checkconfig(config, [:small, :large])
     max_width, block_configs = MOBILENETV3_CONFIGS[config]
-    return irmodelbuilder(width_mult, block_configs; inplanes = 16,
-                          headplanes = max_width, activation = relu,
-                          se_from_explanes = true, se_round_fn = _round_channels,
-                          expanded_classifier = true, dropout_prob, inchannels, nclasses)
+    return build_irmodel(width_mult, block_configs; inplanes = 16,
+                         headplanes = max_width, activation = relu,
+                         se_from_explanes = true, se_round_fn = _round_channels,
+                         expanded_classifier = true, dropout_prob, inchannels, nclasses)
 end
 
 """
diff --git a/src/convnets/resnets/core.jl b/src/convnets/resnets/core.jl
index 1b15c35f8..763369388 100644
--- a/src/convnets/resnets/core.jl
+++ b/src/convnets/resnets/core.jl
@@ -244,8 +244,8 @@ function resnet(block_type, block_repeats::AbstractVector{<:Integer},
     end
     classifier_fn = nfeatures -> create_classifier(nfeatures, nclasses; dropout_prob,
                                                    pool_layer, use_conv)
-    return resnetbuilder((imsize..., inchannels), stem, get_layers, block_repeats,
-                         connection$activation, classifier_fn)
+    return build_resnet((imsize..., inchannels), stem, get_layers, block_repeats,
+                        connection$activation, classifier_fn)
 end
 function resnet(block_fn, block_repeats, downsample_opt::Symbol = :B; kwargs...)
     return resnet(block_fn, block_repeats, RESNET_SHORTCUTS[downsample_opt]; kwargs...)
diff --git a/src/layers/drop.jl b/src/layers/drop.jl
index 21ebbd56e..f9c516c3c 100644
--- a/src/layers/drop.jl
+++ b/src/layers/drop.jl
@@ -62,15 +62,6 @@ It can be used in two ways: either with all blocks having the same survival prob
 or with a linear scaling rule across the blocks. This is performed only at training time.
 At test time, the `DropBlock` layer is equivalent to `identity`.
 
-!!! warning
-    
-    In the case of the linear scaling rule, the calculations of survival probabilities for each
-    block may lead to a survival probability > 1 for a given block. This will lead to
-    `DropBlock` erroring. This usually happens with a low number of blocks and a high base
-    survival probability, so in such cases it is recommended to use a fixed base survival
-    probability across blocks. If this is not desired, then a lower base survival probability
-    is recommended.
-
 ([reference](https://arxiv.org/abs/1810.12890))
 
 # Arguments
@@ -141,15 +132,6 @@ all blocks having the same survival probability or with a linear scaling rule ac
 blocks. This is performed only at training time. At test time, the `StochasticDepth` layer is
 equivalent to `identity`.
 
-!!! warning
-    
-    In the case of the linear scaling rule, the calculations of survival probabilities for each
-    block may lead to a survival probability > 1 for a given block. This will lead to
-    `StochasticDepth` erroring. This usually happens with a low number of blocks and a high base
-    survival probability, so in such cases it is recommended to use a fixed base survival
-    probability across  blocks. If this is not desired, then a lower base survival probability
-    is recommended.
-
 # Arguments
 
   - `p`: probability of Stochastic Depth.
diff --git a/src/layers/mlp.jl b/src/layers/mlp.jl
index e6f3517fc..7aa9bc803 100644
--- a/src/layers/mlp.jl
+++ b/src/layers/mlp.jl
@@ -1,4 +1,4 @@
-# TODO @theabhirath figure out consistent behaviour for dropout rates - 0.0 vs `nothing`
+# TODO @theabhirath figure out consistent behaviour for dropout probs - 0.0 vs `nothing`
 """
     mlp_block(inplanes::Integer, hidden_planes::Integer, outplanes::Integer = inplanes; 
               dropout_prob = 0., activation = gelu)
diff --git a/src/mixers/core.jl b/src/mixers/core.jl
index c9d3949b9..4c9c31def 100644
--- a/src/mixers/core.jl
+++ b/src/mixers/core.jl
@@ -27,9 +27,10 @@ function mlpmixer(block, imsize::Dims{2} = (224, 224); norm_layer = LayerNorm,
                   depth::Integer = 12, inchannels::Integer = 3, nclasses::Integer = 1000,
                   kwargs...)
     npatches = prod(imsize .÷ patch_size)
-    dp_rates = linear_scheduler(stochastic_depth_prob; depth)
+    sdschedule = linear_scheduler(stochastic_depth_prob; depth)
     layers = Chain(PatchEmbedding(imsize; inchannels, patch_size, embedplanes),
-                   Chain([block(embedplanes, npatches; stochastic_depth_prob = dp_rates[i],
+                   Chain([block(embedplanes, npatches;
+                                stochastic_depth_prob = sdschedule[i],
                                 kwargs...)
                           for i in 1:depth]...))
     classifier = Chain(norm_layer(embedplanes), seconddimmean, Dense(embedplanes, nclasses))
diff --git a/src/utilities.jl b/src/utilities.jl
index 13b8ec385..cdcd7ee6e 100644
--- a/src/utilities.jl
+++ b/src/utilities.jl
@@ -66,17 +66,18 @@ function _maybe_big_show(io, model)
 end
 
 """
-    linear_scheduler(drop_rate = 0.0; start_value = 0.0, depth)
-    linear_scheduler(drop_rate::Nothing; depth::Integer)
+    linear_scheduler(drop_prob = 0.0; start_value = 0.0, depth)
+    linear_scheduler(drop_prob::Nothing; depth::Integer)
 
-Returns the dropout rates for a given depth using the linear scaling rule. If the
-`drop_rate` is `nothing`, it returns a `Vector` of length `depth` with all values
-equal to `nothing`.
+Returns the dropout probabilities for a given depth using the linear scaling rule. Note
+that this returns evenly spaced values between `start_value` and `drop_prob`, not including
+`drop_prob`. If `drop_prob` is `nothing`, it returns a `Vector` of length `depth` with all
+values equal to `nothing`.
 """
-function linear_scheduler(drop_rate = 0.0; depth::Integer, start_value = 0.0)
-    return LinRange(start_value, drop_rate, depth)
+function linear_scheduler(drop_prob = 0.0; depth::Integer, start_value = 0.0)
+    return LinRange(start_value, drop_prob, depth + 1)[1:(end - 1)]
 end
-linear_scheduler(drop_rate::Nothing; depth::Integer) = fill(drop_rate, depth)
+linear_scheduler(drop_prob::Nothing; depth::Integer) = fill(drop_prob, depth)
 
 # Utility function for depth and configuration checks in models
 function _checkconfig(config, configs)
diff --git a/test/convnets.jl b/test/convnets.jl
index 13079a8af..7a6ed42d2 100644
--- a/test/convnets.jl
+++ b/test/convnets.jl
@@ -46,8 +46,8 @@ end
                     (dropout_prob = 0.5, stochastic_depth_prob = 0.5, dropblock_prob = 0.5),
                     (dropout_prob = 0.8, stochastic_depth_prob = 0.8, dropblock_prob = 0.8),
                 ]
-                @testset for drop_rates in drop_list
-                    m = Metalhead.resnet(block_fn, layers; drop_rates...)
+                @testset for drop_probs in drop_list
+                    m = Metalhead.resnet(block_fn, layers; drop_probs...)
                     @test size(m(x_224)) == (1000, 1)
                     @test gradtest(m, x_224)
                     _gc()

From 7827ca6ec4ef7c5e07d04cd6d84a1a3b11289dc0 Mon Sep 17 00:00:00 2001
From: Abhirath Anand <74202102+theabhirath@users.noreply.github.com>
Date: Sun, 28 Aug 2022 12:14:59 +0530
Subject: [PATCH 5/8] Add some docstrings

---
 Project.toml                                 |   4 +-
 docs/dev-guide/contributing.md               |   2 +-
 docs/tutorials/quickstart.md                 |   2 +-
 src/convnets/alexnet.jl                      |   2 +-
 src/convnets/builders/irmodel.jl             |   1 -
 src/convnets/builders/mbconv.jl              |  47 ++++--
 src/convnets/builders/resblocks.jl           | 141 ++++++++++++++--
 src/convnets/builders/resnet.jl              |  35 ++++
 src/convnets/builders/stages.jl              |  22 ++-
 src/convnets/convmixer.jl                    |  12 +-
 src/convnets/convnext.jl                     |   6 +-
 src/convnets/densenet.jl                     |   4 +-
 src/convnets/efficientnets/efficientnet.jl   |  37 ++++-
 src/convnets/efficientnets/efficientnetv2.jl |  29 +++-
 src/convnets/inceptions/googlenet.jl         |   2 +-
 src/convnets/inceptions/inceptionv3.jl       |   2 +-
 src/convnets/mobilenets/mnasnet.jl           |  23 ++-
 src/convnets/mobilenets/mobilenetv1.jl       |  20 ++-
 src/convnets/mobilenets/mobilenetv2.jl       |  22 ++-
 src/convnets/mobilenets/mobilenetv3.jl       |  21 ++-
 src/convnets/resnets/core.jl                 | 161 ++++++++++++++++---
 src/convnets/resnets/res2net.jl              |  90 ++---------
 src/convnets/resnets/resnet.jl               |   4 +-
 src/convnets/resnets/resnext.jl              |   2 +-
 src/convnets/resnets/seresnet.jl             |   4 +-
 src/convnets/squeezenet.jl                   |  13 +-
 src/convnets/vgg.jl                          |  12 +-
 src/layers/conv.jl                           |   2 +-
 src/layers/drop.jl                           |  11 +-
 src/layers/mbconv.jl                         |   4 +-
 src/layers/selayers.jl                       |   2 +-
 src/mixers/gmlp.jl                           |   2 +-
 src/mixers/mlpmixer.jl                       |   2 +-
 src/mixers/resmlp.jl                         |   2 +-
 src/utilities.jl                             |  10 +-
 src/vit-based/vit.jl                         |   2 +-
 36 files changed, 561 insertions(+), 196 deletions(-)

diff --git a/Project.toml b/Project.toml
index 9664dc79d..2f7df3ed4 100644
--- a/Project.toml
+++ b/Project.toml
@@ -19,10 +19,10 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 
 [compat]
 BSON = "0.3.2"
-Flux = "0.13"
-Functors = "0.2, 0.3"
 CUDA = "3"
 ChainRulesCore = "1"
+Flux = "0.13"
+Functors = "0.2, 0.3"
 MLUtils = "0.2.10"
 NNlib = "0.8"
 NNlibCUDA = "0.2"
diff --git a/docs/dev-guide/contributing.md b/docs/dev-guide/contributing.md
index 75574b033..f126d7bb8 100644
--- a/docs/dev-guide/contributing.md
+++ b/docs/dev-guide/contributing.md
@@ -16,7 +16,7 @@ To add a new model architecture to Metalhead.jl, you can [open a PR](https://git
 
 - reuse layers from Flux as much as possible (e.g. use `Parallel` before defining a `Bottleneck` struct)
 - adhere as closely as possible to a reference such as a published paper (i.e. the structure of your model should follow intuitively from the paper)
-- use generic functional builders (e.g. [`resnet`](#) is the core function that builds "ResNet-like" models)
+- use generic functional builders (e.g. [`Metalhead.resnet`](@ref) is the core function that builds "ResNet-like" models)
 - use multiple dispatch to add convenience constructors that wrap your functional builder
 
 When in doubt, just open a PR! We are more than happy to help review your code to help it align with the rest of the library. After adding a model, you might consider adding some pre-trained weights (see below).
diff --git a/docs/tutorials/quickstart.md b/docs/tutorials/quickstart.md
index 43149933a..cfb99eb72 100644
--- a/docs/tutorials/quickstart.md
+++ b/docs/tutorials/quickstart.md
@@ -5,7 +5,7 @@
 using Flux, Metalhead
 ```
 
-Using a model from Metalhead is as simple as selecting a model from the table of [available models](#). For example, below we use the pre-trained ResNet-18 model.
+Using a model from Metalhead is as simple as selecting a model from the table of [available models](@ref). For example, below we use the pre-trained ResNet-18 model.
 {cell=quickstart}
 ```julia
 using Flux, Metalhead
diff --git a/src/convnets/alexnet.jl b/src/convnets/alexnet.jl
index 3256e4ab8..51cfbf029 100644
--- a/src/convnets/alexnet.jl
+++ b/src/convnets/alexnet.jl
@@ -45,7 +45,7 @@ Create a `AlexNet`.
     
     `AlexNet` does not currently support pretrained weights.
 
-See also [`alexnet`](#).
+See also [`Metalhead.alexnet`](@ref).
 """
 struct AlexNet
     layers::Any
diff --git a/src/convnets/builders/irmodel.jl b/src/convnets/builders/irmodel.jl
index 9c16608f3..2552d2cb1 100644
--- a/src/convnets/builders/irmodel.jl
+++ b/src/convnets/builders/irmodel.jl
@@ -36,7 +36,6 @@ function build_irmodel(scalings::NTuple{2, Real}, block_configs::AbstractVector{
     end
     return Chain(Chain(layers...), classifier)
 end
-
 function build_irmodel(width_mult::Real, block_configs::AbstractVector{<:Tuple}; kwargs...)
     return build_irmodel((width_mult, 1), block_configs; kwargs...)
 end
diff --git a/src/convnets/builders/mbconv.jl b/src/convnets/builders/mbconv.jl
index fbf52a03f..73cd70c32 100644
--- a/src/convnets/builders/mbconv.jl
+++ b/src/convnets/builders/mbconv.jl
@@ -1,10 +1,24 @@
-# TODO - potentially make these builders more flexible to specify stuff like
-# activation functions and reductions that don't change over the stages
+"""
+    irblockbuilder(::typeof(irblockfn), block_configs::AbstractVector{<:Tuple},
+                   inplanes::Integer, stage_idx::Integer, scalings::NTuple{2, Real};
+                   stochastic_depth_prob = nothing, norm_layer = BatchNorm,
+                   divisor::Integer = 8, kwargs...)
 
-function _get_builder(::typeof(dwsep_conv_norm), block_configs::AbstractVector{<:Tuple},
-                      inplanes::Integer, stage_idx::Integer, scalings::NTuple{2, Real};
-                      stochastic_depth_prob = nothing, norm_layer = BatchNorm,
-                      divisor::Integer = 8, kwargs...)
+Constructs a collection of inverted residual blocks for a given stage. Note that
+this function is not intended to be called directly, but rather by the [`mbconv_stage_builder`](@ref)
+function. This function must only be extended if the user wishes to extend a custom inverted
+residual block type.
+
+# Arguments
+
+  - `irblockfn`: the inverted residual block function to use in the block builder. Metalhead
+    defines methods for [`dwsep_conv_norm`](@ref), [`mbconv`](@ref) and [`fused_mbconv`](@ref)
+    as inverted residual blocks.
+"""
+function irblockbuilder(::typeof(dwsep_conv_norm), block_configs::AbstractVector{<:Tuple},
+                        inplanes::Integer, stage_idx::Integer, scalings::NTuple{2, Real};
+                        stochastic_depth_prob = nothing, norm_layer = BatchNorm,
+                        divisor::Integer = 8, kwargs...)
     width_mult, depth_mult = scalings
     block_fn, k, outplanes, stride, nrepeats, activation = block_configs[stage_idx]
     outplanes = _round_channels(outplanes * width_mult, divisor)
@@ -21,10 +35,10 @@ function _get_builder(::typeof(dwsep_conv_norm), block_configs::AbstractVector{<
     return get_layers, ceil(Int, nrepeats * depth_mult)
 end
 
-function _get_builder(::typeof(mbconv), block_configs::AbstractVector{<:Tuple},
-                      inplanes::Integer, stage_idx::Integer, scalings::NTuple{2, Real};
-                      stochastic_depth_prob = nothing, norm_layer = BatchNorm,
-                      divisor::Integer = 8, se_from_explanes::Bool = false, kwargs...)
+function irblockbuilder(::typeof(mbconv), block_configs::AbstractVector{<:Tuple},
+                        inplanes::Integer, stage_idx::Integer, scalings::NTuple{2, Real};
+                        stochastic_depth_prob = nothing, norm_layer = BatchNorm,
+                        divisor::Integer = 8, se_from_explanes::Bool = false, kwargs...)
     width_mult, depth_mult = scalings
     block_repeats = [ceil(Int, block_configs[idx][end - 2] * depth_mult)
                      for idx in eachindex(block_configs)]
@@ -47,7 +61,6 @@ function _get_builder(::typeof(mbconv), block_configs::AbstractVector{<:Tuple},
         use_skip = stride == 1 && inplanes == outplanes
         if use_skip
             schedule_idx = sum(block_repeats[1:(stage_idx - 1)]) + block_idx
-
             drop_path = StochasticDepth(sdschedule[schedule_idx])
             return (drop_path, block)
         else
@@ -57,10 +70,10 @@ function _get_builder(::typeof(mbconv), block_configs::AbstractVector{<:Tuple},
     return get_layers, block_repeats[stage_idx]
 end
 
-function _get_builder(::typeof(fused_mbconv), block_configs::AbstractVector{<:Tuple},
-                      inplanes::Integer, stage_idx::Integer, scalings::NTuple{2, Real};
-                      stochastic_depth_prob = nothing, norm_layer = BatchNorm,
-                      divisor::Integer = 8, kwargs...)
+function irblockbuilder(::typeof(fused_mbconv), block_configs::AbstractVector{<:Tuple},
+                        inplanes::Integer, stage_idx::Integer, scalings::NTuple{2, Real};
+                        stochastic_depth_prob = nothing, norm_layer = BatchNorm,
+                        divisor::Integer = 8, kwargs...)
     width_mult, depth_mult = scalings
     block_repeats = [ceil(Int, block_configs[idx][end - 1] * depth_mult)
                      for idx in eachindex(block_configs)]
@@ -83,7 +96,7 @@ end
 
 function mbconv_stage_builder(block_configs::AbstractVector{<:Tuple}, inplanes::Integer,
                               scalings::NTuple{2, Real}; kwargs...)
-    bxs = [_get_builder(block_configs[idx][1], block_configs, inplanes, idx, scalings;
-                        kwargs...) for idx in eachindex(block_configs)]
+    bxs = [irblockbuilder(block_configs[idx][1], block_configs, inplanes, idx, scalings;
+                          kwargs...) for idx in eachindex(block_configs)]
     return (stage_idx, block_idx) -> first.(bxs)[stage_idx](block_idx), last.(bxs)
 end
diff --git a/src/convnets/builders/resblocks.jl b/src/convnets/builders/resblocks.jl
index 506c622e3..b1435e4ff 100644
--- a/src/convnets/builders/resblocks.jl
+++ b/src/convnets/builders/resblocks.jl
@@ -1,10 +1,42 @@
+"""
+    basicblock_builder(block_repeats::AbstractVector{<:Integer};
+                       inplanes::Integer = 64, reduction_factor::Integer = 1,
+                       expansion::Integer = 1, norm_layer = BatchNorm,
+                       revnorm::Bool = false, activation = relu,
+                       attn_fn = planes -> identity,
+                       dropblock_prob = nothing, stochastic_depth_prob = nothing,
+                       stride_fn = resnet_stride, planes_fn = resnet_planes,
+                       downsample_tuple = (downsample_conv, downsample_identity))
+
+Builder for creating a basic block for a ResNet model.
+([reference](https://arxiv.org/abs/1512.03385))
+
+# Arguments
+
+  - `block_repeats`: number of repeats of a block in each stage
+
+  - `inplanes`: number of input channels
+  - `reduction_factor`: reduction factor for the number of channels in each stage
+  - `expansion`: expansion factor for the number of channels for the block
+  - `norm_layer`: normalization layer to use
+  - `revnorm`: set to `true` to place normalization layer before the convolution
+  - `activation`: activation function to use
+  - `attn_fn`: attention function to use
+  - `dropblock_prob`: dropblock probability. Set to `nothing` to disable DropBlock
+  - `stochastic_depth_prob`: stochastic depth probability. Set to `nothing` to disable StochasticDepth
+  - `stride_fn`: function to use to compute the stride of the block
+  - `planes_fn`: function to use to compute the number of channels in each block
+  - `downsample_tuple`: two-element tuple of downsample functions to use. The first one
+    is used when the number of channels changes in the block, the second one is used
+    when the number of channels stays the same.
+"""
 function basicblock_builder(block_repeats::AbstractVector{<:Integer};
                             inplanes::Integer = 64, reduction_factor::Integer = 1,
                             expansion::Integer = 1, norm_layer = BatchNorm,
                             revnorm::Bool = false, activation = relu,
-                            attn_fn = planes -> identity,
-                            dropblock_prob = nothing, stochastic_depth_prob = nothing,
-                            stride_fn = resnet_stride, planes_fn = resnet_planes,
+                            attn_fn = planes -> identity, dropblock_prob = nothing,
+                            stochastic_depth_prob = nothing, stride_fn = resnet_stride,
+                            planes_fn = resnet_planes,
                             downsample_tuple = (downsample_conv, downsample_identity))
     # DropBlock, StochasticDepth both take in probabilities based on a linear scaling schedule
     # Also get `planes_vec` needed for block `inplanes` and `planes` calculations
@@ -18,8 +50,6 @@ function basicblock_builder(block_repeats::AbstractVector{<:Integer};
         schedule_idx = sum(block_repeats[1:(stage_idx - 1)]) + block_idx
         planes = planes_vec[schedule_idx]
         inplanes = schedule_idx == 1 ? inplanes : planes_vec[schedule_idx - 1] * expansion
-        # `resnet_stride` is a callback that the user can tweak to change the stride of the
-        # blocks. It defaults to the standard behaviour as in the paper
         stride = stride_fn(stage_idx, block_idx)
         downsample_fn = stride != 1 || inplanes != planes * expansion ?
                         downsample_tuple[1] : downsample_tuple[2]
@@ -34,14 +64,48 @@ function basicblock_builder(block_repeats::AbstractVector{<:Integer};
     return get_layers
 end
 
+"""
+    bottleneck_builder(block_repeats::AbstractVector{<:Integer};
+                       inplanes::Integer = 64, cardinality::Integer = 1,
+                       base_width::Integer = 64, reduction_factor::Integer = 1,
+                       expansion::Integer = 4, norm_layer = BatchNorm,
+                       revnorm::Bool = false, activation = relu,
+                       attn_fn = planes -> identity, dropblock_prob = nothing,
+                       stochastic_depth_prob = nothing, stride_fn = resnet_stride,
+                       planes_fn = resnet_planes,
+                       downsample_tuple = (downsample_conv, downsample_identity))
+
+Builder for creating a bottleneck block for a ResNet/ResNeXt model.
+([reference](https://arxiv.org/abs/1611.05431))
+
+# Arguments
+
+  - `block_repeats`: number of repeats of a block in each stage
+  - `inplanes`: number of input channels
+  - `cardinality`: number of groups for the convolutional layer
+  - `base_width`: base width for the convolutional layer
+  - `reduction_factor`: reduction factor for the number of channels in each stage
+  - `expansion`: expansion factor for the number of channels for the block
+  - `norm_layer`: normalization layer to use
+  - `revnorm`: set to `true` to place normalization layer before the convolution
+  - `activation`: activation function to use
+  - `attn_fn`: attention function to use
+  - `dropblock_prob`: dropblock probability. Set to `nothing` to disable DropBlock
+  - `stochastic_depth_prob`: stochastic depth probability. Set to `nothing` to disable StochasticDepth
+  - `stride_fn`: function to use to compute the stride of the block
+  - `planes_fn`: function to use to compute the number of channels in each block
+  - `downsample_tuple`: two-element tuple of downsample functions to use. The first one
+    is used when the number of channels changes in the block, the second one is used
+    when the number of channels stays the same.
+"""
 function bottleneck_builder(block_repeats::AbstractVector{<:Integer};
                             inplanes::Integer = 64, cardinality::Integer = 1,
                             base_width::Integer = 64, reduction_factor::Integer = 1,
                             expansion::Integer = 4, norm_layer = BatchNorm,
                             revnorm::Bool = false, activation = relu,
-                            attn_fn = planes -> identity,
-                            dropblock_prob = nothing, stochastic_depth_prob = nothing,
-                            stride_fn = resnet_stride, planes_fn = resnet_planes,
+                            attn_fn = planes -> identity, dropblock_prob = nothing,
+                            stochastic_depth_prob = nothing, stride_fn = resnet_stride,
+                            planes_fn = resnet_planes,
                             downsample_tuple = (downsample_conv, downsample_identity))
     sdschedule = linear_scheduler(stochastic_depth_prob; depth = sum(block_repeats))
     dbschedule = linear_scheduler(dropblock_prob; depth = sum(block_repeats))
@@ -53,8 +117,6 @@ function bottleneck_builder(block_repeats::AbstractVector{<:Integer};
         schedule_idx = sum(block_repeats[1:(stage_idx - 1)]) + block_idx
         planes = planes_vec[schedule_idx]
         inplanes = schedule_idx == 1 ? inplanes : planes_vec[schedule_idx - 1] * expansion
-        # `resnet_stride` is a callback that the user can tweak to change the stride of the
-        # blocks. It defaults to the standard behaviour as in the paper
         stride = stride_fn(stage_idx, block_idx)
         downsample_fn = stride != 1 || inplanes != planes * expansion ?
                         downsample_tuple[1] : downsample_tuple[2]
@@ -69,3 +131,62 @@ function bottleneck_builder(block_repeats::AbstractVector{<:Integer};
     end
     return get_layers
 end
+
+"""
+    bottle2neck_builder(block_repeats::AbstractVector{<:Integer};
+                        inplanes::Integer = 64, cardinality::Integer = 1,
+                        base_width::Integer = 26, scale::Integer = 4,
+                        expansion::Integer = 4, norm_layer = BatchNorm,
+                        revnorm::Bool = false, activation = relu,
+                        attn_fn = planes -> identity, stride_fn = resnet_stride,
+                        planes_fn = resnet_planes,
+                        downsample_tuple = (downsample_conv, downsample_identity))
+
+Builder for creating a bottle2neck block for a Res2Net model.
+([reference](https://arxiv.org/abs/1904.01169))
+
+# Arguments
+
+  - `block_repeats`: number of repeats of a block in each stage
+  - `inplanes`: number of input channels
+  - `cardinality`: number of groups for the convolutional layer
+  - `base_width`: base width for the convolutional layer
+  - `scale`: scale for the number of channels in each block
+  - `expansion`: expansion factor for the number of channels for the block
+  - `norm_layer`: normalization layer to use
+  - `revnorm`: set to `true` to place normalization layer before the convolution
+  - `activation`: activation function to use
+  - `attn_fn`: attention function to use
+  - `stride_fn`: function to use to compute the stride of the block
+  - `planes_fn`: function to use to compute the number of channels in each block
+  - `downsample_tuple`: two-element tuple of downsample functions to use. The first one
+    is used when the number of channels changes in the block, the second one is used
+    when the number of channels stays the same.
+"""
+function bottle2neck_builder(block_repeats::AbstractVector{<:Integer};
+                             inplanes::Integer = 64, cardinality::Integer = 1,
+                             base_width::Integer = 26, scale::Integer = 4,
+                             expansion::Integer = 4, norm_layer = BatchNorm,
+                             revnorm::Bool = false, activation = relu,
+                             attn_fn = planes -> identity, stride_fn = resnet_stride,
+                             planes_fn = resnet_planes,
+                             downsample_tuple = (downsample_conv, downsample_identity))
+    planes_vec = collect(planes_fn(block_repeats))
+    # closure over `idxs`
+    function get_layers(stage_idx::Integer, block_idx::Integer)
+        # This is needed for block `inplanes` and `planes` calculations
+        schedule_idx = sum(block_repeats[1:(stage_idx - 1)]) + block_idx
+        planes = planes_vec[schedule_idx]
+        inplanes = schedule_idx == 1 ? inplanes : planes_vec[schedule_idx - 1] * expansion
+        stride = stride_fn(stage_idx, block_idx)
+        downsample_fn = (stride != 1 || inplanes != planes * expansion) ?
+                        downsample_tuple[1] : downsample_tuple[2]
+        is_first = (stride > 1 || downsample_fn != downsample_tuple[2]) ? true : false
+        block = bottle2neck(inplanes, planes; stride, cardinality, base_width, scale,
+                            activation, is_first, norm_layer, revnorm, attn_fn)
+        downsample = downsample_fn(inplanes, planes * expansion; stride, norm_layer,
+                                   revnorm)
+        return block, downsample
+    end
+    return get_layers
+end
diff --git a/src/convnets/builders/resnet.jl b/src/convnets/builders/resnet.jl
index 5bcc16960..aa038f728 100644
--- a/src/convnets/builders/resnet.jl
+++ b/src/convnets/builders/resnet.jl
@@ -1,3 +1,38 @@
+"""
+    build_resnet(img_dims, stem, get_layers, block_repeats::AbstractVector{<:Integer},
+                 connection, classifier_fn)
+
+Creates a ResNet model.
+
+!!! info
+    
+    This is a very generic, flexible but low level API that can be used to create any of the ResNet
+    variants. For a more user friendly API, see the other version of `resnet`.
+
+# Arguments
+
+  - `img_dims`: The dimensions of the input image. This is used to determine the number of feature
+    maps to be passed to the classifier. This should be a tuple of the form `(height, width, channels)`.
+  - `stem`: The stem of the ResNet model. The stem should be created outside of this function and
+    passed in as an argument. This is done to allow for more flexibility in creating the stem.
+    [`resnet_stem`](@ref) is a helper function that Metalhead provides which is recommended for
+    creating the stem.
+  - `get_layers` is a function that takes in two inputs - the `stage_idx`, or the index of
+    the stage, and the `block_idx`, or the index of the block within the stage. It returns a
+    tuple of layers. If the tuple returned by `get_layers` has more than one element, then
+    `connection` is used to splat this tuple into `Parallel` - if not, then the only element of
+    the tuple is directly inserted into the network. `get_layers` is a very specific function
+    and should not be created on its own. Instead, use one of the builders provided by Metalhead
+    to create it.
+  - `block_repeats`: This is a `Vector` of integers that specifies the number of repeats of each
+    block in each stage.
+  - `connection`: This is a function that determines the residual connection in the model. For
+    `resnets`, either of [`Metalhead.addact`](@ref) or [`Metalhead.actadd`](@ref) is recommended.
+  - `classifier_fn`: This is a function that takes in the number of feature maps and returns a
+    classifier. This is usually built as a closure using a function like [`Metalhead.create_classifier`](@ref).
+    For example, if the number of output classes is `nclasses`, then the function can be defined as
+    `channels -> create_classifier(channels, nclasses)`.
+"""
 function build_resnet(img_dims, stem, get_layers, block_repeats::AbstractVector{<:Integer},
                       connection, classifier_fn)
     # Build stages of the ResNet
diff --git a/src/convnets/builders/stages.jl b/src/convnets/builders/stages.jl
index f97f92ff9..13706fd02 100644
--- a/src/convnets/builders/stages.jl
+++ b/src/convnets/builders/stages.jl
@@ -1,3 +1,23 @@
+"""
+    cnn_stages(get_layers, block_repeats::AbstractVector{<:Integer}, [connection = nothing])
+
+Creates a convolutional neural network backbone by calling the function `get_layers`
+repeatedly.
+
+# Arguments
+
+  - `get_layers` is a function that takes in two inputs - the `stage_idx`, or the index of
+    the stage, and the `block_idx`, or the index of the block within the stage. It returns a
+    tuple of layers. If the tuple returned by `get_layers` has more than one element, then
+    `connection` is used - if not, then the only element of the tuple is directly inserted
+    into the network.
+  - `block_repeats` is a `Vector` of integers, where each element specifies the number of
+    times the `get_layers` function should be called for that stage.
+  - `connection` defaults to `nothing` and is an optional argument that specifies the
+    connection type between the layers. It is passed to `Parallel` and is useful for residual
+    network structures. For example, for ResNet, the connection is simply `+`. If `connection`
+    is `nothing`, then every call to `get_layers` _must_ return a tuple of length 1.
+"""
 function cnn_stages(get_layers, block_repeats::AbstractVector{<:Integer},
                     connection = nothing)
     # Construct each stage
@@ -7,7 +27,7 @@ function cnn_stages(get_layers, block_repeats::AbstractVector{<:Integer},
         blocks = map(1:nblocks) do block_idx
             branches = get_layers(stage_idx, block_idx)
             if isnothing(connection)
-                @assert length(branches)==1 "get_layers should return a single branch for 
+                @assert length(branches)==1 "get_layers should return a single branch for
                 each block if no connection is specified"
             end
             return length(branches) == 1 ? only(branches) :
diff --git a/src/convnets/convmixer.jl b/src/convnets/convmixer.jl
index 4cd22cb6b..83d8d4638 100644
--- a/src/convnets/convmixer.jl
+++ b/src/convnets/convmixer.jl
@@ -45,7 +45,8 @@ const CONVMIXER_CONFIGS = Dict(:base => ((1536, 20),
                                            patch_size = (7, 7))))
 
 """
-    ConvMixer(config::Symbol; inchannels::Integer = 3, nclasses::Integer = 1000)
+    ConvMixer(config::Symbol; pretrain::Bool = false, inchannels::Integer = 3,
+              nclasses::Integer = 1000)
 
 Creates a ConvMixer model.
 ([reference](https://arxiv.org/abs/2201.09792))
@@ -53,18 +54,25 @@ Creates a ConvMixer model.
 # Arguments
 
   - `config`: the size of the model, either `:base`, `:small` or `:large`
+  - `pretrain`: whether to load the pre-trained weights for ImageNet
   - `inchannels`: number of input channels
   - `nclasses`: number of classes in the output
+
+See also [`Metalhead.convmixer`](@ref).
 """
 struct ConvMixer
     layers::Any
 end
 @functor ConvMixer
 
-function ConvMixer(config::Symbol; inchannels::Integer = 3, nclasses::Integer = 1000)
+function ConvMixer(config::Symbol; pretrain::Bool = false, inchannels::Integer = 3,
+                   nclasses::Integer = 1000)
     _checkconfig(config, keys(CONVMIXER_CONFIGS))
     layers = convmixer(CONVMIXER_CONFIGS[config][1]...; CONVMIXER_CONFIGS[config][2]...,
                        inchannels, nclasses)
+    if pretrain
+        loadpretrain!(layers, "convmixer$config")
+    end
     return ConvMixer(layers)
 end
 
diff --git a/src/convnets/convnext.jl b/src/convnets/convnext.jl
index 283c8662e..02f9bbc7f 100644
--- a/src/convnets/convnext.jl
+++ b/src/convnets/convnext.jl
@@ -8,7 +8,7 @@ Creates a single block of ConvNeXt.
 
   - `planes`: number of input channels.
   - `stochastic_depth_prob`: Stochastic depth probability.
-  - `layerscale_init`: Initial value for [`LayerScale`](#)
+  - `layerscale_init`: Initial value for [`LayerScale`](@ref)
 """
 function convnextblock(planes::Integer, stochastic_depth_prob = 0.0,
                        layerscale_init = 1.0f-6)
@@ -35,7 +35,7 @@ Creates the layers for a ConvNeXt model.
   - `depths`: list with configuration for depth of each block
   - `planes`: list with configuration for number of output channels in each block
   - `stochastic_depth_prob`: Stochastic depth probability.
-  - `layerscale_init`: Initial value for [`LayerScale`](#)
+  - `layerscale_init`: Initial value for [`LayerScale`](@ref)
     ([reference](https://arxiv.org/abs/2103.17239))
   - `inchannels`: number of input channels.
   - `nclasses`: number of output classes
@@ -101,7 +101,7 @@ Creates a ConvNeXt model.
     
     `ConvNeXt` does not currently support pretrained weights.
 
-See also [`Metalhead.convnext`](#).
+See also [`Metalhead.convnext`](@ref).
 """
 struct ConvNeXt
     layers::Any
diff --git a/src/convnets/densenet.jl b/src/convnets/densenet.jl
index 0f5be78c8..4799e22a7 100644
--- a/src/convnets/densenet.jl
+++ b/src/convnets/densenet.jl
@@ -65,7 +65,7 @@ Create a DenseNet model
 
   - `inplanes`: the number of input feature maps to the first dense block
   - `growth_rates`: the growth rates of output feature maps within each
-    [`dense_block`](#) (a vector of vectors)
+    [`dense_block`](@ref) (a vector of vectors)
   - `reduction`: the factor by which the number of feature maps is scaled across each transition
   - `dropout_prob`: the dropout probability for the classifier head. Set to `nothing` to disable dropout.
   - `nclasses`: the number of output classes
@@ -125,7 +125,7 @@ Set `pretrain = true` to load the model with pre-trained weights for ImageNet.
     
     `DenseNet` does not currently support pretrained weights.
 
-See also [`Metalhead.densenet`](#).
+See also [`Metalhead.densenet`](@ref).
 """
 struct DenseNet
     layers::Any
diff --git a/src/convnets/efficientnets/efficientnet.jl b/src/convnets/efficientnets/efficientnet.jl
index bbd864a1a..3554eddba 100644
--- a/src/convnets/efficientnets/efficientnet.jl
+++ b/src/convnets/efficientnets/efficientnet.jl
@@ -31,6 +31,29 @@ const EFFICIENTNET_GLOBAL_CONFIGS = Dict(:b0 => (224, (1.0, 1.0)),
                                          :b7 => (600, (2.0, 3.1)),
                                          :b8 => (672, (2.2, 3.6)))
 
+"""
+    efficientnet(config::Symbol; norm_layer = BatchNorm, stochastic_depth_prob = 0.2,
+                 dropout_prob = nothing, inchannels::Integer = 3, nclasses::Integer = 1000)
+
+Create an EfficientNet model. ([reference](https://arxiv.org/abs/1905.11946v5)).
+
+# Arguments
+
+  - `config`: size of the model. Can be one of `[:b0, :b1, :b2, :b3, :b4, :b5, :b6, :b7, :b8]`.
+  - `norm_layer`: normalization layer to use.
+  - `stochastic_depth_prob`: probability of stochastic depth. Set to `nothing` to disable
+    stochastic depth.
+  - `dropout_prob`: probability of dropout in the classifier head. Set to `nothing` to disable
+    dropout.
+  - `inchannels`: number of input channels.
+  - `nclasses`: number of output classes.
+
+!!! warning
+    
+    EfficientNet does not currently support pretrained weights.
+
+See also [`Metalhead.efficientnet`](@ref).
+"""
 function efficientnet(config::Symbol; norm_layer = BatchNorm, stochastic_depth_prob = 0.2,
                       dropout_prob = nothing, inchannels::Integer = 3,
                       nclasses::Integer = 1000)
@@ -43,16 +66,22 @@ function efficientnet(config::Symbol; norm_layer = BatchNorm, stochastic_depth_p
 end
 
 """
-    EfficientNet(config::Symbol; pretrain::Bool = false)
+    EfficientNet(config::Symbol; pretrain::Bool = false, inchannels::Integer = 3,
+                 nclasses::Integer = 1000)
 
 Create an EfficientNet model ([reference](https://arxiv.org/abs/1905.11946v5)).
-See also [`efficientnet`](#).
+See also [`efficientnet`](@ref).
 
 # Arguments
 
-  - `config`: name of default configuration
-    (can be `:b0`, `:b1`, `:b2`, `:b3`, `:b4`, `:b5`, `:b6`, `:b7`, `:b8`)
+  - `config`: size of the model. Can be one of `[:b0, :b1, :b2, :b3, :b4, :b5, :b6, :b7, :b8]`.
   - `pretrain`: set to `true` to load the pre-trained weights for ImageNet
+
+!!! warning
+    
+    `EfficientNet` does not currently support pretrained weights.
+
+See also [`efficientnet`](#).
 """
 struct EfficientNet
     layers::Any
diff --git a/src/convnets/efficientnets/efficientnetv2.jl b/src/convnets/efficientnets/efficientnetv2.jl
index 5ea35740c..e820f9989 100644
--- a/src/convnets/efficientnets/efficientnetv2.jl
+++ b/src/convnets/efficientnets/efficientnetv2.jl
@@ -35,6 +35,23 @@ const EFFNETV2_CONFIGS = Dict(:small => [(fused_mbconv, 3, 24, 1, 1, 2, swish),
                                   (mbconv, 3, 512, 6, 2, 32, 4, swish),
                                   (mbconv, 3, 768, 6, 1, 8, 4, swish)])
 
+"""
+    efficientnet(config::Symbol; norm_layer = BatchNorm, stochastic_depth_prob = 0.2,
+                 dropout_prob = nothing, inchannels::Integer = 3, nclasses::Integer = 1000)
+
+Create an EfficientNetv2 model. ([reference](https://arxiv.org/abs/2104.00298)).
+
+# Arguments
+
+  - `config`: size of the model. Can be one of `[:b0, :b1, :b2, :b3, :b4, :b5, :b6, :b7, :b8]`.
+  - `norm_layer`: normalization layer to use.
+  - `stochastic_depth_prob`: probability of stochastic depth. Set to `nothing` to disable
+    stochastic depth.
+  - `dropout_prob`: probability of dropout in the classifier head. Set to `nothing` to disable
+    dropout.
+  - `inchannels`: number of input channels.
+  - `nclasses`: number of output classes.
+"""
 function efficientnetv2(config::Symbol; norm_layer = BatchNorm, stochastic_depth_prob = 0.2,
                         dropout_prob = nothing, inchannels::Integer = 3,
                         nclasses::Integer = 1000)
@@ -46,8 +63,8 @@ function efficientnetv2(config::Symbol; norm_layer = BatchNorm, stochastic_depth
 end
 
 """
-    EfficientNetv2(config::Symbol; pretrain::Bool = false, width_mult::Real = 1,
-                   inchannels::Integer = 3, nclasses::Integer = 1000)
+    EfficientNetv2(config::Symbol; pretrain::Bool = false, inchannels::Integer = 3,
+                   nclasses::Integer = 1000)
 
 Create an EfficientNetv2 model ([reference](https://arxiv.org/abs/2104.00298)).
 
@@ -55,10 +72,14 @@ Create an EfficientNetv2 model ([reference](https://arxiv.org/abs/2104.00298)).
 
   - `config`: size of the network (one of `[:small, :medium, :large, :xlarge]`)
   - `pretrain`: whether to load the pre-trained weights for ImageNet
-  - `width_mult`: Controls the number of output feature maps in each block (with 1
-    being the default in the paper)
   - `inchannels`: number of input channels
   - `nclasses`: number of output classes
+
+!!! warning
+    
+    `EfficientNetv2` does not currently support pretrained weights.
+
+See also [`efficientnet`](#).
 """
 struct EfficientNetv2
     layers::Any
diff --git a/src/convnets/inceptions/googlenet.jl b/src/convnets/inceptions/googlenet.jl
index 1f786bbb5..fefbbca9a 100644
--- a/src/convnets/inceptions/googlenet.jl
+++ b/src/convnets/inceptions/googlenet.jl
@@ -71,7 +71,7 @@ Create an Inception-v1 model (commonly referred to as `GoogLeNet`)
     
     `GoogLeNet` does not currently support pretrained weights.
 
-See also [`googlenet`](#).
+See also [`googlenet`](@ref).
 """
 struct GoogLeNet
     layers::Any
diff --git a/src/convnets/inceptions/inceptionv3.jl b/src/convnets/inceptions/inceptionv3.jl
index c36273e63..038d54818 100644
--- a/src/convnets/inceptions/inceptionv3.jl
+++ b/src/convnets/inceptions/inceptionv3.jl
@@ -160,7 +160,7 @@ end
     Inceptionv3(; pretrain::Bool = false, inchannels::Integer = 3, nclasses::Integer = 1000)
 
 Create an Inception-v3 model ([reference](https://arxiv.org/abs/1512.00567v3)).
-See also [`inceptionv3`](#).
+See also [`inceptionv3`](@ref).
 
 # Arguments
 
diff --git a/src/convnets/mobilenets/mnasnet.jl b/src/convnets/mobilenets/mnasnet.jl
index cf1f597fb..c08595715 100644
--- a/src/convnets/mobilenets/mnasnet.jl
+++ b/src/convnets/mobilenets/mnasnet.jl
@@ -39,6 +39,24 @@ const MNASNET_CONFIGS = Dict(:B1 => (32,
                                             (mbconv, 3, 144, 6, 1, 1, nothing, relu),
                                         ]))
 
+"""
+    mnasnet(config::Symbol; width_mult::Real = 1, max_width::Integer = 1280,
+            dropout_prob = 0.2, inchannels::Integer = 3, nclasses::Integer = 1000)
+
+Create an MNasNet model. ([reference](https://arxiv.org/abs/1807.11626))
+
+# Arguments
+
+  - `config`: configuration of the model. One of `B1`, `A1` or `small`. `B1` is without
+    squeeze-and-excite layers, `A1` is with squeeze-and-excite layers, and `small` is a smaller
+    version of `A1`.
+  - `width_mult`: Controls the number of output feature maps in each block
+    (with 1 being the default in the paper; this is usually a value between 0.1 and 1.4)
+  - `max_width`: Controls the maximum number of output feature maps in each block
+  - `dropout_prob`: Dropout probability for the classifier head. Set to `nothing` to disable dropout.
+  - `inchannels`: Number of input channels.
+  - `nclasses`: Number of output classes.
+"""
 function mnasnet(config::Symbol; width_mult::Real = 1, max_width::Integer = 1280,
                  dropout_prob = 0.2, inchannels::Integer = 3, nclasses::Integer = 1000)
     _checkconfig(config, keys(MNASNET_CONFIGS))
@@ -59,8 +77,7 @@ Creates a MNASNet model with the specified configuration.
 # Arguments
 
   - `width_mult`: Controls the number of output feature maps in each block
-    (with 1 being the default in the paper;
-    this is usually a value between 0.1 and 1.4)
+    (with 1 being the default in the paper; this is usually a value between 0.1 and 1.4)
   - `pretrain`: Whether to load the pre-trained weights for ImageNet
   - `inchannels`: The number of input channels.
   - `nclasses`: The number of output classes
@@ -69,7 +86,7 @@ Creates a MNASNet model with the specified configuration.
     
     `MNASNet` does not currently support pretrained weights.
 
-See also [`mnasnet`](#).
+See also [`Metalhead.mnasnet`](@ref).
 """
 struct MNASNet
     layers::Any
diff --git a/src/convnets/mobilenets/mobilenetv1.jl b/src/convnets/mobilenets/mobilenetv1.jl
index 8d534d111..7de7ac384 100644
--- a/src/convnets/mobilenets/mobilenetv1.jl
+++ b/src/convnets/mobilenets/mobilenetv1.jl
@@ -14,6 +14,21 @@ const MOBILENETV1_CONFIGS = [
     (dwsep_conv_norm, 3, 1024, 2, 2, relu6),
 ]
 
+"""
+    mobilenetv1(width_mult::Real = 1; inplanes::Integer = 32, dropout_prob = nothing,
+                inchannels::Integer = 3, nclasses::Integer = 1000)
+
+Create a MobileNetv1 model. ([reference](https://arxiv.org/abs/1704.04861v1)).
+
+# Arguments
+
+  - `width_mult`: Controls the number of output feature maps in each block
+    (with 1 being the default in the paper; this is usually a value between 0.1 and 1.4)
+  - `inplanes`: Number of input channels to the first convolution layer
+  - `dropout_prob`: Dropout probability for the classifier head. Set to `nothing` to disable dropout.
+  - `inchannels`: Number of input channels.
+  - `nclasses`: Number of output classes.
+"""
 function mobilenetv1(width_mult::Real = 1; inplanes::Integer = 32, dropout_prob = nothing,
                      inchannels::Integer = 3, nclasses::Integer = 1000)
     return build_irmodel(width_mult, MOBILENETV1_CONFIGS; inplanes, inchannels,
@@ -31,8 +46,7 @@ Create a MobileNetv1 model with the baseline configuration
 # Arguments
 
   - `width_mult`: Controls the number of output feature maps in each block
-    (with 1 being the default in the paper;
-    this is usually a value between 0.1 and 1.4)
+    (with 1 being the default in the paper; this is usually a value between 0.1 and 1.4)
   - `pretrain`: Whether to load the pre-trained weights for ImageNet
   - `inchannels`: The number of input channels.
   - `nclasses`: The number of output classes
@@ -41,7 +55,7 @@ Create a MobileNetv1 model with the baseline configuration
     
     `MobileNetv1` does not currently support pretrained weights.
 
-See also [`mobilenetv1`](#).
+See also [`Metalhead.mobilenetv1`](@ref).
 """
 struct MobileNetv1
     layers::Any
diff --git a/src/convnets/mobilenets/mobilenetv2.jl b/src/convnets/mobilenets/mobilenetv2.jl
index 271cb1e37..64e680bd8 100644
--- a/src/convnets/mobilenets/mobilenetv2.jl
+++ b/src/convnets/mobilenets/mobilenetv2.jl
@@ -18,6 +18,23 @@ const MOBILENETV2_CONFIGS = [
     (mbconv, 3, 320, 6, 1, 1, nothing, relu6),
 ]
 
+"""
+    mobilenetv2(width_mult::Real = 1; max_width::Integer = 1280,
+                inplanes::Integer = 32, dropout_prob = 0.2,
+                inchannels::Integer = 3, nclasses::Integer = 1000)
+
+Create a MobileNetv2 model. ([reference](https://arxiv.org/abs/1801.04381v1)).
+
+# Arguments
+
+    - `width_mult`: Controls the number of output feature maps in each block
+    (with 1 being the default in the paper; this is usually a value between 0.1 and 1.4)
+    - `max_width`: The maximum width of the network.
+    - `inplanes`: Number of input channels to the first convolution layer
+    - `dropout_prob`: Dropout probability for the classifier head. Set to `nothing` to disable dropout.
+    - `inchannels`: Number of input channels.
+    - `nclasses`: Number of output classes.
+"""
 function mobilenetv2(width_mult::Real = 1; max_width::Integer = 1280,
                      inplanes::Integer = 32, dropout_prob = 0.2,
                      inchannels::Integer = 3, nclasses::Integer = 1000)
@@ -35,8 +52,7 @@ Create a MobileNetv2 model with the specified configuration.
 # Arguments
 
   - `width_mult`: Controls the number of output feature maps in each block
-    (with 1 being the default in the paper;
-    this is usually a value between 0.1 and 1.4)
+    (with 1 being the default in the paper; this is usually a value between 0.1 and 1.4)
   - `pretrain`: Whether to load the pre-trained weights for ImageNet
   - `inchannels`: The number of input channels.
   - `nclasses`: The number of output classes
@@ -45,7 +61,7 @@ Create a MobileNetv2 model with the specified configuration.
     
     `MobileNetv2` does not currently support pretrained weights.
 
-See also [`mobilenetv2`](#).
+See also [`Metalhead.mobilenetv2`](@ref).
 """
 struct MobileNetv2
     layers::Any
diff --git a/src/convnets/mobilenets/mobilenetv3.jl b/src/convnets/mobilenets/mobilenetv3.jl
index 1fa4cab26..4bf748e9d 100644
--- a/src/convnets/mobilenets/mobilenetv3.jl
+++ b/src/convnets/mobilenets/mobilenetv3.jl
@@ -34,6 +34,22 @@ const MOBILENETV3_CONFIGS = Dict(:small => (1024,
                                                 (mbconv, 5, 160, 6, 1, 3, 4, hardswish),
                                             ]))
 
+"""
+    mobilenetv3(config::Symbol; width_mult::Real = 1, dropout_prob = 0.2,
+                inchannels::Integer = 3, nclasses::Integer = 1000)
+
+Create a MobileNetv3 model with the specified configuration.
+([reference](https://arxiv.org/abs/1905.02244)).
+
+# Arguments
+
+    - `config`: The configuration of the model. Can be either `small` or `large`.
+    - `width_mult`: Controls the number of output feature maps in each block
+      (with 1 being the default in the paper; this is usually a value between 0.1 and 1.4)
+    - `dropout_prob`: Dropout probability for the classifier head. Set to `nothing` to disable dropout.
+    - `inchannels`: The number of input channels.
+    - `nclasses`: The number of output classes.
+"""
 function mobilenetv3(config::Symbol; width_mult::Real = 1, dropout_prob = 0.2,
                      inchannels::Integer = 3, nclasses::Integer = 1000)
     _checkconfig(config, [:small, :large])
@@ -56,8 +72,7 @@ Set `pretrain = true` to load the model with pre-trained weights for ImageNet.
 
   - `config`: :small or :large for the size of the model (see paper).
   - `width_mult`: Controls the number of output feature maps in each block
-    (with 1 being the default in the paper;
-    this is usually a value between 0.1 and 1.4)
+    (with 1 being the default in the paper; this is usually a value between 0.1 and 1.4)
   - `pretrain`: whether to load the pre-trained weights for ImageNet
   - `inchannels`: number of input channels
   - `nclasses`: the number of output classes
@@ -66,7 +81,7 @@ Set `pretrain = true` to load the model with pre-trained weights for ImageNet.
     
     `MobileNetv3` does not currently support pretrained weights.
 
-See also [`mobilenetv3`](#).
+See also [`Metalhead.mobilenetv3`](@ref).
 """
 struct MobileNetv3
     layers::Any
diff --git a/src/convnets/resnets/core.jl b/src/convnets/resnets/core.jl
index 763369388..923e2fa7b 100644
--- a/src/convnets/resnets/core.jl
+++ b/src/convnets/resnets/core.jl
@@ -1,3 +1,5 @@
+## ResNet blocks
+
 """
     basicblock(inplanes::Integer, planes::Integer; stride::Integer = 1,
                reduction_factor::Integer = 1, activation = relu,
@@ -19,7 +21,7 @@ Creates a basic residual block (see [reference](https://arxiv.org/abs/1512.03385
   - `revnorm`: set to `true` to place the normalisation layer before the convolution
   - `drop_block`: the drop block layer
   - `drop_path`: the drop path layer
-  - `attn_fn`: the attention function to use. See [`squeeze_excite`](#) for an example.
+  - `attn_fn`: the attention function to use. See [`squeeze_excite`](@ref) for an example.
 """
 function basicblock(inplanes::Integer, planes::Integer; stride::Integer = 1,
                     reduction_factor::Integer = 1, activation = relu,
@@ -60,7 +62,7 @@ Creates a bottleneck residual block (see [reference](https://arxiv.org/abs/1512.
   - `revnorm`: set to `true` to place the normalisation layer before the convolution
   - `drop_block`: the drop block layer
   - `drop_path`: the drop path layer
-  - `attn_fn`: the attention function to use. See [`squeeze_excite`](#) for an example.
+  - `attn_fn`: the attention function to use. See [`squeeze_excite`](@ref) for an example.
 """
 function bottleneck(inplanes::Integer, planes::Integer; stride::Integer,
                     cardinality::Integer = 1, base_width::Integer = 64,
@@ -81,14 +83,92 @@ function bottleneck(inplanes::Integer, planes::Integer; stride::Integer,
     return Chain(filter!(!=(identity), layers)...)
 end
 
-# Downsample layer using convolutions.
+"""
+    bottle2neck(inplanes::Integer, planes::Integer; stride::Integer = 1,
+                cardinality::Integer = 1, base_width::Integer = 26,
+                scale::Integer = 4, activation = relu, norm_layer = BatchNorm,
+                revnorm::Bool = false, attn_fn = planes -> identity)
+
+Creates a bottleneck block as described in the Res2Net paper.
+([reference](https://arxiv.org/abs/1904.01169))
+
+# Arguments
+
+  - `inplanes`: number of input feature maps
+  - `planes`: number of feature maps for the block
+  - `stride`: the stride of the block
+  - `cardinality`: the number of groups in the 3x3 convolutions.
+  - `base_width`: the number of output feature maps for each convolutional group.
+  - `scale`: the number of feature groups in the block. See the [paper](https://arxiv.org/abs/1904.01169)
+    for more details.
+  - `activation`: the activation function to use.
+  - `norm_layer`: the normalization layer to use.
+  - `revnorm`: set to `true` to place the batch norm before the convolution
+  - `attn_fn`: the attention function to use. See [`squeeze_excite`](@ref) for an example.
+"""
+function bottle2neck(inplanes::Integer, planes::Integer; stride::Integer = 1,
+                     cardinality::Integer = 1, base_width::Integer = 26,
+                     scale::Integer = 4, activation = relu, is_first::Bool = false,
+                     norm_layer = BatchNorm, revnorm::Bool = false,
+                     attn_fn = planes -> identity)
+    width = fld(planes * base_width, 64) * cardinality
+    outplanes = planes * 4
+    pool = is_first && scale > 1 ? MeanPool((3, 3); stride, pad = 1) : identity
+    conv_bns = [Chain(conv_norm((3, 3), width, width, activation; norm_layer, stride,
+                                pad = 1, groups = cardinality)...)
+                for _ in 1:max(1, scale - 1)]
+    reslayer = is_first ? Parallel(cat_channels, pool, conv_bns...) :
+               Parallel(cat_channels, identity, Chain(PairwiseFusion(+, conv_bns...)))
+    tuplify = is_first ? x -> tuple(x...) : x -> tuple(x[1], tuple(x[2:end]...))
+    layers = [
+        conv_norm((1, 1), inplanes, width * scale, activation;
+                  norm_layer, revnorm)...,
+        chunk$(; size = width, dims = 3), tuplify, reslayer,
+        conv_norm((1, 1), width * scale, outplanes, activation;
+                  norm_layer, revnorm)...,
+        attn_fn(outplanes),
+    ]
+    return Chain(filter(!=(identity), layers)...)
+end
+
+## Downsample layers
+
+"""
+    downsample_conv(inplanes::Integer, outplanes::Integer; stride::Integer = 1,
+                    norm_layer = BatchNorm, revnorm::Bool = false)
+
+Creates a 1x1 convolutional downsample layer as used in ResNet.
+
+# Arguments
+
+  - `inplanes`: number of input feature maps
+  - `outplanes`: number of output feature maps
+  - `stride`: the stride of the convolution
+  - `norm_layer`: the normalization layer to use.
+  - `revnorm`: set to `true` to place the normalisation layer before the convolution
+"""
 function downsample_conv(inplanes::Integer, outplanes::Integer; stride::Integer = 1,
                          norm_layer = BatchNorm, revnorm::Bool = false)
     return Chain(conv_norm((1, 1), inplanes, outplanes, identity; norm_layer, revnorm,
                            pad = SamePad(), stride)...)
 end
 
-# Downsample layer using max pooling
+"""
+    downsample_pool(inplanes::Integer, outplanes::Integer; stride::Integer = 1,
+                    norm_layer = BatchNorm, revnorm::Bool = false)
+
+Creates a pooling-based downsample layer as described in the
+[Bag of Tricks](https://arxiv.org/abs/1812.01187v1) paper. This adds an average pooling layer
+of size `(2, 2)` with `stride` followed by a 1x1 convolution.
+
+# Arguments
+
+  - `inplanes`: number of input feature maps
+  - `outplanes`: number of output feature maps
+  - `stride`: the stride of the convolution
+  - `norm_layer`: the normalization layer to use.
+  - `revnorm`: set to `true` to place the normalisation layer before the convolution
+"""
 function downsample_pool(inplanes::Integer, outplanes::Integer; stride::Integer = 1,
                          norm_layer = BatchNorm, revnorm::Bool = false)
     pool = stride == 1 ? identity : MeanPool((2, 2); stride, pad = SamePad())
@@ -97,33 +177,42 @@ function downsample_pool(inplanes::Integer, outplanes::Integer; stride::Integer
                            revnorm)...)
 end
 
-# Downsample layer which is an identity projection. Uses max pooling
-# when the output size is more than the input size.
 # TODO - figure out how to make this work when outplanes < inplanes
+"""
+    downsample_identity(inplanes::Integer, outplanes::Integer; kwargs...)
+
+Creates an identity downsample layer. This returns `identity` if `inplanes == outplanes`.
+If `outplanes > inplanes`, it maps the input to `outplanes` channels using a 1x1 max pooling
+layer and zero padding.
+
+!!! warning
+    
+    This does not currently support the scenario where `inplanes > outplanes`.
+
+# Arguments
+
+  - `inplanes`: number of input feature maps
+  - `outplanes`: number of output feature maps
+
+Note that kwargs are ignored and only included for compatibility with other downsample layers.
+"""
 function downsample_identity(inplanes::Integer, outplanes::Integer; kwargs...)
     if outplanes > inplanes
         return Chain(MaxPool((1, 1); stride = 2),
                      y -> cat_channels(y,
-                                       zeros(eltype(y),
-                                             size(y, 1),
-                                             size(y, 2),
+                                       zeros(eltype(y), size(y, 1), size(y, 2),
                                              outplanes - inplanes, size(y, 4))))
     else
         return identity
     end
 end
 
-# Shortcut configurations for the ResNet models
+# Shortcut configurations for the ResNet variants
 const RESNET_SHORTCUTS = Dict(:A => (downsample_identity, downsample_identity),
                               :B => (downsample_conv, downsample_identity),
                               :C => (downsample_conv, downsample_conv),
                               :D => (downsample_pool, downsample_identity))
 
-# Stride for each block in the ResNet model
-function resnet_stride(stage_idx::Integer, block_idx::Integer)
-    return stage_idx == 1 || block_idx != 1 ? 1 : 2
-end
-
 # returns `DropBlock`s for each stage of the ResNet as in timm.
 # TODO - add experimental options for DropBlock as part of the API (#188)
 # function _drop_blocks(dropblock_prob::AbstractFloat)
@@ -137,7 +226,7 @@ end
     resnet_stem(; stem_type = :default, inchannels::Integer = 3, replace_stem_pool = false,
                   norm_layer = BatchNorm, activation = relu)
 
-Builds a stem to be used in a ResNet model. See the `stem` argument of [`resnet`](#) for details
+Builds a stem to be used in a ResNet model. See the `stem` argument of [`resnet`](@ref) for details
 on how to use this function.
 
 # Arguments
@@ -147,9 +236,9 @@ on how to use this function.
       + `:default`: Builds a stem based on the default ResNet stem, which consists of a single
         7x7 convolution with stride 2 and a normalisation layer followed by a 3x3 max pooling
         layer with stride 2.
-      + `:deep`: This borrows ideas from other papers (InceptionResNet-v2, for example) in using
-        a deeper stem with 3 successive 3x3 convolutions having normalisation layers after each
-        one. This is followed by a 3x3 max pooling layer with stride 2.
+      + `:deep`: This borrows ideas from other papers ([InceptionResNetv2](https://arxiv.org/abs/1602.07261),
+        for example) in using a deeper stem with 3 successive 3x3 convolutions having normalisation
+        layers after each one. This is followed by a 3x3 max pooling layer with stride 2.
       + `:deep_tiered`: A variant of the `:deep` stem that has a larger width in the second
         convolution. This is an experimental variant from the `timm` library in Python that
         shows peformance improvements over the `:deep` stem in some cases.
@@ -191,16 +280,46 @@ function resnet_stem(stem_type::Symbol = :default; inchannels::Integer = 3,
     return Chain(conv1, bn1, stempool)
 end
 
+# Callbacks for channel and stride calculations for each block in a ResNet
+
+"""
+    resnet_planes(block_repeats::AbstractVector{<:Integer})
+
+Default callback for determining the number of channels in each block in a ResNet model.
+
+# Arguments
+
+`block_repeats`: A `Vector` of integers specifying the number of times each block is repeated
+in each stage of the ResNet model. For example, `[3, 4, 6, 3]` is the configuration used in
+ResNet-50, which has 3 blocks in the first stage, 4 blocks in the second stage, 6 blocks in the
+third stage and 3 blocks in the fourth stage.
+"""
 function resnet_planes(block_repeats::AbstractVector{<:Integer})
     return Iterators.flatten((64 * 2^(stage_idx - 1) for _ in 1:stages)
                              for (stage_idx, stages) in enumerate(block_repeats))
 end
 
+"""
+    resnet_stride(stage_idx::Integer, block_idx::Integer)
+
+Default callback for determining the stride of a block in a ResNet model.
+Returns `2` for the first block in every stage except the first stage and `1` for all other
+blocks.
+
+# Arguments
+
+  - `stage_idx`: The index of the stage in the ResNet model.
+  - `block_idx`: The index of the block in the stage.
+"""
+function resnet_stride(stage_idx::Integer, block_idx::Integer)
+    return stage_idx == 1 || block_idx != 1 ? 1 : 2
+end
+
 function resnet(block_type, block_repeats::AbstractVector{<:Integer},
                 downsample_opt::NTuple{2, Any} = (downsample_conv, downsample_identity);
                 cardinality::Integer = 1, base_width::Integer = 64,
                 inplanes::Integer = 64, reduction_factor::Integer = 1,
-                stem_fn = resnet_stem, connection = addact, activation = relu,
+                connection = addact, activation = relu,
                 norm_layer = BatchNorm, revnorm::Bool = false,
                 attn_fn = planes -> identity, pool_layer = AdaptiveMeanPool((1, 1)),
                 use_conv::Bool = false, dropblock_prob = nothing,
@@ -208,7 +327,7 @@ function resnet(block_type, block_repeats::AbstractVector{<:Integer},
                 imsize::Dims{2} = (256, 256), inchannels::Integer = 3,
                 nclasses::Integer = 1000, kwargs...)
     # Build stem
-    stem = stem_fn(; inchannels)
+    stem = resnet_stem(; inchannels)
     # Block builder
     if block_type == basicblock
         @assert cardinality==1 "Cardinality must be 1 for `basicblock`"
diff --git a/src/convnets/resnets/res2net.jl b/src/convnets/resnets/res2net.jl
index 5ef4987ad..33b9fb961 100644
--- a/src/convnets/resnets/res2net.jl
+++ b/src/convnets/resnets/res2net.jl
@@ -1,81 +1,3 @@
-"""
-    bottle2neck(inplanes::Integer, planes::Integer; stride::Integer = 1,
-                cardinality::Integer = 1, base_width::Integer = 26,
-                scale::Integer = 4, activation = relu, norm_layer = BatchNorm,
-                revnorm::Bool = false, attn_fn = planes -> identity)
-
-Creates a bottleneck block as described in the Res2Net paper.
-([reference](https://arxiv.org/abs/1904.01169))
-
-# Arguments
-
-  - `inplanes`: number of input feature maps
-  - `planes`: number of feature maps for the block
-  - `stride`: the stride of the block
-  - `cardinality`: the number of groups in the 3x3 convolutions.
-  - `base_width`: the number of output feature maps for each convolutional group.
-  - `scale`: the number of feature groups in the block. See the [paper](https://arxiv.org/abs/1904.01169)
-    for more details.
-  - `activation`: the activation function to use.
-  - `norm_layer`: the normalization layer to use.
-  - `revnorm`: set to `true` to place the batch norm before the convolution
-  - `attn_fn`: the attention function to use. See [`squeeze_excite`](#) for an example.
-"""
-function bottle2neck(inplanes::Integer, planes::Integer; stride::Integer = 1,
-                     cardinality::Integer = 1, base_width::Integer = 26,
-                     scale::Integer = 4, activation = relu, is_first::Bool = false,
-                     norm_layer = BatchNorm, revnorm::Bool = false,
-                     attn_fn = planes -> identity)
-    width = fld(planes * base_width, 64) * cardinality
-    outplanes = planes * 4
-    pool = is_first && scale > 1 ? MeanPool((3, 3); stride, pad = 1) : identity
-    conv_bns = [Chain(conv_norm((3, 3), width, width, activation; norm_layer, stride,
-                                pad = 1, groups = cardinality)...)
-                for _ in 1:max(1, scale - 1)]
-    reslayer = is_first ? Parallel(cat_channels, pool, conv_bns...) :
-               Parallel(cat_channels, identity, Chain(PairwiseFusion(+, conv_bns...)))
-    tuplify = is_first ? x -> tuple(x...) : x -> tuple(x[1], tuple(x[2:end]...))
-    layers = [
-        conv_norm((1, 1), inplanes, width * scale, activation;
-                  norm_layer, revnorm)...,
-        chunk$(; size = width, dims = 3), tuplify, reslayer,
-        conv_norm((1, 1), width * scale, outplanes, activation;
-                  norm_layer, revnorm)...,
-        attn_fn(outplanes),
-    ]
-    return Chain(filter(!=(identity), layers)...)
-end
-
-function bottle2neck_builder(block_repeats::AbstractVector{<:Integer};
-                             inplanes::Integer = 64, cardinality::Integer = 1,
-                             base_width::Integer = 26, scale::Integer = 4,
-                             expansion::Integer = 4, norm_layer = BatchNorm,
-                             revnorm::Bool = false, activation = relu,
-                             attn_fn = planes -> identity,
-                             stride_fn = resnet_stride, planes_fn = resnet_planes,
-                             downsample_tuple = (downsample_conv, downsample_identity))
-    planes_vec = collect(planes_fn(block_repeats))
-    # closure over `idxs`
-    function get_layers(stage_idx::Integer, block_idx::Integer)
-        # This is needed for block `inplanes` and `planes` calculations
-        schedule_idx = sum(block_repeats[1:(stage_idx - 1)]) + block_idx
-        planes = planes_vec[schedule_idx]
-        inplanes = schedule_idx == 1 ? inplanes : planes_vec[schedule_idx - 1] * expansion
-        # `resnet_stride` is a callback that the user can tweak to change the stride of the
-        # blocks. It defaults to the standard behaviour as in the paper
-        stride = stride_fn(stage_idx, block_idx)
-        downsample_fn = (stride != 1 || inplanes != planes * expansion) ?
-                        downsample_tuple[1] : downsample_tuple[2]
-        is_first = (stride > 1 || downsample_fn != downsample_tuple[2]) ? true : false
-        block = bottle2neck(inplanes, planes; stride, cardinality, base_width, scale,
-                            activation, is_first, norm_layer, revnorm, attn_fn)
-        downsample = downsample_fn(inplanes, planes * expansion; stride, norm_layer,
-                                   revnorm)
-        return block, downsample
-    end
-    return get_layers
-end
-
 """
     Res2Net(depth::Integer; pretrain::Bool = false, scale::Integer = 4,
             base_width::Integer = 26, inchannels::Integer = 3,
@@ -93,6 +15,12 @@ Creates a Res2Net model with the specified depth, scale, and base width.
   - `base_width`: the number of feature maps in each group.
   - `inchannels`: the number of input channels.
   - `nclasses`: the number of output classes
+
+!!! warning
+    
+    `Res2Net` does not currently support pretrained weights.
+
+Advanced users who want more configuration options will be better served by using [`resnet`](@ref).
 """
 struct Res2Net
     layers::Any
@@ -134,6 +62,12 @@ Creates a Res2NeXt model with the specified depth, scale, base width and cardina
   - `cardinality`: the number of groups in the 3x3 convolutions.
   - `inchannels`: the number of input channels.
   - `nclasses`: the number of output classes
+
+!!! warning
+    
+    `Res2NeXt` does not currently support pretrained weights.
+
+Advanced users who want more configuration options will be better served by using [`resnet`](@ref).
 """
 struct Res2NeXt
     layers::Any
diff --git a/src/convnets/resnets/resnet.jl b/src/convnets/resnets/resnet.jl
index b65b71072..a8c12eda8 100644
--- a/src/convnets/resnets/resnet.jl
+++ b/src/convnets/resnets/resnet.jl
@@ -11,7 +11,7 @@ Creates a ResNet model with the specified depth.
   - `inchannels`: The number of input channels.
   - `nclasses`: the number of output classes
 
-Advanced users who want more configuration options will be better served by using [`resnet`](#).
+Advanced users who want more configuration options will be better served by using [`resnet`](@ref).
 """
 struct ResNet
     layers::Any
@@ -48,7 +48,7 @@ The number of channels in outer 1x1 convolutions is the same.
   - `inchannels`: The number of input channels.
   - `nclasses`: The number of output classes
 
-Advanced users who want more configuration options will be better served by using [`resnet`](#).
+Advanced users who want more configuration options will be better served by using [`resnet`](@ref).
 """
 struct WideResNet
     layers::Any
diff --git a/src/convnets/resnets/resnext.jl b/src/convnets/resnets/resnext.jl
index 2a8fbd561..b44989559 100644
--- a/src/convnets/resnets/resnext.jl
+++ b/src/convnets/resnets/resnext.jl
@@ -20,7 +20,7 @@ Creates a ResNeXt model with the specified depth, cardinality, and base width.
   - `inchannels`: the number of input channels.
   - `nclasses`: the number of output classes
 
-Advanced users who want more configuration options will be better served by using [`resnet`](#).
+Advanced users who want more configuration options will be better served by using [`resnet`](@ref).
 """
 struct ResNeXt
     layers::Any
diff --git a/src/convnets/resnets/seresnet.jl b/src/convnets/resnets/seresnet.jl
index 487665518..44e32083d 100644
--- a/src/convnets/resnets/seresnet.jl
+++ b/src/convnets/resnets/seresnet.jl
@@ -15,7 +15,7 @@ Creates a SEResNet model with the specified depth.
     
     `SEResNet` does not currently support pretrained weights.
 
-Advanced users who want more configuration options will be better served by using [`resnet`](#).
+Advanced users who want more configuration options will be better served by using [`resnet`](@ref).
 """
 struct SEResNet
     layers::Any
@@ -58,7 +58,7 @@ Creates a SEResNeXt model with the specified depth, cardinality, and base width.
     
     `SEResNeXt` does not currently support pretrained weights.
 
-Advanced users who want more configuration options will be better served by using [`resnet`](#).
+Advanced users who want more configuration options will be better served by using [`resnet`](@ref).
 """
 struct SEResNeXt
     layers::Any
diff --git a/src/convnets/squeezenet.jl b/src/convnets/squeezenet.jl
index 5c688a645..5d63a565d 100644
--- a/src/convnets/squeezenet.jl
+++ b/src/convnets/squeezenet.jl
@@ -21,17 +21,18 @@ function fire(inplanes::Integer, squeeze_planes::Integer, expand1x1_planes::Inte
 end
 
 """
-    squeezenet(; inchannels::Integer = 3, nclasses::Integer = 1000)
+    squeezenet(; dropout_prob = 0.5, inchannels::Integer = 3, nclasses::Integer = 1000)
 
-Create a SqueezeNet
+Create a SqueezeNet model.
 ([reference](https://arxiv.org/abs/1602.07360v4)).
 
 # Arguments
 
+  - `dropout_prob`: dropout probability for the classifier head. Set to `nothing` to disable dropout.
   - `inchannels`: number of input channels.
   - `nclasses`: the number of output classes.
 """
-function squeezenet(; inchannels::Integer = 3, nclasses::Integer = 1000)
+function squeezenet(; dropout_prob = 0.5, inchannels::Integer = 3, nclasses::Integer = 1000)
     backbone = Chain(Conv((3, 3), inchannels => 64, relu; stride = 2),
                      MaxPool((3, 3); stride = 2),
                      fire(64, 16, 64, 64),
@@ -44,14 +45,14 @@ function squeezenet(; inchannels::Integer = 3, nclasses::Integer = 1000)
                      fire(384, 48, 192, 192),
                      fire(384, 64, 256, 256),
                      fire(512, 64, 256, 256))
-    classifier = Chain(Dropout(0.5), Conv((1, 1), 512 => nclasses, relu),
+    classifier = Chain(Dropout(dropout_prob), Conv((1, 1), 512 => nclasses, relu),
                        AdaptiveMeanPool((1, 1)), MLUtils.flatten)
     return Chain(backbone, classifier)
 end
 
 """
     SqueezeNet(; pretrain::Bool = false, inchannels::Integer = 3,
-           nclasses::Integer = 1000)
+               nclasses::Integer = 1000)
 
 Create a SqueezeNet
 ([reference](https://arxiv.org/abs/1602.07360v4)).
@@ -62,7 +63,7 @@ Create a SqueezeNet
   - `inchannels`: number of input channels.
   - `nclasses`: the number of output classes.
 
-See also [`squeezenet`](#).
+See also [`Metalhead.squeezenet`](@ref).
 """
 struct SqueezeNet
     layers::Any
diff --git a/src/convnets/vgg.jl b/src/convnets/vgg.jl
index 345f1e275..35d01e0b5 100644
--- a/src/convnets/vgg.jl
+++ b/src/convnets/vgg.jl
@@ -36,7 +36,7 @@ Create VGG convolution layers
 # Arguments
 
   - `config`: vector of tuples `(output_channels, num_convolutions)`
-    for each block (see [`Metalhead.vgg_block`](#))
+    for each block (see [`Metalhead.vgg_block`](@ref))
   - `batchnorm`: set to `true` to include batch normalization after each convolution
   - `inchannels`: number of input channels
 """
@@ -61,7 +61,7 @@ Create VGG classifier (fully connected) layers
 # Arguments
 
   - `imsize`: tuple `(width, height, channels)` indicating the size after
-    the convolution layers (see [`Metalhead.vgg_convolutional_layers`](#))
+    the convolution layers (see [`Metalhead.vgg_convolutional_layers`](@ref))
   - `nclasses`: number of output classes
   - `fcsize`: input and output size of the intermediate fully connected layer
   - `dropout_prob`: the dropout level between each fully connected layer
@@ -86,12 +86,12 @@ Create a VGG model
 
   - `imsize`: input image width and height as a tuple
   - `config`: the configuration for the convolution layers
-    (see [`Metalhead.vgg_convolutional_layers`](#))
+    (see [`Metalhead.vgg_convolutional_layers`](@ref))
   - `inchannels`: number of input channels
   - `batchnorm`: set to `true` to use batch normalization after each convolution
   - `nclasses`: number of output classes
   - `fcsize`: intermediate fully connected layer size
-    (see [`Metalhead.vgg_classifier_layers`](#))
+    (see [`Metalhead.vgg_classifier_layers`](@ref))
   - `dropout_prob`: dropout level between fully connected layers
 """
 function vgg(imsize::Dims{2}; config, batchnorm::Bool = false, fcsize::Integer = 4096,
@@ -122,7 +122,7 @@ Construct a VGG model with the specified input image size. Typically, the image
   - `batchnorm`: set to `true` to use batch normalization after each convolution
   - `nclasses`: number of output classes
   - `fcsize`: intermediate fully connected layer size
-    (see [`Metalhead.vgg_classifier_layers`](#))
+    (see [`Metalhead.vgg_classifier_layers`](@ref))
   - `dropout_prob`: dropout level between fully connected layers
 """
 struct VGG
@@ -156,7 +156,7 @@ Create a VGG style model with specified `depth`.
   - `inchannels`: number of input channels
   - `nclasses`: number of output classes
 
-See also [`vgg`](#).
+See also [`vgg`](@ref).
 """
 function VGG(depth::Integer; pretrain::Bool = false, batchnorm::Bool = false,
              inchannels::Integer = 3, nclasses::Integer = 1000)
diff --git a/src/layers/conv.jl b/src/layers/conv.jl
index 068546f5e..a6ac6e073 100644
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@@ -27,7 +27,7 @@ Create a convolution + normalisation layer pair with activation.
   - `groups`: groups for the convolution kernel
   - `bias`: bias for the convolution kernel. This is set to `false` by default if
     `use_norm = true`.
-  - `weight`, `init`: initialization for the convolution kernel (see [`Flux.Conv`](#))
+  - `weight`, `init`: initialization for the convolution kernel (see [`Flux.Conv`](@ref))
 """
 function conv_norm(kernel_size::Dims{2}, inplanes::Integer, outplanes::Integer,
                    activation = relu; norm_layer = BatchNorm, revnorm::Bool = false,
diff --git a/src/layers/drop.jl b/src/layers/drop.jl
index f9c516c3c..f7f5d95bd 100644
--- a/src/layers/drop.jl
+++ b/src/layers/drop.jl
@@ -28,7 +28,8 @@ regions of size `block_size` in the input. Otherwise, it simply returns the inpu
   - `gamma_scale`: multiplicative factor for `gamma` used. For the calculations,
     refer to [the paper](https://arxiv.org/abs/1810.12890).
 
-If you are an end-user, you do not want this function. Use [`DropBlock`](#) instead.
+If you are not a package developer, you most likely do not want this function. Use [`DropBlock`](@ref)
+instead.
 """
 function dropblock(rng::AbstractRNG, x::AbstractArray{T, 4}, drop_block_prob,
                    block_size::Integer, gamma_scale) where {T}
@@ -67,7 +68,8 @@ At test time, the `DropBlock` layer is equivalent to `identity`.
 # Arguments
 
   - `drop_block_prob`: probability of dropping a block. If `nothing` is passed, it returns
-    `identity`.
+    `identity`. Note that some literature uses the term "survival probability" instead,
+    which is equivalent to `1 - drop_block_prob`.
   - `block_size`: size of the block to drop
   - `gamma_scale`: multiplicative factor for `gamma` used. For the calculation of gamma,
     refer to [the paper](https://arxiv.org/abs/1810.12890).
@@ -134,8 +136,9 @@ equivalent to `identity`.
 
 # Arguments
 
-  - `p`: probability of Stochastic Depth.
-  - `mode`: Either `:batch` or `:row`. :batch randomly zeroes the entire input, `row` zeroes
+  - `p`: probability of Stochastic Depth. Note that some literature uses the term "survival
+    probability" instead, which is equivalent to `1 - p`.
+  - `mode`: Either `:batch` or `:row`. `:batch` randomly zeroes the entire input, `row` zeroes
     randomly selected rows from the batch. The default is `:row`.
   - `rng`: can be used to pass in a custom RNG instead of the default. See `Flux.Dropout`
     for more information on the behaviour of this argument. Custom RNGs are only supported
diff --git a/src/layers/mbconv.jl b/src/layers/mbconv.jl
index eb6801aaa..891c8ae0f 100644
--- a/src/layers/mbconv.jl
+++ b/src/layers/mbconv.jl
@@ -30,7 +30,7 @@ See Fig. 3 in [reference](https://arxiv.org/abs/1704.04861v1).
     `use_norm[1] == true`.
   - `stride`: stride of the first convolution kernel
   - `pad`: padding of the first convolution kernel
-  - `weight`, `init`: initialization for the convolution kernel (see [`Flux.Conv`](#))
+  - `weight`, `init`: initialization for the convolution kernel (see [`Flux.Conv`](@ref))
 """
 function dwsep_conv_norm(kernel_size::Dims{2}, inplanes::Integer, outplanes::Integer,
                          activation = relu; norm_layer = BatchNorm, eps::Float32 = 1.0f-5,
@@ -75,7 +75,7 @@ First introduced in the MobileNetv2 paper.
   - `activation`: The activation function for the first two convolution layer
   - `stride`: The stride of the convolutional kernel, has to be either 1 or 2
   - `reduction`: The reduction factor for the number of hidden feature maps
-    in a squeeze and excite layer (see [`squeeze_excite`](#))
+    in a squeeze and excite layer (see [`squeeze_excite`](@ref))
   - `se_round_fn`: The function to round the number of reduced feature maps
     in the squeeze and excite layer
   - `norm_layer`: The normalization layer to use
diff --git a/src/layers/selayers.jl b/src/layers/selayers.jl
index 8ac02e889..4f99733bf 100644
--- a/src/layers/selayers.jl
+++ b/src/layers/selayers.jl
@@ -15,7 +15,7 @@ Creates a squeeze-and-excitation layer used in MobileNets, EfficientNets and SE-
   - `squeeze_planes`: The number of feature maps in the intermediate layers. Alternatively,
     specify the keyword arguments `reduction` and `rd_divisior`, which determine the number
     of feature maps in the intermediate layers from the number of input feature maps as:
-    `squeeze_planes = _round_channels(inplanes ÷ reduction)`. (See [`_round_channels`](#) for details.)
+    `squeeze_planes = _round_channels(inplanes ÷ reduction)`. (See [`_round_channels`](@ref) for details.)
   - `activation`: The activation function for the first convolution layer
   - `gate_activation`: The activation function for the gate layer
   - `norm_layer`: The normalization layer to be used after the convolution layers
diff --git a/src/mixers/gmlp.jl b/src/mixers/gmlp.jl
index 08f64d65a..845786686 100644
--- a/src/mixers/gmlp.jl
+++ b/src/mixers/gmlp.jl
@@ -87,7 +87,7 @@ Creates a model with the gMLP architecture.
   - `inchannels`: the number of input channels
   - `nclasses`: number of output classes
 
-See also [`Metalhead.mlpmixer`](#).
+See also [`Metalhead.mlpmixer`](@ref).
 """
 struct gMLP
     layers::Any
diff --git a/src/mixers/mlpmixer.jl b/src/mixers/mlpmixer.jl
index e4861d726..24656bd63 100644
--- a/src/mixers/mlpmixer.jl
+++ b/src/mixers/mlpmixer.jl
@@ -49,7 +49,7 @@ Creates a model with the MLPMixer architecture.
   - `inchannels`: the number of input channels
   - `nclasses`: number of output classes
 
-See also [`Metalhead.mlpmixer`](#).
+See also [`Metalhead.mlpmixer`](@ref).
 """
 struct MLPMixer
     layers::Any
diff --git a/src/mixers/resmlp.jl b/src/mixers/resmlp.jl
index b307a8e3e..b1ff44ea1 100644
--- a/src/mixers/resmlp.jl
+++ b/src/mixers/resmlp.jl
@@ -48,7 +48,7 @@ Creates a model with the ResMLP architecture.
   - `inchannels`: the number of input channels
   - `nclasses`: number of output classes
 
-See also [`Metalhead.mlpmixer`](#).
+See also [`Metalhead.mlpmixer`](@ref).
 """
 struct ResMLP
     layers::Any
diff --git a/src/utilities.jl b/src/utilities.jl
index cdcd7ee6e..6f97e81d2 100644
--- a/src/utilities.jl
+++ b/src/utilities.jl
@@ -14,9 +14,9 @@ end
 
 Convenience function for applying an activation function to the output after
 summing up the input arrays. Useful as the `connection` argument for the block
-function in [`resnet`](#).
+function in [`resnet`](@ref).
 
-See also [`reluadd`](#).
+See also [`reluadd`](@ref).
 """
 addact(activation = relu, xs...) = activation(sum(xs))
 
@@ -25,9 +25,9 @@ addact(activation = relu, xs...) = activation(sum(xs))
 
 Convenience function for adding input arrays after applying an activation
 function to them. Useful as the `connection` argument for the block function in
-[`resnet`](#).
+[`resnet`](@ref).
 
-See also [`addrelu`](#).
+See also [`addrelu`](@ref).
 """
 actadd(activation = relu, xs...) = sum(activation.(x) for x in xs)
 
@@ -75,7 +75,7 @@ that this returns evenly spaced values between `start_value` and `drop_prob`, no
 values equal to `nothing`.
 """
 function linear_scheduler(drop_prob = 0.0; depth::Integer, start_value = 0.0)
-    return LinRange(start_value, drop_prob, depth + 1)[1:(end - 1)]
+    return LinRange(start_value, drop_prob, depth + 1)[1:depth]
 end
 linear_scheduler(drop_prob::Nothing; depth::Integer) = fill(drop_prob, depth)
 
diff --git a/src/vit-based/vit.jl b/src/vit-based/vit.jl
index d9688d229..02c57941f 100644
--- a/src/vit-based/vit.jl
+++ b/src/vit-based/vit.jl
@@ -92,7 +92,7 @@ Creates a Vision Transformer (ViT) model.
   - `pool`: pooling type, either :class or :mean
   - `nclasses`: number of classes in the output
 
-See also [`Metalhead.vit`](#).
+See also [`Metalhead.vit`](@ref).
 """
 struct ViT
     layers::Any

From e048d78945e5a5d8d26bc7284163ce93781d1d48 Mon Sep 17 00:00:00 2001
From: Abhirath Anand <74202102+theabhirath@users.noreply.github.com>
Date: Thu, 1 Sep 2022 11:45:31 +0530
Subject: [PATCH 6/8] More docstrings

---
 src/Metalhead.jl                             |  8 +--
 src/convnets/builders/mbconv.jl              |  6 --
 src/convnets/builders/resblocks.jl           | 20 +++---
 src/convnets/builders/resnet.jl              |  6 +-
 src/convnets/efficientnets/efficientnet.jl   | 13 ++--
 src/convnets/efficientnets/efficientnetv2.jl |  6 +-
 src/convnets/inceptions/googlenet.jl         |  6 +-
 src/convnets/inceptions/inceptionresnetv2.jl |  4 +-
 src/convnets/inceptions/inceptionv3.jl       |  7 +-
 src/convnets/inceptions/inceptionv4.jl       |  6 +-
 src/convnets/inceptions/xception.jl          |  2 +
 src/convnets/mobilenets/mnasnet.jl           |  7 +-
 src/convnets/resnets/core.jl                 | 67 ++++++++++++++++++--
 src/convnets/resnets/resnext.jl              |  4 +-
 14 files changed, 112 insertions(+), 50 deletions(-)

diff --git a/src/Metalhead.jl b/src/Metalhead.jl
index 01fc149eb..feba14ca0 100644
--- a/src/Metalhead.jl
+++ b/src/Metalhead.jl
@@ -72,16 +72,16 @@ export AlexNet, VGG, VGG11, VGG13, VGG16, VGG19,
        DenseNet, DenseNet121, DenseNet161, DenseNet169, DenseNet201,
        GoogLeNet, Inception3, Inceptionv3, Inceptionv4, InceptionResNetv2, Xception,
        SqueezeNet, MobileNetv1, MobileNetv2, MobileNetv3, MNASNet,
-       EfficientNet, EfficientNetv2,
-       MLPMixer, ResMLP, gMLP, ViT, ConvMixer, ConvNeXt
+       EfficientNet, EfficientNetv2, ConvMixer, ConvNeXt,
+       MLPMixer, ResMLP, gMLP, ViT
 
 # use Flux._big_show to pretty print large models
 for T in (:AlexNet, :VGG, :SqueezeNet, :ResNet, :WideResNet, :ResNeXt,
           :SEResNet, :SEResNeXt, :Res2Net, :Res2NeXt, :GoogLeNet, :DenseNet,
           :Inceptionv3, :Inceptionv4, :InceptionResNetv2, :Xception,
           :MobileNetv1, :MobileNetv2, :MobileNetv3, :MNASNet,
-          :EfficientNet, :EfficientNetv2,
-          :MLPMixer, :ResMLP, :gMLP, :ViT, :ConvMixer, :ConvNeXt)
+          :EfficientNet, :EfficientNetv2, :ConvMixer, :ConvNeXt,
+          :MLPMixer, :ResMLP, :gMLP, :ViT)
     @eval Base.show(io::IO, ::MIME"text/plain", model::$T) = _maybe_big_show(io, model)
 end
 
diff --git a/src/convnets/builders/mbconv.jl b/src/convnets/builders/mbconv.jl
index 73cd70c32..4c4fa9a10 100644
--- a/src/convnets/builders/mbconv.jl
+++ b/src/convnets/builders/mbconv.jl
@@ -8,12 +8,6 @@ Constructs a collection of inverted residual blocks for a given stage. Note that
 this function is not intended to be called directly, but rather by the [`mbconv_stage_builder`](@ref)
 function. This function must only be extended if the user wishes to extend a custom inverted
 residual block type.
-
-# Arguments
-
-  - `irblockfn`: the inverted residual block function to use in the block builder. Metalhead
-    defines methods for [`dwsep_conv_norm`](@ref), [`mbconv`](@ref) and [`fused_mbconv`](@ref)
-    as inverted residual blocks.
 """
 function irblockbuilder(::typeof(dwsep_conv_norm), block_configs::AbstractVector{<:Tuple},
                         inplanes::Integer, stage_idx::Integer, scalings::NTuple{2, Real};
diff --git a/src/convnets/builders/resblocks.jl b/src/convnets/builders/resblocks.jl
index b1435e4ff..bdf36c0f3 100644
--- a/src/convnets/builders/resblocks.jl
+++ b/src/convnets/builders/resblocks.jl
@@ -22,10 +22,10 @@ Builder for creating a basic block for a ResNet model.
   - `revnorm`: set to `true` to place normalization layer before the convolution
   - `activation`: activation function to use
   - `attn_fn`: attention function to use
-  - `dropblock_prob`: dropblock probability. Set to `nothing` to disable DropBlock
-  - `stochastic_depth_prob`: stochastic depth probability. Set to `nothing` to disable StochasticDepth
-  - `stride_fn`: function to use to compute the stride of the block
-  - `planes_fn`: function to use to compute the number of channels in each block
+  - `dropblock_prob`: dropblock probability. Set to `nothing` to disable `DropBlock`
+  - `stochastic_depth_prob`: stochastic depth probability. Set to `nothing` to disable `StochasticDepth`
+  - `stride_fn`: callback for computing the stride of the block
+  - `planes_fn`: callback for computing the number of channels in each block
   - `downsample_tuple`: two-element tuple of downsample functions to use. The first one
     is used when the number of channels changes in the block, the second one is used
     when the number of channels stays the same.
@@ -90,10 +90,10 @@ Builder for creating a bottleneck block for a ResNet/ResNeXt model.
   - `revnorm`: set to `true` to place normalization layer before the convolution
   - `activation`: activation function to use
   - `attn_fn`: attention function to use
-  - `dropblock_prob`: dropblock probability. Set to `nothing` to disable DropBlock
-  - `stochastic_depth_prob`: stochastic depth probability. Set to `nothing` to disable StochasticDepth
-  - `stride_fn`: function to use to compute the stride of the block
-  - `planes_fn`: function to use to compute the number of channels in each block
+  - `dropblock_prob`: dropblock probability. Set to `nothing` to disable `DropBlock`
+  - `stochastic_depth_prob`: stochastic depth probability. Set to `nothing` to disable `StochasticDepth`
+  - `stride_fn`: callback for computing the stride of the block
+  - `planes_fn`: callback for computing the number of channels in each block
   - `downsample_tuple`: two-element tuple of downsample functions to use. The first one
     is used when the number of channels changes in the block, the second one is used
     when the number of channels stays the same.
@@ -157,8 +157,8 @@ Builder for creating a bottle2neck block for a Res2Net model.
   - `revnorm`: set to `true` to place normalization layer before the convolution
   - `activation`: activation function to use
   - `attn_fn`: attention function to use
-  - `stride_fn`: function to use to compute the stride of the block
-  - `planes_fn`: function to use to compute the number of channels in each block
+  - `stride_fn`: callback for computing the stride of the block
+  - `planes_fn`: callback for computing the number of channels in each block
   - `downsample_tuple`: two-element tuple of downsample functions to use. The first one
     is used when the number of channels changes in the block, the second one is used
     when the number of channels stays the same.
diff --git a/src/convnets/builders/resnet.jl b/src/convnets/builders/resnet.jl
index aa038f728..580baaa34 100644
--- a/src/convnets/builders/resnet.jl
+++ b/src/convnets/builders/resnet.jl
@@ -2,12 +2,12 @@
     build_resnet(img_dims, stem, get_layers, block_repeats::AbstractVector{<:Integer},
                  connection, classifier_fn)
 
-Creates a ResNet model.
+Creates a generic ResNet-like model.
 
 !!! info
     
-    This is a very generic, flexible but low level API that can be used to create any of the ResNet
-    variants. For a more user friendly API, see the other version of `resnet`.
+    This is a very generic, flexible but low level function that can be used to create any of the ResNet
+    variants. For a more user friendly function, see [`Metalhead.resnet`](@ref).
 
 # Arguments
 
diff --git a/src/convnets/efficientnets/efficientnet.jl b/src/convnets/efficientnets/efficientnet.jl
index 3554eddba..9b8519e2d 100644
--- a/src/convnets/efficientnets/efficientnet.jl
+++ b/src/convnets/efficientnets/efficientnet.jl
@@ -47,12 +47,6 @@ Create an EfficientNet model. ([reference](https://arxiv.org/abs/1905.11946v5)).
     dropout.
   - `inchannels`: number of input channels.
   - `nclasses`: number of output classes.
-
-!!! warning
-    
-    EfficientNet does not currently support pretrained weights.
-
-See also [`Metalhead.efficientnet`](@ref).
 """
 function efficientnet(config::Symbol; norm_layer = BatchNorm, stochastic_depth_prob = 0.2,
                       dropout_prob = nothing, inchannels::Integer = 3,
@@ -70,18 +64,19 @@ end
                  nclasses::Integer = 1000)
 
 Create an EfficientNet model ([reference](https://arxiv.org/abs/1905.11946v5)).
-See also [`efficientnet`](@ref).
 
 # Arguments
 
   - `config`: size of the model. Can be one of `[:b0, :b1, :b2, :b3, :b4, :b5, :b6, :b7, :b8]`.
   - `pretrain`: set to `true` to load the pre-trained weights for ImageNet
+  - `inchannels`: number of input channels.
+  - `nclasses`: number of output classes.
 
 !!! warning
     
-    `EfficientNet` does not currently support pretrained weights.
+    EfficientNet does not currently support pretrained weights.
 
-See also [`efficientnet`](#).
+See also [`Metalhead.efficientnet`](@ref).
 """
 struct EfficientNet
     layers::Any
diff --git a/src/convnets/efficientnets/efficientnetv2.jl b/src/convnets/efficientnets/efficientnetv2.jl
index e820f9989..58c9abd35 100644
--- a/src/convnets/efficientnets/efficientnetv2.jl
+++ b/src/convnets/efficientnets/efficientnetv2.jl
@@ -36,14 +36,14 @@ const EFFNETV2_CONFIGS = Dict(:small => [(fused_mbconv, 3, 24, 1, 1, 2, swish),
                                   (mbconv, 3, 768, 6, 1, 8, 4, swish)])
 
 """
-    efficientnet(config::Symbol; norm_layer = BatchNorm, stochastic_depth_prob = 0.2,
-                 dropout_prob = nothing, inchannels::Integer = 3, nclasses::Integer = 1000)
+    efficientnetv2(config::Symbol; norm_layer = BatchNorm, stochastic_depth_prob = 0.2,
+                   dropout_prob = nothing, inchannels::Integer = 3, nclasses::Integer = 1000)
 
 Create an EfficientNetv2 model. ([reference](https://arxiv.org/abs/2104.00298)).
 
 # Arguments
 
-  - `config`: size of the model. Can be one of `[:b0, :b1, :b2, :b3, :b4, :b5, :b6, :b7, :b8]`.
+  - `config`: size of the network (one of `[:small, :medium, :large, :xlarge]`)
   - `norm_layer`: normalization layer to use.
   - `stochastic_depth_prob`: probability of stochastic depth. Set to `nothing` to disable
     stochastic depth.
diff --git a/src/convnets/inceptions/googlenet.jl b/src/convnets/inceptions/googlenet.jl
index fefbbca9a..c343eec72 100644
--- a/src/convnets/inceptions/googlenet.jl
+++ b/src/convnets/inceptions/googlenet.jl
@@ -27,13 +27,15 @@ function inceptionblock(inplanes, out_1x1, red_3x3, out_3x3, red_5x5, out_5x5, p
 end
 
 """
-    googlenet(; nclasses::Integer = 1000)
+    googlenet(; dropout_prob = 0.4, inchannels::Integer = 3, nclasses::Integer = 1000)
 
 Create an Inception-v1 model (commonly referred to as GoogLeNet)
 ([reference](https://arxiv.org/abs/1409.4842v1)).
 
 # Arguments
 
+  - `dropout_prob`: the dropout probability in the classifier head. Set to `nothing` to disable dropout.
+  - `inchannels`: the number of input channels
   - `nclasses`: the number of output classes
 """
 function googlenet(; dropout_prob = 0.4, inchannels::Integer = 3, nclasses::Integer = 1000)
@@ -71,7 +73,7 @@ Create an Inception-v1 model (commonly referred to as `GoogLeNet`)
     
     `GoogLeNet` does not currently support pretrained weights.
 
-See also [`googlenet`](@ref).
+See also [`Metalhead.googlenet`](@ref).
 """
 struct GoogLeNet
     layers::Any
diff --git a/src/convnets/inceptions/inceptionresnetv2.jl b/src/convnets/inceptions/inceptionresnetv2.jl
index d4a034bed..98d686062 100644
--- a/src/convnets/inceptions/inceptionresnetv2.jl
+++ b/src/convnets/inceptions/inceptionresnetv2.jl
@@ -71,8 +71,8 @@ Creates an InceptionResNetv2 model.
 
 # Arguments
 
-  - `inchannels`: number of input channels.
   - `dropout_prob`: probability of dropout in classifier head. Set to `nothing` to disable dropout.
+  - `inchannels`: number of input channels.
   - `nclasses`: the number of output classes.
 """
 function inceptionresnetv2(; dropout_prob = nothing, inchannels::Integer = 3,
@@ -111,6 +111,8 @@ Creates an InceptionResNetv2 model.
 !!! warning
     
     `InceptionResNetv2` does not currently support pretrained weights.
+
+See also [`Metalhead.inceptionresnetv2`](@ref).
 """
 struct InceptionResNetv2
     layers::Any
diff --git a/src/convnets/inceptions/inceptionv3.jl b/src/convnets/inceptions/inceptionv3.jl
index 038d54818..41d7ae18e 100644
--- a/src/convnets/inceptions/inceptionv3.jl
+++ b/src/convnets/inceptions/inceptionv3.jl
@@ -125,12 +125,14 @@ function inceptionv3_e(inplanes)
 end
 
 """
-    inceptionv3(; inchannels::Integer = 3, nclasses::Integer = 1000)
+    inceptionv3(; dropout_prob = 0.2, inchannels::Integer = 3, nclasses::Integer = 1000)
 
 Create an Inception-v3 model ([reference](https://arxiv.org/abs/1512.00567v3)).
 
 # Arguments
 
+  - `dropout_prob`: the dropout probability in the classifier head. Set to `nothing` to disable dropout.
+  - `inchannels`: number of input feature maps
   - `nclasses`: the number of output classes
 """
 function inceptionv3(; dropout_prob = 0.2, inchannels::Integer = 3,
@@ -160,7 +162,6 @@ end
     Inceptionv3(; pretrain::Bool = false, inchannels::Integer = 3, nclasses::Integer = 1000)
 
 Create an Inception-v3 model ([reference](https://arxiv.org/abs/1512.00567v3)).
-See also [`inceptionv3`](@ref).
 
 # Arguments
 
@@ -171,6 +172,8 @@ See also [`inceptionv3`](@ref).
 !!! warning
     
     `Inceptionv3` does not currently support pretrained weights.
+
+See also [`Metalhead.inceptionv3`](@ref).
 """
 struct Inceptionv3
     layers::Any
diff --git a/src/convnets/inceptions/inceptionv4.jl b/src/convnets/inceptions/inceptionv4.jl
index e3b4e30f7..964afc362 100644
--- a/src/convnets/inceptions/inceptionv4.jl
+++ b/src/convnets/inceptions/inceptionv4.jl
@@ -85,15 +85,15 @@ function inceptionv4_c()
 end
 
 """
-    inceptionv4(; inchannels::Integer = 3, dropout_prob = nothing, nclasses::Integer = 1000)
+    inceptionv4(; dropout_prob = nothing, inchannels::Integer = 3, nclasses::Integer = 1000)
 
 Create an Inceptionv4 model.
 ([reference](https://arxiv.org/abs/1602.07261))
 
 # Arguments
 
-  - `inchannels`: number of input channels.
   - `dropout_prob`: probability of dropout in classifier head. Set to `nothing` to disable dropout.
+  - `inchannels`: number of input channels.
   - `nclasses`: the number of output classes.
 """
 function inceptionv4(; dropout_prob = nothing, inchannels::Integer = 3,
@@ -126,6 +126,8 @@ Creates an Inceptionv4 model.
 !!! warning
     
     `Inceptionv4` does not currently support pretrained weights.
+
+See also [`Metalhead.inceptionv4`](@ref).
 """
 struct Inceptionv4
     layers::Any
diff --git a/src/convnets/inceptions/xception.jl b/src/convnets/inceptions/xception.jl
index efb60edd3..95c8ecf83 100644
--- a/src/convnets/inceptions/xception.jl
+++ b/src/convnets/inceptions/xception.jl
@@ -82,6 +82,8 @@ Creates an Xception model.
 !!! warning
     
     `Xception` does not currently support pretrained weights.
+
+See also [`Metalhead.xception`](@ref).
 """
 struct Xception
     layers::Any
diff --git a/src/convnets/mobilenets/mnasnet.jl b/src/convnets/mobilenets/mnasnet.jl
index c08595715..a035d1c77 100644
--- a/src/convnets/mobilenets/mnasnet.jl
+++ b/src/convnets/mobilenets/mnasnet.jl
@@ -68,14 +68,17 @@ function mnasnet(config::Symbol; width_mult::Real = 1, max_width::Integer = 1280
 end
 
 """
-    MNASNet(width_mult = 1; inchannels::Integer = 3, pretrain::Bool = false,
-            nclasses::Integer = 1000)
+    MNASNet(config::Symbol; width_mult::Real = 1, pretrain::Bool = false,
+            inchannels::Integer = 3, nclasses::Integer = 1000)
 
 Creates a MNASNet model with the specified configuration.
 ([reference](https://arxiv.org/abs/1807.11626))
 
 # Arguments
 
+  - `config`: configuration of the model. One of `B1`, `A1` or `small`. `B1` is without
+    squeeze-and-excite layers, `A1` is with squeeze-and-excite layers, and `small` is a smaller
+    version of `A1`.
   - `width_mult`: Controls the number of output feature maps in each block
     (with 1 being the default in the paper; this is usually a value between 0.1 and 1.4)
   - `pretrain`: Whether to load the pre-trained weights for ImageNet
diff --git a/src/convnets/resnets/core.jl b/src/convnets/resnets/core.jl
index 923e2fa7b..1eea1a306 100644
--- a/src/convnets/resnets/core.jl
+++ b/src/convnets/resnets/core.jl
@@ -8,6 +8,8 @@
                attn_fn = planes -> identity)
 
 Creates a basic residual block (see [reference](https://arxiv.org/abs/1512.03385v1)).
+This function creates the layers. For more configuration options and to see the function
+used to build the block for the model, see [`Metalhead.basicblock_builder`](@ref).
 
 # Arguments
 
@@ -47,6 +49,8 @@ end
                attn_fn = planes -> identity)
 
 Creates a bottleneck residual block (see [reference](https://arxiv.org/abs/1512.03385v1)).
+This function creates the layers. For more configuration options and to see the function
+used to build the block for the model, see [`Metalhead.bottleneck_builder`](@ref).
 
 # Arguments
 
@@ -89,8 +93,9 @@ end
                 scale::Integer = 4, activation = relu, norm_layer = BatchNorm,
                 revnorm::Bool = false, attn_fn = planes -> identity)
 
-Creates a bottleneck block as described in the Res2Net paper.
-([reference](https://arxiv.org/abs/1904.01169))
+Creates a bottleneck block as described in the Res2Net paper. ([reference](https://arxiv.org/abs/1904.01169))
+This function creates the layers. For more configuration options and to see the function
+used to build the block for the model, see [`Metalhead.bottle2neck_builder`](@ref).
 
 # Arguments
 
@@ -252,9 +257,16 @@ on how to use this function.
 function resnet_stem(stem_type::Symbol = :default; inchannels::Integer = 3,
                      replace_pool::Bool = false, activation = relu,
                      norm_layer = BatchNorm, revnorm::Bool = false)
-    _checkconfig(stem_type, [:default, :deep, :deep_tiered])
+    # Check for valid stem types
+    deep_stem = if stem_type === :deep || stem_type === :deep_tiered
+        true
+    elseif stem_type === :default
+        false
+    else
+        throw(ArgumentError("Unsupported stem type $stem_type. Must be one of 
+                             [:default, :deep, :deep_tiered]"))
+    end
     # Main stem
-    deep_stem = stem_type === :deep || stem_type === :deep_tiered
     inplanes = deep_stem ? stem_width * 2 : 64
     # Deep stem that uses three successive 3x3 convolutions instead of a single 7x7 convolution
     if deep_stem
@@ -315,6 +327,53 @@ function resnet_stride(stage_idx::Integer, block_idx::Integer)
     return stage_idx == 1 || block_idx != 1 ? 1 : 2
 end
 
+"""
+    resnet(block_type, block_repeats::AbstractVector{<:Integer},
+           downsample_opt::NTuple{2, Any} = (downsample_conv, downsample_identity);
+           cardinality::Integer = 1, base_width::Integer = 64,
+           inplanes::Integer = 64, reduction_factor::Integer = 1,
+           connection = addact, activation = relu,
+           norm_layer = BatchNorm, revnorm::Bool = false,
+           attn_fn = planes -> identity, pool_layer = AdaptiveMeanPool((1, 1)),
+           use_conv::Bool = false, dropblock_prob = nothing,
+           stochastic_depth_prob = nothing, dropout_prob = nothing,
+           imsize::Dims{2} = (256, 256), inchannels::Integer = 3,
+           nclasses::Integer = 1000, kwargs...)
+
+Creates a generic ResNet-like model that is used to create the higher level models like ResNet,
+Wide ResNet, ResNeXt and Res2Net. For an _even_ more generic model API, see [`Metalhead.build_resnet`](@ref).
+
+# Arguments
+
+    - `block_type`: The type of block to be used in the model. This can be one of [`Metalhead.basicblock`](@ref),
+    [`Metalhead.bottleneck`](@ref) and [`Metalhead.bottle2neck`](@ref). `basicblock` is used in the 
+    original ResNet paper for ResNet-18 and ResNet-34, and `bottleneck` is used in the original ResNet-50
+    and ResNet-101 models, as well as for the Wide ResNet and ResNeXt models. `bottle2neck` is introduced in
+    the `Res2Net` paper.
+    - `block_repeats`: A `Vector` of integers specifying the number of times each block is repeated
+    in each stage of the ResNet model. For example, `[3, 4, 6, 3]` is the configuration used in
+    ResNet-50, which has 3 blocks in the first stage, 4 blocks in the second stage, 6 blocks in the
+    third stage and 3 blocks in the fourth stage.
+    - `downsample_opt`: A `NTuple` of two callbacks that are used to determine the downsampling
+    operation to be used in the model. The first callback is used to determine the convolutional
+    operation to be used in the downsampling operation and the second callback is used to determine
+    the identity operation to be used in the downsampling operation.
+    - `cardinality`: The number of groups to be used in the 3x3 convolutional layer in the bottleneck
+    block. This is usually modified from the default value of `1` in the ResNet models to `32` or `64`
+    in the `ResNeXt` models.
+    - `base_width`: The base width of the convolutional layer in the blocks of the model.
+    - `inplanes`: The number of input channels in the first convolutional layer.
+    - `reduction_factor`: The reduction factor used in the model.
+    - `connection`: This is a function that determines the residual connection in the model. For
+    `resnets`, either of [`Metalhead.addact`](@ref) or [`Metalhead.actadd`](@ref) is recommended.
+    - `norm_layer`: The normalisation layer to be used in the model.
+    - `revnorm`: set to `true` to place the normalisation layers before the convolutions
+    - `attn_fn`: A callback that is used to determine the attention function to be used in the model.
+    See [`Metalhead.Layers.squeeze_excite`](@ref) for an example.
+    - `pool_layer`: A fully-insta
+    - `use_conv`: Set to true to use convolutions instead of identity operations in the model.
+    - `dropblock_prob`: The probability of using DropBlock in the model.
+"""
 function resnet(block_type, block_repeats::AbstractVector{<:Integer},
                 downsample_opt::NTuple{2, Any} = (downsample_conv, downsample_identity);
                 cardinality::Integer = 1, base_width::Integer = 64,
diff --git a/src/convnets/resnets/resnext.jl b/src/convnets/resnets/resnext.jl
index b44989559..6d65708d0 100644
--- a/src/convnets/resnets/resnext.jl
+++ b/src/convnets/resnets/resnext.jl
@@ -27,8 +27,6 @@ struct ResNeXt
 end
 @functor ResNeXt
 
-(m::ResNeXt)(x) = m.layers(x)
-
 function ResNeXt(depth::Integer; pretrain::Bool = false, cardinality::Integer = 32,
                  base_width::Integer = 4, inchannels::Integer = 3, nclasses::Integer = 1000)
     _checkconfig(depth, keys(LRESNET_CONFIGS))
@@ -41,5 +39,7 @@ function ResNeXt(depth::Integer; pretrain::Bool = false, cardinality::Integer =
     return ResNeXt(layers)
 end
 
+(m::ResNeXt)(x) = m.layers(x)
+
 backbone(m::ResNeXt) = m.layers[1]
 classifier(m::ResNeXt) = m.layers[2]

From da5d3a74ceb97d68cd2159702740b07a1d08f4af Mon Sep 17 00:00:00 2001
From: Abhirath Anand <74202102+theabhirath@users.noreply.github.com>
Date: Fri, 2 Sep 2022 13:02:40 +0530
Subject: [PATCH 7/8] Some more cleanup

---
 src/convnets/inceptions/xception.jl |  2 +-
 src/convnets/resnets/core.jl        | 40 +++++++++++++++++------------
 src/layers/conv.jl                  | 36 +++++++++++---------------
 src/layers/mbconv.jl                | 31 +++++++++-------------
 src/layers/selayers.jl              | 26 +++++++------------
 5 files changed, 61 insertions(+), 74 deletions(-)

diff --git a/src/convnets/inceptions/xception.jl b/src/convnets/inceptions/xception.jl
index 95c8ecf83..9dfd73f86 100644
--- a/src/convnets/inceptions/xception.jl
+++ b/src/convnets/inceptions/xception.jl
@@ -34,7 +34,7 @@ function xception_block(inchannels::Integer, outchannels::Integer, nrepeats::Int
         end
         push!(layers, relu)
         append!(layers,
-                dwsep_conv_norm((3, 3), inc, outc; pad = 1, use_norm = (false, false)))
+                dwsep_conv_norm((3, 3), inc, outc; pad = 1, norm_layer = identity))
         push!(layers, BatchNorm(outc))
     end
     layers = start_with_relu ? layers : layers[2:end]
diff --git a/src/convnets/resnets/core.jl b/src/convnets/resnets/core.jl
index 1eea1a306..c5b2f1c7d 100644
--- a/src/convnets/resnets/core.jl
+++ b/src/convnets/resnets/core.jl
@@ -133,7 +133,7 @@ function bottle2neck(inplanes::Integer, planes::Integer; stride::Integer = 1,
                   norm_layer, revnorm)...,
         attn_fn(outplanes),
     ]
-    return Chain(filter(!=(identity), layers)...)
+    return Chain(filter!(!=(identity), layers)...)
 end
 
 ## Downsample layers
@@ -345,34 +345,42 @@ Wide ResNet, ResNeXt and Res2Net. For an _even_ more generic model API, see [`Me
 
 # Arguments
 
-    - `block_type`: The type of block to be used in the model. This can be one of [`Metalhead.basicblock`](@ref),
-    [`Metalhead.bottleneck`](@ref) and [`Metalhead.bottle2neck`](@ref). `basicblock` is used in the 
+  - `block_type`: The type of block to be used in the model. This can be one of [`Metalhead.basicblock`](@ref),
+    [`Metalhead.bottleneck`](@ref) and [`Metalhead.bottle2neck`](@ref). `basicblock` is used in the
     original ResNet paper for ResNet-18 and ResNet-34, and `bottleneck` is used in the original ResNet-50
     and ResNet-101 models, as well as for the Wide ResNet and ResNeXt models. `bottle2neck` is introduced in
     the `Res2Net` paper.
-    - `block_repeats`: A `Vector` of integers specifying the number of times each block is repeated
+  - `block_repeats`: A `Vector` of integers specifying the number of times each block is repeated
     in each stage of the ResNet model. For example, `[3, 4, 6, 3]` is the configuration used in
     ResNet-50, which has 3 blocks in the first stage, 4 blocks in the second stage, 6 blocks in the
     third stage and 3 blocks in the fourth stage.
-    - `downsample_opt`: A `NTuple` of two callbacks that are used to determine the downsampling
+  - `downsample_opt`: A `NTuple` of two callbacks that are used to determine the downsampling
     operation to be used in the model. The first callback is used to determine the convolutional
     operation to be used in the downsampling operation and the second callback is used to determine
     the identity operation to be used in the downsampling operation.
-    - `cardinality`: The number of groups to be used in the 3x3 convolutional layer in the bottleneck
+  - `cardinality`: The number of groups to be used in the 3x3 convolutional layer in the bottleneck
     block. This is usually modified from the default value of `1` in the ResNet models to `32` or `64`
     in the `ResNeXt` models.
-    - `base_width`: The base width of the convolutional layer in the blocks of the model.
-    - `inplanes`: The number of input channels in the first convolutional layer.
-    - `reduction_factor`: The reduction factor used in the model.
-    - `connection`: This is a function that determines the residual connection in the model. For
+  - `base_width`: The base width of the convolutional layer in the blocks of the model.
+  - `inplanes`: The number of input channels in the first convolutional layer.
+  - `reduction_factor`: The reduction factor used in the model.
+  - `connection`: This is a function that determines the residual connection in the model. For
     `resnets`, either of [`Metalhead.addact`](@ref) or [`Metalhead.actadd`](@ref) is recommended.
-    - `norm_layer`: The normalisation layer to be used in the model.
-    - `revnorm`: set to `true` to place the normalisation layers before the convolutions
-    - `attn_fn`: A callback that is used to determine the attention function to be used in the model.
+  - `norm_layer`: The normalisation layer to be used in the model.
+  - `revnorm`: set to `true` to place the normalisation layers before the convolutions
+  - `attn_fn`: A callback that is used to determine the attention function to be used in the model.
     See [`Metalhead.Layers.squeeze_excite`](@ref) for an example.
-    - `pool_layer`: A fully-insta
-    - `use_conv`: Set to true to use convolutions instead of identity operations in the model.
-    - `dropblock_prob`: The probability of using DropBlock in the model.
+  - `pool_layer`: A fully-instantiated pooling layer passed in to be used by the classifier head.
+    For example, `AdaptiveMeanPool((1, 1))` is used in the ResNet family by default, but something
+    like `MeanPool((3, 3))` should also work provided the dimensions after applying the pooling
+    layer are compatible with the rest of the classifier head.
+  - `use_conv`: Set to true to use convolutions instead of identity operations in the model.
+  - `dropblock_prob`: `DropBlock` probability to be used in the model. Set to `nothing` to disable
+    DropBlock. See [`Metalhead.DropBlock`](@ref) for more details.
+  - `stochastic_depth_prob`: `StochasticDepth` probability to be used in the model. Set to `nothing` to disable
+    StochasticDepth. See [`Metalhead.StochasticDepth`](@ref) for more details.
+  - `dropout_prob`: `Dropout` probability to be used in the classifier head. Set to `nothing` to
+    disable Dropout.
 """
 function resnet(block_type, block_repeats::AbstractVector{<:Integer},
                 downsample_opt::NTuple{2, Any} = (downsample_conv, downsample_identity);
diff --git a/src/layers/conv.jl b/src/layers/conv.jl
index a6ac6e073..cbee8ef03 100644
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@@ -1,9 +1,8 @@
 """
     conv_norm(kernel_size::Dims{2}, inplanes::Integer, outplanes::Integer,
               activation = relu; norm_layer = BatchNorm, revnorm::Bool = false,
-              eps::Float32 = 1.0f-5, preact::Bool = false, use_norm::Bool = true,
-              stride::Integer = 1, pad::Integer = 0, dilation::Integer = 1, 
-              groups::Integer = 1, [bias, weight, init])
+              preact::Bool = false, stride::Integer = 1, pad::Integer = 0,
+              dilation::Integer = 1, groups::Integer = 1, [bias, weight, init])
 
 Create a convolution + normalisation layer pair with activation.
 
@@ -14,33 +13,27 @@ Create a convolution + normalisation layer pair with activation.
   - `outplanes`: number of output feature maps
   - `activation`: the activation function for the final layer
   - `norm_layer`: the normalisation layer used. Note that using `identity` as the normalisation
-    layer will result in no normalisation being applied i.e. this will be the same as
-    setting `use_norm = false`.
+    layer will result in no normalisation being applied. (This is only compatible with `preact`
+    and `revnorm` both set to `false`.)
   - `revnorm`: set to `true` to place the normalisation layer before the convolution
   - `preact`: set to `true` to place the activation function before the normalisation layer
     (only compatible with `revnorm = false`)
-  - `use_norm`: set to `false` to disable normalisation
-    (only compatible with `revnorm = false` and `preact = false`)
+  - `bias`: bias for the convolution kernel. This is set to `false` by default if
+    `norm_layer` is not `identity` and `true` otherwise.
   - `stride`: stride of the convolution kernel
   - `pad`: padding of the convolution kernel
   - `dilation`: dilation of the convolution kernel
   - `groups`: groups for the convolution kernel
-  - `bias`: bias for the convolution kernel. This is set to `false` by default if
-    `use_norm = true`.
   - `weight`, `init`: initialization for the convolution kernel (see [`Flux.Conv`](@ref))
 """
 function conv_norm(kernel_size::Dims{2}, inplanes::Integer, outplanes::Integer,
                    activation = relu; norm_layer = BatchNorm, revnorm::Bool = false,
-                   eps::Float32 = 1.0f-5, preact::Bool = false, use_norm::Bool = true,
-                   bias = !use_norm, kwargs...)
-    # no normalization layer (including case where normalization layer is identity)
-    use_norm = use_norm && norm_layer !== identity
-    if !use_norm
+                   preact::Bool = false, bias = !(norm_layer !== identity), kwargs...)
+    # no normalization layer
+    if !(norm_layer !== identity)
         if preact || revnorm
-            throw(ArgumentError("`preact` only supported with `use_norm = true`. Check if
-            `use_norm = false` is intended. Note that it is also possible to trigger this
-            error if you set `norm_layer` to `identity` since that returns the same
-            behaviour as `use_norm`."))
+            throw(ArgumentError("`preact` only supported with `norm_layer !== identity`.
+            Check if a non-`identity` norm layer is intended."))
         else
             # early return if no norm layer is required
             return [Conv(kernel_size, inplanes => outplanes, activation; kwargs...)]
@@ -64,7 +57,7 @@ function conv_norm(kernel_size::Dims{2}, inplanes::Integer, outplanes::Integer,
     end
     # layers
     layers = [Conv(kernel_size, inplanes => outplanes, activations.conv; bias, kwargs...),
-        norm_layer(normplanes, activations.norm; ϵ = eps)]
+        norm_layer(normplanes, activations.norm)]
     return revnorm ? reverse(layers) : layers
 end
 
@@ -86,6 +79,7 @@ TensorFlow implementation.
 """
 function basic_conv_bn(kernel_size::Dims{2}, inplanes, outplanes, activation = relu;
                        kwargs...)
-    return conv_norm(kernel_size, inplanes, outplanes, activation; norm_layer = BatchNorm,
-                     eps = 1.0f-3, kwargs...)
+    # TensorFlow uses a default epsilon of 1e-3 for BatchNorm
+    norm_layer = (args...; kwargs...) -> BatchNorm(args...; ϵ = 1.0f-3, kwargs...)
+    return conv_norm(kernel_size, inplanes, outplanes, activation; norm_layer, kwargs...)
 end
diff --git a/src/layers/mbconv.jl b/src/layers/mbconv.jl
index 891c8ae0f..13eef8ae1 100644
--- a/src/layers/mbconv.jl
+++ b/src/layers/mbconv.jl
@@ -1,17 +1,16 @@
 """
     dwsep_conv_norm(kernel_size::Dims{2}, inplanes::Integer, outplanes::Integer,
-                    activation = relu; eps::Float32 = 1.0f-5, revnorm::Bool = false, 
-                    stride::Integer = 1, use_norm::NTuple{2, Bool} = (true, true),
-                    pad::Integer = 0, [bias, weight, init])
+                    activation = relu; norm_layer = BatchNorm, stride::Integer = 1,
+                    bias::Bool = !(norm_layer !== identity), pad::Integer = 0, [bias, weight, init])
 
 Create a depthwise separable convolution chain as used in MobileNetv1.
 This is sequence of layers:
 
   - a `kernel_size` depthwise convolution from `inplanes => inplanes`
-  - a (batch) normalisation layer + `activation` (if `use_norm[1] == true`; otherwise
+  - a (batch) normalisation layer + `activation` (if `norm_layer !== identity`; otherwise
     `activation` is applied to the convolution output)
   - a `kernel_size` convolution from `inplanes => outplanes`
-  - a (batch) normalisation layer + `activation` (if `use_norm[2] == true`; otherwise
+  - a (batch) normalisation layer + `activation` (if `norm_layer !== identity`; otherwise
     `activation` is applied to the convolution output)
 
 See Fig. 3 in [reference](https://arxiv.org/abs/1704.04861v1).
@@ -22,25 +21,19 @@ See Fig. 3 in [reference](https://arxiv.org/abs/1704.04861v1).
   - `inplanes`: number of input feature maps
   - `outplanes`: number of output feature maps
   - `activation`: the activation function for the final layer
-  - `revnorm`: set to `true` to place the batch norm before the convolution
-  - `use_norm`: a tuple of two booleans to specify whether to use normalization for the first and
-    second convolution
-  - `bias`: a tuple of two booleans to specify whether to use bias for the first and second
-    convolution. This is set to `(false, false)` by default if `use_norm[0] == true` and
-    `use_norm[1] == true`.
+  - `norm_layer`: the normalisation layer used. Note that using `identity` as the normalisation
+    layer will result in no normalisation being applied.
+  - `bias`: whether to use bias in the convolution layers.
   - `stride`: stride of the first convolution kernel
   - `pad`: padding of the first convolution kernel
   - `weight`, `init`: initialization for the convolution kernel (see [`Flux.Conv`](@ref))
 """
 function dwsep_conv_norm(kernel_size::Dims{2}, inplanes::Integer, outplanes::Integer,
-                         activation = relu; norm_layer = BatchNorm, eps::Float32 = 1.0f-5,
-                         use_norm::NTuple{2, Bool} = (true, true), stride::Integer = 1,
-                         bias::NTuple{2, Bool} = (!use_norm[1], !use_norm[2]), kwargs...)
-    return vcat(conv_norm(kernel_size, inplanes, inplanes, activation; eps, norm_layer,
-                          use_norm = use_norm[1], stride, bias = bias[1],
-                          groups = inplanes, kwargs...), # depthwise convolution
-                conv_norm((1, 1), inplanes, outplanes, activation; eps, norm_layer,
-                          use_norm = use_norm[2], bias = bias[2])) # pointwise convolution
+                         activation = relu; norm_layer = BatchNorm, stride::Integer = 1,
+                         bias::Bool = !(norm_layer !== identity), kwargs...)
+    return vcat(conv_norm(kernel_size, inplanes, inplanes, activation; norm_layer,
+                          stride, bias, groups = inplanes, kwargs...), # depthwise convolution
+                conv_norm((1, 1), inplanes, outplanes, activation; norm_layer, bias)) # pointwise convolution
 end
 
 """
diff --git a/src/layers/selayers.jl b/src/layers/selayers.jl
index 4f99733bf..40a6c027d 100644
--- a/src/layers/selayers.jl
+++ b/src/layers/selayers.jl
@@ -1,38 +1,30 @@
 """
-    squeeze_excite(inplanes::Integer, squeeze_planes::Integer;
-                   norm_layer = planes -> identity, activation = relu,
-                   gate_activation = sigmoid)
-
-    squeeze_excite(inplanes::Integer; reduction::Real = 16,
-                   norm_layer = planes -> identity, activation = relu,
-                   gate_activation = sigmoid)
+    squeeze_excite(inplanes::Integer; reduction::Real = 16, round_fn = _round_channels, 
+                   norm_layer = identity, activation = relu, gate_activation = sigmoid)
 
 Creates a squeeze-and-excitation layer used in MobileNets, EfficientNets and SE-ResNets.
 
 # Arguments
 
   - `inplanes`: The number of input feature maps
-  - `squeeze_planes`: The number of feature maps in the intermediate layers. Alternatively,
-    specify the keyword arguments `reduction` and `rd_divisior`, which determine the number
-    of feature maps in the intermediate layers from the number of input feature maps as:
-    `squeeze_planes = _round_channels(inplanes ÷ reduction)`. (See [`_round_channels`](@ref) for details.)
+  - `reduction`: The reduction factor for the number of hidden feature maps in the
+    squeeze and excite layer. The number of hidden feature maps is calculated as
+    `round_fn(inplanes / reduction)`.
+  - `round_fn`: The function to round the number of reduced feature maps.
   - `activation`: The activation function for the first convolution layer
   - `gate_activation`: The activation function for the gate layer
   - `norm_layer`: The normalization layer to be used after the convolution layers
   - `rd_planes`: The number of hidden feature maps in a squeeze and excite layer
 """
-function squeeze_excite(inplanes::Integer, squeeze_planes::Integer; norm_layer = identity,
-                        activation = relu, gate_activation = sigmoid)
+function squeeze_excite(inplanes::Integer; reduction::Real = 16, round_fn = _round_channels,
+                        norm_layer = identity, activation = relu, gate_activation = sigmoid)
+    squeeze_planes = round_fn(inplanes ÷ reduction)
     return SkipConnection(Chain(AdaptiveMeanPool((1, 1)),
                                 conv_norm((1, 1), inplanes, squeeze_planes, activation;
                                           norm_layer)...,
                                 conv_norm((1, 1), squeeze_planes, inplanes,
                                           gate_activation; norm_layer)...), .*)
 end
-function squeeze_excite(inplanes::Integer; reduction::Real = 16,
-                        round_fn = _round_channels, kwargs...)
-    return squeeze_excite(inplanes, round_fn(inplanes / reduction); kwargs...)
-end
 
 """
     effective_squeeze_excite(inplanes, gate_activation = sigmoid)

From ab96700d05f05ab21c93fb888ae40297ee952e29 Mon Sep 17 00:00:00 2001
From: Abhirath Anand <74202102+theabhirath@users.noreply.github.com>
Date: Sat, 3 Sep 2022 06:28:46 +0530
Subject: [PATCH 8/8] Renaming `ir_` to `invres_`

---
 src/Metalhead.jl                              |  2 +-
 .../builders/{irmodel.jl => invresmodel.jl}   | 19 +++++----
 src/convnets/builders/mbconv.jl               | 39 ++++++++++---------
 src/convnets/convnext.jl                      | 15 ++++---
 src/convnets/efficientnets/efficientnet.jl    |  8 ++--
 src/convnets/efficientnets/efficientnetv2.jl  |  6 +--
 src/convnets/mobilenets/mnasnet.jl            |  4 +-
 src/convnets/mobilenets/mobilenetv1.jl        |  6 +--
 src/convnets/mobilenets/mobilenetv2.jl        |  4 +-
 src/convnets/mobilenets/mobilenetv3.jl        |  8 ++--
 10 files changed, 57 insertions(+), 54 deletions(-)
 rename src/convnets/builders/{irmodel.jl => invresmodel.jl} (66%)

diff --git a/src/Metalhead.jl b/src/Metalhead.jl
index feba14ca0..b67bf1faf 100644
--- a/src/Metalhead.jl
+++ b/src/Metalhead.jl
@@ -20,7 +20,7 @@ using .Layers
 
 # CNN models
 ## Builders
-include("convnets/builders/irmodel.jl")
+include("convnets/builders/invresmodel.jl")
 include("convnets/builders/mbconv.jl")
 include("convnets/builders/resblocks.jl")
 include("convnets/builders/resnet.jl")
diff --git a/src/convnets/builders/irmodel.jl b/src/convnets/builders/invresmodel.jl
similarity index 66%
rename from src/convnets/builders/irmodel.jl
rename to src/convnets/builders/invresmodel.jl
index 2552d2cb1..6faeca992 100644
--- a/src/convnets/builders/irmodel.jl
+++ b/src/convnets/builders/invresmodel.jl
@@ -1,9 +1,11 @@
-function build_irmodel(scalings::NTuple{2, Real}, block_configs::AbstractVector{<:Tuple};
-                       inplanes::Integer = 32, connection = +, activation = relu,
-                       norm_layer = BatchNorm, divisor::Integer = 8, tail_conv::Bool = true,
-                       expanded_classifier::Bool = false, stochastic_depth_prob = nothing,
-                       headplanes::Integer, dropout_prob = nothing, inchannels::Integer = 3,
-                       nclasses::Integer = 1000, kwargs...)
+function build_invresmodel(scalings::NTuple{2, Real},
+                           block_configs::AbstractVector{<:Tuple};
+                           inplanes::Integer = 32, connection = +, activation = relu,
+                           norm_layer = BatchNorm, divisor::Integer = 8,
+                           tail_conv::Bool = true, expanded_classifier::Bool = false,
+                           stochastic_depth_prob = nothing, headplanes::Integer,
+                           dropout_prob = nothing, inchannels::Integer = 3,
+                           nclasses::Integer = 1000, kwargs...)
     width_mult, _ = scalings
     # building first layer
     inplanes = _round_channels(inplanes * width_mult, divisor)
@@ -36,6 +38,7 @@ function build_irmodel(scalings::NTuple{2, Real}, block_configs::AbstractVector{
     end
     return Chain(Chain(layers...), classifier)
 end
-function build_irmodel(width_mult::Real, block_configs::AbstractVector{<:Tuple}; kwargs...)
-    return build_irmodel((width_mult, 1), block_configs; kwargs...)
+function build_invresmodel(width_mult::Real, block_configs::AbstractVector{<:Tuple};
+                           kwargs...)
+    return build_invresmodel((width_mult, 1), block_configs; kwargs...)
 end
diff --git a/src/convnets/builders/mbconv.jl b/src/convnets/builders/mbconv.jl
index 4c4fa9a10..7b68be300 100644
--- a/src/convnets/builders/mbconv.jl
+++ b/src/convnets/builders/mbconv.jl
@@ -1,18 +1,19 @@
 """
-    irblockbuilder(::typeof(irblockfn), block_configs::AbstractVector{<:Tuple},
+    invresbuilder(::typeof(irblockfn), block_configs::AbstractVector{<:Tuple},
                    inplanes::Integer, stage_idx::Integer, scalings::NTuple{2, Real};
                    stochastic_depth_prob = nothing, norm_layer = BatchNorm,
                    divisor::Integer = 8, kwargs...)
 
-Constructs a collection of inverted residual blocks for a given stage. Note that
-this function is not intended to be called directly, but rather by the [`mbconv_stage_builder`](@ref)
-function. This function must only be extended if the user wishes to extend a custom inverted
-residual block type.
+Creates a block builder for `irblockfn` within a given stage.
+Note that this function is not intended to be called directly, but instead passed to
+[`mbconv_stage_builder`](@ref) which will return a builder over all stages.
+Users wanting to provide a custom inverted residual block type can extend this
+function by defining `invresbuilder(::typeof(my_block), ...)`.
 """
-function irblockbuilder(::typeof(dwsep_conv_norm), block_configs::AbstractVector{<:Tuple},
-                        inplanes::Integer, stage_idx::Integer, scalings::NTuple{2, Real};
-                        stochastic_depth_prob = nothing, norm_layer = BatchNorm,
-                        divisor::Integer = 8, kwargs...)
+function invresbuilder(::typeof(dwsep_conv_norm), block_configs::AbstractVector{<:Tuple},
+                       inplanes::Integer, stage_idx::Integer, scalings::NTuple{2, Real};
+                       stochastic_depth_prob = nothing, norm_layer = BatchNorm,
+                       divisor::Integer = 8, kwargs...)
     width_mult, depth_mult = scalings
     block_fn, k, outplanes, stride, nrepeats, activation = block_configs[stage_idx]
     outplanes = _round_channels(outplanes * width_mult, divisor)
@@ -29,10 +30,10 @@ function irblockbuilder(::typeof(dwsep_conv_norm), block_configs::AbstractVector
     return get_layers, ceil(Int, nrepeats * depth_mult)
 end
 
-function irblockbuilder(::typeof(mbconv), block_configs::AbstractVector{<:Tuple},
-                        inplanes::Integer, stage_idx::Integer, scalings::NTuple{2, Real};
-                        stochastic_depth_prob = nothing, norm_layer = BatchNorm,
-                        divisor::Integer = 8, se_from_explanes::Bool = false, kwargs...)
+function invresbuilder(::typeof(mbconv), block_configs::AbstractVector{<:Tuple},
+                       inplanes::Integer, stage_idx::Integer, scalings::NTuple{2, Real};
+                       stochastic_depth_prob = nothing, norm_layer = BatchNorm,
+                       divisor::Integer = 8, se_from_explanes::Bool = false, kwargs...)
     width_mult, depth_mult = scalings
     block_repeats = [ceil(Int, block_configs[idx][end - 2] * depth_mult)
                      for idx in eachindex(block_configs)]
@@ -64,10 +65,10 @@ function irblockbuilder(::typeof(mbconv), block_configs::AbstractVector{<:Tuple}
     return get_layers, block_repeats[stage_idx]
 end
 
-function irblockbuilder(::typeof(fused_mbconv), block_configs::AbstractVector{<:Tuple},
-                        inplanes::Integer, stage_idx::Integer, scalings::NTuple{2, Real};
-                        stochastic_depth_prob = nothing, norm_layer = BatchNorm,
-                        divisor::Integer = 8, kwargs...)
+function invresbuilder(::typeof(fused_mbconv), block_configs::AbstractVector{<:Tuple},
+                       inplanes::Integer, stage_idx::Integer, scalings::NTuple{2, Real};
+                       stochastic_depth_prob = nothing, norm_layer = BatchNorm,
+                       divisor::Integer = 8, kwargs...)
     width_mult, depth_mult = scalings
     block_repeats = [ceil(Int, block_configs[idx][end - 1] * depth_mult)
                      for idx in eachindex(block_configs)]
@@ -90,7 +91,7 @@ end
 
 function mbconv_stage_builder(block_configs::AbstractVector{<:Tuple}, inplanes::Integer,
                               scalings::NTuple{2, Real}; kwargs...)
-    bxs = [irblockbuilder(block_configs[idx][1], block_configs, inplanes, idx, scalings;
-                          kwargs...) for idx in eachindex(block_configs)]
+    bxs = [invresbuilder(block_configs[idx][1], block_configs, inplanes, idx, scalings;
+                         kwargs...) for idx in eachindex(block_configs)]
     return (stage_idx, block_idx) -> first.(bxs)[stage_idx](block_idx), last.(bxs)
 end
diff --git a/src/convnets/convnext.jl b/src/convnets/convnext.jl
index 02f9bbc7f..30b314556 100644
--- a/src/convnets/convnext.jl
+++ b/src/convnets/convnext.jl
@@ -12,14 +12,13 @@ Creates a single block of ConvNeXt.
 """
 function convnextblock(planes::Integer, stochastic_depth_prob = 0.0,
                        layerscale_init = 1.0f-6)
-    layers = SkipConnection(Chain(DepthwiseConv((7, 7), planes => planes; pad = 3),
-                                  swapdims((3, 1, 2, 4)),
-                                  LayerNorm(planes; ϵ = 1.0f-6),
-                                  mlp_block(planes, 4 * planes),
-                                  LayerScale(planes, layerscale_init),
-                                  swapdims((2, 3, 1, 4)),
-                                  StochasticDepth(stochastic_depth_prob)), +)
-    return layers
+    return SkipConnection(Chain(DepthwiseConv((7, 7), planes => planes; pad = 3),
+                                swapdims((3, 1, 2, 4)),
+                                LayerNorm(planes; ϵ = 1.0f-6),
+                                mlp_block(planes, 4 * planes),
+                                LayerScale(planes, layerscale_init),
+                                swapdims((2, 3, 1, 4)),
+                                StochasticDepth(stochastic_depth_prob)), +)
 end
 
 """
diff --git a/src/convnets/efficientnets/efficientnet.jl b/src/convnets/efficientnets/efficientnet.jl
index 9b8519e2d..2657a3884 100644
--- a/src/convnets/efficientnets/efficientnet.jl
+++ b/src/convnets/efficientnets/efficientnet.jl
@@ -53,10 +53,10 @@ function efficientnet(config::Symbol; norm_layer = BatchNorm, stochastic_depth_p
                       nclasses::Integer = 1000)
     _checkconfig(config, keys(EFFICIENTNET_GLOBAL_CONFIGS))
     scalings = EFFICIENTNET_GLOBAL_CONFIGS[config][2]
-    return build_irmodel(scalings, EFFICIENTNET_BLOCK_CONFIGS; inplanes = 32,
-                         norm_layer, stochastic_depth_prob, activation = swish,
-                         headplanes = EFFICIENTNET_BLOCK_CONFIGS[end][3] * 4,
-                         dropout_prob, inchannels, nclasses)
+    return build_invresmodel(scalings, EFFICIENTNET_BLOCK_CONFIGS; inplanes = 32,
+                             norm_layer, stochastic_depth_prob, activation = swish,
+                             headplanes = EFFICIENTNET_BLOCK_CONFIGS[end][3] * 4,
+                             dropout_prob, inchannels, nclasses)
 end
 
 """
diff --git a/src/convnets/efficientnets/efficientnetv2.jl b/src/convnets/efficientnets/efficientnetv2.jl
index 58c9abd35..dab159e68 100644
--- a/src/convnets/efficientnets/efficientnetv2.jl
+++ b/src/convnets/efficientnets/efficientnetv2.jl
@@ -57,9 +57,9 @@ function efficientnetv2(config::Symbol; norm_layer = BatchNorm, stochastic_depth
                         nclasses::Integer = 1000)
     _checkconfig(config, keys(EFFNETV2_CONFIGS))
     block_configs = EFFNETV2_CONFIGS[config]
-    return build_irmodel((1, 1), block_configs; activation = swish, norm_layer,
-                         inplanes = block_configs[1][3], headplanes = 1280,
-                         stochastic_depth_prob, dropout_prob, inchannels, nclasses)
+    return build_invresmodel((1, 1), block_configs; activation = swish, norm_layer,
+                             inplanes = block_configs[1][3], headplanes = 1280,
+                             stochastic_depth_prob, dropout_prob, inchannels, nclasses)
 end
 
 """
diff --git a/src/convnets/mobilenets/mnasnet.jl b/src/convnets/mobilenets/mnasnet.jl
index a035d1c77..98cd9d759 100644
--- a/src/convnets/mobilenets/mnasnet.jl
+++ b/src/convnets/mobilenets/mnasnet.jl
@@ -63,8 +63,8 @@ function mnasnet(config::Symbol; width_mult::Real = 1, max_width::Integer = 1280
     # momentum used for BatchNorm is as per Tensorflow implementation
     norm_layer = (args...; kwargs...) -> BatchNorm(args...; momentum = 0.0003f0, kwargs...)
     inplanes, block_configs = MNASNET_CONFIGS[config]
-    return build_irmodel(width_mult, block_configs; inplanes, norm_layer,
-                         headplanes = max_width, dropout_prob, inchannels, nclasses)
+    return build_invresmodel(width_mult, block_configs; inplanes, norm_layer,
+                             headplanes = max_width, dropout_prob, inchannels, nclasses)
 end
 
 """
diff --git a/src/convnets/mobilenets/mobilenetv1.jl b/src/convnets/mobilenets/mobilenetv1.jl
index 7de7ac384..ca17dc4ac 100644
--- a/src/convnets/mobilenets/mobilenetv1.jl
+++ b/src/convnets/mobilenets/mobilenetv1.jl
@@ -31,9 +31,9 @@ Create a MobileNetv1 model. ([reference](https://arxiv.org/abs/1704.04861v1)).
 """
 function mobilenetv1(width_mult::Real = 1; inplanes::Integer = 32, dropout_prob = nothing,
                      inchannels::Integer = 3, nclasses::Integer = 1000)
-    return build_irmodel(width_mult, MOBILENETV1_CONFIGS; inplanes, inchannels,
-                         activation = relu6, connection = nothing, tail_conv = false,
-                         headplanes = 1024, dropout_prob, nclasses)
+    return build_invresmodel(width_mult, MOBILENETV1_CONFIGS; inplanes, inchannels,
+                             activation = relu6, connection = nothing, tail_conv = false,
+                             headplanes = 1024, dropout_prob, nclasses)
 end
 
 """
diff --git a/src/convnets/mobilenets/mobilenetv2.jl b/src/convnets/mobilenets/mobilenetv2.jl
index 64e680bd8..6d0973130 100644
--- a/src/convnets/mobilenets/mobilenetv2.jl
+++ b/src/convnets/mobilenets/mobilenetv2.jl
@@ -38,8 +38,8 @@ Create a MobileNetv2 model. ([reference](https://arxiv.org/abs/1801.04381v1)).
 function mobilenetv2(width_mult::Real = 1; max_width::Integer = 1280,
                      inplanes::Integer = 32, dropout_prob = 0.2,
                      inchannels::Integer = 3, nclasses::Integer = 1000)
-    return build_irmodel(width_mult, MOBILENETV2_CONFIGS; activation = relu6, inplanes,
-                         headplanes = max_width, dropout_prob, inchannels, nclasses)
+    return build_invresmodel(width_mult, MOBILENETV2_CONFIGS; activation = relu6, inplanes,
+                             headplanes = max_width, dropout_prob, inchannels, nclasses)
 end
 
 """
diff --git a/src/convnets/mobilenets/mobilenetv3.jl b/src/convnets/mobilenets/mobilenetv3.jl
index 4bf748e9d..07e4501eb 100644
--- a/src/convnets/mobilenets/mobilenetv3.jl
+++ b/src/convnets/mobilenets/mobilenetv3.jl
@@ -54,10 +54,10 @@ function mobilenetv3(config::Symbol; width_mult::Real = 1, dropout_prob = 0.2,
                      inchannels::Integer = 3, nclasses::Integer = 1000)
     _checkconfig(config, [:small, :large])
     max_width, block_configs = MOBILENETV3_CONFIGS[config]
-    return build_irmodel(width_mult, block_configs; inplanes = 16,
-                         headplanes = max_width, activation = relu,
-                         se_from_explanes = true, se_round_fn = _round_channels,
-                         expanded_classifier = true, dropout_prob, inchannels, nclasses)
+    return build_invresmodel(width_mult, block_configs; inplanes = 16,
+                             headplanes = max_width, activation = relu,
+                             se_from_explanes = true, se_round_fn = _round_channels,
+                             expanded_classifier = true, dropout_prob, inchannels, nclasses)
 end
 
 """