From 944afd830e3d7659efb95e48667ce56a1be57b37 Mon Sep 17 00:00:00 2001
From: Abhirath Anand <74202102+theabhirath@users.noreply.github.com>
Date: Sat, 19 Feb 2022 18:06:18 +0530
Subject: [PATCH 1/9] Modified Diagonal to make bias optional

1. Allows for more flexibility to implement custom layers like LayerScale
2. Leaves current working unchanged (what worked before still will without any modifications)
---
 src/layers/basic.jl  | 28 ++++++++++++++++++++--------
 test/layers/basic.jl |  7 ++++---
 2 files changed, 24 insertions(+), 11 deletions(-)

diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index 3e22895e82..43521fdc18 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -174,27 +174,39 @@ end
 
 """
     Diagonal(α, β)
-    Diagonal(size::Integer...)
+    Diagonal(size::Integer...; bias = true, init = ones32)
 
 Create an element-wise linear layer, which performs
 
     y = α .* x .+ β
 
-The learnable arrays are initialised `α = ones(Float32, size)` and
-`β = zeros(Float32, size)`.
+if `bias` is true, and
+
+    y = α .* x
+
+otherwise. The learnable arrays are initialised `α = ones(Float32, size)` and
+`β = zeros(Float32, size)`. If `init` is specified, the function given to it is 
+called and used to initialise α.
 
 Used by [`LayerNorm`](@ref).
 """
-struct Diagonal{T}
-  α::T
-  β::T
+struct Diagonal{A, B}
+  α::A
+  β::B
+  function Diagonal(W::M, bias = true) where M<:AbstractArray
+    b = create_bias(W, bias, size(W)...)
+    new{M, typeof(b)}(W, b)
+  end
 end
 
-Diagonal(sz::Integer...) = Diagonal(ones32(sz...), zeros32(sz...))
+Diagonal(sz::Integer...; bias = true, init = ones32) = Diagonal(init(sz...), bias)
 
 @functor Diagonal
 
-(a::Diagonal)(x) = a.α .* x .+ a.β
+function (a::Diagonal)(x)
+  x = a.α .* x
+  x = x .+ a.β
+end
 
 function Base.show(io::IO, l::Diagonal)
   print(io, "Diagonal(", join(size(l.α), ", "), ")")
diff --git a/test/layers/basic.jl b/test/layers/basic.jl
index ca8e15a643..1221b2e1d1 100644
--- a/test/layers/basic.jl
+++ b/test/layers/basic.jl
@@ -91,16 +91,17 @@ import Flux: activations
     @test length(Flux.Diagonal(10)(randn(10))) == 10
     @test length(Flux.Diagonal(10)(1)) == 10
     @test length(Flux.Diagonal(10)(randn(1))) == 10
+    @test length(Flux.Diagonal(10; bias = false)(randn(10))) == 10
     @test_throws DimensionMismatch Flux.Diagonal(10)(randn(2))
 
     @test Flux.Diagonal(2)([1 2]) == [1 2; 1 2]
     @test Flux.Diagonal(2)([1,2]) == [1,2]
-    @test Flux.Diagonal(2)([1 2; 3 4]) == [1 2; 3 4]
+    @test Flux.Diagonal(2; bias = false)([1 2; 3 4]) == [1 2; 3 4]
 
     @test Flux.Diagonal(2)(rand(2,3,4)) |> size == (2, 3, 4)
     @test Flux.Diagonal(2,3)(rand(2,3,4)) |> size == (2, 3, 4)
-    @test Flux.Diagonal(2,3,4)(rand(2,3,4)) |> size == (2, 3, 4)
-    @test Flux.Diagonal(2,3)(rand(2,1,4)) |> size == (2, 3, 4)
+    @test Flux.Diagonal(2, 3, 4; bias = false)(rand(2,3,4)) |> size == (2, 3, 4)
+    @test Flux.Diagonal(2, 3; bias = false)(rand(2,1,4)) |> size == (2, 3, 4)
   end
 
   @testset "Maxout" begin

From 86f1d4f3faf10acbe2a9c15b27fc2d410032cd72 Mon Sep 17 00:00:00 2001
From: Abhirath Anand <74202102+theabhirath@users.noreply.github.com>
Date: Sat, 19 Feb 2022 18:12:06 +0530
Subject: [PATCH 2/9] Updated docstring

---
 src/layers/basic.jl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index 43521fdc18..acb570ba68 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -186,7 +186,8 @@ if `bias` is true, and
 
 otherwise. The learnable arrays are initialised `α = ones(Float32, size)` and
 `β = zeros(Float32, size)`. If `init` is specified, the function given to it is 
-called and used to initialise α.
+called and used to initialise α. The weight matrix and/or the bias vector 
+(with the same size as x) may also be provided explicitly.
 
 Used by [`LayerNorm`](@ref).
 """

From a11f3985ff4741dbf8d6ffba57a1866f5d24dc4c Mon Sep 17 00:00:00 2001
From: Abhirath Anand <74202102+theabhirath@users.noreply.github.com>
Date: Sat, 19 Feb 2022 18:06:18 +0530
Subject: [PATCH 3/9] Modified Diagonal to make bias optional

1. Allows for more flexibility to implement custom layers like LayerScale
2. Leaves current working unchanged (what worked before still will without any modifications)
---
 src/layers/basic.jl  | 28 ++++++++++++++++++++--------
 test/layers/basic.jl |  7 ++++---
 2 files changed, 24 insertions(+), 11 deletions(-)

diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index 8aaf4e7df0..0e4bf6f205 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -173,27 +173,39 @@ end
 
 """
     Diagonal(α, β)
-    Diagonal(size::Integer...)
+    Diagonal(size::Integer...; bias = true, init = ones32)
 
 Create an element-wise linear layer, which performs
 
     y = α .* x .+ β
 
-The learnable arrays are initialised `α = ones(Float32, size)` and
-`β = zeros(Float32, size)`.
+if `bias` is true, and
+
+    y = α .* x
+
+otherwise. The learnable arrays are initialised `α = ones(Float32, size)` and
+`β = zeros(Float32, size)`. If `init` is specified, the function given to it is 
+called and used to initialise α.
 
 Used by [`LayerNorm`](@ref).
 """
-struct Diagonal{T}
-  α::T
-  β::T
+struct Diagonal{A, B}
+  α::A
+  β::B
+  function Diagonal(W::M, bias = true) where M<:AbstractArray
+    b = create_bias(W, bias, size(W)...)
+    new{M, typeof(b)}(W, b)
+  end
 end
 
-Diagonal(sz::Integer...) = Diagonal(ones32(sz...), zeros32(sz...))
+Diagonal(sz::Integer...; bias = true, init = ones32) = Diagonal(init(sz...), bias)
 
 @functor Diagonal
 
-(a::Diagonal)(x) = a.α .* x .+ a.β
+function (a::Diagonal)(x)
+  x = a.α .* x
+  x = x .+ a.β
+end
 
 function Base.show(io::IO, l::Diagonal)
   print(io, "Diagonal(", join(size(l.α), ", "), ")")
diff --git a/test/layers/basic.jl b/test/layers/basic.jl
index 5befed57b4..0c12b22d11 100644
--- a/test/layers/basic.jl
+++ b/test/layers/basic.jl
@@ -91,16 +91,17 @@ import Flux: activations
     @test length(Flux.Diagonal(10)(randn(10))) == 10
     @test length(Flux.Diagonal(10)(1)) == 10
     @test length(Flux.Diagonal(10)(randn(1))) == 10
+    @test length(Flux.Diagonal(10; bias = false)(randn(10))) == 10
     @test_throws DimensionMismatch Flux.Diagonal(10)(randn(2))
 
     @test Flux.Diagonal(2)([1 2]) == [1 2; 1 2]
     @test Flux.Diagonal(2)([1,2]) == [1,2]
-    @test Flux.Diagonal(2)([1 2; 3 4]) == [1 2; 3 4]
+    @test Flux.Diagonal(2; bias = false)([1 2; 3 4]) == [1 2; 3 4]
 
     @test Flux.Diagonal(2)(rand(2,3,4)) |> size == (2, 3, 4)
     @test Flux.Diagonal(2,3)(rand(2,3,4)) |> size == (2, 3, 4)
-    @test Flux.Diagonal(2,3,4)(rand(2,3,4)) |> size == (2, 3, 4)
-    @test Flux.Diagonal(2,3)(rand(2,1,4)) |> size == (2, 3, 4)
+    @test Flux.Diagonal(2, 3, 4; bias = false)(rand(2,3,4)) |> size == (2, 3, 4)
+    @test Flux.Diagonal(2, 3; bias = false)(rand(2,1,4)) |> size == (2, 3, 4)
   end
 
   @testset "Maxout" begin

From 46de848580d359fcc09f54dad380ead5e3fe3c37 Mon Sep 17 00:00:00 2001
From: Abhirath Anand <74202102+theabhirath@users.noreply.github.com>
Date: Sat, 19 Feb 2022 18:12:06 +0530
Subject: [PATCH 4/9] Updated docstring

---
 src/layers/basic.jl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index 0e4bf6f205..10c997183c 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -185,7 +185,8 @@ if `bias` is true, and
 
 otherwise. The learnable arrays are initialised `α = ones(Float32, size)` and
 `β = zeros(Float32, size)`. If `init` is specified, the function given to it is 
-called and used to initialise α.
+called and used to initialise α. The weight matrix and/or the bias vector 
+(with the same size as x) may also be provided explicitly.
 
 Used by [`LayerNorm`](@ref).
 """

From 37ddbac4cfdba08667734c2941f04372fa43805e Mon Sep 17 00:00:00 2001
From: Abhirath Anand <74202102+theabhirath@users.noreply.github.com>
Date: Sat, 5 Mar 2022 23:03:35 +0530
Subject: [PATCH 5/9] Rebase to do away with Zeros and cleanup

---
 src/layers/basic.jl | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index 10c997183c..c562b7367c 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -167,21 +167,21 @@ end
 function Base.show(io::IO, l::Dense)
   print(io, "Dense(", size(l.weight, 2), " => ", size(l.weight, 1))
   l.σ == identity || print(io, ", ", l.σ)
-  l.bias == false && print(io, "; bias=false")
+  l.bias == false && print(io, "; bias = false")
   print(io, ")")
 end
 
 """
-    Diagonal(α, β)
+    Diagonal(scale, bias)
     Diagonal(size::Integer...; bias = true, init = ones32)
 
 Create an element-wise linear layer, which performs
 
-    y = α .* x .+ β
+    y = scale .* x .+ bias
 
 if `bias` is true, and
 
-    y = α .* x
+    y = x .* scale
 
 otherwise. The learnable arrays are initialised `α = ones(Float32, size)` and
 `β = zeros(Float32, size)`. If `init` is specified, the function given to it is 
@@ -190,9 +190,9 @@ called and used to initialise α. The weight matrix and/or the bias vector
 
 Used by [`LayerNorm`](@ref).
 """
-struct Diagonal{A, B}
-  α::A
-  β::B
+struct Diagonal{A<:AbstractArray, B}
+  scale::A
+  bias::B
   function Diagonal(W::M, bias = true) where M<:AbstractArray
     b = create_bias(W, bias, size(W)...)
     new{M, typeof(b)}(W, b)
@@ -203,13 +203,12 @@ Diagonal(sz::Integer...; bias = true, init = ones32) = Diagonal(init(sz...), bia
 
 @functor Diagonal
 
-function (a::Diagonal)(x)
-  x = a.α .* x
-  x = x .+ a.β
-end
+(a::Diagonal)(x) = a.scale .* x .+ a.bias
 
 function Base.show(io::IO, l::Diagonal)
-  print(io, "Diagonal(", join(size(l.α), ", "), ")")
+  print(io, "Diagonal(", join(size(l.scale), ", "))
+  l.bias == false && print(io, "; bias = false")
+  print(io, ")")
 end
 
 """

From 572e9dfa51551393a68e4608f168080828e5a713 Mon Sep 17 00:00:00 2001
From: Abhirath Anand <74202102+theabhirath@users.noreply.github.com>
Date: Sat, 5 Mar 2022 23:08:30 +0530
Subject: [PATCH 6/9] Docstring cleanup

---
 src/layers/basic.jl | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index c562b7367c..0ad307c2af 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -110,7 +110,7 @@ as an `in × N` matrix, or any array with `size(x,1) == in`.
 The out `y` will be a vector  of length `out`, or a batch with
 `size(y) == (out, size(x)[2:end]...)`
 
-Keyword `bias=false` will switch off trainable bias for the layer.
+Keyword `bias = false` will switch off trainable bias for the layer.
 The initialisation of the weight matrix is `W = init(out, in)`, calling the function
 given to keyword `init`, with default [`glorot_uniform`](@doc Flux.glorot_uniform).
 The weight matrix and/or the bias vector (of length `out`) may also be provided explicitly.
@@ -127,7 +127,7 @@ julia> d(rand(Float32, 5, 1, 1, 64)) |> size  # treated as three batch dimension
 (2, 1, 1, 64)
 
 julia> d1 = Dense(ones(2, 5), false, tanh)  # using provided weight matrix
-Dense(5 => 2, tanh; bias=false)  # 10 parameters
+Dense(5 => 2, tanh; bias = false)  # 10 parameters
 
 julia> d1(ones(5))
 2-element Vector{Float64}:
@@ -183,9 +183,9 @@ if `bias` is true, and
 
     y = x .* scale
 
-otherwise. The learnable arrays are initialised `α = ones(Float32, size)` and
-`β = zeros(Float32, size)`. If `init` is specified, the function given to it is 
-called and used to initialise α. The weight matrix and/or the bias vector 
+otherwise. The learnable arrays are initialised `scale = ones(Float32, size)` and
+`bias = zeros(Float32, size)`. If `init` is specified, the function given to it is 
+called and used to initialise scale. The weight matrix and/or the bias vector 
 (with the same size as x) may also be provided explicitly.
 
 Used by [`LayerNorm`](@ref).
@@ -330,7 +330,7 @@ which is accepted as the input to a `Chain`.
 If the two input sizes are the same, `in1 == in2`, then you may write `Bilinear(in => out, σ)`.
 
 The initialisation works as for [`Dense`](@ref) layer, with `W = init(out, in1, in2)`.
-By default the bias vector is `zeros(Float32, out)`, option `bias=false` will switch off
+By default the bias vector is `zeros(Float32, out)`, option `bias = false` will switch off
 trainable bias. Either of these may be provided explicitly.
 
 # Examples
@@ -348,14 +348,14 @@ true
 
 julia> sc = SkipConnection(
                 Chain(Dense(5 => 20, tanh), Dense(20 => 9, tanh)),
-                Flux.Bilinear((9, 5) => 3, bias=false),
+                Flux.Bilinear((9, 5) => 3, bias = false),
             );  # used as the recombinator, with skip as the second input
 
 julia> sc(x) |> size
 (3, 32)
 
 julia> Flux.Bilinear(rand(4,8,16), false, tanh)  # first dim of weight is the output
-Bilinear((8, 16) => 4, tanh; bias=false)  # 512 parameters
+Bilinear((8, 16) => 4, tanh; bias = false)  # 512 parameters
 ```
 """
 struct Bilinear{F,A,B}
@@ -406,7 +406,7 @@ function Base.show(io::IO, l::Bilinear)
     print(io, "Bilinear((", size(l.weight, 2), ", ", size(l.weight, 3), ") => ", size(l.weight, 1))
   end
   l.σ == identity || print(io, ", ", l.σ)
-  l.bias === false && print(io, "; bias=false")
+  l.bias === false && print(io, "; bias = false")
   print(io, ")")
 end
 

From a95d7c467d478e9f62b52b0ee8dd11be7e85c286 Mon Sep 17 00:00:00 2001
From: Abhirath Anand <74202102+theabhirath@users.noreply.github.com>
Date: Sat, 5 Mar 2022 23:45:09 +0530
Subject: [PATCH 7/9] More docstring cleanup

---
 src/layers/basic.jl | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index 0ad307c2af..86c3b9fb14 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -110,7 +110,7 @@ as an `in × N` matrix, or any array with `size(x,1) == in`.
 The out `y` will be a vector  of length `out`, or a batch with
 `size(y) == (out, size(x)[2:end]...)`
 
-Keyword `bias = false` will switch off trainable bias for the layer.
+Keyword `bias=false` will switch off trainable bias for the layer.
 The initialisation of the weight matrix is `W = init(out, in)`, calling the function
 given to keyword `init`, with default [`glorot_uniform`](@doc Flux.glorot_uniform).
 The weight matrix and/or the bias vector (of length `out`) may also be provided explicitly.
@@ -127,7 +127,7 @@ julia> d(rand(Float32, 5, 1, 1, 64)) |> size  # treated as three batch dimension
 (2, 1, 1, 64)
 
 julia> d1 = Dense(ones(2, 5), false, tanh)  # using provided weight matrix
-Dense(5 => 2, tanh; bias = false)  # 10 parameters
+Dense(5 => 2, tanh; bias=false)  # 10 parameters
 
 julia> d1(ones(5))
 2-element Vector{Float64}:
@@ -167,7 +167,7 @@ end
 function Base.show(io::IO, l::Dense)
   print(io, "Dense(", size(l.weight, 2), " => ", size(l.weight, 1))
   l.σ == identity || print(io, ", ", l.σ)
-  l.bias == false && print(io, "; bias = false")
+  l.bias == false && print(io, "; bias=false")
   print(io, ")")
 end
 
@@ -182,11 +182,13 @@ Create an element-wise linear layer, which performs
 if `bias` is true, and
 
     y = x .* scale
+    
+otherwise.
 
-otherwise. The learnable arrays are initialised `scale = ones(Float32, size)` and
-`bias = zeros(Float32, size)`. If `init` is specified, the function given to it is 
-called and used to initialise scale. The weight matrix and/or the bias vector 
-(with the same size as x) may also be provided explicitly.
+The learnable parameters are initialised `scale = Flux.ones32(size)` and 
+`bias = Flux.zeros32(size)`. Alternatively, specify `init` to customize how `scale` 
+is initialised. The weight matrix and/or the bias vector (with the same size as `x`) 
+may also be provided explicitly.
 
 Used by [`LayerNorm`](@ref).
 """
@@ -207,7 +209,7 @@ Diagonal(sz::Integer...; bias = true, init = ones32) = Diagonal(init(sz...), bia
 
 function Base.show(io::IO, l::Diagonal)
   print(io, "Diagonal(", join(size(l.scale), ", "))
-  l.bias == false && print(io, "; bias = false")
+  l.bias == false && print(io, "; bias=false")
   print(io, ")")
 end
 
@@ -330,7 +332,7 @@ which is accepted as the input to a `Chain`.
 If the two input sizes are the same, `in1 == in2`, then you may write `Bilinear(in => out, σ)`.
 
 The initialisation works as for [`Dense`](@ref) layer, with `W = init(out, in1, in2)`.
-By default the bias vector is `zeros(Float32, out)`, option `bias = false` will switch off
+By default the bias vector is `zeros(Float32, out)`, option `bias=false` will switch off
 trainable bias. Either of these may be provided explicitly.
 
 # Examples
@@ -348,14 +350,14 @@ true
 
 julia> sc = SkipConnection(
                 Chain(Dense(5 => 20, tanh), Dense(20 => 9, tanh)),
-                Flux.Bilinear((9, 5) => 3, bias = false),
+                Flux.Bilinear((9, 5) => 3, bias=false),
             );  # used as the recombinator, with skip as the second input
 
 julia> sc(x) |> size
 (3, 32)
 
 julia> Flux.Bilinear(rand(4,8,16), false, tanh)  # first dim of weight is the output
-Bilinear((8, 16) => 4, tanh; bias = false)  # 512 parameters
+Bilinear((8, 16) => 4, tanh; bias=false)  # 512 parameters
 ```
 """
 struct Bilinear{F,A,B}
@@ -406,7 +408,7 @@ function Base.show(io::IO, l::Bilinear)
     print(io, "Bilinear((", size(l.weight, 2), ", ", size(l.weight, 3), ") => ", size(l.weight, 1))
   end
   l.σ == identity || print(io, ", ", l.σ)
-  l.bias === false && print(io, "; bias = false")
+  l.bias === false && print(io, "; bias=false")
   print(io, ")")
 end
 

From cdbc5c5e432ae66cef5c64faec56ba24f3428b51 Mon Sep 17 00:00:00 2001
From: Abhirath Anand <74202102+theabhirath@users.noreply.github.com>
Date: Sun, 6 Mar 2022 00:22:35 +0530
Subject: [PATCH 8/9] Even more docstring cleanup

Co-authored-by: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
---
 src/layers/basic.jl | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index 86c3b9fb14..d6fa889d3d 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -172,23 +172,18 @@ function Base.show(io::IO, l::Dense)
 end
 
 """
-    Diagonal(scale, bias)
     Diagonal(size::Integer...; bias = true, init = ones32)
+    Diagonal(scale::AbstractArray, [bias])
 
 Create an element-wise linear layer, which performs
 
     y = scale .* x .+ bias
 
-if `bias` is true, and
-
-    y = x .* scale
-    
-otherwise.
-
-The learnable parameters are initialised `scale = Flux.ones32(size)` and 
-`bias = Flux.zeros32(size)`. Alternatively, specify `init` to customize how `scale` 
-is initialised. The weight matrix and/or the bias vector (with the same size as `x`) 
-may also be provided explicitly.
+with no activation function.
+ 
+The learnable parameters are initialised `scale = init(size...)` and 
+`bias = zeros32(size...)`, with `init = ones32` by default. You may specify the function `init`, 
+turn off trainable bias with `bias = false`, or provide the array(s) explicitly.
 
 Used by [`LayerNorm`](@ref).
 """

From 0b06bcb10e01b9050b7d3a047796843be6518d06 Mon Sep 17 00:00:00 2001
From: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
Date: Sat, 5 Mar 2022 17:30:01 -0500
Subject: [PATCH 9/9] let's go with no spaces for now then, although it's not a
 big deal

---
 src/layers/basic.jl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index d6fa889d3d..e76efea60a 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -172,7 +172,7 @@ function Base.show(io::IO, l::Dense)
 end
 
 """
-    Diagonal(size::Integer...; bias = true, init = ones32)
+    Diagonal(size::Integer...; bias=true, init=ones32)
     Diagonal(scale::AbstractArray, [bias])
 
 Create an element-wise linear layer, which performs
@@ -181,9 +181,9 @@ Create an element-wise linear layer, which performs
 
 with no activation function.
  
-The learnable parameters are initialised `scale = init(size...)` and 
-`bias = zeros32(size...)`, with `init = ones32` by default. You may specify the function `init`, 
-turn off trainable bias with `bias = false`, or provide the array(s) explicitly.
+The learnable scale & bias are initialised `init(size...)` and `zeros32(size...)`,
+with `init=ones32` by default. You may specify the function `init`, 
+turn off trainable bias with `bias=false`, or provide the array(s) explicitly.
 
 Used by [`LayerNorm`](@ref).
 """