From 944afd830e3d7659efb95e48667ce56a1be57b37 Mon Sep 17 00:00:00 2001 From: Abhirath Anand <74202102+theabhirath@users.noreply.github.com> Date: Sat, 19 Feb 2022 18:06:18 +0530 Subject: [PATCH 1/9] Modified Diagonal to make bias optional 1. Allows for more flexibility to implement custom layers like LayerScale 2. Leaves current working unchanged (what worked before still will without any modifications) --- src/layers/basic.jl | 28 ++++++++++++++++++++-------- test/layers/basic.jl | 7 ++++--- 2 files changed, 24 insertions(+), 11 deletions(-) diff --git a/src/layers/basic.jl b/src/layers/basic.jl index 3e22895e82..43521fdc18 100644 --- a/src/layers/basic.jl +++ b/src/layers/basic.jl @@ -174,27 +174,39 @@ end """ Diagonal(α, β) - Diagonal(size::Integer...) + Diagonal(size::Integer...; bias = true, init = ones32) Create an element-wise linear layer, which performs y = α .* x .+ β -The learnable arrays are initialised `α = ones(Float32, size)` and -`β = zeros(Float32, size)`. +if `bias` is true, and + + y = α .* x + +otherwise. The learnable arrays are initialised `α = ones(Float32, size)` and +`β = zeros(Float32, size)`. If `init` is specified, the function given to it is +called and used to initialise α. Used by [`LayerNorm`](@ref). """ -struct Diagonal{T} - α::T - β::T +struct Diagonal{A, B} + α::A + β::B + function Diagonal(W::M, bias = true) where M<:AbstractArray + b = create_bias(W, bias, size(W)...) + new{M, typeof(b)}(W, b) + end end -Diagonal(sz::Integer...) = Diagonal(ones32(sz...), zeros32(sz...)) +Diagonal(sz::Integer...; bias = true, init = ones32) = Diagonal(init(sz...), bias) @functor Diagonal -(a::Diagonal)(x) = a.α .* x .+ a.β +function (a::Diagonal)(x) + x = a.α .* x + x = x .+ a.β +end function Base.show(io::IO, l::Diagonal) print(io, "Diagonal(", join(size(l.α), ", "), ")") diff --git a/test/layers/basic.jl b/test/layers/basic.jl index ca8e15a643..1221b2e1d1 100644 --- a/test/layers/basic.jl +++ b/test/layers/basic.jl @@ -91,16 +91,17 @@ import Flux: activations @test length(Flux.Diagonal(10)(randn(10))) == 10 @test length(Flux.Diagonal(10)(1)) == 10 @test length(Flux.Diagonal(10)(randn(1))) == 10 + @test length(Flux.Diagonal(10; bias = false)(randn(10))) == 10 @test_throws DimensionMismatch Flux.Diagonal(10)(randn(2)) @test Flux.Diagonal(2)([1 2]) == [1 2; 1 2] @test Flux.Diagonal(2)([1,2]) == [1,2] - @test Flux.Diagonal(2)([1 2; 3 4]) == [1 2; 3 4] + @test Flux.Diagonal(2; bias = false)([1 2; 3 4]) == [1 2; 3 4] @test Flux.Diagonal(2)(rand(2,3,4)) |> size == (2, 3, 4) @test Flux.Diagonal(2,3)(rand(2,3,4)) |> size == (2, 3, 4) - @test Flux.Diagonal(2,3,4)(rand(2,3,4)) |> size == (2, 3, 4) - @test Flux.Diagonal(2,3)(rand(2,1,4)) |> size == (2, 3, 4) + @test Flux.Diagonal(2, 3, 4; bias = false)(rand(2,3,4)) |> size == (2, 3, 4) + @test Flux.Diagonal(2, 3; bias = false)(rand(2,1,4)) |> size == (2, 3, 4) end @testset "Maxout" begin From 86f1d4f3faf10acbe2a9c15b27fc2d410032cd72 Mon Sep 17 00:00:00 2001 From: Abhirath Anand <74202102+theabhirath@users.noreply.github.com> Date: Sat, 19 Feb 2022 18:12:06 +0530 Subject: [PATCH 2/9] Updated docstring --- src/layers/basic.jl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/layers/basic.jl b/src/layers/basic.jl index 43521fdc18..acb570ba68 100644 --- a/src/layers/basic.jl +++ b/src/layers/basic.jl @@ -186,7 +186,8 @@ if `bias` is true, and otherwise. The learnable arrays are initialised `α = ones(Float32, size)` and `β = zeros(Float32, size)`. If `init` is specified, the function given to it is -called and used to initialise α. +called and used to initialise α. The weight matrix and/or the bias vector +(with the same size as x) may also be provided explicitly. Used by [`LayerNorm`](@ref). """ From a11f3985ff4741dbf8d6ffba57a1866f5d24dc4c Mon Sep 17 00:00:00 2001 From: Abhirath Anand <74202102+theabhirath@users.noreply.github.com> Date: Sat, 19 Feb 2022 18:06:18 +0530 Subject: [PATCH 3/9] Modified Diagonal to make bias optional 1. Allows for more flexibility to implement custom layers like LayerScale 2. Leaves current working unchanged (what worked before still will without any modifications) --- src/layers/basic.jl | 28 ++++++++++++++++++++-------- test/layers/basic.jl | 7 ++++--- 2 files changed, 24 insertions(+), 11 deletions(-) diff --git a/src/layers/basic.jl b/src/layers/basic.jl index 8aaf4e7df0..0e4bf6f205 100644 --- a/src/layers/basic.jl +++ b/src/layers/basic.jl @@ -173,27 +173,39 @@ end """ Diagonal(α, β) - Diagonal(size::Integer...) + Diagonal(size::Integer...; bias = true, init = ones32) Create an element-wise linear layer, which performs y = α .* x .+ β -The learnable arrays are initialised `α = ones(Float32, size)` and -`β = zeros(Float32, size)`. +if `bias` is true, and + + y = α .* x + +otherwise. The learnable arrays are initialised `α = ones(Float32, size)` and +`β = zeros(Float32, size)`. If `init` is specified, the function given to it is +called and used to initialise α. Used by [`LayerNorm`](@ref). """ -struct Diagonal{T} - α::T - β::T +struct Diagonal{A, B} + α::A + β::B + function Diagonal(W::M, bias = true) where M<:AbstractArray + b = create_bias(W, bias, size(W)...) + new{M, typeof(b)}(W, b) + end end -Diagonal(sz::Integer...) = Diagonal(ones32(sz...), zeros32(sz...)) +Diagonal(sz::Integer...; bias = true, init = ones32) = Diagonal(init(sz...), bias) @functor Diagonal -(a::Diagonal)(x) = a.α .* x .+ a.β +function (a::Diagonal)(x) + x = a.α .* x + x = x .+ a.β +end function Base.show(io::IO, l::Diagonal) print(io, "Diagonal(", join(size(l.α), ", "), ")") diff --git a/test/layers/basic.jl b/test/layers/basic.jl index 5befed57b4..0c12b22d11 100644 --- a/test/layers/basic.jl +++ b/test/layers/basic.jl @@ -91,16 +91,17 @@ import Flux: activations @test length(Flux.Diagonal(10)(randn(10))) == 10 @test length(Flux.Diagonal(10)(1)) == 10 @test length(Flux.Diagonal(10)(randn(1))) == 10 + @test length(Flux.Diagonal(10; bias = false)(randn(10))) == 10 @test_throws DimensionMismatch Flux.Diagonal(10)(randn(2)) @test Flux.Diagonal(2)([1 2]) == [1 2; 1 2] @test Flux.Diagonal(2)([1,2]) == [1,2] - @test Flux.Diagonal(2)([1 2; 3 4]) == [1 2; 3 4] + @test Flux.Diagonal(2; bias = false)([1 2; 3 4]) == [1 2; 3 4] @test Flux.Diagonal(2)(rand(2,3,4)) |> size == (2, 3, 4) @test Flux.Diagonal(2,3)(rand(2,3,4)) |> size == (2, 3, 4) - @test Flux.Diagonal(2,3,4)(rand(2,3,4)) |> size == (2, 3, 4) - @test Flux.Diagonal(2,3)(rand(2,1,4)) |> size == (2, 3, 4) + @test Flux.Diagonal(2, 3, 4; bias = false)(rand(2,3,4)) |> size == (2, 3, 4) + @test Flux.Diagonal(2, 3; bias = false)(rand(2,1,4)) |> size == (2, 3, 4) end @testset "Maxout" begin From 46de848580d359fcc09f54dad380ead5e3fe3c37 Mon Sep 17 00:00:00 2001 From: Abhirath Anand <74202102+theabhirath@users.noreply.github.com> Date: Sat, 19 Feb 2022 18:12:06 +0530 Subject: [PATCH 4/9] Updated docstring --- src/layers/basic.jl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/layers/basic.jl b/src/layers/basic.jl index 0e4bf6f205..10c997183c 100644 --- a/src/layers/basic.jl +++ b/src/layers/basic.jl @@ -185,7 +185,8 @@ if `bias` is true, and otherwise. The learnable arrays are initialised `α = ones(Float32, size)` and `β = zeros(Float32, size)`. If `init` is specified, the function given to it is -called and used to initialise α. +called and used to initialise α. The weight matrix and/or the bias vector +(with the same size as x) may also be provided explicitly. Used by [`LayerNorm`](@ref). """ From 37ddbac4cfdba08667734c2941f04372fa43805e Mon Sep 17 00:00:00 2001 From: Abhirath Anand <74202102+theabhirath@users.noreply.github.com> Date: Sat, 5 Mar 2022 23:03:35 +0530 Subject: [PATCH 5/9] Rebase to do away with Zeros and cleanup --- src/layers/basic.jl | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/src/layers/basic.jl b/src/layers/basic.jl index 10c997183c..c562b7367c 100644 --- a/src/layers/basic.jl +++ b/src/layers/basic.jl @@ -167,21 +167,21 @@ end function Base.show(io::IO, l::Dense) print(io, "Dense(", size(l.weight, 2), " => ", size(l.weight, 1)) l.σ == identity || print(io, ", ", l.σ) - l.bias == false && print(io, "; bias=false") + l.bias == false && print(io, "; bias = false") print(io, ")") end """ - Diagonal(α, β) + Diagonal(scale, bias) Diagonal(size::Integer...; bias = true, init = ones32) Create an element-wise linear layer, which performs - y = α .* x .+ β + y = scale .* x .+ bias if `bias` is true, and - y = α .* x + y = x .* scale otherwise. The learnable arrays are initialised `α = ones(Float32, size)` and `β = zeros(Float32, size)`. If `init` is specified, the function given to it is @@ -190,9 +190,9 @@ called and used to initialise α. The weight matrix and/or the bias vector Used by [`LayerNorm`](@ref). """ -struct Diagonal{A, B} - α::A - β::B +struct Diagonal{A<:AbstractArray, B} + scale::A + bias::B function Diagonal(W::M, bias = true) where M<:AbstractArray b = create_bias(W, bias, size(W)...) new{M, typeof(b)}(W, b) @@ -203,13 +203,12 @@ Diagonal(sz::Integer...; bias = true, init = ones32) = Diagonal(init(sz...), bia @functor Diagonal -function (a::Diagonal)(x) - x = a.α .* x - x = x .+ a.β -end +(a::Diagonal)(x) = a.scale .* x .+ a.bias function Base.show(io::IO, l::Diagonal) - print(io, "Diagonal(", join(size(l.α), ", "), ")") + print(io, "Diagonal(", join(size(l.scale), ", ")) + l.bias == false && print(io, "; bias = false") + print(io, ")") end """ From 572e9dfa51551393a68e4608f168080828e5a713 Mon Sep 17 00:00:00 2001 From: Abhirath Anand <74202102+theabhirath@users.noreply.github.com> Date: Sat, 5 Mar 2022 23:08:30 +0530 Subject: [PATCH 6/9] Docstring cleanup --- src/layers/basic.jl | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/layers/basic.jl b/src/layers/basic.jl index c562b7367c..0ad307c2af 100644 --- a/src/layers/basic.jl +++ b/src/layers/basic.jl @@ -110,7 +110,7 @@ as an `in × N` matrix, or any array with `size(x,1) == in`. The out `y` will be a vector of length `out`, or a batch with `size(y) == (out, size(x)[2:end]...)` -Keyword `bias=false` will switch off trainable bias for the layer. +Keyword `bias = false` will switch off trainable bias for the layer. The initialisation of the weight matrix is `W = init(out, in)`, calling the function given to keyword `init`, with default [`glorot_uniform`](@doc Flux.glorot_uniform). The weight matrix and/or the bias vector (of length `out`) may also be provided explicitly. @@ -127,7 +127,7 @@ julia> d(rand(Float32, 5, 1, 1, 64)) |> size # treated as three batch dimension (2, 1, 1, 64) julia> d1 = Dense(ones(2, 5), false, tanh) # using provided weight matrix -Dense(5 => 2, tanh; bias=false) # 10 parameters +Dense(5 => 2, tanh; bias = false) # 10 parameters julia> d1(ones(5)) 2-element Vector{Float64}: @@ -183,9 +183,9 @@ if `bias` is true, and y = x .* scale -otherwise. The learnable arrays are initialised `α = ones(Float32, size)` and -`β = zeros(Float32, size)`. If `init` is specified, the function given to it is -called and used to initialise α. The weight matrix and/or the bias vector +otherwise. The learnable arrays are initialised `scale = ones(Float32, size)` and +`bias = zeros(Float32, size)`. If `init` is specified, the function given to it is +called and used to initialise scale. The weight matrix and/or the bias vector (with the same size as x) may also be provided explicitly. Used by [`LayerNorm`](@ref). @@ -330,7 +330,7 @@ which is accepted as the input to a `Chain`. If the two input sizes are the same, `in1 == in2`, then you may write `Bilinear(in => out, σ)`. The initialisation works as for [`Dense`](@ref) layer, with `W = init(out, in1, in2)`. -By default the bias vector is `zeros(Float32, out)`, option `bias=false` will switch off +By default the bias vector is `zeros(Float32, out)`, option `bias = false` will switch off trainable bias. Either of these may be provided explicitly. # Examples @@ -348,14 +348,14 @@ true julia> sc = SkipConnection( Chain(Dense(5 => 20, tanh), Dense(20 => 9, tanh)), - Flux.Bilinear((9, 5) => 3, bias=false), + Flux.Bilinear((9, 5) => 3, bias = false), ); # used as the recombinator, with skip as the second input julia> sc(x) |> size (3, 32) julia> Flux.Bilinear(rand(4,8,16), false, tanh) # first dim of weight is the output -Bilinear((8, 16) => 4, tanh; bias=false) # 512 parameters +Bilinear((8, 16) => 4, tanh; bias = false) # 512 parameters ``` """ struct Bilinear{F,A,B} @@ -406,7 +406,7 @@ function Base.show(io::IO, l::Bilinear) print(io, "Bilinear((", size(l.weight, 2), ", ", size(l.weight, 3), ") => ", size(l.weight, 1)) end l.σ == identity || print(io, ", ", l.σ) - l.bias === false && print(io, "; bias=false") + l.bias === false && print(io, "; bias = false") print(io, ")") end From a95d7c467d478e9f62b52b0ee8dd11be7e85c286 Mon Sep 17 00:00:00 2001 From: Abhirath Anand <74202102+theabhirath@users.noreply.github.com> Date: Sat, 5 Mar 2022 23:45:09 +0530 Subject: [PATCH 7/9] More docstring cleanup --- src/layers/basic.jl | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/src/layers/basic.jl b/src/layers/basic.jl index 0ad307c2af..86c3b9fb14 100644 --- a/src/layers/basic.jl +++ b/src/layers/basic.jl @@ -110,7 +110,7 @@ as an `in × N` matrix, or any array with `size(x,1) == in`. The out `y` will be a vector of length `out`, or a batch with `size(y) == (out, size(x)[2:end]...)` -Keyword `bias = false` will switch off trainable bias for the layer. +Keyword `bias=false` will switch off trainable bias for the layer. The initialisation of the weight matrix is `W = init(out, in)`, calling the function given to keyword `init`, with default [`glorot_uniform`](@doc Flux.glorot_uniform). The weight matrix and/or the bias vector (of length `out`) may also be provided explicitly. @@ -127,7 +127,7 @@ julia> d(rand(Float32, 5, 1, 1, 64)) |> size # treated as three batch dimension (2, 1, 1, 64) julia> d1 = Dense(ones(2, 5), false, tanh) # using provided weight matrix -Dense(5 => 2, tanh; bias = false) # 10 parameters +Dense(5 => 2, tanh; bias=false) # 10 parameters julia> d1(ones(5)) 2-element Vector{Float64}: @@ -167,7 +167,7 @@ end function Base.show(io::IO, l::Dense) print(io, "Dense(", size(l.weight, 2), " => ", size(l.weight, 1)) l.σ == identity || print(io, ", ", l.σ) - l.bias == false && print(io, "; bias = false") + l.bias == false && print(io, "; bias=false") print(io, ")") end @@ -182,11 +182,13 @@ Create an element-wise linear layer, which performs if `bias` is true, and y = x .* scale + +otherwise. -otherwise. The learnable arrays are initialised `scale = ones(Float32, size)` and -`bias = zeros(Float32, size)`. If `init` is specified, the function given to it is -called and used to initialise scale. The weight matrix and/or the bias vector -(with the same size as x) may also be provided explicitly. +The learnable parameters are initialised `scale = Flux.ones32(size)` and +`bias = Flux.zeros32(size)`. Alternatively, specify `init` to customize how `scale` +is initialised. The weight matrix and/or the bias vector (with the same size as `x`) +may also be provided explicitly. Used by [`LayerNorm`](@ref). """ @@ -207,7 +209,7 @@ Diagonal(sz::Integer...; bias = true, init = ones32) = Diagonal(init(sz...), bia function Base.show(io::IO, l::Diagonal) print(io, "Diagonal(", join(size(l.scale), ", ")) - l.bias == false && print(io, "; bias = false") + l.bias == false && print(io, "; bias=false") print(io, ")") end @@ -330,7 +332,7 @@ which is accepted as the input to a `Chain`. If the two input sizes are the same, `in1 == in2`, then you may write `Bilinear(in => out, σ)`. The initialisation works as for [`Dense`](@ref) layer, with `W = init(out, in1, in2)`. -By default the bias vector is `zeros(Float32, out)`, option `bias = false` will switch off +By default the bias vector is `zeros(Float32, out)`, option `bias=false` will switch off trainable bias. Either of these may be provided explicitly. # Examples @@ -348,14 +350,14 @@ true julia> sc = SkipConnection( Chain(Dense(5 => 20, tanh), Dense(20 => 9, tanh)), - Flux.Bilinear((9, 5) => 3, bias = false), + Flux.Bilinear((9, 5) => 3, bias=false), ); # used as the recombinator, with skip as the second input julia> sc(x) |> size (3, 32) julia> Flux.Bilinear(rand(4,8,16), false, tanh) # first dim of weight is the output -Bilinear((8, 16) => 4, tanh; bias = false) # 512 parameters +Bilinear((8, 16) => 4, tanh; bias=false) # 512 parameters ``` """ struct Bilinear{F,A,B} @@ -406,7 +408,7 @@ function Base.show(io::IO, l::Bilinear) print(io, "Bilinear((", size(l.weight, 2), ", ", size(l.weight, 3), ") => ", size(l.weight, 1)) end l.σ == identity || print(io, ", ", l.σ) - l.bias === false && print(io, "; bias = false") + l.bias === false && print(io, "; bias=false") print(io, ")") end From cdbc5c5e432ae66cef5c64faec56ba24f3428b51 Mon Sep 17 00:00:00 2001 From: Abhirath Anand <74202102+theabhirath@users.noreply.github.com> Date: Sun, 6 Mar 2022 00:22:35 +0530 Subject: [PATCH 8/9] Even more docstring cleanup Co-authored-by: Michael Abbott <32575566+mcabbott@users.noreply.github.com> --- src/layers/basic.jl | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/src/layers/basic.jl b/src/layers/basic.jl index 86c3b9fb14..d6fa889d3d 100644 --- a/src/layers/basic.jl +++ b/src/layers/basic.jl @@ -172,23 +172,18 @@ function Base.show(io::IO, l::Dense) end """ - Diagonal(scale, bias) Diagonal(size::Integer...; bias = true, init = ones32) + Diagonal(scale::AbstractArray, [bias]) Create an element-wise linear layer, which performs y = scale .* x .+ bias -if `bias` is true, and - - y = x .* scale - -otherwise. - -The learnable parameters are initialised `scale = Flux.ones32(size)` and -`bias = Flux.zeros32(size)`. Alternatively, specify `init` to customize how `scale` -is initialised. The weight matrix and/or the bias vector (with the same size as `x`) -may also be provided explicitly. +with no activation function. + +The learnable parameters are initialised `scale = init(size...)` and +`bias = zeros32(size...)`, with `init = ones32` by default. You may specify the function `init`, +turn off trainable bias with `bias = false`, or provide the array(s) explicitly. Used by [`LayerNorm`](@ref). """ From 0b06bcb10e01b9050b7d3a047796843be6518d06 Mon Sep 17 00:00:00 2001 From: Michael Abbott <32575566+mcabbott@users.noreply.github.com> Date: Sat, 5 Mar 2022 17:30:01 -0500 Subject: [PATCH 9/9] let's go with no spaces for now then, although it's not a big deal --- src/layers/basic.jl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/layers/basic.jl b/src/layers/basic.jl index d6fa889d3d..e76efea60a 100644 --- a/src/layers/basic.jl +++ b/src/layers/basic.jl @@ -172,7 +172,7 @@ function Base.show(io::IO, l::Dense) end """ - Diagonal(size::Integer...; bias = true, init = ones32) + Diagonal(size::Integer...; bias=true, init=ones32) Diagonal(scale::AbstractArray, [bias]) Create an element-wise linear layer, which performs @@ -181,9 +181,9 @@ Create an element-wise linear layer, which performs with no activation function. -The learnable parameters are initialised `scale = init(size...)` and -`bias = zeros32(size...)`, with `init = ones32` by default. You may specify the function `init`, -turn off trainable bias with `bias = false`, or provide the array(s) explicitly. +The learnable scale & bias are initialised `init(size...)` and `zeros32(size...)`, +with `init=ones32` by default. You may specify the function `init`, +turn off trainable bias with `bias=false`, or provide the array(s) explicitly. Used by [`LayerNorm`](@ref). """