Put an abstract type over the optimisers to allow dispatch (#79)

ChrisRackauckas · mcabbott · web-flow · commit 17102dc61ae8 · 2022-06-03T12:23:01.000-04:00
* Put an abstract type over the optimisers to allow dispatch Required for SciML/Optimization.jl#255 * add depwarn, update docs, shorter name, version * change to AbstractRule * restrict types to Real too, while touching * one more AbstractRule * Revert "restrict types to Real too, while touching" This reverts commit 014cc44. Co-authored-by: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "Optimisers"
 uuid = "3bd65402-5787-11e9-1adc-39752487f4e2"
 authors = ["Mike J Innes <mike.j.innes@gmail.com>"]
-version = "0.2.5"
+version = "0.2.6"
 
 [deps]
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
diff --git a/docs/src/index.md b/docs/src/index.md
@@ -7,7 +7,7 @@ These act on one array of parameters:
 
 ```julia
 # Define a container to hold any optimiser specific parameters (if any):
-struct DecayDescent{T}
+struct DecayDescent{T} <: Optimisers.AbstractRule
   η::T
 end
 
diff --git a/src/Optimisers.jl b/src/Optimisers.jl
@@ -4,6 +4,7 @@ using Functors: functor, fmap, isleaf
 using LinearAlgebra
 
 include("interface.jl")
+export AbstractRule
 
 include("destructure.jl")
 export destructure
diff --git a/src/interface.jl b/src/interface.jl
@@ -4,12 +4,15 @@ base(dx::Tangent) = backing(canonicalize(dx))
 base(dx) = dx
 const Zero = Union{Nothing, AbstractZero}  # Union{Zygote, Diffractor}
 
+abstract type AbstractRule end
+
 struct Leaf{R,S}
   rule::R
   state::S
 end
 
 function setup(rule, x; seen = Base.IdSet())
+  rule isa AbstractRule || Base.depwarn("In future, all optimisation rules should be <: AbstractRule", :setup)
   if isnumeric(x)
     x in seen && throw(ArgumentError("Optimisers.jl does not at present handle tied weights, sorry."))
     isbits(x) || push!(seen, x)
diff --git a/src/rules.jl b/src/rules.jl
@@ -16,7 +16,7 @@ For each parameter `p` and its gradient `dp`, this runs `p -= η*dp`.
 - Learning rate (`η`): Amount by which gradients are discounted before updating
                        the weights.
 """
-struct Descent{T}
+struct Descent{T} <: AbstractRule
   eta::T
 end
 Descent() = Descent(1f-1)
@@ -40,7 +40,7 @@ Gradient descent optimizer with learning rate `η` and momentum `ρ`.
 - Momentum (`ρ`): Controls the acceleration of gradient descent in the
                   prominent direction, in effect dampening oscillations.
 """
-struct Momentum{T}
+struct Momentum{T} <: AbstractRule
   eta::T
   rho::T
 end
@@ -66,7 +66,7 @@ Gradient descent optimizer with learning rate `η` and Nesterov momentum `ρ`.
 - Nesterov momentum (`ρ`): Controls the acceleration of gradient descent in the
                            prominent direction, in effect dampening oscillations.
 """
-struct Nesterov{T}
+struct Nesterov{T} <: AbstractRule
   eta::T
   rho::T
 end
@@ -104,7 +104,7 @@ gradients by an estimate their variance, instead of their second moment.
 - Keyword `centred` (or `centered`): Indicates whether to use centred variant
                                      of the algorithm.
 """
-struct RMSProp{T}
+struct RMSProp{T} <: AbstractRule
   eta::T
   rho::T
   epsilon::T
@@ -148,7 +148,7 @@ end
 - Machine epsilon (`ϵ`): Constant to prevent division by zero
                          (no need to change default)
 """
-struct Adam{T}
+struct Adam{T} <: AbstractRule
   eta::T
   beta::Tuple{T, T}
   epsilon::T
@@ -183,7 +183,7 @@ end
 - Machine epsilon (`ϵ`): Constant to prevent division by zero
                          (no need to change default)
 """
-struct RAdam{T}
+struct RAdam{T} <: AbstractRule
   eta::T
   beta::Tuple{T, T}
   epsilon::T
@@ -224,7 +224,7 @@ end
 - Machine epsilon (`ϵ`): Constant to prevent division by zero
                          (no need to change default)
 """
-struct AdaMax{T}
+struct AdaMax{T} <: AbstractRule
   eta::T
   beta::Tuple{T, T}
   epsilon::T
@@ -258,7 +258,7 @@ is a variant of Adam adding an "optimistic" term suitable for adversarial traini
 - Machine epsilon (`ϵ`): Constant to prevent division by zero
                          (no need to change default)
 """
-struct OAdam{T}
+struct OAdam{T} <: AbstractRule
   eta::T
   beta::Tuple{T, T}
   epsilon::T
@@ -293,7 +293,7 @@ Parameters don't need tuning.
 - Machine epsilon (`ϵ`): Constant to prevent division by zero
                          (no need to change default)
 """
-struct AdaGrad{T}
+struct AdaGrad{T} <: AbstractRule
   eta::T
   epsilon::T
 end
@@ -323,7 +323,7 @@ Parameters don't need tuning.
 - Machine epsilon (`ϵ`): Constant to prevent division by zero
                          (no need to change default)
 """
-struct AdaDelta{T}
+struct AdaDelta{T} <: AbstractRule
   rho::T
   epsilon::T
 end
@@ -357,7 +357,7 @@ optimiser. Parameters don't need tuning.
 - Machine epsilon (`ϵ`): Constant to prevent division by zero
                          (no need to change default)
 """
-struct AMSGrad{T}
+struct AMSGrad{T} <: AbstractRule
   eta::T
   beta::Tuple{T, T}
   epsilon::T
@@ -393,7 +393,7 @@ Parameters don't need tuning.
 - Machine epsilon (`ϵ`): Constant to prevent division by zero
                          (no need to change default)
 """
-struct NAdam{T}
+struct NAdam{T} <: AbstractRule
   eta::T
   beta::Tuple{T, T}
   epsilon::T
@@ -447,7 +447,7 @@ Adam optimiser.
 - Machine epsilon (`ϵ::Float32`): Constant to prevent division by zero
                                   (no need to change default)
 """
-struct AdaBelief{T}
+struct AdaBelief{T} <: AbstractRule
   eta::T
   beta::Tuple{T, T}
   epsilon::T
@@ -479,7 +479,7 @@ This is equivalent to adding ``L_2`` regularization with coefficient ``γ`` to t
 # Parameters
 - Weight decay (`γ`): Decay applied to weights during optimisation.
 """
-struct WeightDecay{T}
+struct WeightDecay{T} <: AbstractRule
   gamma::T
 end
 WeightDecay() = WeightDecay(5f-4)
@@ -499,7 +499,7 @@ Restricts every gradient component to obey `-δ ≤ dx[i] ≤ δ`.
 
 See also [`ClipNorm`](@ref).
 """
-struct ClipGrad{T<:Real}
+struct ClipGrad{T<:Real} <: AbstractRule
   delta::T
 end
 ClipGrad() = ClipGrad(10f0)
@@ -524,7 +524,7 @@ which you can turn off with `throw = false`.
 
 See also [`ClipGrad`](@ref).
 """
-struct ClipNorm{T<:Real}
+struct ClipNorm{T<:Real} <: AbstractRule
   omega::T
   p::T
   throw::Bool
@@ -566,7 +566,7 @@ julia> Optimisers.update(s, m, ([0.3, 1, 7],))[2]  # clips before discounting
 ([-0.03, -0.1, -0.1],)
 ```
 """
-struct OptimiserChain{O<:Tuple}
+struct OptimiserChain{O<:Tuple} <: AbstractRule
   opts::O
 end
 OptimiserChain(opts...) = OptimiserChain(opts)