wrap more rules, test & doc

mcabbott · mcabbott · commit 7aa74e0e2fc9 · 2022-08-10T09:28:02.000-07:00
diff --git a/src/train/Train.jl b/src/train/Train.jl
@@ -43,16 +43,60 @@ function Base.show(io::IO, opt::FluxState)
   end
 end
 
+_DESCENT_EXAMPLE = """# Implicit-style example
+This usage matches Flux ≤ v0.13:
+```
+opt = Flux.Descent(0.3)
+
+ps = Flux.params(model)  # returns a Zygote.Params object
+
+gs = gradient(ps) do    # gradient takes a zero-argument anonymous function
+  loss3(model, x, y)    # ... which depends on the global model
+end                     # ... and returns a Zygote.Grads object
+
+Flux.update!(opt, ps, gs)
+```
+New on Flux v0.14 is a method `train!(loss, ps, opt)` which performs one step,
+rather than iterating over `data`. This is equivalent to `gradient` and `update!` above:
+```
+Flux.train!(ps, opt) do
+  loss3(model, x, y)
+end
+```
+
+# Explicit-style example
+
+This no longer uses `Flux.params`, but instead the model itself:
+```
+opt = Flux.Descent(0.3)        # the same FluxState object
+
+Flux.train!(model, opt) do m   # now explicitly depends on the model
+  loss3(m, x, y)
+end
+```
+"""
 for opt in [
   :Descent, :Adam, :Momentum, :Nesterov, :RMSProp,
 	:AdaGrad, :AdaMax, :AdaDelta, :AMSGrad, :NAdam, :AdamW, :RAdam, :OAdam, :AdaBelief,
-	# :InvDecay, :ExpDecay, :WeightDecay, :stop, :skip, :Optimiser,
-  # :ClipValue, :ClipNorm,
-# TODO check that parameters line up nicely old-vs-new, and include the remaining rules
+	# :InvDecay, :ExpDecay, :WeightDecay, :Optimiser,
+  :ClipGrad, :ClipNorm,
+# TODO sort out the remaining rules
 ]
-  @eval $opt(parameters...; kw...) = FluxState(Optimisers.$opt(parameters...; kw...), missing)
+  @eval begin 
+    $opt(parameters...; kw...) = FluxState(Optimisers.$opt(parameters...; kw...), missing)
+    str = string("""    Flux.$($opt)(args...)
+    
+    Returns `FluxState` wrapper around the following rule definition from Optimisers.jl,
+    allowing its use with `Flux.train!` (in the same manner as `Flux.AbstractOptimiser` objects on Flux ≤ v0.13).
+    Accepts the same arguments, with the same defaults, as the underlying rule:
+    
+    """, @doc(Optimisers.$opt), $opt == Descent ? _DESCENT_EXAMPLE : "")
+    @doc str $opt
+  end
 end
 
+@deprecate ClipValue ClipGrad
+
 
 ### Two styles of gradient, and their `train!` functions
 
diff --git a/test/train.jl b/test/train.jl
@@ -10,10 +10,9 @@ using Random
   Random.seed!(84)
   w = randn(10, 10)
   w2 = randn(10, 10)  # NB outside the inner @testset, else it will be exactly == w, as the RNG seed is reset.
-  @testset for opt in [Descent(0.1), Adam()]
-                      # [AdamW(), AdaGrad(0.1), AdaMax(), AdaDelta(0.9), AMSGrad(),
-                      #  NAdam(), RAdam(), Descent(0.1), Adam(), OAdam(), AdaBelief(),
-                      #  Nesterov(), RMSProp(), Momentum()]
+  @testset for opt in [AdamW(), AdaGrad(0.1), AdaMax(), AdaDelta(0.9), AMSGrad(),
+                       NAdam(), RAdam(), Descent(0.1), Adam(), OAdam(), AdaBelief(),
+                       Nesterov(), RMSProp(), Momentum()]
     w′ = copy(w2)
     b = zeros(10)
     loss(x) = Flux.Losses.mse(w*x, w′*x .+ b)
@@ -27,7 +26,9 @@ end
   Random.seed!(84)
   w = randn(10, 10)
   w2 = randn(10, 10)  # NB outside the inner @testset, else it will be exactly == w, as the RNG seed is reset.
-  @testset for opt in [Descent(0.1), Adam()]
+  @testset for opt in [AdamW(), AdaGrad(0.1), AdaMax(), AdaDelta(0.9), AMSGrad(),
+                       NAdam(), RAdam(), Descent(0.1), Adam(), OAdam(), AdaBelief(),
+                       Nesterov(), RMSProp(), Momentum()]
     @test opt isa FluxState
     @test opt.state isa Missing
 
@@ -41,7 +42,7 @@ end
   end
   
   # Test 3-arg `train!` method:
-  @testset for opt in [Descent(0.1), Adam()]
+  @testset for opt in [Descent(0.1), Adam(), AdamW()]
     @test opt isa FluxState
     @test opt.state isa Missing