give up on customising fmap & write the recursion, add evil tests

mcabbott · mcabbott · commit 3172f1349c19 · 2022-10-12T18:03:56.000-04:00
diff --git a/src/interface.jl b/src/interface.jl
@@ -20,16 +20,28 @@ end
 Base.:(==)(a::Leaf, b::Leaf) = children(a) == children(b)
 
 function setup(rule::AbstractRule, model)
-  cnt = Ref(0)
-  # Rely on Functors to identify shared arrays, they will share a Leaf in this tree:
-  tree = fmapstructure(model, exclude = isnumeric) do x
-    cnt[] += 1
-    Leaf(rule, init(rule, x))
-  end
-  cnt[] == 0 && @warn "setup found no parameters in the given model"
+  cache = IdDict()
+  tree = _setup(rule, model; cache)
+  isempty(cache) && @warn "setup found no trainable parameters in this model"
   tree
 end
 
+# _setup is almost fmapstructure, but needs a _trainable_walk, and a cache which ignores numbers etc.
+function _setup(rule, x; cache)
+  haskey(cache, x) && return cache[x]
+  if isnumeric(x)
+    ℓ = Leaf(rule, init(rule, x))
+    if isbits(x)
+      cache[nothing] = nothing  # just to disable the warning
+      ℓ
+    else
+      cache[x] = ℓ
+    end
+  else
+    map(xᵢ -> _setup(rule, xᵢ; cache), _trainable(x))
+  end
+end
+
 function Base.show(io::IO, ℓ::Leaf)  # show method is mostly to hide its long type!
   ioc = IOContext(io, :compact => true)
   print(ioc, "Leaf(", ℓ.rule, ", ")
@@ -41,65 +53,56 @@ end
 ### update
 ###
 
-function update!(tree, model, grad)
+function update(tree, model, grad, higher...)
+  t′ = fmap(copy, tree; exclude = maywrite)  # walks inside Leaf
+  x′ = fmap(copy, model; exclude = maywrite)
+  update!(t′, x′, grad, higher...)
+end
+
+function update!(tree, model, grad, higher...)
   # First walk is to accumulate the gradient. This recursion visits every copy of
   # shared leaves, but stops when branches are absent from the gradient:
-  dict = IdDict{Leaf, Any}()
-  grads!(dict, tree, model, grad)
-  # Second walk is to update the model. The walk taken follows Leaf identity
-  newmodel = fmap(tree, model; exclude = ℓ -> ℓ isa Leaf, walk = _second_walk, cache = LeafCache()) do ℓ, x
-    haskey(dict, ℓ) || return x  # no gradient seen, nothing to do
-    s′, x̄′ = apply!(ℓ.rule, ℓ.state, x, dict[ℓ])
-    ℓ.state = s′  # to get state out of here, rely on mutability of Leaf
+  grads = IdDict{Leaf, Any}()
+  _grads!(grads, tree, model, grad, higher...)
+  # Second walk is to update the model. The params cache indexed by (tree,x),
+  # so that identified Leafs can tie isbits parameters, but setup won't do that for you:
+  newmodel = _update!(tree, model; grads, params = IdDict())
+  tree, newmodel  # note that tree is guaranteed to be updated. Also that it's not necc a tree.
+end
+
+function _update!(tree, x; grads, params)
+  haskey(params, (tree,x)) && return params[(tree,x)]
+  isbits(tree) && return x  # means () is not cached, and also (((),),)
+  x′, re = functor(x)
+  x′′ = map((tᵢ, xᵢ) -> _update!(tᵢ, xᵢ; grads, params), tree, x′)
+  params[(tree,x)] = re(x′′)
+end
+function _update!(ℓ::Leaf, x; grads, params)
+  haskey(params, (ℓ,x)) && return params[(ℓ,x)]
+  params[(ℓ,x)] = if haskey(grads, ℓ)
+    ℓ.state, x̄′ = apply!(ℓ.rule, ℓ.state, x, grads[ℓ]...)
     subtract!(x, x̄′)
+  else
+    x # no gradient seen
   end
-  tree, newmodel  # note that tree is guaranteed to be updated
 end
 
 subtract!(x, x̄) = maywrite(x) ? (x .= x .- x̄) : eltype(x).(x .- x̄)
 
-grads!(dict::IdDict, ℓ::Leaf, x, ::Zero) = nothing
-function grads!(dict::IdDict, ℓ::Leaf, x, x̄)
-  x̄₀ = get(dict, ℓ, ZeroTangent())
-  dict[ℓ] = x̄ + x̄₀  # adding Zero should be free. Lazy accumulation broadcasted(+, x̄, x̄₀) also possible.
+_grads!(dict::IdDict, ℓ::Leaf, x, ::Zero...) = nothing
+function _grads!(dict::IdDict, ℓ::Leaf, x, x̄s...)
+  x̄s₀ = get(dict, ℓ, map(_ -> ZeroTangent(), x̄s))
+  dict[ℓ] = map(+, x̄s, x̄s₀)  # adding Zero should be free. Lazy accumulation broadcasted(+, x̄, x̄₀) also possible.
   nothing
 end
-grads!(dict::IdDict, t, x, ::Zero) = nothing
-function grads!(dict::IdDict, tree, x, x̄s...)
-  # The only reason grads! takes model is that functor(typeof(x), base(x̄)) may differ from 
+_grads!(dict::IdDict, t, x, ::Zero...) = nothing
+function _grads!(dict::IdDict, tree, x, x̄s...)
+  # The only reason _grads! takes model is that functor(typeof(x), base(x̄)) may differ from 
   # functor(typeof(tree), base(x̄)), for things like Transpose
   x̄s′ = map(x̄ -> functor(typeof(x), base(x̄))[1], x̄s)
   x′, _ = functor(typeof(x), x)
-  foreach((tᵢ, xᵢ, x̄sᵢ...) -> grads!(dict, tᵢ, xᵢ, x̄sᵢ...), tree, x′, x̄s′...)
-end
-
-function update(tree, x, x̄s...)
-  t′ = fmap(copy, tree; exclude = maywrite)  # goes inside Leaf
-  x′ = fmap(copy, x; exclude = maywrite)
-  update!(t′, x′, x̄s...)
-end
-
-# This differs from _default_walk(f,x,y) in taking re from 2nd argument, but cache will still operate on the first
-function _second_walk(f, x, y)
-  x′, _ = functor(typeof(y), x)
-  y′, re = functor(y)
-  re(map(f, x′, y′))
-end
-
-# When fmap reconstructs for update!, it should not cache results with trivial nodes like () in the state.
-# This cache type has just enough methods to work in Functors, which possibly should be upgraded to just work.
-struct LeafCache <: AbstractDict{Leaf,Any}
-  dict::IdDict{Leaf,Any}
+  foreach((tᵢ, xᵢ, x̄sᵢ...) -> _grads!(dict, tᵢ, xᵢ, x̄sᵢ...), tree, x′, x̄s′...)
 end
-LeafCache() = LeafCache(IdDict{Leaf,Any}())
-
-Base.setindex!(c::LeafCache, x, ℓ::Leaf) = setindex!(c.dict, x, ℓ)
-Base.setindex!(c::LeafCache, x, _) = nothing
-Base.in(k, c::LeafCache) = k in c.dict
-Base.haskey(c::LeafCache, k) = haskey(c.dict, k)
-Base.getindex(c::LeafCache, ℓ::Leaf) = getindex(c.dict, ℓ)
-Base.iterate(c::LeafCache, i = 0) = iterate(c.dict, i)
-Base.length(c::LeafCache) = length(c.dict)
 
 # default all rules to first order calls
 apply!(o, state, x, dx, dx2, dxs...) = apply!(o, state, x, dx)
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -13,19 +13,26 @@ struct TwoThirds a; b; c; end
 Functors.@functor TwoThirds (a, c)
 Optimisers.trainable(x::TwoThirds) = (a = x.a,)
 
-struct DummyHigherOrder <: AbstractRule end
+mutable struct MutTwo; x; y; end
+Functors.@functor MutTwo
 
+struct DummyHigherOrder <: AbstractRule end
 Optimisers.init(::DummyHigherOrder, x::AbstractArray) =
   (ones(eltype(x), size(x)), zero(x))
-
 dummy_update_rule(st, p, dx, dx2) = @. p - (st[1] * dx + st[2] * dx2)
 function Optimisers.apply!(::DummyHigherOrder, state, x, dx, dx2)
   a, b = state
   @.. dx = a * dx + b * dx2
-
   return (a .+ 1, b .+ 1), dx
 end
 
+struct BiRule <: Optimisers.AbstractRule end
+Optimisers.init(o::BiRule, x::AbstractArray) = nothing
+function Optimisers.apply!(o::BiRule, state, x, dx, dx2)
+  dx == dx2 || error("expected 1st & 2nd gradients to agree")
+  return state, dx
+end
+
 @testset verbose=true "Optimisers.jl" begin
   @testset verbose=true "Features" begin
 
@@ -220,6 +227,23 @@ end
       @test_throws MethodError Optimisers.update(sm, m)
     end
 
+    @testset "2nd order gradient" begin
+      m = (α = ([1.0], sin), γ = Float32[4,3,2])
+
+      # Special rule which requires this:
+      s = Optimisers.setup(BiRule(), m)
+      g = (α = ([0.1], ZeroTangent()), γ = [1,10,100],)
+      s1, m1 = Optimisers.update(s, m, g, g)
+      @test m1.α[1] == [0.9]
+      @test_throws Exception Optimisers.update(s, m, g, map(x->2 .* x, g))
+
+      # Ordinary rule which doesn't need it:
+      s2 = Optimisers.setup(Adam(), m)
+      s3, m3 = Optimisers.update(s2, m, g)
+      s4, m4 = Optimisers.update(s2, m, g, g)
+      @test m3.γ == m4.γ
+    end
+
     @testset "broadcasting macros" begin
       x = [1.0, 2.0]; y = [3,4]; z = [5,6]
       @test (@lazy x + y * z) isa Broadcast.Broadcasted
@@ -305,22 +329,42 @@ end
          # Error: no constructors for type Any
          @test_broken s4, t4 = Optimisers.update(stri, tri, g4)
        end
-       
+
        @testset "artificial" begin
          # Interpret shared Leaf as implying shared parameters, even if this did not arise from shared arrays.
          # No API for setting this at the moment, but can construct one by hand:
-         model = (a = [1,2.0], b = [1, 2.0], c = [1, 2.0], d = [1, 2.0])
-         honest = Optimisers.setup(Momentum(0.1), model)
-         trick = (a = honest.a, b = honest.a, c = honest.c, d= honest.d)  # makes a & b shared
+         model = (a = SA[1,2.0], b = SA[1, 2.0], c = SA[1, 2.0], d = SA[1, 2.0])
+         auto = Optimisers.setup(Momentum(0.1), model)
+         @test auto.a !== auto.b  # not tied just by value
+
+         trick = (a = auto.a, b = auto.a, c = auto.c, d= auto.d)  # makes a & b tied
   
          trick2, model2 = Optimisers.update(trick, model, (a=[3,3], b=[7,7], c=[3,3], d=[10, 10]))
          trick3, model3 = Optimisers.update(trick2, model2, (a=[3,3], b=[7,7], c=[3,3], d=[10, 10]))
          
          @test model3.a == model3.b == model3.d  # same as having the gradients added
          @test !(model3.a ≈ model3.c)
          @test trick3.a === trick3.b  # leaves remain shared
-         model3.a === model3.b  # in fact arrays end up shared, but this is not required
        end
+
+       @testset "mutable containers" begin
+         tmp = MutTwo([1.0], [2.0])
+         model = (a=tmp, b=tmp, c=MutTwo(tmp.x, tmp.y))
+         state = Optimisers.setup(Momentum(), model)
+
+         @test model.a === model.b
+         @test model.a !== model.c  # fields are identified, but struct is not
+
+         @test state.a.x === state.b.x
+         @test state.a === state.b
+         @test state.a === state.c  # unavoidable, but means we can't use leaf ID alone
+
+         mgrad = (a=(x=[1], y=[10]), b=(x=[100], y=[1000]), c=(x=[1/3], y=[1/30]))
+         state2, model2 = Optimisers.update(state, model, mgrad)
+
+         @test model2.a === model2.b  # tie of MutTwo structs is restored
+         @test model2.a !== model2.c  # but a new tie is not created
+      end
     end
 
     @testset "higher order interface" begin