destructure, take II

mcabbott · mcabbott · commit f7c1a7f1c543 · 2022-02-09T19:30:48.000-05:00
diff --git a/docs/src/api.md b/docs/src/api.md
@@ -42,6 +42,12 @@ optimiser to act on all suitable fields. To restrict this, define `trainable`:
 Optimisers.trainable
 ```
 
+Such restrictions are also obeyed by this function for flattening a model:
+
+```@docs
+Optimisers.destructure
+```
+
 ## Rule Definition
 
 ```@docs
diff --git a/src/Optimisers.jl b/src/Optimisers.jl
@@ -4,8 +4,10 @@ using Functors: functor, fmap, isleaf
 using LinearAlgebra
 
 include("interface.jl")
-include("rules.jl")
+include("destructure.jl")
+export destructure
 
+include("rules.jl")
 export Descent, ADAM, Momentum, Nesterov, RMSProp,
        ADAGrad, AdaMax, ADADelta, AMSGrad, NADAM, ADAMW, RADAM, OADAM, AdaBelief,
        WeightDecay, ClipGrad, ClipNorm, OptimiserChain
diff --git a/src/destructure.jl b/src/destructure.jl
@@ -0,0 +1,108 @@
+
+using ChainRulesCore: ChainRulesCore, NoTangent, ProjectTo
+const NoT = NoTangent()
+
+"""
+    destructure([T], model) -> vector, reconstructor
+
+Copies all [`trainable`](@ref), [`isnumeric`](@ref) parameters in the model
+to a `Vector{T}`, and returns also a function which reverses this transformation.
+Differentiable.
+"""
+function destructure(::Type{T}, x) where T
+  flat, off = alpha!(x, T[])
+  len = length(flat)
+  # flat, newflat -> beta(x, off, newflat; len)
+  flat, Restucture(x, off, len)
+end
+
+struct Restucture{T,S}
+  model::T
+  offsets::S
+  length::Int
+end
+(re::Restucture)(flat) = beta(re.model, re.offsets, flat; len = re.length)
+Base.show(io::IO, re::Restucture{T}) where T = print(io, "Restructure(", T.name.name, ", ..., ", re.length, ")")
+
+# This flattens a model, and returns a web of offsets for later use:
+function alpha!(x, flat::AbstractVector)
+  isempty(flat) || error("this won't work")
+  isnumeric(x) && return append!(flat, x), 0  # trivial case
+  off = fmap(x; exclude = isnumeric, walk = (f, z) -> map(f, _trainable(z))) do y
+    append!(flat, y)
+    length(flat) - length(y)
+  end
+  flat, off
+end
+
+function ChainRulesCore.rrule(::typeof(alpha!), x, flat)
+  flat′, off = alpha!(x, flat)
+  len = length(flat′)
+  alpha_back((dflat, _)) = (NoT, beta(x, off, dflat; walk = _Tangent_biwalk, prune = NoT, len), NoT)
+  (flat′, off), alpha_back
+end
+
+# This reconstructs either a model like x, or a gradient for it:
+function beta(x, off, flat::AbstractVector; len, walk = _trainable_biwalk, kw...)
+  len == length(flat) || error("wrong length")
+  fmap(x, off; exclude = isnumeric, walk, kw...) do y, o
+    _getat(y, o, flat)
+  end
+end
+
+_getat(y::Number, o::Int, flat::AbstractVector) = ProjectTo(y)(flat[o + 1])
+_getat(y::AbstractArray, o::Int, flat::AbstractVector) =
+  ProjectTo(y)(reshape(flat[o .+ (1:length(y))], axes(y)))  # ProjectTo is just correcting eltypes
+
+function _trainable_biwalk(f, x, aux)
+  ch, re = functor(typeof(x), x)
+  au, _ = functor(typeof(x), aux)
+  trainmap(f, ch, _trainable(x), au) |> re
+end
+
+function trainmap(f, ch, tr, aux)
+  map(ch, tr, aux) do c, t, a
+    isnothing(t) ? c : f(t, a)
+  end
+end
+
+function _Tangent_biwalk(f, x, aux)  # use with prune = true
+  ch, re = functor(typeof(x), x)
+  au, _ = functor(typeof(x), aux)
+  y = trainmap(f, ch, _trainable(x), au)
+  y isa Tuple{} && return NoT
+  Tangent{typeof(x), typeof(y)}(y)
+end
+# _Tangent_biwalk(f, x::Tuple{}, aux) = NoT
+
+function ChainRulesCore.rrule(::typeof(beta), x, off, flat; len)
+  dflat = map!(zero, similar(flat, float(eltype(flat))), flat)
+  beta_back(dx) = (NoT, NoT, NoT, gamma!(x, dx, off, dflat))
+  beta(x, off, flat; len), beta_back
+end
+
+# This is the gradient of model reconstruction, accumulating duplicates:
+function gamma!(x, dx, off, flat::AbstractVector)
+  x′, _ = functor(typeof(x), x)
+  dx′, _ = functor(typeof(x), dx)
+  off′, _ = functor(typeof(x), off)
+  foreach((xᵢ, dxᵢ, oᵢ) -> gamma!(xᵢ, dxᵢ, oᵢ, flat), x′, dx′, off′)
+  flat
+end
+function gamma!(x, dx, off::Integer, flat::AbstractVector)
+  @views flat[off .+ (1:length(x))] .+= dx  # must visit all tied nodes, hence no fmap.
+  flat
+end
+gamma!(x, dx::Zero, off, flat::AbstractVector) = nothing
+gamma!(x, dx::Zero, off::Integer, flat::AbstractVector) = nothing  # ambiguity
+
+# Least importantly, this infers the eltype if one is not given:
+destructure(x) = destructure(omega(x), x)
+function omega(x)
+  T = Bool
+  fmap(x; exclude = isnumeric, walk = (f, z) -> foreach(f, _trainable(z))) do y
+    T = promote_type(T, eltype(y))
+  end
+  T
+end
+ChainRulesCore.@non_differentiable omega(::Any)
diff --git a/src/interface.jl b/src/interface.jl
@@ -70,7 +70,7 @@ trainable(x) = functor(x)[1]
 
 _trainable(x) = _trainable(functor(x)[1], trainable(x))
 _trainable(ch::NamedTuple, tr::NamedTuple) = merge(map(_ -> nothing, ch), tr)
-_trainable(ch::Tuple, tr::Tuple) = tr
+_trainable(ch::Tuple{Vararg{Any,N}}, tr::Tuple{Vararg{Any,N}}) where N = tr
 function _trainable(ch::NamedTuple, tr::Tuple)  # for old Flux-style no-names tuple
   @warn "trainable(x) should now return a NamedTuple with the field names, not a Tuple"
   map(c -> c in tr ? c : nothing, ch)
diff --git a/test/destructure.jl b/test/destructure.jl
@@ -0,0 +1,84 @@
+
+m1 = collect(1:3.0)
+m2 = (collect(1:3.0), collect(4:6.0))
+m3 = (x = m1, y = sin, z = collect(4:6.0))
+m4 = (x = m1, y = m1, z = collect(4:6.0))
+m5 = (a = (m3, true), b = (m1, false), c = (m4, true))
+m6 = (a = m1, b = [4.0 + im], c = m1)
+m7 = TwoThirds((sin, collect(1:3.0)), (cos, collect(4:6.0)), (tan, collect(7:9.0)))
+
+@testset "flatten & restore" begin
+  @test destructure(Int, m1)[1] isa Vector{Int}
+  @test destructure(m1)[1] isa Vector{Float64}
+
+  @test destructure(m1)[1] == 1:3
+  @test destructure(m2)[1] == 1:6
+  @test destructure(m3)[1] == 1:6
+  @test destructure(m4)[1] == 1:6
+  @test destructure(m5)[1] == vcat(1:6, 4:6)
+  @test destructure(m6)[1] == vcat(1:3, 4 + im)
+
+  @test destructure(m1)[2](7:9) == [7,8,9]
+  @test destructure(m2)[2](4:9) == ([4,5,6], [7,8,9])
+  @test destructure(m3)[2](4:9) == (x = [4,5,6], y = sin, z = [7,8,9])
+  m4′ = destructure(m4)[2](4:9)
+  @test m4′ == (x = [4,5,6], y = [4,5,6], z = [7,8,9])
+  @test m4′.x === m4′.y
+  m5′ = destructure(m5)[2](reverse(1:9))
+  @test m5′.a[1].x === m5′.b[1]
+  @test m5′.b[2] === false
+  m6′ = destructure(m6)[2]((4:7) .+ (1:4) .* im)
+  @test m6′.a isa Vector{Float64}
+  @test m6′.a == 4:6
+  @test m6′.a === m6′.c
+  @test m6′.b == [7 + 4im]
+
+  @test destructure(m7)[1] == 1:3
+  m7′ = destructure(m7)[2]([10,20,30])
+  @test m7′.a == (sin, [10,20,30])
+  @test m7′.b == (cos, [4,5,6])
+  @test m7′.c == (tan, [7,8,9])
+
+  @test_throws Exception destructure(m7)[2]([10,20])
+  @test_throws Exception destructure(m7)[2]([10,20,30,40])
+end
+
+@testset "gradient of flatten" begin
+  @test gradient(m -> destructure(m)[1][1], m1)[1] == [1,0,0]
+  @test gradient(m -> destructure(m)[1][2], m2)[1] == ([0,1,0], [0,0,0])
+  @test gradient(m -> destructure(m)[1][3], (m1, m1))[1] == ([0,0,1], nothing)
+  @test gradient(m -> destructure(m)[1][1], m3)[1] == (x = [1,0,0], y = nothing, z = [0,0,0])
+  @test gradient(m -> destructure(m)[1][2], m4)[1] == (x = [0,1,0], y = nothing, z = [0,0,0])
+
+  g5 = gradient(m -> destructure(m)[1][3], m5)[1]
+  @test g5.a[1].x == [0,0,1]
+  @test g5.a[2] === nothing
+
+  g6 = gradient(m -> imag(destructure(m)[1][4]), m6)[1]
+  @test g6.a == [0,0,0]
+  @test g6.a isa Vector{Float64}
+  @test g6.b == [0+im]
+end
+
+@testset "gradient of rebuild" begin
+  re1 = destructure(m1)[2]
+  @test gradient(x -> re1(x)[1], rand(3))[1] == [1,0,0]
+  re2 = destructure(m2)[2]
+  @test gradient(x -> re2(x)[1][2], rand(6))[1] == [0,1,0,0,0,0]
+  re3 = destructure(m3)[2]
+  @test gradient(x -> re3(x).x[3], rand(6))[1] == [0,0,1,0,0,0]
+  @test gradient(x -> re3(x).z[1], rand(6))[1] == [0,0,0,1,0,0]
+
+  re4 = destructure(m4)[2]
+  @test gradient(x -> re4(x).x[1], rand(6))[1] == [1,0,0,0,0,0]
+  @test gradient(x -> re4(x).y[2], rand(6))[1] == [0,1,0,0,0,0]
+  @test gradient(rand(6)) do x
+    m = re4(x)
+    m.x[1] + 2*m.y[2] + 3*m.z[3]
+  end[1] == [1,2,0, 0,0,3]
+
+  re7 = destructure(m7)[2]
+  @test gradient(x -> re7(x).a[2][3], rand(3))[1] == [0,0,1]
+  @test gradient(x -> re7(x).b[2][2], rand(3))[1] == [0,0,0]
+  @test gradient(x -> re7(x).c[2][1], rand(3))[1] == [0,0,0]
+end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -164,8 +164,11 @@ Optimisers.trainable(x::TwoThirds) = (a = x.a,)
       @test_throws ArgumentError Optimisers.setup(ADAMW(), m2)
     end
 
-    @info "finished feature testing"
   end
+  @testset verbose=true "Optimisation Rules" begin
+    include("destructure.jl")
+  end
+  @info "finished feature testing"
   @testset verbose=true "Optimisation Rules" begin
     include("rules.jl")
   end