cl/trainables

CarloLucibello · CarloLucibello · commit d6285e55610c · 2024-04-01T11:09:04.000+02:00
diff --git a/src/trainables.jl b/src/trainables.jl
@@ -2,18 +2,33 @@ using BenchmarkTools
 using Optimisers
 using Functors
 using Zygote, Flux
+using ChainRulesCore
 
 function trainables1(x)
-    Optimisers.isnumeric(x) && return [x]
     arrays = AbstractArray[]
-    exclude(x) = Optimisers.isnumeric(x) && Functors.isleaf(x)
+    exclude(x) = Optimisers.isnumeric(x)
     fmap(x; exclude, walk = Optimisers._TrainableStructWalk()) do y
         push!(arrays, y)
         return y
     end
     return arrays
 end
 
+function ∇trainables1(x, Δ)
+    exclude(x) = Optimisers.isnumeric(x)
+    i = 0
+    return fmapstructure(x; exclude, walk = Optimisers._TrainableStructWalk()) do _
+                return Δ[i+=1]
+           end
+end
+
+
+function ChainRulesCore.rrule(::typeof(trainables1), x)
+    y = trainables1(x)
+    trainables_back(Δ) = (NoTangent(), ∇trainables1(x, unthunk(Δ)))
+    return y, trainables_back
+end
+
 ############
 
 using Functors: AbstractWalk, _map, _values, execute, ExcludeWalk
@@ -49,33 +64,52 @@ end
 
 
 function floss(ps)
-    sum([sum(p) for p in ps])
+    sum([sum(abs2, p) for p in ps])
 end
 
 using Flux
 
 function perf()
     m = Chain(Dense(128 => 128, relu), 
               Dense(128 => 128, relu),
-              BatchNorm(128), Dense(3 => 2), x -> x^2)
+              BatchNorm(128),
+              x -> x^2,
               Dense(128 => 128, relu), 
-              Dense(128 => 128, relu)
+              Dense(128 => 128, relu))
               
     println("trainables1")
-    @btime trainables1($m)
+    @btime floss(trainables1($m))
     println("trainables2")
-    @btime trainables2($m)
+    @btime floss(trainables2($m))
     println("trainables3")
-    @btime trainables3($m)
+    @btime floss(trainables3($m))
     println()
 
-
-    # gradient(m -> floss(trainables1(m)), #m) # non differentiable since mutating
+    println("gradient trainables1")
+    @btime gradient(m -> floss(trainables1(m)), $m)
     println("gradient trainables2")
     @btime gradient(m -> floss(trainables2(m)), $m)
     println("gradient trainables3")
     @btime gradient(m -> floss(trainables3(m)), $m)
+
+    nothing
 end
 
 Zygote.refresh()
-perf()
+perf()
+
+
+m = Chain(Dense(128 => 128, relu), 
+              Dense(128 => 128, relu),
+              BatchNorm(128),
+              x -> x^2,
+              Dense(128 => 128, relu), 
+              Dense(128 => 128, relu))
+              
+floss(trainables1(m))
+g1 = gradient(m -> floss(trainables1(m)), m)[1]
+g2 = gradient(m -> floss(trainables2(m)), m)[1]
+@test g1.layers[1].weight ≈ g2.layers[1].weight
+@test g1.layers[1].weight ≈ g2.layers[1].weight
+@test g1.layers[3].μ === nothing
+@test g2.layers[3].μ === nothing