Merge pull request #987 from SciML/avik-pal-patch-1

ChrisRackauckas · web-flow · commit cc55450e0b9d · 2025-09-28T08:22:01.000Z
fix: force a recent version of Lux to avoid ForwardDiff regression
diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
@@ -23,14 +23,13 @@ steps:
   - label: "Documentation"
     plugins:
       - JuliaCI/julia#v1:
-          version: "1.10"
+          version: "1"
     command: |
       julia --project -e '
         println("--- :julia: Instantiating project")
         using Pkg
         Pkg.instantiate()
         Pkg.activate("docs")
-        Pkg.develop(PackageSpec(path=pwd()))
         Pkg.instantiate()
         push!(LOAD_PATH, @__DIR__)
         println("+++ :julia: Building documentation")
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "DiffEqFlux"
 uuid = "aae7a2af-3d4f-5e19-a356-7da93b79d9d0"
 authors = ["Chris Rackauckas <accounts@chrisrackauckas.com>"]
-version = "4.4.0"
+version = "4.4.1"
 
 [deps]
 ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b"
@@ -30,7 +30,7 @@ DiffEqFluxDataInterpolationsExt = "DataInterpolations"
 ADTypes = "1.5"
 Aqua = "0.8.7"
 BenchmarkTools = "1.5.0"
-Boltz = "1"
+Boltz = "1.7"
 ChainRulesCore = "1"
 ComponentArrays = "0.15.17"
 ConcreteStructs = "0.2"
@@ -43,11 +43,11 @@ Distributions = "0.25"
 DistributionsAD = "0.6.55"
 ExplicitImports = "1.9"
 Flux = "0.16"
-ForwardDiff = "0.10"
+ForwardDiff = "0.10, 1"
 Hwloc = "3"
 InteractiveUtils = "<0.0.1, 1"
 LinearAlgebra = "1.10"
-Lux = "1"
+Lux = "1.22"
 LuxCUDA = "0.3.2"
 LuxCore = "1"
 LuxLib = "1.2"
diff --git a/docs/Project.toml b/docs/Project.toml
@@ -2,13 +2,12 @@
 CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 ComponentArrays = "b0b7db55-cfe3-40fc-9ded-d10e2dbeff66"
-DataDeps = "124859b0-ceae-595e-8997-d05f6a7a8dfe"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 DiffEqFlux = "aae7a2af-3d4f-5e19-a356-7da93b79d9d0"
 Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
-Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
+Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 IterTools = "c8e1da08-722c-5040-9ed9-7db0dc04731e"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
@@ -27,27 +26,24 @@ OrdinaryDiffEq = "1dea7af3-3e70-54e6-95c3-0bf5283fa5ed"
 Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
-ReverseDiff = "37e2e3b7-166d-5795-8a7a-e32c996b4267"
 SciMLBase = "0bca4576-84f4-4d90-8ffe-ffa030f20462"
 SciMLSensitivity = "1ed8b502-d754-442c-8d5d-10ac956f44a1"
-StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 StochasticDiffEq = "789caeaf-c7a9-5a7d-9973-96adeb23e2a0"
-Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
+[sources]
+DiffEqFlux = {path = ".."}
+
 [compat]
-CSV = "0.10"
 CUDA = "5"
 ComponentArrays = "0.15"
-DataDeps = "0.7"
 DataFrames = "1"
 DiffEqFlux = "4"
 Distances = "0.10.7"
 Distributions = "0.25.78"
 Documenter = "1"
-Flux = "0.14, 0.15, 0.16"
-ForwardDiff = "0.10"
+ForwardDiff = "0.10, 1"
 IterTools = "1"
 LinearAlgebra = "1"
 Lux = "1"
@@ -65,11 +61,8 @@ OrdinaryDiffEq = "6.31"
 Plots = "1.36"
 Printf = "1"
 Random = "1"
-ReverseDiff = "1.14"
 SciMLBase = "2"
 SciMLSensitivity = "7.11"
-StableRNGs = "1"
 Statistics = "1"
 StochasticDiffEq = "6.56"
-Test = "1"
 Zygote = "0.6.62, 0.7"
diff --git a/docs/make.jl b/docs/make.jl
@@ -13,8 +13,8 @@ makedocs(; sitename = "DiffEqFlux.jl",
     authors = "Chris Rackauckas et al.",
     clean = true,
     doctest = false,
-    linkcheck = true,
-    warnonly = [:docs_block, :missing_docs],
+    # linkcheck = true,
+    warnonly = [:docs_block, :missing_docs, :linkcheck],
     modules = [DiffEqFlux],
     format = Documenter.HTML(; assets = ["assets/favicon.ico"],
         canonical = "https://docs.sciml.ai/DiffEqFlux/stable/"),
diff --git a/docs/src/examples/hamiltonian_nn.md b/docs/src/examples/hamiltonian_nn.md
@@ -26,18 +26,19 @@ dpdt = -2π_32 .* q_t
 data = cat(q_t, p_t; dims = 1)
 target = cat(dqdt, dpdt; dims = 1)
 B = 256
-NEPOCHS = 500
+NEPOCHS = 125
 dataloader = DataLoader((data, target); batchsize = B)
 
-hnn = Layers.HamiltonianNN{true}(Layers.MLP(2, (1028, 1)); autodiff = AutoZygote())
+hnn = Layers.HamiltonianNN{true}(Layers.MLP(2, (32, 32, 1), gelu); autodiff = AutoZygote())
 ps, st = Lux.setup(Xoshiro(0), hnn)
+model = StatefulLuxLayer(hnn, ps, st)
 ps_c = ps |> ComponentArray
 
-opt = OptimizationOptimisers.Adam(0.01f0)
+opt = OptimizationOptimisers.Adam(0.003f0)
 
 function loss_function(ps, databatch)
     data, target = databatch
-    pred, st_ = hnn(data, ps, st)
+    pred = model(data, ps)
     return mean(abs2, pred .- target)
 end
 
@@ -53,10 +54,10 @@ res = Optimization.solve(opt_prob, opt; callback, epochs = NEPOCHS)
 
 ps_trained = res.u
 
-model = NeuralODE(
+nhde = NeuralODE(
     hnn, (0.0f0, 1.0f0), Tsit5(); save_everystep = false, save_start = true, saveat = t)
 
-pred = Array(first(model(data[:, 1], ps_trained, st)))
+pred = Array(first(nhde(data[:, 1], ps_trained, st)))
 plot(data[1, :], data[2, :]; lw = 4, label = "Original")
 plot!(pred[1, :], pred[2, :]; lw = 4, label = "Predicted")
 xlabel!("Position (q)")
@@ -69,7 +70,7 @@ ylabel!("Momentum (p)")
 
 The HNN predicts the gradients ``(\dot q, \dot p)`` given ``(q, p)``. Hence, we generate the pairs ``(q, p)`` using the equations given at the top. Additionally, to supervise the training, we also generate the gradients. Next, we use Flux DataLoader for automatically batching our dataset.
 
-```@example hamiltonian
+```julia
 using Lux, DiffEqFlux, OrdinaryDiffEq, Statistics, Plots, Zygote, ForwardDiff, Random,
       ComponentArrays, Optimization, OptimizationOptimisers, MLUtils
 
@@ -83,25 +84,25 @@ dpdt = -2π_32 .* q_t
 data = cat(q_t, p_t; dims = 1)
 target = cat(dqdt, dpdt; dims = 1)
 B = 256
-NEPOCHS = 500
+NEPOCHS = 125
 dataloader = DataLoader((data, target); batchsize = B)
 ```
 
 ### Training the HamiltonianNN
 
 We parameterize the  with a small MultiLayered Perceptron. HNNs are trained by optimizing the gradients of the Neural Network. Zygote currently doesn't support nesting itself, so we will be using ForwardDiff in the training loop to compute the gradients of the HNN Layer for Optimization.
 
-```@example hamiltonian
-hnn = Layers.HamiltonianNN{true}(Layers.MLP(2, (1028, 1)); autodiff = AutoZygote())
+```julia
+hnn = Layers.HamiltonianNN{true}(Layers.MLP(2, (32, 32, 1), gelu); autodiff = AutoZygote())
 ps, st = Lux.setup(Xoshiro(0), hnn)
+model = StatefulLuxLayer(hnn, ps, st)
 ps_c = ps |> ComponentArray
-hnn_stateful = StatefulLuxLayer{true}(hnn, ps_c, st)
 
-opt = OptimizationOptimisers.Adam(0.005f0)
+opt = OptimizationOptimisers.Adam(0.003f0)
 
 function loss_function(ps, databatch)
-    (data, target) = databatch
-    pred = hnn_stateful(data, ps)
+    data, target = databatch
+    pred = model(data, ps)
     return mean(abs2, pred .- target)
 end
 
@@ -110,7 +111,7 @@ function callback(state, loss)
     return false
 end
 
-opt_func = OptimizationFunction(loss_function, Optimization.AutoZygote())
+opt_func = OptimizationFunction(loss_function, Optimization.AutoForwardDiff())
 opt_prob = OptimizationProblem(opt_func, ps_c, dataloader)
 
 res = Optimization.solve(opt_prob, opt; callback, epochs = NEPOCHS)
@@ -123,11 +124,11 @@ ps_trained = res.u
 In order to visualize the learned trajectories, we need to solve the ODE. We will use the
 `NeuralODE` layer with `HamiltonianNN` layer, and solves the ODE.
 
-```@example hamiltonian
-model = NeuralODE(
+```julia
+nhde = NeuralODE(
     hnn, (0.0f0, 1.0f0), Tsit5(); save_everystep = false, save_start = true, saveat = t)
 
-pred = Array(first(model(data[:, 1], ps_trained, st)))
+pred = Array(first(nhde(data[:, 1], ps_trained, st)))
 plot(data[1, :], data[2, :]; lw = 4, label = "Original")
 plot!(pred[1, :], pred[2, :]; lw = 4, label = "Predicted")
 xlabel!("Position (q)")
diff --git a/docs/src/examples/neural_ode_weather_forecast.md b/docs/src/examples/neural_ode_weather_forecast.md
@@ -9,17 +9,18 @@ The data is a four-dimensional dataset of daily temperature, humidity, wind spee
 
 ```@example weather_forecast
 using Random, Dates, Optimization, ComponentArrays, Lux, OptimizationOptimisers, DiffEqFlux,
-      OrdinaryDiffEq, CSV, DataFrames, Dates, Statistics, Plots, DataDeps
+      OrdinaryDiffEq, CSV, DataFrames, Dates, Statistics, Plots
+using Downloads: download
 
 function download_data(
         data_url = "https://raw.githubusercontent.com/SebastianCallh/neural-ode-weather-forecast/master/data/",
         data_local_path = "./delhi")
     function load(file_name)
-        data_dep = DataDep("delhi/train", "", "$data_url/$file_name")
-        Base.download(data_dep, data_local_path; i_accept_the_terms_of_use = true)
-        CSV.read(joinpath(data_local_path, file_name), DataFrame)
+        download("$data_url/$file_name", joinpath(data_local_path, file_name))
+        return CSV.read(joinpath(data_local_path, file_name), DataFrame)
     end
 
+    mkpath(data_local_path)
     train_df = load("DailyDelhiClimateTrain.csv")
     test_df = load("DailyDelhiClimateTest.csv")
     return vcat(train_df, test_df)
@@ -102,7 +103,7 @@ We are now ready to construct and train our model! To avoid local minimas we wil
 function neural_ode(t, data_dim)
     f = Chain(Dense(data_dim => 64, swish), Dense(64 => 32, swish), Dense(32 => data_dim))
 
-    node = NeuralODE(f, extrema(t), Tsit5(); saveat = t, abstol = 1e-9, reltol = 1e-9)
+    node = NeuralODE(f, extrema(t), Tsit5(); saveat = t, abstol = 1e-6, reltol = 1e-3)
 
     rng = Xoshiro(0)
     p, state = Lux.setup(rng, f)
@@ -151,7 +152,7 @@ ps, state, losses = train(t_train, y_train, obs_grid, maxiters, lr, rng; progres
 We can now animate the training to get a better understanding of the fit.
 
 ```@example weather_forecast
-predict(y0, t, p, state) = begin
+function predict(y0, t, p, state)
     node, _, _ = neural_ode(t, length(y0))
     Array(node(y0, p, state)[1])
 end