From 618263266cd1677c7d2f021a3e9ea000b77c951f Mon Sep 17 00:00:00 2001 From: David Widmann Date: Wed, 26 Nov 2025 11:20:02 +0100 Subject: [PATCH] Do not (mis)use objective as state --- Project.toml | 2 +- src/Manifolds.jl | 11 +- src/Optim.jl | 3 - src/api.jl | 9 +- src/multivariate/optimize/optimize.jl | 110 ++++---- .../solvers/constrained/fminbox.jl | 237 ++++++++---------- .../solvers/constrained/ipnewton/interior.jl | 130 +++++----- .../solvers/constrained/ipnewton/ipnewton.jl | 159 ++++++------ .../ipnewton/utilities/assess_convergence.jl | 6 +- .../constrained/ipnewton/utilities/trace.jl | 10 +- src/multivariate/solvers/constrained/samin.jl | 84 +++---- .../accelerated_gradient_descent.jl | 33 +-- src/multivariate/solvers/first_order/adam.jl | 101 ++++---- .../solvers/first_order/adamax.jl | 103 ++++---- src/multivariate/solvers/first_order/bfgs.jl | 84 ++++--- src/multivariate/solvers/first_order/cg.jl | 66 +++-- .../solvers/first_order/gradient_descent.jl | 45 ++-- .../solvers/first_order/l_bfgs.jl | 67 ++--- .../first_order/momentum_gradient_descent.jl | 29 ++- .../solvers/first_order/ngmres.jl | 110 ++++---- .../second_order/krylov_trust_region.jl | 44 ++-- .../solvers/second_order/newton.jl | 52 ++-- .../second_order/newton_trust_region.jl | 94 ++++--- .../solvers/zeroth_order/nelder_mead.jl | 38 +-- .../solvers/zeroth_order/particle_swarm.jl | 27 +- .../zeroth_order/simulated_annealing.jl | 55 ++-- .../solvers/zeroth_order/zeroth_utils.jl | 6 +- src/types.jl | 4 +- src/utilities/assess_convergence.jl | 79 ++---- src/utilities/generic.jl | 4 +- src/utilities/perform_linesearch.jl | 26 +- src/utilities/trace.jl | 8 +- test/general/convergence.jl | 30 ++- test/general/objective_types.jl | 66 ++--- .../constrained/ipnewton/constraints.jl | 87 ++++--- .../solvers/first_order/ngmres.jl | 4 +- 36 files changed, 1037 insertions(+), 986 deletions(-) diff --git a/Project.toml b/Project.toml index 5bac458c1..33ff5a79a 100644 --- a/Project.toml +++ b/Project.toml @@ -30,7 +30,7 @@ ExplicitImports = "1.13.2" FillArrays = "0.6.2, 0.7, 0.8, 0.9, 0.10, 0.11, 0.12, 0.13, 1" ForwardDiff = "0.10, 1" JET = "0.9, 0.10" -LineSearches = "7.4.0" +LineSearches = "7.5.1" LinearAlgebra = "<0.0.1, 1.6" MathOptInterface = "1.17" Measurements = "2.14.1" diff --git a/src/Manifolds.jl b/src/Manifolds.jl index 8d02b553a..b016dd629 100644 --- a/src/Manifolds.jl +++ b/src/Manifolds.jl @@ -28,16 +28,7 @@ end # TODO: is it safe here to call retract! and change x? function NLSolversBase.value!(obj::ManifoldObjective, x) xin = retract(obj.manifold, x) - value!(obj.inner_obj, xin) -end -function NLSolversBase.value(obj::ManifoldObjective) - value(obj.inner_obj) -end -function NLSolversBase.gradient(obj::ManifoldObjective) - gradient(obj.inner_obj) -end -function NLSolversBase.gradient(obj::ManifoldObjective, i::Int) - gradient(obj.inner_obj, i) + return value!(obj.inner_obj, xin) end function NLSolversBase.gradient!(obj::ManifoldObjective, x) xin = retract(obj.manifold, x) diff --git a/src/Optim.jl b/src/Optim.jl index a311e1af9..e5be7198c 100644 --- a/src/Optim.jl +++ b/src/Optim.jl @@ -41,10 +41,7 @@ using NLSolversBase: TwiceDifferentiableConstraints, nconstraints, nconstraints_x, - hessian, hessian!, - hessian!!, - hv_product, hv_product! # var for NelderMead diff --git a/src/api.jl b/src/api.jl index 72e0eca7b..467488ff4 100644 --- a/src/api.jl +++ b/src/api.jl @@ -100,18 +100,15 @@ g_norm_trace(r::OptimizationResults) = g_norm_trace(r::MultivariateOptimizationResults) = [state.g_norm for state in trace(r)] f_calls(r::OptimizationResults) = r.f_calls -f_calls(d) = first(d.f_calls) +f_calls(d::AbstractObjective) = NLSolversBase.f_calls(d) g_calls(r::OptimizationResults) = error("g_calls is not implemented for $(summary(r)).") g_calls(r::MultivariateOptimizationResults) = r.g_calls -g_calls(d::NonDifferentiable) = 0 -g_calls(d) = first(d.df_calls) +g_calls(d::AbstractObjective) = NLSolversBase.g_calls(d) h_calls(r::OptimizationResults) = error("h_calls is not implemented for $(summary(r)).") h_calls(r::MultivariateOptimizationResults) = r.h_calls -h_calls(d::Union{NonDifferentiable,OnceDifferentiable}) = 0 -h_calls(d) = first(d.h_calls) -h_calls(d::TwiceDifferentiableHV) = first(d.hv_calls) +h_calls(d::AbstractObjective) = NLSolversBase.h_calls(d) + NLSolversBase.hv_calls(d) converged(r::UnivariateOptimizationResults) = r.stopped_by.converged function converged(r::MultivariateOptimizationResults) diff --git a/src/multivariate/optimize/optimize.jl b/src/multivariate/optimize/optimize.jl index 73498df30..6af2533c6 100644 --- a/src/multivariate/optimize/optimize.jl +++ b/src/multivariate/optimize/optimize.jl @@ -1,37 +1,42 @@ -update_g!(d, state, method) = nothing -function update_g!(d, state, method::FirstOrderOptimizer) - # Update the function value and gradient - value_gradient!(d, state.x) - project_tangent!(method.manifold, gradient(d), state.x) +# Update function value, gradient and Hessian +function update_fgh!(d, state, ::ZerothOrderOptimizer) + f_x = value!(d, state.x) + state.f_x = f_x + return nothing end -function update_g!(d, state, method::Newton) - # Update the function value and gradient - value_gradient!(d, state.x) -end -update_fg!(d, state, method) = nothing -update_fg!(d, state, method::ZerothOrderOptimizer) = value!(d, state.x) -function update_fg!(d, state, method::FirstOrderOptimizer) - value_gradient!(d, state.x) - project_tangent!(method.manifold, gradient(d), state.x) +function update_fgh!(d, state, method::FirstOrderOptimizer) + f_x, g_x = value_gradient!(d, state.x) + if hasproperty(method, :manifold) + project_tangent!(method.manifold, g_x, state.x) + end + state.f_x = f_x + copyto!(state.g_x, g_x) + return nothing end -function update_fg!(d, state, method::Newton) - value_gradient!(d, state.x) +function update_fgh!(d, state, method::SecondOrderOptimizer) + # Manifold optimization is currently not supported for second order optimization algorithms + @assert !hasproperty(method, :manifold) + + # TODO: Switch to `value_gradient_hessian!` when it becomes available + f_x, g_x = value_gradient!(d, state.x) + H_x = hessian!(d, state.x) + state.f_x = f_x + copyto!(state.g_x, g_x) + copyto!(state.H_x, H_x) + + return nothing end -# Update the Hessian -update_h!(d, state, method) = nothing -update_h!(d, state, method::SecondOrderOptimizer) = hessian!(d, state.x) - after_while!(d, state, method, options) = nothing -function initial_convergence(d, state, method::AbstractOptimizer, initial_x, options) - gradient!(d, initial_x) - stopped = !isfinite(value(d)) || any(!isfinite, gradient(d)) - g_residual(d, state) <= options.g_abstol, stopped +function initial_convergence(state::AbstractOptimizerState, options::Options) + stopped = !isfinite(state.f_x) || any(!isfinite, state.g_x) + return g_residual(state) <= options.g_abstol, stopped end -function initial_convergence(d, state, method::ZerothOrderOptimizer, initial_x, options) +function initial_convergence(::ZerothOrderState, ::Options) false, false end + function optimize( d::D, initial_x::Tx, @@ -41,7 +46,7 @@ function optimize( ) where {D<:AbstractObjective,M<:AbstractOptimizer,Tx<:AbstractArray,T,TCallback} t0 = time() # Initial time stamp used to control early stopping by options.time_limit - tr = OptimizationTrace{typeof(value(d)),typeof(method)}() + tr = OptimizationTrace{typeof(state.f_x),typeof(method)}() tracing = options.store_trace || options.show_trace || @@ -51,7 +56,7 @@ function optimize( f_limit_reached, g_limit_reached, h_limit_reached = false, false, false x_converged, f_converged, f_increased, counter_f_tol = false, false, false, 0 - g_converged, stopped = initial_convergence(d, state, method, initial_x, options) + g_converged, stopped = initial_convergence(state, options) converged = g_converged || stopped # prepare iteration counter (used to make "initial state" trace entry) iteration = 0 @@ -62,22 +67,29 @@ function optimize( ls_success::Bool = true while !converged && !stopped && iteration < options.iterations iteration += 1 + + # Convention: When `update_state!` is called, then `state` satisfies: + # - `state.x`: Current state + # - `state.f`: Objective function value of the current state, ie. `d(state.x)` + # - `state.g_x` (if available): Gradient of the objective function at the current state, i.e. `gradient(d, state.x)` + # - `state.H_x` (if available): Hessian of the objective function at the current state, i.e. `hessian(d, state.x)` ls_success = !update_state!(d, state, method) if !ls_success break # it returns true if it's forced by something in update! to stop (eg dx_dg == 0.0 in BFGS, or linesearch errors) end - if !(method isa NewtonTrustRegion) - update_g!(d, state, method) # TODO: Should this be `update_fg!`? - end + + # Update function value, gradient and Hessian matrix (skipped by some methods that already update those in `update_state!`) + # TODO: Already perform in `update_state!`? + update_fgh!(d, state, method) + + # Check convergence x_converged, f_converged, g_converged, f_increased = assess_convergence(state, d, options) # For some problems it may be useful to require `f_converged` to be hit multiple times # TODO: Do the same for x_tol? counter_f_tol = f_converged ? counter_f_tol + 1 : 0 converged = x_converged || g_converged || (counter_f_tol > options.successive_f_tol) - if !(converged && method isa Newton) && !(method isa NewtonTrustRegion) - update_h!(d, state, method) # only relevant if not converged - end + if tracing # update trace; callbacks can stop routine early by returning true stopped_by_callback = @@ -113,11 +125,11 @@ function optimize( end end - if g_calls(d) > 0 && !all(isfinite, gradient(d)) + if hasproperty(state, :g_x) && !all(isfinite, state.g_x) options.show_warnings && @warn "Terminated early due to NaN in gradient." break end - if h_calls(d) > 0 && !(d isa TwiceDifferentiableHV) && !all(isfinite, hessian(d)) + if hasproperty(state, :H_x) && !all(isfinite, state.H_x) options.show_warnings && @warn "Terminated early due to NaN in Hessian." break end @@ -127,7 +139,7 @@ function optimize( # we can just check minimum, as we've earlier enforced same types/eltypes # in variables besides the option settings - Tf = typeof(value(d)) + Tf = typeof(state.f_x) f_incr_pick = f_increased && !options.allow_f_increases stopped_by = (x_converged, f_converged, g_converged, f_limit_reached = f_limit_reached, @@ -141,7 +153,7 @@ function optimize( ) termination_code = - _termination_code(d, g_residual(d, state), state, stopped_by, options) + _termination_code(d, g_residual(state), state, stopped_by, options) return MultivariateOptimizationResults{ typeof(method), @@ -154,7 +166,7 @@ function optimize( method, initial_x, pick_best_x(f_incr_pick, state), - pick_best_f(f_incr_pick, state, d), + pick_best_f(f_incr_pick, state), iteration, Tf(options.x_abstol), Tf(options.x_reltol), @@ -162,10 +174,10 @@ function optimize( x_relchange(state), Tf(options.f_abstol), Tf(options.f_reltol), - f_abschange(d, state), - f_relchange(d, state), + f_abschange(state), + f_relchange(state), Tf(options.g_abstol), - g_residual(d, state), + g_residual(state), tr, f_calls(d), g_calls(d), @@ -186,13 +198,13 @@ function _termination_code(d, gres, state, stopped_by, options) elseif (iszero(options.x_abstol) && x_abschange(state) <= options.x_abstol) || (iszero(options.x_reltol) && x_relchange(state) <= options.x_reltol) TerminationCode.NoXChange - elseif (iszero(options.f_abstol) && f_abschange(d, state) <= options.f_abstol) || - (iszero(options.f_reltol) && f_relchange(d, state) <= options.f_reltol) + elseif (iszero(options.f_abstol) && f_abschange(state) <= options.f_abstol) || + (iszero(options.f_reltol) && f_relchange(state) <= options.f_reltol) TerminationCode.NoObjectiveChange elseif x_abschange(state) <= options.x_abstol || x_relchange(state) <= options.x_reltol TerminationCode.SmallXChange - elseif f_abschange(d, state) <= options.f_abstol || - f_relchange(d, state) <= options.f_reltol + elseif f_abschange(state) <= options.f_abstol || + f_relchange(state) <= options.f_reltol TerminationCode.SmallObjectiveChange elseif stopped_by.ls_failed TerminationCode.FailedLinesearch @@ -210,11 +222,11 @@ function _termination_code(d, gres, state, stopped_by, options) TerminationCode.HessianCalls elseif stopped_by.f_increased TerminationCode.ObjectiveIncreased - elseif f_calls(d) > 0 && !isfinite(value(d)) - TerminationCode.GradientNotFinite - elseif g_calls(d) > 0 && !all(isfinite, gradient(d)) + elseif !isfinite(state.f_x) + TerminationCode.ObjectiveNotFinite + elseif hasproperty(state, :g_x) && !all(isfinite, state.g_x) TerminationCode.GradientNotFinite - elseif h_calls(d) > 0 && !(d isa TwiceDifferentiableHV) && !all(isfinite, hessian(d)) + elseif hasproperty(state, :H_x) && !all(isfinite, state.H_x) TerminationCode.HessianNotFinite else TerminationCode.NotImplemented diff --git a/src/multivariate/solvers/constrained/fminbox.jl b/src/multivariate/solvers/constrained/fminbox.jl index 5ac0c0e54..9837ba79f 100644 --- a/src/multivariate/solvers/constrained/fminbox.jl +++ b/src/multivariate/solvers/constrained/fminbox.jl @@ -1,5 +1,5 @@ using NLSolversBase: - value, value!, value!!, gradient, gradient!, value_gradient!, value_gradient!! + value, value!, gradient!, value_gradient! ####### FIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIX THE MIDDLE OF BOX CASE THAT WAS THERE mutable struct BarrierWrapper{TO,TB,Tm,TF,TDF} <: AbstractObjective obj::TO @@ -10,13 +10,16 @@ mutable struct BarrierWrapper{TO,TB,Tm,TF,TDF} <: AbstractObjective DFb::TDF DFtotal::TDF end -f_calls(obj::BarrierWrapper) = f_calls(obj.obj) -g_calls(obj::BarrierWrapper) = g_calls(obj.obj) -h_calls(obj::BarrierWrapper) = h_calls(obj.obj) + +NLSolversBase.f_calls(obj::BarrierWrapper) = NLSolversBase.f_calls(obj.obj) +NLSolversBase.g_calls(obj::BarrierWrapper) = NLSolversBase.g_calls(obj.obj) +NLSolversBase.h_calls(obj::BarrierWrapper) = NLSolversBase.h_calls(obj.obj) +NLSolversBase.hv_calls(obj::BarrierWrapper) = NLSolversBase.hv_calls(obj.obj) + function BarrierWrapper(obj::NonDifferentiable, mu, lower, upper) barrier_term = BoxBarrier(lower, upper) - BarrierWrapper(obj, barrier_term, mu, copy(obj.F), copy(obj.F), nothing, nothing) + BarrierWrapper(obj, barrier_term, mu, oftype(obj.F, NaN), oftype(obj.F, NaN), nothing, nothing) end function BarrierWrapper(obj::OnceDifferentiable, mu, lower, upper) barrier_term = BoxBarrier(lower, upper) @@ -25,10 +28,10 @@ function BarrierWrapper(obj::OnceDifferentiable, mu, lower, upper) obj, barrier_term, mu, - copy(obj.F), - copy(obj.F), - copy(obj.DF), - copy(obj.DF), + oftype(obj.F, NaN), + oftype(obj.F, NaN), + fill!(copy(obj.DF), NaN), + fill!(copy(obj.DF), NaN), ) end @@ -69,29 +72,6 @@ function _barrier_term_gradient(x::T, l, u) where {T} end # Wrappers -function NLSolversBase.value!!(bw::BarrierWrapper, x) - bw.Fb = _barrier_value(bw.b, x) - if in_box(bw, x) - F = value!!(bw.obj, x) - bw.Ftotal = muladd(bw.mu, bw.Fb, F) - else - bw.Ftotal = bw.mu * bw.Fb - end - return bw.Ftotal -end -function NLSolversBase.value_gradient!!(bw::BarrierWrapper, x) - bw.Fb = _barrier_value(bw.b, x) - bw.DFb .= _barrier_term_gradient.(x, bw.b.lower, bw.b.upper) - if in_box(bw, x) - F, DF = value_gradient!!(bw.obj, x) - bw.Ftotal = muladd(bw.mu, bw.Fb, F) - bw.DFtotal .= muladd.(bw.mu, bw.DFb, DF) - else - bw.Ftotal = bw.mu * bw.Fb - bw.DFtotal .= bw.mu .* bw.DFb - end - return bw.Ftotal, bw.DFtotal -end function NLSolversBase.value_gradient!(bb::BarrierWrapper, x) bb.DFb .= _barrier_term_gradient.(x, bb.b.lower, bb.b.upper) bb.Fb = _barrier_value(bb.b, x) @@ -115,7 +95,6 @@ function NLSolversBase.value!(obj::BarrierWrapper, x) end return obj.Ftotal end -NLSolversBase.value(obj::BarrierWrapper) = obj.Ftotal function NLSolversBase.value(obj::BarrierWrapper, x) Fb = _barrier_value(obj.b, x) if in_box(obj, x) @@ -134,45 +113,6 @@ function NLSolversBase.gradient!(obj::BarrierWrapper, x) end return obj.DFtotal end -NLSolversBase.gradient(obj::BarrierWrapper) = obj.DFtotal - -# this mutates mu but not the gradients -# Super unsafe in that it depends on x_df being correct! -function initial_mu(obj::BarrierWrapper, F) - T = typeof(obj.Fb) # this will not work if F is real, G is complex - gbarrier = map( - x -> - (isfinite.(x[2]) ? one(T) / (x[1] - x[2]) : zero(T)) + - (isfinite(x[3]) ? one(T) / (x[3] - x[1]) : zero(T)), - zip(obj.obj.x_f, obj.b.lower, obj.b.upper), - ) - - # obj.mu = initial_mu(gradient(obj.obj), gradient(obj.b, obj.DFb, obj.obj.x_df), T(F.mufactor), T(F.mu0)) - obj.mu = initial_mu(gradient(obj.obj), gbarrier, T(F.mufactor), T(F.mu0)) -end -# Attempt to compute a reasonable default mu: at the starting -# position, the gradient of the input function should dominate the -# gradient of the barrier. -function initial_mu( - gfunc::AbstractArray{T}, - gbarrier::AbstractArray{T}, - mu0factor::T = T(1) / 1000, - mu0::T = convert(T, NaN), -) where {T} - - if isnan(mu0) - gbarriernorm = sum(abs, gbarrier) - if gbarriernorm > 0 - mu = mu0factor * sum(abs, gfunc) / gbarriernorm - else - # Presumably, there is no barrier function - mu = zero(T) - end - else - mu = mu0 - end - return mu -end function limits_box( x::AbstractArray{T}, @@ -279,6 +219,36 @@ barrier_method( precondprep, ) = m # use `m` as is +struct BoxState{T,Tx} <: ZerothOrderState + x::Tx + f_x::T + x_previous::Tx + f_x_previous::T +end + +# Attempt to compute a reasonable default mu: at the starting +# position, the gradient of the input function should dominate the +# gradient of the barrier. +function initial_mu(box::BoxBarrier, x::AbstractArray, g_x::AbstractArray, F::Fminbox) + # Compute 1-norm of gradient of input function and the gradient of the barrier + _gnorm = sum(abs, g_x) + _gbarrier_norm = sum(Broadcast.instantiate(Broadcast.broadcasted((xi, li, ui) -> abs(_barrier_term_gradient(xi, li, ui)), x, box.lower, box.upper))) + + gnorm, gbarrier_norm, mufactor, mu0 = promote(_gnorm, _gbarrier_norm, F.mufactor, F.mu0) + mu = if isnan(mu0) + if gbarrier_norm > 0 + mufactor * gnorm / gbarrier_norm + else + # Presumably, there is no barrier function + zero(gnorm) + end + else + mu0 + end + + return mu +end + function optimize( f, l::AbstractArray, @@ -483,22 +453,18 @@ function optimize( # barrier-aware optimization method instance (precondition relevance) _optimizer = barrier_method(F.method, P, (P, x) -> F.precondprep(P, x, l, u, dfbox)) - state = initial_state(_optimizer, options, dfbox, x) - # we wait until state has been initialized to set the initial mu because - # we need the gradient of the objective and initial_state will value_gradient!! - # the objective, so that forces an evaluation - if F.method isa NelderMead - gradient!(dfbox, x) - end - dfbox.mu = initial_mu(dfbox, F) - if F.method isa NelderMead - for i = 1:length(state.f_simplex) - x = state.simplex[i] - boxval = _barrier_value(dfbox.b, x) - state.f_simplex[i] += boxval - end - state.i_order = sortperm(state.f_simplex) + # we wait until state has been initialized to set the initial mu because we need the gradient of the objective + state = initial_state(_optimizer, options, df, x) + @assert state.x == x + f_x = state.f_x + g_x = if hasproperty(state, :g_x) + copy(state.g_x) + else + copy(gradient!(dfbox, x)) end + box = BoxBarrier(l, u) + mu = initial_mu(box, x, g_x, F) + if show_trace > 00 println("Fminbox") println("-------") @@ -507,17 +473,9 @@ function optimize( println("\n") end - g = copy(x) - fval_all = Vector{Vector{T}}() - - # Count the total number of outer iterations + # First iteration iteration = 1 - - # define the function (dfbox) to optimize by the inner optimizer - - xold = copy(x) _time = time() - fval0 = dfbox.obj.F # Optimize with current setting of mu if show_trace > 0 @@ -530,15 +488,15 @@ function optimize( println("(numbers below include barrier contribution)") end - # we need to update the +mu*barrier_grad part. Since we're using the - # value_gradient! not !! as in initial_state, we won't make a superfluous - # evaluation - if !(F.method isa NelderMead) - value_gradient!(dfbox, x) - reset!(_optimizer, state, dfbox, x) - else - value!(dfbox, x) - end + # We add the barrier term to the objective function + # Since this changes the objective of the inner optimizer, we have to reset its state + dfbox = BarrierWrapper(df, mu, l, u) + reset!(_optimizer, state, dfbox, x) + + # Store current state + x_previous = copy(x) + f_x_previous = f_x + results = optimize(dfbox, x, _optimizer, options, state) stopped_by_callback = results.stopped_by.callback dfbox.obj.f_calls[1] = 0 @@ -548,7 +506,12 @@ function optimize( if hasfield(typeof(dfbox.obj), :h_calls) dfbox.obj.h_calls[1] = 0 end + + # Compute function value and gradient (without barrier term) copyto!(x, minimizer(results)) + f_x, _g_x = value_gradient!(df, x) + copyto!(g_x, _g_x) + boxdist = Base.minimum(((xi, li, ui),) -> min(xi - li, ui - xi), zip(x, l, u)) # Base.minimum !== minimum if show_trace > 0 println() @@ -559,16 +522,14 @@ function optimize( println("Decreasing barrier term μ.\n") end - # Decrease mu - dfbox.mu *= T(F.mufactor) # Test for convergence - g = x .- min.(max.(x .- gradient(dfbox.obj), l), u) + g = x .- clamp.(x .- g_x, l, u) _x_converged, _f_converged, _g_converged, f_increased = assess_convergence( x, - xold, - minimum(results), - fval0, + x_previous, + f_x, + f_x_previous, g, options.outer_x_abstol, options.outer_x_reltol, @@ -589,11 +550,12 @@ function optimize( @warn("f(x) increased: stopping optimization") else while !converged && !stopped && iteration < outer_iterations - fval0 = dfbox.obj.F # Increment the number of steps we've had to perform iteration += 1 - copyto!(xold, x) + # Decrease mu + mu *= T(F.mufactor) + # Optimize with current setting of mu if show_trace > 0 header_string = "Fminbox iteration $iteration" @@ -605,16 +567,15 @@ function optimize( println("(numbers below include barrier contribution)") end - # we need to update the +mu*barrier_grad part. Since we're using the - # value_gradient! not !! as in initial_state, we won't make a superfluous - # evaluation - - if !(F.method isa NelderMead) - value_gradient!(dfbox, x) - else - value!(dfbox, x) - end + # We need to update the barrier term of the objective function. + # Since this changes the objective of the inner optimizer, we have to reset its state + dfbox = BarrierWrapper(df, mu, l, u) reset!(_optimizer, state, dfbox, x) + + # Store current state + copyto!(x_previous, x) + f_x_previous = f_x + resultsnew = optimize(dfbox, x, _optimizer, options, state) stopped_by_callback = resultsnew.stopped_by.callback append!(results, resultsnew) @@ -625,7 +586,12 @@ function optimize( if hasfield(typeof(dfbox.obj), :h_calls) dfbox.obj.h_calls[1] = 0 end + + # Compute function value and gradient (without barrier term) copyto!(x, minimizer(results)) + f_x, _g_x = value_gradient!(df, x) + copyto!(g_x, _g_x) + boxdist = Base.minimum(((xi, li, ui),) -> min(xi - li, ui - xi), zip(x, l, u)) # Base.minimum !== minimum if show_trace > 0 println() @@ -636,16 +602,14 @@ function optimize( println("Decreasing barrier term μ.\n") end - # Decrease mu - dfbox.mu *= T(F.mufactor) # Test for convergence - g = x .- min.(max.(x .- gradient(dfbox.obj), l), u) + g .= x .- clamp.(g_x, l, u) _x_converged, _f_converged, _g_converged, f_increased = assess_convergence( x, - xold, - minimum(results), - fval0, + x_previous, + f_x, + f_x_previous, g, options.outer_x_abstol, options.outer_x_reltol, @@ -681,25 +645,24 @@ function optimize( f_converged = _f_converged, g_converged = _g_converged, ) - box_state = (; x, x_previous = xold, f_x_previous = fval0) - termination_code = _termination_code(df, g_residual(g), box_state, stopped_by, options) + termination_code = _termination_code(df, g_residual(g), BoxState(x, f_x, x_previous, f_x_previous), stopped_by, options) return MultivariateOptimizationResults( F, initial_x, - minimizer(results), - df.f(minimizer(results)), + x, + f_x, iteration, results.x_abstol, results.x_reltol, - norm(x - xold, Inf), - norm(x - xold, Inf) / norm(x, Inf), + x_abschange(x, x_previous), + x_relchange(x, x_previous), results.f_abstol, results.f_reltol, - f_abschange(minimum(results), fval0), - f_relchange(minimum(results), fval0), + f_abschange(f_x, f_x_previous), + f_relchange(f_x, f_x_previous), results.g_abstol, - g_residual(g, Inf), + g_residual(g), results.trace, results.f_calls, results.g_calls, diff --git a/src/multivariate/solvers/constrained/ipnewton/interior.jl b/src/multivariate/solvers/constrained/ipnewton/interior.jl index 9c197e019..df9bfcaf6 100644 --- a/src/multivariate/solvers/constrained/ipnewton/interior.jl +++ b/src/multivariate/solvers/constrained/ipnewton/interior.jl @@ -1,7 +1,7 @@ # TODO: when Optim supports sparse arrays, make a SparseMatrixCSC version of jacobianx -abstract type AbstractBarrierState end +abstract type AbstractBarrierState <: AbstractOptimizerState end # These are used not only for the current state, but also for the step and the gradient struct BarrierStateVars{T} @@ -209,9 +209,8 @@ ls_update!( function initial_convergence(d, state, method::ConstrainedOptimizer, initial_x, options) # TODO: Make sure state.bgrad has been evaluated at initial_x # state.bgrad normally comes from constraints.c!(..., initial_x) in initial_state - gradient!(d, initial_x) - stopped = !isfinite(value(d)) || any(!isfinite, gradient(d)) - g_residual(d, state) + norm(state.bgrad, Inf) < options.g_abstol, stopped + stopped = !isfinite(state.f_x) || any(!isfinite, state.g_x) + norm(state.g_x, Inf) + norm(state.bgrad, Inf) < options.g_abstol, stopped end function optimize( @@ -253,7 +252,7 @@ function optimize( t0 = time() # Initial time stamp used to control early stopping by options.time_limit _time = t0 - tr = OptimizationTrace{typeof(value(d)),typeof(method)}() + tr = OptimizationTrace{typeof(state.f_x),typeof(method)}() tracing = options.store_trace || options.show_trace || @@ -275,9 +274,20 @@ function optimize( while !converged && !stopped && iteration < options.iterations iteration += 1 - update_state!(d, constraints, state, method, options) && break # it returns true if it's forced by something in update! to stop (eg dx_dg == 0.0 in BFGS or linesearch errors) + # Convention: When `update_state!` is called, then `state` satisfies: + # - `state.x`: Current state + # - `state.f`: Objective function value of the current state, ie. `d(state.x)` + # - `state.g_x` (if available): Gradient of the objective function at the current state, i.e. `gradient(d, state.x)` + # - `state.H_x` (if available): Hessian of the objective function at the current state, i.e. `hessian(d, state.x)` + fail = update_state!(d, constraints, state, method, options) + if fail + # `fail = true` e.g. if it's forced by something in update! to stop (eg dx_dg == 0.0 in BFGS or linesearch errors) + break + end - update_fg!(d, constraints, state, method) + # Update function value, gradient and Hessian matrix (skipped by some methods that already update those in `update_state!`) + # TODO: Already perform in `update_state!`? + update_fgh!(d, constraints, state, method) # TODO: Do we need to rethink f_increased for `ConstrainedOptimizer`s? x_converged, f_converged, g_converged, f_increased = @@ -292,10 +302,6 @@ function optimize( counter_f_tol = f_converged ? counter_f_tol + 1 : 0 converged = x_converged || g_converged || (counter_f_tol > options.successive_f_tol) - # We don't use the Hessian for anything if we have declared convergence, - # so we might as well not make the (expensive) update if converged == true - !converged && update_h!(d, constraints, state, method) - if tracing # update trace; callbacks can stop routine early by returning true stopped_by_callback = trace!(tr, d, state, iteration, method, options) @@ -327,14 +333,13 @@ function optimize( # we can just check minimum, as we've earlier enforced same types/eltypes # in variables besides the option settings T = typeof(options.f_reltol) - Tf = typeof(value(d)) f_incr_pick = f_increased && !options.allow_f_increases termination_code = TerminationCode.NotImplemented return MultivariateOptimizationResults( method, initial_x, pick_best_x(f_incr_pick, state), - pick_best_f(f_incr_pick, state, d), + pick_best_f(f_incr_pick, state), iteration, T(options.x_abstol), T(options.x_reltol), @@ -342,10 +347,10 @@ function optimize( x_relchange(state), T(options.f_abstol), T(options.f_reltol), - f_abschange(d, state), - f_relchange(d, state), + f_abschange(state), + f_relchange(state), T(options.g_abstol), - g_residual(d, state), + g_residual(state), tr, f_calls(d), g_calls(d), @@ -357,13 +362,11 @@ function optimize( ) end -# Fallbacks (for methods that don't need these) +# Fallback (for methods that don't need this) after_while!(d, constraints::AbstractConstraints, state, method, options) = nothing -update_h!(d, constraints::AbstractConstraints, state, method) = nothing """ - initialize_μ_λ!(state, bounds, μ0=:auto, β=0.01) - initialize_μ_λ!(state, bounds, (Hobj,HcI), μ0=:auto, β=0.01) + initialize_μ_λ!(state, bounds, HcI, μ0=:auto, β=0.01) Pick μ and λ to ensure that the equality constraints are satisfied locally (at the current `state.x`), and that the initial gradient @@ -371,11 +374,10 @@ including the barrier would be a descent direction for the problem without the barrier (μ = 0). This ensures that the search isn't pushed out of the basin of the user-supplied initial guess. -Upon entry, the objective function gradient, constraint values, and -constraint jacobian must be set in `state.g`, `state.c`, and `state.J` -respectively. If you also wish to ensure that the projection of -Hessian is minimally-perturbed along the initial gradient, supply the -hessian of the objective (`Hobj`) and +Upon entry, the objective function gradient and Hessian, constraint values, and +constraint jacobian must be set in `state.g_x`, `state.H_x`, `state.c`, and `state.J`, +respectively. To ensure that the projection of Hessian is minimally-perturbed +along the initial gradient, additionally supply HcI = ∑_i (σ_i/s_i)∇∇ c_{Ii} @@ -390,7 +392,7 @@ values of `λ` are set using the chosen `μ`. function initialize_μ_λ!( state, bounds::ConstraintBounds, - Hinfo, + HcI::AbstractMatrix{<:Real}, μ0::Union{Symbol,Number}, β::Number = 1 // 100, ) @@ -399,7 +401,8 @@ function initialize_μ_λ!( fill!(state.bstate, 0) return state end - gf = state.g # must be pre-set to ∇f + copyto!(state.g_L_x, state.g_x) + gf = state.g_L_x # Calculate projection of ∇f into the subspace spanned by the # equality constraint Jacobian JE = jacobianE(state, bounds) @@ -420,7 +423,7 @@ function initialize_μ_λ!( Δg = JI' * σdivs PperpΔg = Δg - JE' * (Cc \ (JE * Δg)) DI = dot(PperpΔg, PperpΔg) - κperp, κI = hessian_projections(Hinfo, Pperpg, (JI * Pperpg) ./ s) + κperp, κI = hessian_projections((state.H_x, HcI), Pperpg, (JI * Pperpg) ./ s) # Calculate μ and λI μ = β * (κperp == 0 ? sqrt(Dperp / DI) : min(sqrt(Dperp / DI), abs(κperp / κI))) if !isfinite(μ) @@ -449,14 +452,6 @@ function initialize_μ_λ!( k == length(λE) || error("Something is wrong when initializing μ and λ.") state end -function initialize_μ_λ!( - state, - bounds::ConstraintBounds, - μ0::Union{Number,Symbol}, - β::Number = 1 // 100, -) - initialize_μ_λ!(state, bounds, nothing, μ0, β) -end function hessian_projections(Hinfo::Tuple{AbstractMatrix,AbstractMatrix}, Pperpg, y) κperp = dot(Hinfo[1] * Pperpg, Pperpg) @@ -545,32 +540,38 @@ userλ(λcI, constraints) = userλ(λcI, constraints.bounds) ## Computation of the Lagrangian and its gradient # This is in a parametrization that is also useful during linesearch # TODO: `lagrangian` does not seem to be used (IPNewton)? -function lagrangian(d, bounds::ConstraintBounds, x, c, bstate::BarrierStateVars, μ) - f_x = NLSolversBase.value!(d, x) +function lagrangian(bounds::ConstraintBounds, x, f_x, c, bstate::BarrierStateVars, μ) ev = equality_violation(bounds, x, c, bstate) - L_xsλ = f_x + barrier_value(bounds, x, bstate, μ) + ev - f_x, L_xsλ, ev + L_x = f_x + barrier_value(bounds, x, bstate, μ) + ev + return L_x, ev end function lagrangian_fg!( - gx, + g_L_x, bgrad, - d, bounds::ConstraintBounds, x, + f_x, + g_x, c, J, bstate::BarrierStateVars, μ, ) - fill!(bgrad, 0) - f_x, g_x = NLSolversBase.value_gradient!(d, x) - gx .= g_x + # Compute the value of the Lagrangian at x ev = equality_violation(bounds, x, c, bstate) - L_xsλ = f_x + barrier_value(bounds, x, bstate, μ) + ev + L_x = f_x + barrier_value(bounds, x, bstate, μ) + ev + + # Compute gradient of the Lagrangian at x: + # - store the gradient of the barrier at x in `bgrad` + # - store the gradient of the Lagrangian at x in `g_L_x` + copyto!(g_L_x, g_x) + fill!(bgrad, 0) barrier_grad!(bgrad, bounds, x, bstate, μ) - equality_grad!(gx, bgrad, bounds, x, c, J, bstate) - f_x, L_xsλ, ev + equality_grad!(g_L_x, bgrad, bounds, x, c, J, bstate) + + + return L_x, ev end # TODO: do we need lagrangian_vec? Maybe for automatic differentiation? @@ -642,11 +643,13 @@ function lagrangian_linefunc(αs, d, constraints, state) end function _lagrangian_linefunc(αs, d, constraints, state) - b_ls, bounds = state.b_ls, constraints.bounds + b_ls = state.b_ls ls_update!(state.x_ls, state.x, state.s, alphax(αs)) ls_update!(b_ls.bstate, state.bstate, state.bstep, αs) constraints.c!(b_ls.c, state.x_ls) - lagrangian(d, constraints.bounds, state.x_ls, b_ls.c, b_ls.bstate, state.μ) + f_x_ls = NLSolversBase.value!(d, state.x_ls) + L_x_ls, ev_ls = lagrangian(constraints.bounds, state.x_ls, f_x_ls, b_ls.c, b_ls.bstate, state.μ) + return f_x_ls, L_x_ls, ev_ls end alphax(α::Number) = α alphax(αs::Union{Tuple,AbstractVector}) = αs[1] @@ -658,10 +661,8 @@ function lagrangian_linefunc!( state, method::IPOptimizer{typeof(backtrack_constrained)}, ) - # For backtrack_constrained, the last evaluation is the one we - # keep, so it's safe to store the results in state - state.f_x, state.L, state.ev = _lagrangian_linefunc(α, d, constraints, state) - state.L + _, L_x_ls, _ = _lagrangian_linefunc(α, d, constraints, state) + return L_x_ls end lagrangian_linefunc!(α, d, constraints, state, method) = lagrangian_linefunc(α, d, constraints, state) @@ -669,8 +670,8 @@ lagrangian_linefunc!(α, d, constraints, state, method) = ## for line searches that do use the gradient along the line function lagrangian_lineslope(αs, d, constraints, state) - f_x, L, ev, slope = _lagrangian_lineslope(αs, d, constraints, state) - L, slope + _, L_x_ls, _, slope = _lagrangian_lineslope(αs, d, constraints, state) + return L_x_ls, slope end function _lagrangian_lineslope(αs, d, constraints, state) @@ -680,19 +681,22 @@ function _lagrangian_lineslope(αs, d, constraints, state) ls_update!(b_ls.bstate, state.bstate, bstep, αs) constraints.c!(b_ls.c, state.x_ls) constraints.jacobian!(b_ls.J, state.x_ls) - f_x, L, ev = lagrangian_fg!( - state.g, + f_x_ls, g_x_ls = value_gradient!(d, state.x_ls) + g_L_x_ls = copy(state.g_L_x) + L_x_ls, ev_ls = lagrangian_fg!( + g_L_x_ls, bgrad, - d, bounds, state.x_ls, + f_x_ls, + g_x_ls, b_ls.c, b_ls.J, b_ls.bstate, state.μ, ) - slopeα = slopealpha(state.s, state.g, bstep, bgrad) - f_x, L, ev, slopeα + slopeα = slopealpha(state.s, g_L_x_ls, bstep, bgrad) + f_x_ls, L_x_ls, ev_ls, slopeα end function lagrangian_lineslope!( @@ -704,8 +708,8 @@ function lagrangian_lineslope!( ) # For backtrack_constrained, the last evaluation is the one we # keep, so it's safe to store the results in state - state.f_x, state.L, state.ev, slope = _lagrangian_lineslope(αs, d, constraints, state) - state.L, slope + state.f_x, state.L_x, state.ev, slope = _lagrangian_lineslope(αs, d, constraints, state) + state.L_x, slope end lagrangian_lineslope!(αs, d, constraints, state, method) = lagrangian_lineslope(αs, d, constraints, state) diff --git a/src/multivariate/solvers/constrained/ipnewton/ipnewton.jl b/src/multivariate/solvers/constrained/ipnewton/ipnewton.jl index cc8a20914..189f6a8c9 100644 --- a/src/multivariate/solvers/constrained/ipnewton/ipnewton.jl +++ b/src/multivariate/solvers/constrained/ipnewton/ipnewton.jl @@ -59,21 +59,21 @@ IPNewton(; show_linesearch::Bool = false, ) = IPNewton(linesearch, μ0, show_linesearch) -mutable struct IPNewtonState{T,Tx} <: AbstractBarrierState - x::Tx - f_x::T - x_previous::Tx - g::Tx - f_x_previous::T - H::Matrix{T} # Hessian of the Lagrangian? - HP::Any # TODO: remove HP? It's not used - Hd::Vector{Int8} # TODO: remove Hd? It's not used - s::Tx # step for x +mutable struct IPNewtonState{T,Tx,Tg} <: AbstractBarrierState + x::Tx # Current state + g_x::Tg # Gradient of the objective function at x + H_x::Matrix{T} # Hessian of the objective function at x + f_x::T # Value of objective function at x + x_previous::Tx # Previous state + f_x_previous::T # Value of the objective function at x_previous + s::Tx # Step for x # Barrier penalty fields - μ::T # coefficient of the barrier penalty - μnext::T # μ for the next iteration - L::T # value of the Lagrangian (objective + barrier + equality) - L_previous::T + μ::T # coefficient of the barrier penalty + μnext::T # μ for the next iteration + L_x::T # value of the Lagrangian (objective + barrier + equality) at x + g_L_x::Tg # gradient of the Lagrangian at x + H_L_x::Matrix{T} # Hessian of the Lagrangian at x + L_x_previous::T # value of the Lagrangian at x_previous bstate::BarrierStateVars{T} # value of slack and λ variables (current "position") bgrad::BarrierStateVars{T} # gradient of slack and λ variables at current "position" bstep::BarrierStateVars{T} # search direction for slack and λ @@ -86,39 +86,6 @@ mutable struct IPNewtonState{T,Tx} <: AbstractBarrierState Htilde::Any # Positive Cholesky factorization of H from PositiveFactorizations.jl end -# TODO: Do we need this convert thing? (It seems to be used with `show(IPNewtonState)`) -function Base.convert( - ::Type{IPNewtonState{T,Tx}}, - state::IPNewtonState{S,Sx}, -) where {T,Tx,S,Sx} - IPNewtonState( - convert(Tx, state.x), - T(state.f_x), - convert(Tx, state.x_previous), - convert(Tx, state.g), - T(state.f_x_previous), - convert(Matrix{T}, state.H), - state.HP, - state.Hd, - convert(Tx, state.s), - T(state.μ), - T(state.μnext), - T(state.L), - T(state.L_previous), - convert(BarrierStateVars{T}, state.bstate), - convert(BarrierStateVars{T}, state.bgrad), - convert(BarrierStateVars{T}, state.bstep), - convert(Vector{T}, state.constr_c), - convert(Matrix{T}, state.constr_J), - T(state.ev), - convert(Tx, state.x_ls), - T(state.alpha), - convert(BarrierLineSearchGrad{T}, state.b_ls), - convert(Tx, state.gtilde), - state.Htilde, - ) -end - function initial_state( method::IPNewton, options::Options, @@ -139,18 +106,12 @@ function initial_state( end # Allocate fields for the objective function n = length(initial_x) - g = similar(initial_x) - s = similar(initial_x) - f_x_previous = NaN - f_x, g_x = value_gradient!(d, initial_x) - g .= g_x # needs to be a separate copy of g_x - hessian!(d, initial_x) - H = collect(T, hessian(d)) - Hd = zeros(Int8, n) + + # TODO: Switch to `value_gradient_hessian!` + f_x, g_x, H_x = NLSolversBase.value_gradient_hessian!!(d, initial_x) # More constraints constr_J = fill!(Matrix{T}(undef, mc, n), NaN) - gtilde = copy(g) constraints.jacobian!(constr_J, initial_x) μ = T(1) bstate = BarrierStateVars(constraints.bounds, initial_x, constr_c) @@ -161,18 +122,18 @@ function initial_state( state = IPNewtonState( copy(initial_x), # Maintain current state in state.x + copy(g_x), # Store current gradient in state.g_x + copy(H_x), # Store current Hessian in state.H_x f_x, # Store current f in state.f_x - copy(initial_x), # Maintain previous state in state.x_previous - g, # Store current gradient in state.g (TODO: includes Lagrangian calculation?) - T(NaN), # Store previous f in state.f_x_previous - H, - 0, # will be replaced - Hd, - similar(initial_x), # Maintain current x-search direction in state.s + fill!(similar(initial_x), NaN), # Maintain previous state in state.x_previous + oftype(f_x, NaN), # Store previous f in state.f_x_previous + fill!(similar(initial_x), NaN), # Maintain current x-search direction in state.s μ, μ, - T(NaN), - T(NaN), + oftype(f_x, NaN), # Store Lagrangian at x + fill!(similar(g_x), NaN), # Store gradient of Lagrangian at x + fill!(similar(H_x), NaN), # Store Hessian of Lagrangian at x + oftype(f_x, NaN), # Store Lagrangian at x_previous bstate, bgrad, bstep, @@ -181,31 +142,48 @@ function initial_state( T(NaN), @initial_linesearch()..., # Maintain a cache for line search results in state.lsr b_ls, - gtilde, + fill!(similar(g_x), NaN), # Modified gradient at x 0, ) - Hinfo = (state.H, hessianI(initial_x, constraints, 1 ./ bstate.slack_c, 1)) - initialize_μ_λ!(state, constraints.bounds, Hinfo, method.μ0) - update_fg!(d, constraints, state, method) - update_h!(d, constraints, state, method) + HcI = hessianI(initial_x, constraints, 1 ./ bstate.slack_c, 1) + initialize_μ_λ!(state, constraints.bounds, HcI, method.μ0) + + # Update function value, gradient and Hessian matrix + update_fgh!(d, constraints, state, method) state end -function update_fg!(d, constraints::TwiceDifferentiableConstraints, state, method::IPNewton) - state.f_x, state.L, state.ev = lagrangian_fg!( - state.g, +function update_fgh!(d, constraints::TwiceDifferentiableConstraints, state, method::IPNewton) + # Compute objective function, gradient and Hessian matrix + # TODO: Switch to `value_gradient_hessian!!` + f_x, g_x, H_x = NLSolversBase.value_gradient_hessian!!(d, state.x) + copyto!(state.g_x, g_x) + copyto!(state.H_x, H_x) + state.f_x = f_x + + # Compute value and gradient of the Lagrangian + state.L_x, state.ev = lagrangian_fg!( + state.g_L_x, state.bgrad, - d, constraints.bounds, state.x, + state.f_x, + state.g_x, state.constr_c, state.constr_J, state.bstate, state.μ, ) + + # Calculate the modified gradient (see below) update_gtilde!(d, constraints, state, method) + + # Compute the Hessian of the Lagrangian + update_lagrangian_h!(d, constraints, state, method) + + return nothing end function update_gtilde!( @@ -217,9 +195,9 @@ function update_gtilde!( # Calculate the modified x-gradient for the block-eliminated problem # gtilde is the gradient for the affine-scaling problem, i.e., # with μ=0, used in the adaptive setting of μ. Once we calculate μ we'll correct it - gtilde, bstate, bgrad = state.gtilde, state.bstate, state.bgrad + (; gtilde, bstate, bgrad) = state bounds = constraints.bounds - copyto!(gtilde, state.g) + copyto!(gtilde, state.g_L_x) JIc = view(state.constr_J, bounds.ineqc, :) if !isempty(JIc) Hssc = Diagonal(bstate.λc ./ bstate.slack_c) @@ -234,13 +212,14 @@ function update_gtilde!( state end -function update_h!(d, constraints::TwiceDifferentiableConstraints, state, method::IPNewton) - x, μ, Hxx, J = state.x, state.μ, state.H, state.constr_J +function update_lagrangian_h!(d, constraints::TwiceDifferentiableConstraints, state, method::IPNewton) + x, μ, Hxx, J = state.x, state.μ, state.H_L_x, state.constr_J bstate, bgrad, bounds = state.bstate, state.bgrad, constraints.bounds m, n = size(J, 1), size(J, 2) - hessian!(d, state.x) # objective's Hessian - copyto!(Hxx, hessian(d)) # objective's Hessian + # Initialize the Hessian of the Langrangian with the Hessian of the objective function + copyto!(Hxx, state.H_x) # objective's Hessian + # accumulate the constraint second derivatives λ = userλ(bstate.λc, constraints) λ[bounds.eqc] = -bstate.λcE # the negative sign is from the Hessian @@ -265,18 +244,24 @@ function update_h!(d, constraints::TwiceDifferentiableConstraints, state, method state end +# TODO: This only works for method.linesearch = backtracking_constrained_grad +# TODO: How are we meant to implement backtracking_constrained?. +# It requires both an alpha and an alphaI (αmax and αImax) ... function update_state!( d, constraints::TwiceDifferentiableConstraints, state::IPNewtonState{T}, - method::IPNewton, + method::IPNewton{typeof(backtrack_constrained_grad)}, options::Options, ) where {T} - state.f_x_previous, state.L_previous = state.f_x, state.L + state.f_x_previous, state.L_x_previous = state.f_x, state.L_x bstate, bstep, bounds = state.bstate, state.bstep, constraints.bounds qp = solve_step!(state, constraints, options, method.show_linesearch) # If a step α=1 will not change any of the parameters, we can quit now. # This prevents a futile linesearch. + if !(qp isa NTuple{3,Any}) + return false + end if is_smaller_eps(state.x, state.s) && is_smaller_eps(bstate.slack_x, bstep.slack_x) && is_smaller_eps(bstate.slack_c, bstep.slack_c) && @@ -294,9 +279,7 @@ function update_state!( # Determine the actual distance of movement along the search line ϕ = linesearch_anon(d, constraints, state, method) - # TODO: This only works for method.linesearch = backtracking_constrained_grad - # TODO: How are we meant to implement backtracking_constrained?. - # It requires both an alpha and an alphaI (αmax and αImax) ... + state.alpha = method.linesearch!(ϕ, T(1), αmax, qp; show_linesearch = method.show_linesearch) @@ -395,9 +378,9 @@ function solve_step!( k == length(ΔλE) || error("exhausted targets before ΔλE") solve_slack!(bstep, Δx, bounds, bstate, bgrad, J, μ) # Solve for the quadratic parameters (use the real H, not the posdef H) - Hstepx, HstepλE = state.H * Δx - JE' * ΔλE, -JE * Δx - qp = state.L, - slopealpha(state.s, state.g, bstep, bgrad), + Hstepx, HstepλE = state.H_L_x * Δx - JE' * ΔλE, -JE * Δx + qp = state.L_x, + slopealpha(state.s, state.g_L_x, bstep, bgrad), dot(Δx, Hstepx) + dot(ΔλE, HstepλE) qp end diff --git a/src/multivariate/solvers/constrained/ipnewton/utilities/assess_convergence.jl b/src/multivariate/solvers/constrained/ipnewton/utilities/assess_convergence.jl index f928adba2..1d754d104 100644 --- a/src/multivariate/solvers/constrained/ipnewton/utilities/assess_convergence.jl +++ b/src/multivariate/solvers/constrained/ipnewton/utilities/assess_convergence.jl @@ -4,9 +4,9 @@ function assess_convergence(state::IPNewtonState, d, options::Options) assess_convergence( state.x, state.x_previous, - state.L, - state.L_previous, - [state.g; bgrad.slack_x; bgrad.slack_c; bgrad.λx; bgrad.λc; bgrad.λxE; bgrad.λcE], + state.L_x, + state.L_x_previous, + [state.g_x; bgrad.slack_x; bgrad.slack_c; bgrad.λx; bgrad.λc; bgrad.λxE; bgrad.λcE], options.x_abstol, options.f_reltol, options.g_abstol, diff --git a/src/multivariate/solvers/constrained/ipnewton/utilities/trace.jl b/src/multivariate/solvers/constrained/ipnewton/utilities/trace.jl index 05de5f4ef..35ebd1b9a 100644 --- a/src/multivariate/solvers/constrained/ipnewton/utilities/trace.jl +++ b/src/multivariate/solvers/constrained/ipnewton/utilities/trace.jl @@ -25,15 +25,15 @@ end function trace!(tr, d, state, iteration, method::IPOptimizer, options, curr_time = time()) dt = Dict() - dt["Lagrangian"] = state.L + dt["Lagrangian"] = state.L_x dt["μ"] = state.μ dt["ev"] = abs(state.ev) dt["time"] = curr_time if options.extended_trace dt["α"] = state.alpha dt["x"] = copy(state.x) - dt["g(x)"] = copy(state.g) - dt["h(x)"] = copy(state.H) + dt["g(x)"] = copy(state.g_x) + dt["h(x)"] = copy(state.H_x) if !isempty(state.bstate) dt["gtilde(x)"] = copy(state.gtilde) dt["bstate"] = copy(state.bstate) @@ -41,11 +41,11 @@ function trace!(tr, d, state, iteration, method::IPOptimizer, options, curr_time dt["c"] = copy(state.constr_c) end end - g_norm = norm(state.g, Inf) + norm(state.bgrad, Inf) + g_norm = norm(state.g_x, Inf) + norm(state.bgrad, Inf) update!( tr, iteration, - value(d), + state.f_x, g_norm, dt, options.store_trace, diff --git a/src/multivariate/solvers/constrained/samin.jl b/src/multivariate/solvers/constrained/samin.jl index a8a9f7f33..a7ec24040 100644 --- a/src/multivariate/solvers/constrained/samin.jl +++ b/src/multivariate/solvers/constrained/samin.jl @@ -62,7 +62,7 @@ function optimize( obj_fn, lb::AbstractArray, ub::AbstractArray, - x::AbstractArray{Tx}, + initial_x::AbstractArray{Tx}, method::SAMIN, options::Options = Options(), ) where {Tx} @@ -70,9 +70,11 @@ function optimize( time0 = time() # Initial time stamp used to control early stopping by options.time_limit hline = "="^80 + x = copy(initial_x) d = NonDifferentiable(obj_fn, x) + f_x = value!(d, x) - tr = OptimizationTrace{typeof(value(d)),typeof(method)}() + tr = OptimizationTrace{typeof(f_x),typeof(method)}() tracing = options.store_trace || options.show_trace || @@ -84,7 +86,6 @@ function optimize( x_tol, f_tol = options.x_abstol, options.f_abstol - x0 = copy(x) n = size(x, 1) # dimension of parameter # Set initial values nacc = 0 # total accepted trials @@ -96,11 +97,10 @@ function optimize( f_absΔ = Inf # most recent values, to compare to when checking convergend fstar = typemax(Float64) * ones(neps) - # Initial obj_value - xopt = copy(x) - f_old = value!(d, x) - fopt = copy(f_old) # give it something to compare to - details = [f_calls(d) t fopt xopt'] + # Best values + f_opt = f_x + x_opt = copy(x) + details = [f_calls(d) t f_opt x_opt'] bounds = ub - lb # check for out-of-bounds starting values for i = 1:n @@ -114,7 +114,7 @@ function optimize( trace!( tr, d, - (x = xopt, iteration = iteration), + (; x, f_x, iteration = iteration), iteration, method, options, @@ -122,6 +122,7 @@ function optimize( ) stopped_by_callback = false # main loop, first increase temperature until parameter space covered, then reduce until convergence + xp = copy(x) # proposal while converge == 0 # statistics to report at each temp change, set back to zero nup = 0 @@ -141,37 +142,35 @@ function optimize( # new Sept 2011, if bounds are same, skip the search for that vbl. # Allows restrictions without complicated programming if (lb[h] != ub[h]) - xp = copy(x) + copyto!(xp, x) xp[h] += (Tx(2.0) * rand(Tx) - Tx(1.0)) * bounds[h] if (xp[h] < lb[h]) || (xp[h] > ub[h]) xp[h] = lb[h] + (ub[h] - lb[h]) * rand(Tx) lnobds += 1 end # Evaluate function at new point - f_proposal = value(d, xp) + f_proposal = value!(d, xp) # Accept the new point if the function value decreases - if (f_proposal <= f_old) - x = copy(xp) - f_old = f_proposal + if (f_proposal <= f_x) + copyto!(x, xp) + f_x = f_proposal nacc += 1 # total number of acceptances nacp[h] += 1 # acceptances for this parameter nup += 1 # If lower than any other point, record as new optimum - if f_proposal < fopt - xopt = copy(xp) - fopt = f_proposal - d.F = f_proposal + if f_proposal < f_opt + copyto!(x_opt, xp) + f_opt = f_proposal nnew += 1 details = [details; [f_calls(d) t f_proposal xp']] end # If the point is higher, use the Metropolis criteria to decide on # acceptance or rejection. else - p = exp(-(f_proposal - f_old) / t) + p = exp(-(f_proposal - f_x) / t) if rand(Tx) < p - x = copy(xp) - f_old = copy(f_proposal) - d.F = f_proposal + copyto!(x, xp) + f_x = f_proposal nacc += 1 nacp[h] += 1 ndown += 1 @@ -186,7 +185,7 @@ function optimize( stopped_by_callback = trace!( tr, d, - (x = xopt, iteration = iteration), + (; x = x_opt, f_x = f_opt, iteration = iteration), iteration, method, options, @@ -204,10 +203,10 @@ function optimize( println(hline) println("SAMIN results") println("NO CONVERGENCE: MAXEVALS exceeded") - @printf("\n Obj. value: %16.5f\n\n", fopt) + @printf("\n Obj. value: %16.5f\n\n", f_opt) println(" parameter search width") for i = 1:n - @printf("%16.5f %16.5f \n", xopt[i], bounds[i]) + @printf("%16.5f %16.5f \n", x_opt[i], bounds[i]) end println(hline) end @@ -215,9 +214,9 @@ function optimize( termination_code = TerminationCode.NotImplemented return MultivariateOptimizationResults( method, - x0,# initial_x, - xopt, #pick_best_x(f_incr_pick, state), - fopt, # pick_best_f(f_incr_pick, state, d), + initial_x, + x_opt, #pick_best_x(f_incr_pick, state), + f_opt, # pick_best_f(f_incr_pick, state), f_calls(d), #iteration, x_tol,#T(options.x_tol), 0.0,#T(options.x_tol), @@ -225,10 +224,10 @@ function optimize( NaN,# x_abschange(state), f_tol,#T(options.f_tol), 0.0,#T(options.f_tol), - f_absΔ,#f_abschange(d, state), - NaN,#f_abschange(d, state), + f_absΔ,#f_abschange(state), + NaN,#f_abschange(state), 0.0,#T(options.g_tol), - NaN,#g_residual(d), + NaN,#g_residual(state), tr, f_calls(d), g_calls(d), @@ -274,7 +273,7 @@ function optimize( println(hline) println("samin: intermediate results before next temperature change") println("temperature: ", round(t, digits = 5)) - println("current best function value: ", round(fopt, digits = 5)) + println("current best function value: ", round(f_opt, digits = 5)) println("total evaluations so far: ", f_calls(d)) println("total moves since last temperature reduction: ", nup + ndown + nrej) println("downhill: ", nup) @@ -285,16 +284,16 @@ function optimize( println() println(" parameter search width") for i = 1:n - @printf("%16.5f %16.5f \n", xopt[i], bounds[i]) + @printf("%16.5f %16.5f \n", x_opt[i], bounds[i]) end println(hline * "\n") end # Check for convergence, if we have covered the parameter space if coverage_ok # last value close enough to last neps values? - fstar[1] = f_old - f_absΔ = abs.(fopt - f_old) # close enough to best so far? - if all((abs.(fopt .- fstar)) .< f_tol) # within to for last neps trials? + fstar[1] = f_x + f_absΔ = abs.(f_opt - f_x) # close enough to best so far? + if all((abs.(f_opt .- fstar)) .< f_tol) # within to for last neps trials? f_converged = true # check for bound narrow enough for parameter convergence if any(bounds .> x_tol) @@ -336,10 +335,10 @@ function optimize( ) end println("total number of objective function evaluations: ", f_calls(d)) - @printf("\n Obj. value: %16.10f\n\n", fopt) + @printf("\n Obj. value: %16.10f\n\n", f_opt) println(" parameter search width") for i = 1:n - @printf("%16.5f %16.5f \n", xopt[i], bounds[i]) + @printf("%16.5f %16.5f \n", x_opt[i], bounds[i]) end println(hline * "\n") end @@ -347,18 +346,17 @@ function optimize( # Reduce temperature, record current function value in the # list of last "neps" values, and loop again t *= rt - pushfirst!(fstar, f_old) + pushfirst!(fstar, f_x) fstar = fstar[1:end-1] - f_old = copy(fopt) - x = copy(xopt) else # coverage not ok - increase temperature quickly to expand search area t *= r_expand for i = neps:-1:2 fstar[i] = fstar[i-1] end - f_old = fopt - x = xopt end + + f_x = f_opt + copyto!(x, x_opt) end end diff --git a/src/multivariate/solvers/first_order/accelerated_gradient_descent.jl b/src/multivariate/solvers/first_order/accelerated_gradient_descent.jl index 27896487c..7e1ca1d53 100644 --- a/src/multivariate/solvers/first_order/accelerated_gradient_descent.jl +++ b/src/multivariate/solvers/first_order/accelerated_gradient_descent.jl @@ -22,8 +22,10 @@ function AcceleratedGradientDescent(; AcceleratedGradientDescent(_alphaguess(alphaguess), linesearch, manifold) end -mutable struct AcceleratedGradientDescentState{T,Tx} <: AbstractOptimizerState +mutable struct AcceleratedGradientDescentState{T,Tx,Tg} <: AbstractOptimizerState x::Tx + g_x::Tg + f_x::T x_previous::Tx f_x_previous::T iteration::Int @@ -35,25 +37,25 @@ end function initial_state( method::AcceleratedGradientDescent, - options, + ::Options, d, - initial_x::AbstractArray{T}, -) where {T} + initial_x::AbstractArray, +) initial_x = copy(initial_x) retract!(method.manifold, initial_x) - - value_gradient!!(d, initial_x) - - project_tangent!(method.manifold, gradient(d), initial_x) + f_x, g_x = value_gradient!(d, initial_x) + project_tangent!(method.manifold, g_x, initial_x) AcceleratedGradientDescentState( - copy(initial_x), # Maintain current state in state.x - copy(initial_x), # Maintain previous state in state.x_previous - real(T)(NaN), # Store previous f in state.f_x_previous + initial_x, # Maintain current state in state.x + copy(g_x), # Maintain current gradient in state.g_x + f_x, # Maintain current f in state.f_x + fill!(similar(initial_x), NaN), # Maintain previous state in state.x_previous + oftype(f_x, NaN), # Store previous f in state.f_x_previous 0, # Iteration copy(initial_x), # Maintain intermediary current state in state.y - similar(initial_x), # Maintain intermediary state in state.y_previous - similar(initial_x), # Maintain current search direction in state.s + fill!(similar(initial_x), NaN), # Maintain intermediary state in state.y_previous + fill!(similar(initial_x), NaN), # Maintain current search direction in state.s @initial_linesearch()..., ) end @@ -63,11 +65,10 @@ function update_state!( state::AcceleratedGradientDescentState, method::AcceleratedGradientDescent, ) - value_gradient!(d, state.x) state.iteration += 1 - project_tangent!(method.manifold, gradient(d), state.x) + # Search direction is always the negative gradient - state.s .= .-gradient(d) + state.s .= .-state.g_x # Determine the distance of movement along the search line lssuccess = perform_linesearch!(state, method, ManifoldObjective(method.manifold, d)) diff --git a/src/multivariate/solvers/first_order/adam.jl b/src/multivariate/solvers/first_order/adam.jl index b8fef5b09..6eb608210 100644 --- a/src/multivariate/solvers/first_order/adam.jl +++ b/src/multivariate/solvers/first_order/adam.jl @@ -45,81 +45,96 @@ function default_options(method::Adam) (; allow_f_increases = true, iterations = 10_000) end -mutable struct AdamState{Tx,T,Tm,Tu,Ti} <: AbstractOptimizerState +mutable struct AdamState{Tx,T,Tg,Tu,Ti} <: AbstractOptimizerState x::Tx + g_x::Tg + f_x::T x_previous::Tx f_x_previous::T s::Tx - m::Tm + m::Tg u::Tu alpha::T iter::Ti end -function reset!(method, state::AdamState, obj, x) - value_gradient!!(obj, x) -end -function _get_init_params(method::Adam{T}) where {T<:Real} - method.α, method.β₁, method.β₂ +function reset!(method::Adam, state::AdamState, obj, x) + # Update function value and gradient + copyto!(state.x, x) + retract!(method.manifold, state.x) + f_x, g_x = value_gradient!(obj, state.x) + copyto!(state.g_x, g_x) + project_tangent!(method.manifold, state.g_x, state.x) + state.f_x = f_x + + # Reset history + fill!(state.x_previous, NaN) + state.f_x_previous = oftype(state.f_x_previous, NaN) + fill!(state.s, NaN) + + # Reset momentum + copyto!(state.m, state.g_x) + fill!(state.u, false) + + return nothing end -function _get_init_params(method::Adam) - method.α(1), method.β₁, method.β₂ +function _init_alpha(method::Adam) + (; α) = method + return α isa Real ? α : α(1) end -function initial_state(method::Adam, options::Options, d, initial_x::AbstractArray{T}) where {T} +function initial_state(method::Adam, ::Options, d, initial_x::AbstractArray) + # Compute function value and gradient initial_x = copy(initial_x) - - value_gradient!!(d, initial_x) - α, β₁, β₂ = _get_init_params(method) - - m = copy(gradient(d)) - u = zero(m) - iter = 0 + retract!(method.manifold, initial_x) + f_x, g_x = value_gradient!(d, initial_x) + g_x = copy(g_x) + project_tangent!(method.manifold, g_x, initial_x) AdamState( initial_x, # Maintain current state in state.x - copy(initial_x), # Maintain previous state in state.x_previous - real(T(NaN)), # Store previous f in state.f_x_previous - similar(initial_x), # Maintain current search direction in state.s - m, - u, - α, - iter, + g_x, # Maintain current gradient in state.g_x + f_x, # Maintain current f in state.f_x + fill!(similar(initial_x), NaN), # Maintain previous state in state.x_previous + oftype(f_x, NaN), # Store previous f in state.f_x_previous + fill!(similar(initial_x), NaN), # Maintain current search direction in state.s + copy(g_x), # m + zero(g_x), # u + _init_alpha(method), # alpha + 0, # iter ) end -function _update_iter_alpha_in_state!(state::AdamState, method::Adam{T}) where {T<:Real} +function update_state!(d, state::AdamState, method::Adam) + state.iter += 1 - state.iter = state.iter + 1 -end - -function _update_iter_alpha_in_state!(state::AdamState, method::Adam) + # Update α parameter if it is not constant + if !(method.α isa Real) + state.alpha = method.α(state.iter) + end - state.iter = state.iter + 1 - state.alpha = method.α(state.iter) -end - -function update_state!(d, state::AdamState{T}, method::Adam) where {T} - - _update_iter_alpha_in_state!(state, method) - value_gradient!(d, state.x) - - α, β₁, β₂, ϵ = state.alpha, method.β₁, method.β₂, method.ϵ + # Unpack parameters + α = state.alpha + (; β₁, β₂, ϵ) = method a = 1 - β₁ b = 1 - β₂ m, u = state.m, state.u v = u - m .= β₁ .* m .+ a .* gradient(d) - v .= β₂ .* v .+ b .* gradient(d) .^ 2 + m .= β₁ .* m .+ a .* state.g_x + v .= β₂ .* v .+ b .* state.g_x .^ 2 # m̂ = m./(1-β₁^state.iter) # v̂ = v./(1-β₂^state.iter) #@. z = z - α*m̂/(sqrt(v̂+ϵ)) αₜ = α * sqrt(1 - β₂^state.iter) / (1 - β₁^state.iter) + + # Update current state + copyto!(state.x_previous, state.x) + state.f_x_previous = state.f_x @. state.x = state.x - αₜ * m / (sqrt(v) + ϵ) - # Update current position # x = x + alpha * s - false # break on linesearch error + + false # no error end function trace!(tr, d, state::AdamState, iteration::Integer, method::Adam, options::Options, curr_time = time()) diff --git a/src/multivariate/solvers/first_order/adamax.jl b/src/multivariate/solvers/first_order/adamax.jl index 782c3d7e5..678a683d4 100644 --- a/src/multivariate/solvers/first_order/adamax.jl +++ b/src/multivariate/solvers/first_order/adamax.jl @@ -44,75 +44,90 @@ function default_options(method::AdaMax) (; allow_f_increases = true, iterations = 10_000) end - -mutable struct AdaMaxState{Tx,T,Tm,Tu,Ti} <: AbstractOptimizerState +mutable struct AdaMaxState{Tx,T,Tg} <: AbstractOptimizerState x::Tx + g_x::Tg + f_x::T x_previous::Tx f_x_previous::T s::Tx - m::Tm - u::Tu + m::Tg + u::Tg alpha::T - iter::Ti -end -function reset!(method, state::AdaMaxState, obj, x) - value_gradient!!(obj, x) + iter::Int end -function _get_init_params(method::AdaMax{T}) where {T<:Real} - method.α, method.β₁, method.β₂ +function reset!(method::AdaMax, state::AdaMaxState, obj, x) + # Update function value and gradient + copyto!(state.x, x) + retract!(method.manifold, state.x) + f_x, g_x = value_gradient!(obj, state.x) + copyto!(state.g_x, g_x) + project_tangent!(method.manifold, state.g_x, state.x) + state.f_x = f_x + + # Delete history + fill!(state.x_previous, NaN) + state.f_x_previous = oftype(state.f_x_previous, NaN) + fill!(state.s, NaN) + + # Update momentum + copyto!(state.m, state.g_x) + fill!(state.u, false) + + return nothing end -function _get_init_params(method::AdaMax) - method.α(1), method.β₁, method.β₂ +function _init_alpha(method::AdaMax) + (; α) = method + return α isa Real ? α : α(1) end function initial_state(method::AdaMax, options::Options, d, initial_x::AbstractArray{T}) where {T} + # Compute function value and gradient initial_x = copy(initial_x) - - value_gradient!!(d, initial_x) - α, β₁, β₂ = _get_init_params(method) - - m = copy(gradient(d)) - u = zero(m) - iter = 0 + retract!(method.manifold, initial_x) + f_x, g_x = value_gradient!(d, initial_x) + g_x = copy(g_x) + project_tangent!(method.manifold, g_x, initial_x) AdaMaxState( initial_x, # Maintain current state in state.x - copy(initial_x), # Maintain previous state in state.x_previous - real(T(NaN)), # Store previous f in state.f_x_previous - similar(initial_x), # Maintain current search direction in state.s - m, - u, - α, - iter, + g_x, # Maintain current gradient in state.g_x + f_x, # Maintain current f in state.f_x + fill!(similar(initial_x), NaN), # Maintain previous state in state.x_previous + oftype(f_x, NaN), # Store previous f in state.f_x_previous + fill!(similar(initial_x), NaN), # Maintain current search direction in state.s + copy(g_x), # m + zero(g_x), # u + _init_alpha(method), # alpha + 0, # iter ) end -function _update_iter_alpha_in_state!(state::AdaMaxState, method::AdaMax{T}) where {T<:Real} +function update_state!(d, state::AdaMaxState, method::AdaMax) + state.iter += 1 - state.iter = state.iter + 1 -end + # Update step size alpha if it is not constant + if !(method.α isa Real) + state.alpha = method.α(state.iter) + end -function _update_iter_alpha_in_state!(state::AdaMaxState, method::AdaMax) - - state.iter = state.iter + 1 - state.alpha = method.α(state.iter) -end - -function update_state!(d, state::AdaMaxState{T}, method::AdaMax) where {T} - _update_iter_alpha_in_state!(state, method) - value_gradient!(d, state.x) - α, β₁, β₂, ϵ = state.alpha, method.β₁, method.β₂, method.ϵ + # Unpack parameters + α = state.alpha + (; β₁, β₂, ϵ) = method a = 1 - β₁ - m, u = state.m, state.u - m .= β₁ .* m .+ a .* gradient(d) - u .= max.(ϵ, max.(β₂ .* u, abs.(gradient(d)))) # I know it's not there in the paper but if m and u start at 0 for some element... NaN occurs next + (; g_x, m, u) = state + m .= β₁ .* m .+ a .* g_x + u .= max.(ϵ, max.(β₂ .* u, abs.(g_x))) # I know it's not there in the paper but if m and u start at 0 for some element... NaN occurs next + # Update current state + copyto!(state.x_previous, state.x) + state.f_x_previous = state.f_x @. state.x = state.x - (α / (1 - β₁^state.iter)) * m / u - # Update current position # x = x + alpha * s - false # break on linesearch error + + false # no error end function trace!(tr, d, state::AdaMaxState, iteration::Integer, method::AdaMax, options::Options, curr_time = time()) diff --git a/src/multivariate/solvers/first_order/bfgs.jl b/src/multivariate/solvers/first_order/bfgs.jl index ed1c3b455..9e6041b51 100644 --- a/src/multivariate/solvers/first_order/bfgs.jl +++ b/src/multivariate/solvers/first_order/bfgs.jl @@ -49,8 +49,10 @@ end mutable struct BFGSState{Tx,Tm,T,G} <: AbstractOptimizerState x::Tx + g_x::G + f_x::T x_previous::Tx - g_previous::G + g_x_previous::G f_x_previous::T dx::Tx dg::Tx @@ -69,74 +71,82 @@ function _init_identity_matrix(x::AbstractArray{T}, scale::T = T(1)) where {T} end function reset!(method, state::BFGSState, obj, x) - n = length(x) - T = eltype(x) + # Update function value and gradient retract!(method.manifold, x) - value_gradient!(obj, x) - project_tangent!(method.manifold, gradient(obj), x) + f_x, g_x = value_gradient!(obj, x) + project_tangent!(method.manifold, g_x, x) + state.f_x = f_x + copyto!(state.g_x, g_x) + # Delete history + fill!(state.x_previous, NaN) + fill!(state.g_x_previous, NaN) + state.f_x_previous = oftype(state.f_x_previous, NaN) + + # Update approximation of inverse Hessian if method.initial_invH === nothing if method.initial_stepnorm === nothing # Identity matrix of size n x n state.invH = _init_identity_matrix(x) else - initial_scale = T(method.initial_stepnorm) * inv(norm(gradient(obj), Inf)) + T = eltype(state.invH) + initial_scale = T(method.initial_stepnorm) * inv(norm(g_x, Inf)) state.invH = _init_identity_matrix(x, initial_scale) end else state.invH .= method.initial_invH(x) end + + return nothing end -function initial_state(method::BFGS, options::Options, d, initial_x::AbstractArray{T}) where {T} - n = length(initial_x) +function initial_state(method::BFGS, ::Options, d, initial_x::AbstractArray) + # Compute function value and gradient initial_x = copy(initial_x) retract!(method.manifold, initial_x) + f_x, g_x = value_gradient!(d, initial_x) + project_tangent!(method.manifold, g_x, initial_x) - value_gradient!!(d, initial_x) - - project_tangent!(method.manifold, gradient(d), initial_x) - + # Initialize approximation of inverse Hessian if method.initial_invH === nothing if method.initial_stepnorm === nothing # Identity matrix of size n x n invH0 = _init_identity_matrix(initial_x) else - initial_scale = T(method.initial_stepnorm) * inv(norm(gradient(d), Inf)) + T = eltype(g_x) + initial_scale = T(method.initial_stepnorm) * inv(norm(g_x, Inf)) invH0 = _init_identity_matrix(initial_x, initial_scale) end else invH0 = method.initial_invH(initial_x) end + # Maintain a cache for line search results # Trace the history of states visited BFGSState( initial_x, # Maintain current state in state.x - copy(initial_x), # Maintain previous state in state.x_previous - copy(gradient(d)), # Store previous gradient in state.g_previous - real(T)(NaN), # Store previous f in state.f_x_previous - similar(initial_x), # Store changes in position in state.dx - similar(initial_x), # Store changes in gradient in state.dg - similar(initial_x), # Buffer stored in state.u + copy(g_x), # Maintain current gradient in state.g_x + f_x, # Maintain current f in state.f_x + fill!(similar(initial_x), NaN), # Maintain previous state in state.x_previous + fill!(similar(g_x), NaN), # Store previous gradient in state.g_x_previous + oftype(f_x, NaN), # Store previous f in state.f_x_previous + fill!(similar(initial_x), NaN), # Store changes in position in state.dx + fill!(similar(initial_x), NaN), # Store changes in gradient in state.dg + fill!(similar(initial_x), NaN), # Buffer stored in state.u invH0, # Store current invH in state.invH - similar(initial_x), # Store current search direction in state.s + fill!(similar(initial_x), NaN), # Store current search direction in state.s @initial_linesearch()..., ) end function update_state!(d, state::BFGSState, method::BFGS) - n = length(state.x) - T = eltype(state.s) # Set the search direction # Search direction is the negative gradient divided by the approximate Hessian - mul!(vec(state.s), state.invH, vec(gradient(d))) - rmul!(state.s, T(-1)) + mul!(vec(state.s), state.invH, vec(state.g_x)) + rmul!(state.s, eltype(state.s)(-1)) project_tangent!(method.manifold, state.s, state.x) - # Maintain a record of the previous gradient - copyto!(state.g_previous, gradient(d)) - # Determine the distance of movement along the search line # This call resets invH to initial_invH if the former is not positive # semi-definite @@ -150,11 +160,17 @@ function update_state!(d, state::BFGSState, method::BFGS) return !lssuccess # break on linesearch error end -function update_h!(d, state, method::BFGS) - n = length(state.x) +function update_fgh!(d, state::BFGSState, method::BFGS) (; invH, dx, dg, u) = state + + # Update function value and gradient + f_x, g_x = value_gradient!(d, state.x) + copyto!(state.g_x, g_x) + project_tangent!(method.manifold, state.g_x, state.x) + state.f_x = f_x + # Measure the change in the gradient - dg .= gradient(d) .- state.g_previous + dg .= state.g_x .- state.g_x_previous # Update the inverse Hessian approximation using Sherman-Morrison dx_dg = real(dot(dx, dg)) @@ -166,6 +182,7 @@ function update_h!(d, state, method::BFGS) # invH = invH + c1 * (s * s') - c2 * (u * s' + s * u') if (invH isa Array) # i.e. not a CuArray + n = length(dx) @inbounds for j = 1:n c1dxj = c1 * dx[j]' c2dxj = c2 * dx[j]' @@ -191,16 +208,15 @@ function trace!(tr, d, state::BFGSState, iteration::Integer, method::BFGS, optio dt["time"] = curr_time if options.extended_trace dt["x"] = copy(state.x) - dt["g(x)"] = copy(gradient(d)) + dt["g(x)"] = copy(state.g_x) dt["~inv(H)"] = copy(state.invH) dt["Current step size"] = state.alpha end - g_norm = norm(gradient(d), Inf) update!( tr, iteration, - value(d), - g_norm, + state.f_x, + g_residual(state), dt, options.store_trace, options.show_trace, diff --git a/src/multivariate/solvers/first_order/cg.jl b/src/multivariate/solvers/first_order/cg.jl index fdc777f68..99fd9d724 100644 --- a/src/multivariate/solvers/first_order/cg.jl +++ b/src/multivariate/solvers/first_order/cg.jl @@ -93,8 +93,10 @@ end mutable struct ConjugateGradientState{Tx,T,G} <: AbstractOptimizerState x::Tx + g_x::G + f_x::T x_previous::Tx - g_previous::G + g_x_previous::G f_x_previous::T y::Tx py::Tx @@ -104,24 +106,30 @@ mutable struct ConjugateGradientState{Tx,T,G} <: AbstractOptimizerState end function reset!(cg::ConjugateGradient, cgs::ConjugateGradientState, obj, x) - cgs.x .= x - _precondition!(cgs.pg, cg, x, gradient(obj)) - + copyto!(cgs.x, x) + retract!(cg.manifold, cgs.x) + f_x, g_x = value_gradient!(obj, cgs.x) + project_tangent!(cg.manifold, g_x, cgs.x) + cgs.f_x = f_x + copyto!(cgs.g_x, g_x) + + fill!(cgs.x_previous, NaN) + cgs.f_x_previous = oftype(cgs.f_x_previous, NaN) + fill!(cgs.g_x_previous, NaN) + + _precondition!(cgs.pg, cg, x, g_x) if cg.P !== nothing project_tangent!(cg.manifold, cgs.pg, x) end cgs.s .= .-cgs.pg - cgs.f_x_previous = typeof(cgs.f_x_previous)(NaN) + + return nothing end -function initial_state(method::ConjugateGradient, options::Options, d, initial_x) - T = eltype(initial_x) +function initial_state(method::ConjugateGradient, ::Options, d, initial_x) initial_x = copy(initial_x) retract!(method.manifold, initial_x) - - value_gradient!!(d, initial_x) - - project_tangent!(method.manifold, gradient(d), initial_x) - pg = copy(gradient(d)) + f_x, g_x = value_gradient!(d, initial_x) + project_tangent!(method.manifold, g_x, initial_x) # Could move this out? as a general check? #= @@ -136,16 +144,19 @@ function initial_state(method::ConjugateGradient, options::Options, d, initial_x # Determine the intial search direction # if we don't precondition, then this is an extra superfluous copy # TODO: consider allowing a reference for pg instead of a copy - _precondition!(pg, method, initial_x, gradient(d)) + pg = copy(g_x) + _precondition!(pg, method, initial_x, g_x) if method.P !== nothing project_tangent!(method.manifold, pg, initial_x) end ConjugateGradientState( initial_x, # Maintain current state in state.x - 0 .* (initial_x), # Maintain previous state in state.x_previous - 0 .* (gradient(d)), # Store previous gradient in state.g_previous - real(T)(NaN), # Store previous f in state.f_x_previous + copy(g_x), # Maintain current gradient in state.g + f_x, # Maintain current f in state.f + fill!(similar(initial_x), NaN), # Maintain previous state in state.x_previous + fill!(similar(g_x), NaN), # Store previous gradient in state.g_x_previous + oftype(f_x, NaN), # Store previous f in state.f_x_previous 0 .* (initial_x), # Intermediate value in CG calculation 0 .* (initial_x), # Preconditioned intermediate value in CG calculation pg, # Maintain the preconditioned gradient in pg @@ -158,21 +169,23 @@ function update_state!(d, state::ConjugateGradientState, method::ConjugateGradie # Search direction is predetermined # Maintain a record of the previous gradient - copyto!(state.g_previous, gradient(d)) + copyto!(state.g_x_previous, state.g_x) # Determine the distance of movement along the search line lssuccess = perform_linesearch!(state, method, ManifoldObjective(method.manifold, d)) # Update current position # x = x + alpha * s - @. state.x = state.x + state.alpha * state.s + state.x .= muladd.(state.alpha, state.s, state.x) retract!(method.manifold, state.x) # Update the function value and gradient - value_gradient!(d, state.x) - project_tangent!(method.manifold, gradient(d), state.x) + f_x, g_x = value_gradient!(d, state.x) + project_tangent!(method.manifold, g_x, state.x) + state.f_x = f_x + copyto!(state.g_x, g_x) # Check sanity of function and gradient - isfinite(value(d)) || error("Non-finite f(x) while optimizing ($(value(d)))") + isfinite(f_x) || error(LazyString("Non-finite f(x) while optimizing (", f_x, ")")) # Determine the next search direction using HZ's CG rule # Calculate the beta factor (HZ2013) @@ -186,19 +199,19 @@ function update_state!(d, state::ConjugateGradientState, method::ConjugateGradie # also updates P for the preconditioning step below _apply_precondprep(method, state.x) dPd = _inverse_precondition(method, state) - etak = method.eta * real(dot(state.s, state.g_previous)) / dPd # New in HZ2013 - state.y .= gradient(d) .- state.g_previous + etak = method.eta * real(dot(state.s, state.g_x_previous)) / dPd # New in HZ2013 + state.y .= g_x .- state.g_x_previous ydots = real(dot(state.y, state.s)) copyto!(state.py, state.pg) # below, store pg - pg_previous in py # P already updated in _apply_precondprep above - __precondition!(state.pg, method.P, gradient(d)) + __precondition!(state.pg, method.P, g_x) state.py .= state.pg .- state.py # ydots may be zero if f is not strongly convex or the line search does not satisfy Wolfe betak = ( real(dot(state.y, state.pg)) - - real(dot(state.y, state.py)) * real(dot(gradient(d), state.s)) / ydots + real(dot(state.y, state.py)) * real(dot(g_x, state.s)) / ydots ) / ydots # betak may be undefined if ydots is zero (may due to f not strongly convex or non-Wolfe linesearch) beta = NaNMath.max(betak, etak) # TODO: Set to zero if betak is NaN? @@ -207,7 +220,8 @@ function update_state!(d, state::ConjugateGradientState, method::ConjugateGradie return !lssuccess # break on linesearch error end -update_g!(d, state, method::ConjugateGradient) = nothing +# Function value, gradient and Hessian are already updated in `update_state!` +update_fgh!(d, state, ::ConjugateGradient) = nothing function trace!( tr, diff --git a/src/multivariate/solvers/first_order/gradient_descent.jl b/src/multivariate/solvers/first_order/gradient_descent.jl index b6abe20ec..b999f2096 100644 --- a/src/multivariate/solvers/first_order/gradient_descent.jl +++ b/src/multivariate/solvers/first_order/gradient_descent.jl @@ -39,47 +39,59 @@ function GradientDescent(; GradientDescent(_alphaguess(alphaguess), linesearch, P, precondprep, manifold) end -mutable struct GradientDescentState{Tx,T} <: AbstractOptimizerState +mutable struct GradientDescentState{Tx,Tg,T} <: AbstractOptimizerState x::Tx + g_x::Tg + f_x::T x_previous::Tx f_x_previous::T s::Tx @add_linesearch_fields() end -function reset!(method, state::GradientDescentState, obj, x) - retract!(method.manifold, x) - value_gradient!!(obj, x) +function reset!(method::GradientDescent, state::GradientDescentState, obj, x) + # Update function value and gradient + copyto!(state.x, x) + retract!(method.manifold, state.x) + f_x, g_x = value_gradient!(obj, state.x) + copyto!(state.g_x, g_x) + project_tangent!(method.manifold, g_x, state.x) + state.f_x = f_x + + # Delete history + fill!(state.x_previous, NaN) + state.f_x_previous = oftype(state.f_x_previous, NaN) + fill!(state.s, NaN) - project_tangent!(method.manifold, gradient(obj), x) + return nothing end function initial_state( method::GradientDescent, - options, + ::Options, d, initial_x::AbstractArray{T}, ) where {T} + # Compute function value and gradient initial_x = copy(initial_x) retract!(method.manifold, initial_x) - - value_gradient!!(d, initial_x) - - project_tangent!(method.manifold, gradient(d), initial_x) + f_x, g_x = value_gradient!(d, initial_x) + g_x = copy(g_x) + project_tangent!(method.manifold, g_x, initial_x) GradientDescentState( initial_x, # Maintain current state in state.x - copy(initial_x), # Maintain previous state in state.x_previous - real(T(NaN)), # Store previous f in state.f_x_previous - similar(initial_x), # Maintain current search direction in state.s + g_x, # Maintain current gradient in state.g_x + f_x, # Maintain current f in state.f_x + fill!(similar(initial_x), NaN), # Maintain previous state in state.x_previous + oftype(f_x, NaN), # Store previous f in state.f_x_previous + fill!(similar(initial_x), NaN), # Maintain current search direction in state.s @initial_linesearch()..., ) end function update_state!(d, state::GradientDescentState{T}, method::GradientDescent) where {T} - value_gradient!(d, state.x) # Search direction is always the negative preconditioned gradient - project_tangent!(method.manifold, gradient(d), state.x) - _precondition!(state.s, method, state.x, gradient(d)) + _precondition!(state.s, method, state.x, state.g_x) rmul!(state.s, eltype(state.s)(-1)) if method.P !== nothing project_tangent!(method.manifold, state.s, state.x) @@ -91,6 +103,7 @@ function update_state!(d, state::GradientDescentState{T}, method::GradientDescen # Update current position # x = x + alpha * s @. state.x = state.x + state.alpha * state.s retract!(method.manifold, state.x) + return !lssuccess # break on linesearch error end diff --git a/src/multivariate/solvers/first_order/l_bfgs.jl b/src/multivariate/solvers/first_order/l_bfgs.jl index 2d074a7c6..23aa175c7 100644 --- a/src/multivariate/solvers/first_order/l_bfgs.jl +++ b/src/multivariate/solvers/first_order/l_bfgs.jl @@ -137,8 +137,10 @@ Base.summary(io::IO, ::LBFGS) = print(io, "L-BFGS") mutable struct LBFGSState{Tx,Tdx,Tdg,T,G} <: AbstractOptimizerState x::Tx + g_x::G + f_x::T x_previous::Tx - g_previous::G + g_x_previous::G rho::Vector{T} dx_history::Tdx dg_history::Tdg @@ -154,45 +156,52 @@ mutable struct LBFGSState{Tx,Tdx,Tdg,T,G} <: AbstractOptimizerState end function reset!(method::LBFGS, state::LBFGSState, obj, x::AbstractArray) retract!(method.manifold, x) - value_gradient!(obj, x) - project_tangent!(method.manifold, gradient(obj), x) + f_x, g_x = value_gradient!(obj, x) + project_tangent!(method.manifold, g_x, x) + state.f_x = f_x + copyto!(state.g_x, g_x) + + fill!(state.x_previous, NaN) + fill!(state.g_x_previous, NaN) + state.f_x_previous = oftype(state.f_x_previous, NaN) state.pseudo_iteration = 0 + + return nothing end function initial_state(method::LBFGS, options::Options, d, initial_x::AbstractArray) T = real(eltype(initial_x)) - n = length(initial_x) initial_x = copy(initial_x) retract!(method.manifold, initial_x) - - value_gradient!!(d, initial_x) - - project_tangent!(method.manifold, gradient(d), initial_x) + f_x, g_x = value_gradient!(d, initial_x) + project_tangent!(method.manifold, g_x, initial_x) LBFGSState( initial_x, # Maintain current state in state.x - copy(initial_x), # Maintain previous state in state.x_previous - copy(gradient(d)), # Store previous gradient in state.g_previous + copy(g_x), # Maintain current gradient in state.g_x + f_x, # Main current f in state.f_x + fill!(similar(initial_x), NaN), # Maintain previous state in state.x_previous + fill!(similar(g_x), NaN), # Store previous gradient in state.g_x_previous fill!(Vector{T}(undef, method.m), NaN), # state.rho - [similar(initial_x) for i = 1:method.m], # Store changes in position in state.dx_history - [eltype(gradient(d))(NaN) .* gradient(d) for i = 1:method.m], # Store changes in position in state.dg_history - T(NaN) * initial_x, # Buffer for new entry in state.dx_history - T(NaN) * initial_x, # Buffer for new entry in state.dg_history - T(NaN) * initial_x, # Buffer stored in state.u - real(T)(NaN), # Store previous f in state.f_x_previous - similar(initial_x), #Buffer for use by twoloop - Vector{T}(undef, method.m), #Buffer for use by twoloop + [fill!(similar(initial_x), NaN) for i = 1:method.m], # Store changes in position in state.dx_history + [fill!(similar(g_x), NaN) for i = 1:method.m], # Store changes in position in state.dg_history + fill!(similar(initial_x), NaN), # Buffer for new entry in state.dx_history + fill!(similar(initial_x), NaN), # Buffer for new entry in state.dg_history + fill!(similar(initial_x), NaN), # Buffer stored in state.u + oftype(f_x, NaN), # Store previous f in state.f_x_previous + fill!(similar(initial_x), NaN), #Buffer for use by twoloop + fill!(Vector{T}(undef, method.m), NaN),#Buffer for use by twoloop 0, - eltype(gradient(d))(NaN) .* gradient(d), # Store current search direction in state.s + fill!(similar(g_x), NaN), # Store current search direction in state.s @initial_linesearch()..., ) end function update_state!(d, state::LBFGSState, method::LBFGS) - n = length(state.x) # Increment the number of steps we've had to perform state.pseudo_iteration += 1 - project_tangent!(method.manifold, gradient(d), state.x) + # TODO: Can this be removed? + project_tangent!(method.manifold, state.g_x, state.x) # update the preconditioner _apply_precondprep(method, state.x) @@ -200,7 +209,7 @@ function update_state!(d, state::LBFGSState, method::LBFGS) # Determine the L-BFGS search direction # FIXME just pass state and method? twoloop!( state.s, - gradient(d), + state.g_x, state.rho, state.dx_history, state.dg_history, @@ -213,9 +222,6 @@ function update_state!(d, state::LBFGSState, method::LBFGS) ) project_tangent!(method.manifold, state.s, state.x) - # Save g value to prepare for update_g! call - copyto!(state.g_previous, gradient(d)) - # Determine the distance of movement along the search line lssuccess = perform_linesearch!(state, method, ManifoldObjective(method.manifold, d)) @@ -228,10 +234,15 @@ function update_state!(d, state::LBFGSState, method::LBFGS) end -function update_h!(d, state::LBFGSState, method::LBFGS) - n = length(state.x) +function update_fgh!(d, state::LBFGSState, method::LBFGS) + # Update function value and gradient + f_x, g_x = value_gradient!(d, state.x) + copyto!(state.g_x, g_x) + project_tangent!(method.manifold, state.g_x, state.x) + state.f_x = f_x + # Measure the change in the gradient - state.dg .= gradient(d) .- state.g_previous + state.dg .= state.g_x .- state.g_x_previous # Update the L-BFGS history of positions and gradients rho_iteration = one(eltype(state.dx)) / real(dot(state.dx, state.dg)) diff --git a/src/multivariate/solvers/first_order/momentum_gradient_descent.jl b/src/multivariate/solvers/first_order/momentum_gradient_descent.jl index b9eca6af1..7b074e17c 100644 --- a/src/multivariate/solvers/first_order/momentum_gradient_descent.jl +++ b/src/multivariate/solvers/first_order/momentum_gradient_descent.jl @@ -19,8 +19,10 @@ function MomentumGradientDescent(; MomentumGradientDescent(mu, _alphaguess(alphaguess), linesearch, manifold) end -mutable struct MomentumGradientDescentState{Tx,T} <: AbstractOptimizerState +mutable struct MomentumGradientDescentState{Tx,Tg,T} <: AbstractOptimizerState x::Tx + g_x::Tg + f_x::T x_previous::Tx x_momentum::Tx f_x_previous::T @@ -28,21 +30,21 @@ mutable struct MomentumGradientDescentState{Tx,T} <: AbstractOptimizerState @add_linesearch_fields() end -function initial_state(method::MomentumGradientDescent, options::Options, d, initial_x::AbstractArray) - T = eltype(initial_x) +function initial_state(method::MomentumGradientDescent, ::Options, d, initial_x::AbstractArray) + # Compute function value and gradient initial_x = copy(initial_x) retract!(method.manifold, initial_x) - - value_gradient!!(d, initial_x) - - project_tangent!(method.manifold, gradient(d), initial_x) + f_x, g_x = value_gradient!(d, initial_x) + project_tangent!(method.manifold, g_x, initial_x) MomentumGradientDescentState( initial_x, # Maintain current state in state.x + copy(g_x), # Maintain current gradient in state.g_x + f_x, # Maintain current f in state.f_x copy(initial_x), # Maintain previous state in state.x_previous - similar(initial_x), # Record momentum correction direction in state.x_momentum - real(T)(NaN), # Store previous f in state.f_x_previous - similar(initial_x), # Maintain current search direction in state.s + fill!(similar(initial_x), NaN), # Record momentum correction direction in state.x_momentum + oftype(f_x, NaN), # Store previous f in state.f_x_previous + fill!(similar(initial_x), NaN), # Maintain current search direction in state.s @initial_linesearch()..., ) end @@ -52,18 +54,19 @@ function update_state!( state::MomentumGradientDescentState, method::MomentumGradientDescent, ) - project_tangent!(method.manifold, gradient(d), state.x) # Search direction is always the negative gradient - state.s .= .-gradient(d) + state.s .= .-state.g_x - # Update position, and backup current one + # Update momentum state.x_momentum .= state.x .- state.x_previous # Determine the distance of movement along the search line lssuccess = perform_linesearch!(state, method, ManifoldObjective(method.manifold, d)) + # Update state state.x .+= state.alpha .* state.s .+ method.mu .* state.x_momentum retract!(method.manifold, state.x) + return !lssuccess # break on linesearch error end diff --git a/src/multivariate/solvers/first_order/ngmres.jl b/src/multivariate/solvers/first_order/ngmres.jl index 8dd2ee32f..9c74b4695 100644 --- a/src/multivariate/solvers/first_order/ngmres.jl +++ b/src/multivariate/solvers/first_order/ngmres.jl @@ -136,20 +136,22 @@ function OACCEL(; end -mutable struct NGMRESState{P,Tx,Te,T,eTx} <: - AbstractOptimizerState where {P<:AbstractOptimizerState} +mutable struct NGMRESState{P<:AbstractOptimizerState,eTx,Tx<:AbstractArray{eTx},eTg,Tg<:AbstractArray{eTg},T,Te<:VecOrMat{T}} <: + AbstractOptimizerState # eTx is the eltype of Tx - x::Tx # Reference to nlpreconstate.x - x_previous::Tx # Reference to nlpreconstate.x_previous + x::Tx # Current state. Reference to nlpreconstate.x + g_x::Tg # Gradient at x. Reference to nlpreconstate.g_x + f_x::T # Function value of x + x_previous::Tx # Previous state. Reference to nlpreconstate.x_previous x_previous_0::Tx # Used to deal with assess_convergence of NGMRES - f_x_previous::T + f_x_previous::T # Function value of previous state. f_x_previous_0::T # Used to deal with assess_convergence of NGMRES f_xP::T # For tracing purposes grnorm_xP::T # For tracing purposes s::Tx # Search direction for linesearch between xP and xA nlpreconstate::P # Nonlinear preconditioner state X::Array{eTx,2} # Solution vectors in the window - R::Array{eTx,2} # Gradient vectors in the window + R::Array{eTg,2} # Gradient vectors in the window Q::Array{T,2} # Storage to create linear system (TODO: make Symmetric?) ξ::Te # Storage to create linear system curw::Int # Counter for current window size @@ -158,7 +160,7 @@ mutable struct NGMRESState{P,Tx,Te,T,eTx} <: xA::Vector{eTx} # Container for accelerated step k::Int # Used for indexing where to put values in the Storage containers restart::Bool # Restart flag - g_abstol::T # Exit tolerance to be checked after nonlinear preconditioner apply + g_abstol::T # Exit tolerance to be checked after nonlinear preconditioner apply subspacealpha::Vector{T} # Storage for coefficients in the subspace for the acceleration step @add_linesearch_fields() end @@ -225,8 +227,8 @@ function initial_state( method::AbstractNGMRES, options::Options, d, - initial_x::AbstractArray{eTx}, -) where {eTx} + initial_x::AbstractArray, +) if !(typeof(method.nlprecon) <: Union{GradientDescent,LBFGS}) if !ngmres_oaccel_warned[] @warn "Use caution. N-GMRES/O-ACCEL has only been tested with Gradient Descent and L-BFGS preconditioning." @@ -235,36 +237,38 @@ function initial_state( end nlpreconstate = initial_state(method.nlprecon, method.nlpreconopts, d, initial_x) + (; f_x, x, g_x) = nlpreconstate # Manifold comment: # We assume nlprecon calls retract! and project_tangent! on - # nlpreconstate.x and gradient(d) - T = real(eTx) - - n = length(nlpreconstate.x) + # nlpreconstate.x and nlpreconstate.g_x + + T = typeof(f_x) + n = length(x) wmax = method.wmax - X = Array{eTx}(undef, n, wmax) - R = Array{eTx}(undef, n, wmax) - Q = Array{T}(undef, wmax, wmax) + X = fill!(Array{eltype(x)}(undef, n, wmax), NaN) + R = fill!(Array{eltype(g_x)}(undef, n, wmax), NaN) + Q = fill!(Array{typeof(f_x)}(undef, wmax, wmax), NaN) ξ = if typeof(method) <: OACCEL - Array{T}(undef, wmax, 2) + fill!(Array{T}(undef, wmax, 2), NaN) else - Array{T}(undef, wmax) + fill!(Array{T}(undef, wmax), NaN) end - copyto!(view(X, :, 1), nlpreconstate.x) - copyto!(view(R, :, 1), gradient(d)) - + copyto!(view(X, :, 1), x) + copyto!(view(R, :, 1), g_x) _updateQ!(Q, 1, 1, X, R, method) NGMRESState( - nlpreconstate.x, # Maintain current state in state.x. Use same vector as preconditioner. - nlpreconstate.x_previous, # Maintain in state.x_previous. Use same vector as preconditioner. - copy(nlpreconstate.x), # Maintain state at the beginning of an iteration in state.x_previous_0. Used for convergence asessment. + nlpreconstate.x, # Maintain current state in state.x. Uses same vector as preconditioner. + nlpreconstate.g_x, # Maintain current gradient in state.g_x. Uses same vector as preconditioner. + nlpreconstate.f_x, # Maintain current f in state.f_x. + nlpreconstate.x_previous, # Maintain in state.x_previous. Use same vector as preconditioner. + fill!(similar(nlpreconstate.x), NaN), # Maintain state at the beginning of an iteration in state.x_previous_0. Used for convergence asessment. T(NaN), # Store previous f in state.f_x_previous T(NaN), # Store f value from the beginning of an iteration in state.f_x_previous_0. Used for convergence asessment. T(NaN), # Store value f_xP of f(x^P) for tracing purposes T(NaN), # Store value grnorm_xP of |g(x^P)| for tracing purposes - similar(initial_x), # Maintain current search direction in state.s + fill!(similar(initial_x), NaN), # Maintain current search direction in state.s nlpreconstate, # State storage for preconditioner X, R, @@ -282,17 +286,15 @@ function initial_state( ) end -nlprecon_post_optimize!(d, state, method) = update_h!(d, state.nlpreconstate, method) - -nlprecon_post_accelerate!(d, state, method) = update_h!(d, state.nlpreconstate, method) - +nlprecon_post_optimize!(d, state, method) = update_fgh!(d, state.nlpreconstate, method) +nlprecon_post_accelerate!(d, state, method) = update_fgh!(d, state.nlpreconstate, method) function nlprecon_post_accelerate!( d, state::NGMRESState{X,T}, method::LBFGS, ) where {X} where {T} state.nlpreconstate.pseudo_iteration += 1 - update_h!(d, state.nlpreconstate, method) + update_fgh!(d, state.nlpreconstate, method) end @@ -303,32 +305,26 @@ function update_state!( ) where {X} where {T} # Maintain a record of previous position, for convergence assessment copyto!(state.x_previous_0, state.x) - state.f_x_previous_0 = value(d) + state.f_x_previous_0 = state.f_x state.k += 1 curw = state.curw # Step 1: Call preconditioner to get x^P - res = optimize(d, state.x, method.nlprecon, method.nlpreconopts, state.nlpreconstate) + optimize(d, state.x, method.nlprecon, method.nlpreconopts, state.nlpreconstate) # TODO: Is project_tangent! necessary, or is it called by nlprecon before exit? - project_tangent!(method.manifold, gradient(d), state.x) + project_tangent!(method.manifold, state.g_x, state.x) + state.f_xP = state.nlpreconstate.f_x + state.f_x = state.nlpreconstate.f_x - if any(.!isfinite.(state.x)) || any(.!isfinite.(gradient(d))) || !isfinite(value(d)) + if !all(isfinite, state.x) || !all(isfinite, state.g_x) || !isfinite(state.f_xP) @warn("Non-finite values attained from preconditioner $(summary(method.nlprecon)).") return true end + state.grnorm_xP = g_residual(state.g_x) - # Calling value_gradient! in normally done on state.x in optimize or update_g! above, - # but there are corner cases where we need this. - state.f_xP, _g = value_gradient!(d, state.x) - # Manifold start - project_tangent!(method.manifold, gradient(d), state.x) - # Manifold stop - gP = gradient(d) - state.grnorm_xP = g_residual(gP) - - if g_residual(gP) ≤ state.g_abstol + if state.grnorm_xP ≤ state.g_abstol return false # Exit on gradient norm convergence end @@ -336,11 +332,11 @@ function update_state!( nlprecon_post_optimize!(d, state, method.nlprecon) # Step 2: Do acceleration calculation - η = _updateη(state.x, gP, method) + η = _updateη(state.x, state.g_x, method) for i = 1:curw # Update storage vectors according to method {NGMRES, OACCEL} - _updateξ!(state.ξ, i, state.X, state.x, state.R, gP, method) + _updateξ!(state.ξ, i, state.X, state.x, state.R, state.g_x, method) _updateb!(state.b, i, state.ξ, η, method) end @@ -377,7 +373,8 @@ function update_state!( end # 3: Perform condition checks - if real(dot(state.s, gP)) ≥ 0 || !isfinite(real(dot(state.s, gP))) + sTg_x = real(dot(state.s, state.g_x)) + if sTg_x ≥ 0 || !isfinite(sTg_x) # Moving from xP to xA is *not* a descent direction # Discard xA state.restart = true # TODO: expand restart heuristics @@ -428,11 +425,13 @@ function update_state!( return !lssuccess # Break on linesearch error end -function update_g!(d, state, method::AbstractNGMRES) +function update_fgh!(d, state, method::AbstractNGMRES) # Update the function value and gradient # TODO: do we need a retract! on state.x here? - value_gradient!(d, state.x) - project_tangent!(method.manifold, gradient(d), state.x) + f_x, g_x = value_gradient!(d, state.x) + project_tangent!(method.manifold, g_x, state.x) + state.f_x = f_x + copyto!(state.g_x, g_x) if state.restart state.k = 0 @@ -443,11 +442,13 @@ function update_g!(d, state, method::AbstractNGMRES) j = mod(state.k, method.wmax) + 1 copyto!(view(state.X, :, j), vec(state.x)) - copyto!(view(state.R, :, j), vec(gradient(d))) + copyto!(view(state.R, :, j), vec(g_x)) for i = 1:state.curw _updateQ!(state.Q, i, j, state.X, state.R, method) end + + return nothing end function trace!( @@ -463,7 +464,7 @@ function trace!( dt["time"] = curr_time if options.extended_trace dt["x"] = copy(state.x) - dt["g(x)"] = copy(gradient(d)) + dt["g(x)"] = copy(state.g_x) dt["subspace-α"] = state.subspacealpha[1:state.curw-1] if state.restart dt["Current step size"] = NaN @@ -481,12 +482,11 @@ function trace!( dt["|g(x^P)|"] = state.grnorm_xP end - g_norm = g_residual(d) update!( tr, iteration, - value(d), - g_norm, + state.f_x, + g_residual(state), dt, options.store_trace, options.show_trace, diff --git a/src/multivariate/solvers/second_order/krylov_trust_region.jl b/src/multivariate/solvers/second_order/krylov_trust_region.jl index 476b49239..30b1c164a 100644 --- a/src/multivariate/solvers/second_order/krylov_trust_region.jl +++ b/src/multivariate/solvers/second_order/krylov_trust_region.jl @@ -17,11 +17,11 @@ KrylovTrustRegion(; cg_tol::Real = 0.01, ) = KrylovTrustRegion(initial_radius, max_radius, eta, rho_lower, rho_upper, cg_tol) -update_h!(d, state, method::KrylovTrustRegion) = nothing - # TODO: support x::Array{T,N} et al.? mutable struct KrylovTrustRegionState{T} <: AbstractOptimizerState x::Vector{T} + f_x::T + g_x::Vector{T} x_previous::Vector{T} f_x_previous::T s::Vector{T} @@ -45,10 +45,12 @@ function initial_state(method::KrylovTrustRegion, options::Options, d, initial_x @assert(method.rho_lower < method.rho_upper) @assert(method.rho_lower >= 0) - value_gradient!!(d, initial_x) + f_x, g_x = value_gradient!(d, initial_x) KrylovTrustRegionState( copy(initial_x), # Maintain current state in state.x + f_x, # Maintain f of current state in state.f_x + g_x, # Maintain gradient of current state in state.g_x copy(initial_x), # x_previous zero(T), # f_x_previous similar(initial_x), # Maintain current search direction in state.s @@ -87,12 +89,11 @@ function trace!( dt["f_diff"] = state.f_diff dt["cg_iters"] = state.cg_iters end - g_norm = norm(gradient(d), Inf) update!( tr, iteration, - value(d), - g_norm, + state.f_x, + g_residual(state), dt, options.store_trace, options.show_trace, @@ -107,19 +108,18 @@ function cg_steihaug!( state::KrylovTrustRegionState{T}, method::KrylovTrustRegion, ) where {T} - n = length(state.x) - x, g, d, r, z, Hd = - state.x, gradient(objective), state.d, state.r, state.s, hv_product(objective) + (; x, g_x, d, r) = state + z = state.s fill!(z, 0.0) # the search direction is initialized to the 0 vector, - r .= g # so at first the whole gradient is the residual. + copyto!(r, g_x) # so at first the whole gradient is the residual. d .= .-r # the first direction is the direction of steepest descent. rho0 = 1e100 # just a big number state.cg_iters = 0 - for i = 1:n + for i = 1:length(x) state.cg_iters += 1 - hv_product!(objective, x, d) + Hd = hv_product!(objective, x, d) dHd = dot(d, Hd) if -1e-15 < dHd < 1e-15 break @@ -151,8 +151,8 @@ function cg_steihaug!( end end - hv_product!(objective, x, z) - return dot(g, z) + 0.5 * dot(z, Hd) + Hd = hv_product!(objective, x, z) + return dot(g_x, z) + 0.5 * dot(z, Hd) end @@ -164,7 +164,7 @@ function update_state!( state.m_diff = cg_steihaug!(objective, state, method) @assert state.m_diff <= 0 - state.f_diff = value(objective, state.x .+ state.s) - value(objective) + state.f_diff = value(objective, state.x .+ state.s) - state.f_x state.rho = state.f_diff / state.m_diff state.interior = norm(state.s) < 0.9 * state.radius @@ -183,11 +183,13 @@ function update_state!( end -function update_g!(objective, state::KrylovTrustRegionState, method::KrylovTrustRegion) +function update_fgh!(objective, state::KrylovTrustRegionState, ::KrylovTrustRegion) if state.accept_step # Update the function value and gradient - state.f_x_previous = value(objective) - value_gradient!(objective, state.x) + state.f_x_previous = state.f_x + f_x, g_x = value_gradient!(objective, state.x) + state.f_x = f_x + copyto!(state.g_x, g_x) end end @@ -206,13 +208,13 @@ function assess_convergence(state::KrylovTrustRegionState, d, options::Options) # if abs(f_x - f_x_previous) < f_tol # Relative Tolerance if abs(state.f_diff) < max( - options.f_reltol * (abs(value(d)) + options.f_reltol), - eps(abs(value(d)) + abs(state.f_x_previous)), + options.f_reltol * (abs(state.f_x) + options.f_reltol), + eps(abs(state.f_x) + abs(state.f_x_previous)), ) f_converged = true end - if norm(gradient(d), Inf) < options.g_abstol + if norm(state.g_x, Inf) < options.g_abstol g_converged = true end diff --git a/src/multivariate/solvers/second_order/newton.jl b/src/multivariate/solvers/second_order/newton.jl index 776928247..20a4824ae 100644 --- a/src/multivariate/solvers/second_order/newton.jl +++ b/src/multivariate/solvers/second_order/newton.jl @@ -29,8 +29,11 @@ end Base.summary(io::IO, ::Newton) = print(io, "Newton's Method") -mutable struct NewtonState{Tx,T,F<:Cholesky} <: AbstractOptimizerState +mutable struct NewtonState{Tx,Tg,TH,T,F<:Cholesky} <: AbstractOptimizerState x::Tx + g_x::Tg + H_x::TH + f_x::T x_previous::Tx f_x_previous::T F::F @@ -39,20 +42,18 @@ mutable struct NewtonState{Tx,T,F<:Cholesky} <: AbstractOptimizerState end function initial_state(method::Newton, options, d, initial_x) - T = eltype(initial_x) - n = length(initial_x) - # Maintain current gradient in gr - s = similar(initial_x) - - value_gradient!!(d, initial_x) - hessian!!(d, initial_x) + # TODO: Switch to `value_gradient_hessian!` + f_x, g_x, H_x = NLSolversBase.value_gradient_hessian!!(d, initial_x) NewtonState( copy(initial_x), # Maintain current state in state.x - copy(initial_x), # Maintain previous state in state.x_previous - T(NaN), # Store previous f in state.f_x_previous - Cholesky(similar(d.H, T, 0, 0), :U, 0), - similar(initial_x), # Maintain current search direction in state.s + copy(g_x), # Maintain current gradient in state.g_x + copy(H_x), # Maintain current Hessian in state.H_x + f_x, # Maintain current f in state.f_x + fill!(similar(initial_x), NaN), # Maintain previous state in state.x_previous + oftype(f_x, NaN), # Store previous f in state.f_x_previous + Cholesky(similar(H_x, 0, 0), :U, 0), + fill!(similar(initial_x), NaN), # Maintain current search direction in state.s @initial_linesearch()..., ) end @@ -63,19 +64,19 @@ function update_state!(d, state::NewtonState, method::Newton) # represented by H. It deviates from the usual "add a scaled # identity matrix" version of the modified Newton method. More # information can be found in the discussion at issue #153. - T = eltype(state.x) - if typeof(NLSolversBase.hessian(d)) <: AbstractSparseMatrix - state.s .= .-(NLSolversBase.hessian(d) \ convert(Vector{T}, gradient(d))) + if state.H_x isa AbstractSparseMatrix + state.s .= .-(state.H_x \ convert(Vector, state.g_x)) else - state.F = cholesky!(Positive, NLSolversBase.hessian(d)) - if typeof(gradient(d)) <: Array + state.F = cholesky!(Positive, state.H_x) + if state.g_x isa Array # is this actually StridedArray? - ldiv!(state.s, state.F, -gradient(d)) + ldiv!(state.s, state.F, state.g_x) + state.s .= .-state.s else # not Array, we can't do inplace ldiv - gv = Vector{T}(undef, length(gradient(d))) - copyto!(gv, -gradient(d)) + gv = Vector{eltype(state.g_x)}(undef, length(state.g_x)) + gv .= .-state.g_x copyto!(state.s, state.F \ gv) end end @@ -87,21 +88,20 @@ function update_state!(d, state::NewtonState, method::Newton) return !lssuccess # break on linesearch error end -function trace!(tr, d, state::NewtonState, iteration::Integer, method::Newton, options::Options, curr_time = time()) +function trace!(tr, d, state::NewtonState, iteration::Integer, ::Newton, options::Options, curr_time = time()) dt = Dict() dt["time"] = curr_time if options.extended_trace dt["x"] = copy(state.x) - dt["g(x)"] = copy(gradient(d)) - dt["h(x)"] = copy(NLSolversBase.hessian(d)) + dt["g(x)"] = copy(state.g_x) + dt["h(x)"] = copy(state.H_x) dt["Current step size"] = state.alpha end - g_norm = norm(gradient(d), Inf) update!( tr, iteration, - value(d), - g_norm, + state.f_x, + g_residual(state), dt, options.store_trace, options.show_trace, diff --git a/src/multivariate/solvers/second_order/newton_trust_region.jl b/src/multivariate/solvers/second_order/newton_trust_region.jl index f7daccde6..348d647af 100644 --- a/src/multivariate/solvers/second_order/newton_trust_region.jl +++ b/src/multivariate/solvers/second_order/newton_trust_region.jl @@ -302,14 +302,17 @@ end Base.summary(io::IO, ::NewtonTrustRegion) = print(io, "Newton's Method (Trust Region)") -mutable struct NewtonTrustRegionState{Tx,T,G} <: AbstractOptimizerState +mutable struct NewtonTrustRegionState{Tx,T,Tg,TH} <: AbstractOptimizerState x::Tx + g_x::Tg + H_x::TH + f_x::T x_previous::Tx - g_previous::G + g_x_previous::Tg f_x_previous::T s::Tx x_cache::Tx - g_cache::G + g_cache::Tg hard_case::Bool reached_subproblem_solution::Bool interior::Bool @@ -321,7 +324,6 @@ end function initial_state(method::NewtonTrustRegion, options, d, initial_x) T = eltype(initial_x) - n = length(initial_x) # Keep track of trust region sizes delta = copy(method.initial_delta) @@ -331,16 +333,20 @@ function initial_state(method::NewtonTrustRegion, options, d, initial_x) interior = true lambda = NaN - NLSolversBase.value_gradient_hessian!!(d, initial_x) + # TODO: Switch to `value_gradient_hessian!` + f_x, g_x, H_x = NLSolversBase.value_gradient_hessian!!(d, initial_x) NewtonTrustRegionState( copy(initial_x), # Maintain current state in state.x - copy(initial_x), # Maintain previous state in state.x_previous - copy(gradient(d)), # Store previous gradient in state.g_previous - T(NaN), # Store previous f in state.f_x_previous - similar(initial_x), # Maintain current search direction in state.s - fill!(similar(initial_x), NaN), # For resetting the state if a step is rejected - fill!(method.use_fg ? similar(gradient(d)) : empty(gradient(d)), NaN), # For resetting the gradient if a step is rejected + copy(g_x), # Maintain current gradient in state.g_x + copy(H_x), # Maintain current Hessian in state.H_x + f_x, # Maintain current f in state.f_x + fill!(similar(initial_x), NaN), # Maintain previous state in state.x_previous + fill!(similar(g_x), NaN), # Store previous gradient in state.g_x_previous + oftype(f_x, NaN), # Store previous f in state.f_x_previous + fill!(similar(initial_x), NaN), # Maintain current search direction in state.s + fill!(similar(initial_x), NaN), # Cache to be able to reset state.x + fill!(method.use_fg ? similar(g_x) : empty(g_x), NaN), # Cache to be able to reset state.g_x hard_case, reached_subproblem_solution, interior, @@ -353,28 +359,31 @@ end function update_state!(d, state::NewtonTrustRegionState, method::NewtonTrustRegion) - T = eltype(state.x) # Find the next step direction. m, state.interior, state.lambda, state.hard_case, state.reached_subproblem_solution = - solve_tr_subproblem!(gradient(d), NLSolversBase.hessian(d), state.delta, state.s) + solve_tr_subproblem!(state.g_x, state.H_x, state.delta, state.s) - # Maintain a record of previous position + # Maintain a record of current position, to be able to reset it below copyto!(state.x_cache, state.x) - f_cache = value(d) + f_cache = state.f_x # Update current position state.x .+= state.s + # Update the function value and gradient if method.use_fg - copyto!(state.g_cache, gradient(d)) - value_gradient!(d, state.x) + copyto!(state.g_cache, state.g_x) + f_x, g_x = value_gradient!(d, state.x) + copyto!(state.g_x, g_x) + state.f_x = f_x else - value!(d, state.x) + f_x = value!(d, state.x) + state.f_x = f_x end # Update the trust region size based on the discrepancy between # the predicted and actual function values. (Algorithm 4.1 in N&W (2006)) - f_x_diff = f_cache - value(d) - if abs(m) <= eps(T) + f_x_diff = f_cache - f_x + if abs(m) <= eps(typeof(m)) # This should only happen when the step is very small, in which case # we should accept the step and assess_convergence(). state.rho = 1.0 @@ -408,20 +417,24 @@ function update_state!(d, state::NewtonTrustRegionState, method::NewtonTrustRegi # Update/reset gradients and function values if accept_step if method.use_fg - hessian!(d, state.x) + copyto!(state.H_x, hessian!(d, state.x)) else - NLSolversBase.gradient_hessian!!(d, state.x) + # TODO: Switch to `gradient_hessian!` + g_x, H_x = NLSolversBase.gradient_hessian!!(d, state.x) + copyto!(state.g_x, g_x) + copyto!(state.H_x, H_x) end + + # Update history + copyto!(state.x_previous, state.x_cache) + copyto!(state.g_x_previous, state.g_cache) + state.f_x_previous = f_cache else # Reset state copyto!(state.x, state.x_cache) - - # Reset objective function - copyto!(d.x_f, state.x_cache) - d.F = f_cache + state.f_x = f_cache if method.use_fg - copyto!(d.x_df, state.x_cache) - copyto!(d.DF, state.g_cache) + copyto!(state.g_x, state.g_cache) end end @@ -429,30 +442,32 @@ function update_state!(d, state::NewtonTrustRegionState, method::NewtonTrustRegi end function assess_convergence(state::NewtonTrustRegionState, d, options::Options) - x_converged, f_converged, g_converged, converged, f_increased = - false, false, false, false, false if state.rho > state.eta # Accept the point and check convergence - x_converged, f_converged, g_converged, f_increased = assess_convergence( + return assess_convergence( state.x, state.x_previous, - value(d), + state.f_x, state.f_x_previous, - gradient(d), + state.g_x, options.x_abstol, options.f_reltol, options.g_abstol, ) + else + return false, false, false, false end - x_converged, f_converged, g_converged, f_increased end +# Function value, gradient and Hessian matrix are already updated in update_state! +update_fgh!(d, state, ::NewtonTrustRegion) = nothing + function trace!( tr, d, state::NewtonTrustRegionState, iteration::Integer, - method::NewtonTrustRegion, + ::NewtonTrustRegion, options::Options, curr_time = time(), ) @@ -460,20 +475,19 @@ function trace!( dt["time"] = curr_time if options.extended_trace dt["x"] = copy(state.x) - dt["g(x)"] = copy(gradient(d)) - dt["h(x)"] = copy(NLSolversBase.hessian(d)) + dt["g(x)"] = copy(state.g_x) + dt["h(x)"] = copy(state.H_x) dt["delta"] = copy(state.delta) dt["interior"] = state.interior dt["hard case"] = state.hard_case dt["reached_subproblem_solution"] = state.reached_subproblem_solution dt["lambda"] = state.lambda end - g_norm = norm(gradient(d), Inf) update!( tr, iteration, - value(d), - g_norm, + state.f_x, + g_residual(state), dt, options.store_trace, options.show_trace, diff --git a/src/multivariate/solvers/zeroth_order/nelder_mead.jl b/src/multivariate/solvers/zeroth_order/nelder_mead.jl index 6f66db505..47c0e49f5 100644 --- a/src/multivariate/solvers/zeroth_order/nelder_mead.jl +++ b/src/multivariate/solvers/zeroth_order/nelder_mead.jl @@ -130,6 +130,7 @@ end mutable struct NelderMeadState{Tx,T,Tfs} <: ZerothOrderState x::Tx + f_x::T m::Int simplex::Vector{Tx} x_centroid::Tx @@ -150,27 +151,31 @@ mutable struct NelderMeadState{Tx,T,Tfs} <: ZerothOrderState end function reset!(method::NelderMead, state::NelderMeadState, obj, x) state.simplex = simplexer(method.initial_simplex, x) + @assert state.simplex[1] == x - value!(obj, first(state.simplex)) - state.f_simplex[1] = value(obj) + state.f_simplex[1] = state.f_x = value!(obj, state.simplex[1]) for i = 2:length(state.simplex) state.f_simplex[i] = value(obj, state.simplex[i]) end + # Get the indices that correspond to the ordering of the f values # at the vertices. i_order[1] is the index in the simplex of the vertex # with the lowest function value, and i_order[end] is the index in the # simplex of the vertex with the highest function value state.i_order = sortperm(state.f_simplex) + + return nothing end function initial_state(method::NelderMead, options, d, initial_x) T = eltype(initial_x) n = length(initial_x) m = n + 1 simplex = simplexer(method.initial_simplex, initial_x) - f_simplex = zeros(T, m) + @assert simplex[1] == initial_x - value!!(d, first(simplex)) - f_simplex[1] = value(d) + f_x = value!(d, simplex[1]) + f_simplex = Vector{typeof(f_x)}(undef, m) + f_simplex[1] = f_x for i = 2:length(simplex) f_simplex[i] = value(d, simplex[i]) end @@ -184,6 +189,7 @@ function initial_state(method::NelderMead, options, d, initial_x) NelderMeadState( copy(initial_x), # Variable to hold final minimizer value for MultivariateOptimizationResults + f_x, # Variable to hold final objective function value m, # Number of vertices in the simplex simplex, # Maintain simplex in state.simplex centroid(simplex, i_order[m]), # Maintain centroid in state.centroid @@ -309,26 +315,22 @@ function after_while!(f, state, method::NelderMead, options) f.F = f_min end state.x .= x_min + state.f_x = f_min + return nothing end -# We don't have an f_x_previous in NelderMeadState, so we need to special case these -pick_best_x(f_increased, state::NelderMeadState) = state.x -pick_best_f(f_increased, state::NelderMeadState, d) = value(d) + +# We don't have an x_previous and f_x_previous in NelderMeadState, so we need to special case these +pick_best_x(f_increased::Bool, state::NelderMeadState) = state.x +pick_best_f(f_increased::Bool, state::NelderMeadState) = state.f_x function assess_convergence(state::NelderMeadState, d, options::Options) g_converged = state.nm_x <= options.g_abstol # Hijact g_converged for NM stopping criterior return false, false, g_converged, false end -function initial_convergence( - d, - state::NelderMeadState, - method::NelderMead, - initial_x, - options, -) - nmo = nmobjective(state.f_simplex, state.m, length(initial_x)) - - nmo <= options.g_abstol, !isfinite(value(d)) || !isfinite(nmo) +function initial_convergence(state::NelderMeadState, options::Options) + nmo = nmobjective(state.f_simplex, state.m, length(state.x)) + nmo <= options.g_abstol, !isfinite(state.f_x) || !isfinite(nmo) end function trace!( diff --git a/src/multivariate/solvers/zeroth_order/particle_swarm.jl b/src/multivariate/solvers/zeroth_order/particle_swarm.jl index 083b2a7cd..022533a7f 100644 --- a/src/multivariate/solvers/zeroth_order/particle_swarm.jl +++ b/src/multivariate/solvers/zeroth_order/particle_swarm.jl @@ -40,6 +40,7 @@ Base.summary(io::IO, ::ParticleSwarm) = print(io, "Particle Swarm") mutable struct ParticleSwarmState{Tx,T} <: ZerothOrderState x::Tx + f_x::T iteration::Int lower::Tx upper::Tx @@ -127,8 +128,8 @@ function initial_state( current_state = 0 - value!!(d, initial_x) - score[1] = value(d) + f_x = value!(d, initial_x) + score[1] = f_x # if search space is limited, spread the initial population # uniformly over the whole search space @@ -169,6 +170,7 @@ function initial_state( ParticleSwarmState( x, + f_x, 0, lower, upper, @@ -193,15 +195,15 @@ function update_state!(f, state::ParticleSwarmState{T}, method::ParticleSwarm) w if state.iteration == 0 copyto!(state.best_score, state.score) - f.F = Base.minimum(state.score) # Base.minimum !== minimum + # state.f_x = Base.minimum(state.score) # Base.minimum !== minimum end - f.F = housekeeping!( + state.f_x = housekeeping!( state.score, state.best_score, state.X, state.X_best, state.x, - value(f), + state.f_x, state.n_particles, ) # Elitist Learning: @@ -238,14 +240,13 @@ function update_state!(f, state::ParticleSwarmState{T}, method::ParticleSwarm) w end end - score_learn = value(f, state.x_learn) - if score_learn < f.F - f.F = score_learn * 1.0 - for j = 1:n - state.X_best[j, i_worst] = state.x_learn[j] - state.X[j, i_worst] = state.x_learn[j] - state.x[j] = state.x_learn[j] - end + score_learn = value!(f, state.x_learn) + if score_learn < state.f_x + copyto!(state.x, state.x_learn) + copyto!(view(state.X_best, :, i_worst), state.x_learn) + copyto!(view(state.X, :, i_worst), state.x_learn) + + state.f_x = score_learn state.score[i_worst] = score_learn state.best_score[i_worst] = score_learn end diff --git a/src/multivariate/solvers/zeroth_order/simulated_annealing.jl b/src/multivariate/solvers/zeroth_order/simulated_annealing.jl index a0e3ba5da..6388e9109 100644 --- a/src/multivariate/solvers/zeroth_order/simulated_annealing.jl +++ b/src/multivariate/solvers/zeroth_order/simulated_annealing.jl @@ -46,29 +46,38 @@ SimulatedAnnealing(; Base.summary(io::IO, ::SimulatedAnnealing) = print(io, "Simulated Annealing") mutable struct SimulatedAnnealingState{Tx,T} <: ZerothOrderState - x::Tx - iteration::Int - x_current::Tx - x_proposal::Tx - f_x_current::T - f_proposal::T + x::Tx # Best state ever visited + f_x::T # Function value of the best state ever visited + iteration::Int # Iteration number + x_current::Tx # Current state + f_x_current::T # Function value of current state + x_proposal::Tx # Proposed state + f_proposal::T # Function value of proposed state end -# We don't have an f_x_previous in SimulatedAnnealing, so we need to special case these -pick_best_x(f_increased, state::SimulatedAnnealingState) = state.x -pick_best_f(f_increased, state::SimulatedAnnealingState, d) = value(d) + +# We don't have an x_previous and f_x_previous in SimulatedAnnealing, so we need to special case these +# The best state ever visited is stored in x, and its function value in f_x +pick_best_x(::Bool, state::SimulatedAnnealingState) = state.x +pick_best_f(::Bool, state::SimulatedAnnealingState) = state.f_x function initial_state( - method::SimulatedAnnealing, - options, + ::SimulatedAnnealing, + ::Options, d, - initial_x::AbstractArray{T}, -) where {T} - - value!!(d, initial_x) - - # Store the best state ever visited - best_x = copy(initial_x) - SimulatedAnnealingState(copy(best_x), 1, best_x, copy(initial_x), value(d), value(d)) + initial_x::AbstractArray, +) + # Compute function value + f_x = value!(d, initial_x) + + return SimulatedAnnealingState( + copy(initial_x), # best state ever visited + f_x, # function value of the best state ever visited + 1, # iteration + copy(initial_x), # current state + f_x, # function value of the current state + fill!(similar(initial_x), NaN), # proposed state + oftype(f_x, NaN), # function value of the proposed state + ) end function update_state!( @@ -76,7 +85,6 @@ function update_state!( state::SimulatedAnnealingState{Tx,T}, method::SimulatedAnnealing, ) where {Tx,T} - # Determine the temperature for current iteration t = method.temperature(state.iteration) @@ -84,7 +92,7 @@ function update_state!( method.neighbor!(state.x_current, state.x_proposal) # Evaluate the cost function at the proposed state - state.f_proposal = value(nd, state.x_proposal) + state.f_proposal = value!(nd, state.x_proposal) if state.f_proposal <= state.f_x_current # If proposal is superior, we always move to it @@ -92,9 +100,9 @@ function update_state!( state.f_x_current = state.f_proposal # If the new state is the best state yet, keep a record of it - if state.f_proposal < value(nd) - nd.F = state.f_proposal + if state.f_proposal < state.f_x copyto!(state.x, state.x_proposal) + state.f_x = state.f_proposal end else # If proposal is inferior, we move to it with probability p @@ -106,5 +114,6 @@ function update_state!( end state.iteration += 1 + false end diff --git a/src/multivariate/solvers/zeroth_order/zeroth_utils.jl b/src/multivariate/solvers/zeroth_order/zeroth_utils.jl index c6fd9bc3a..857be8a5b 100644 --- a/src/multivariate/solvers/zeroth_order/zeroth_utils.jl +++ b/src/multivariate/solvers/zeroth_order/zeroth_utils.jl @@ -15,7 +15,7 @@ function trace!( update!( tr, state.iteration, - d.F, + state.f_x, NaN, dt, options.store_trace, @@ -29,7 +29,7 @@ function assess_convergence(state::ZerothOrderState, d, options::Options) false, false, false, false end -f_abschange(d::AbstractObjective, state::ZerothOrderState) = convert(typeof(value(d)), NaN) -f_relchange(d::AbstractObjective, state::ZerothOrderState) = convert(typeof(value(d)), NaN) +f_abschange(state::ZerothOrderState) = oftype(state.f_x, NaN) +f_relchange(state::ZerothOrderState) = oftype(state.f_x, NaN) x_abschange(state::ZerothOrderState) = convert(real(eltype(state.x)), NaN) x_relchange(state::ZerothOrderState) = convert(real(eltype(state.x)), NaN) diff --git a/src/types.jl b/src/types.jl index 3e98def7c..e18e57318 100644 --- a/src/types.jl +++ b/src/types.jl @@ -308,8 +308,8 @@ termination_code(mvr::MultivariateOptimizationResults) = mvr.termination_code # pick_best_x and pick_best_f are used to pick the minimizer if we stopped because # f increased and we didn't allow it -pick_best_x(f_increased, state) = f_increased ? state.x_previous : state.x -pick_best_f(f_increased, state, d) = f_increased ? state.f_x_previous : value(d) +pick_best_x(f_increased::Bool, state::AbstractOptimizerState) = f_increased ? state.x_previous : state.x +pick_best_f(f_increased::Bool, state::AbstractOptimizerState) = f_increased ? state.f_x_previous : state.f_x function Base.show(io::IO, t::OptimizationState) @printf io "%6d %14e %14e\n" t.iteration t.value t.g_norm diff --git a/src/utilities/assess_convergence.jl b/src/utilities/assess_convergence.jl index 5fcd3ed47..35e87ff3e 100644 --- a/src/utilities/assess_convergence.jl +++ b/src/utilities/assess_convergence.jl @@ -1,21 +1,20 @@ -f_abschange(d::AbstractObjective, state) = f_abschange(value(d), state.f_x_previous) -f_abschange(f_x::T, f_x_previous) where {T} = abs(f_x - f_x_previous) -f_relchange(d::AbstractObjective, state) = f_relchange(value(d), state.f_x_previous) -f_relchange(f_x::T, f_x_previous) where {T} = abs(f_x - f_x_previous) / abs(f_x) +f_abschange(state::AbstractOptimizerState) = f_abschange(state.f_x, state.f_x_previous) +f_abschange(f_x, f_x_previous) = abs(f_x - f_x_previous) +f_relchange(state::AbstractOptimizerState) = f_relchange(state.f_x, state.f_x_previous) +f_relchange(f_x, f_x_previous) = abs(f_x - f_x_previous) / abs(f_x) -x_abschange(state) = x_abschange(state.x, state.x_previous) +x_abschange(state::AbstractOptimizerState) = x_abschange(state.x, state.x_previous) x_abschange(x, x_previous) = maxdiff(x, x_previous) -x_relchange(state) = x_relchange(state.x, state.x_previous) +x_relchange(state::AbstractOptimizerState) = x_relchange(state.x, state.x_previous) x_relchange(x, x_previous) = maxdiff(x, x_previous) / Base.maximum(abs, x) # Base.maximum !== maximum -g_residual(d, state) = g_residual(d) -g_residual(d, state::NelderMeadState) = state.nm_x -g_residual(d::AbstractObjective) = g_residual(gradient(d)) -g_residual(d::NonDifferentiable) = convert(typeof(value(d)), NaN) -g_residual(g) = Base.maximum(abs, g) # Base.maximum !== maximum -gradient_convergence_assessment(state::AbstractOptimizerState, d, options) = - g_residual(gradient(d)) ≤ options.g_abstol -gradient_convergence_assessment(state::ZerothOrderState, d, options) = false +g_residual(state::NelderMeadState) = state.nm_x +g_residual(state::ZerothOrderState) = oftype(state.f_x, NaN) +g_residual(state::AbstractOptimizerState) = g_residual(state.g_x) +g_residual(g_x::AbstractArray) = Base.maximum(abs, g_x) # Base.maximum !== maximum +gradient_convergence_assessment(state::AbstractOptimizerState, options::Options) = + g_residual(state) ≤ options.g_abstol +gradient_convergence_assessment(state::ZerothOrderState, options::Options) = false # Default function for convergence assessment used by # AcceleratedGradientDescentState, BFGSState, ConjugateGradientState, @@ -24,9 +23,9 @@ function assess_convergence(state::AbstractOptimizerState, d, options::Options) assess_convergence( state.x, state.x_previous, - value(d), + state.f_x, state.f_x_previous, - gradient(d), + state.g_x, options.x_abstol, options.x_reltol, options.f_abstol, @@ -39,65 +38,41 @@ function assess_convergence( x_previous, f_x, f_x_previous, - gx, + g_x, x_abstol, x_reltol, f_abstol, f_reltol, g_abstol, ) - x_converged, f_converged, f_increased, g_converged = false, false, false, false - # TODO: Create function for x_convergence_assessment - if x_abschange(x, x_previous) ≤ x_abstol - x_converged = true - end - if x_abschange(x, x_previous) ≤ x_reltol * Base.maximum(abs, x) # Base.maximum !== maximum - x_converged = true - end + x_converged = x_abschange(x, x_previous) ≤ x_abstol || + x_abschange(x, x_previous) ≤ x_reltol * Base.maximum(abs, x) # Base.maximum !== maximum # Relative Tolerance # TODO: Create function for f_convergence_assessment - if f_abschange(f_x, f_x_previous) ≤ f_abstol - f_converged = true - end - - if f_abschange(f_x, f_x_previous) ≤ f_reltol * abs(f_x) - f_converged = true - end + f_converged = f_abschange(f_x, f_x_previous) ≤ f_abstol || + f_abschange(f_x, f_x_previous) ≤ f_reltol * abs(f_x) - if f_x > f_x_previous - f_increased = true - end + f_increased = f_x > f_x_previous - g_converged = g_residual(gx) ≤ g_abstol + g_converged = g_residual(g_x) ≤ g_abstol # Base.maximum !== maximum return x_converged, f_converged, g_converged, f_increased end # Used by Fminbox and IPNewton -function assess_convergence(x, x_previous, f_x, f_x_previous, g, x_tol, f_tol, g_tol) - - x_converged, f_converged, f_increased, g_converged = false, false, false, false - - if x_abschange(x, x_previous) ≤ x_tol - x_converged = true - end +function assess_convergence(x, x_previous, f_x, f_x_previous, g_x, x_tol, f_tol, g_tol) + x_converged = x_abschange(x, x_previous) ≤ x_tol # Absolute Tolerance # if abs(f_x - f_x_previous) < f_tol # Relative Tolerance - if f_abschange(f_x, f_x_previous) ≤ f_tol * abs(f_x) - f_converged = true - end + f_converged = f_abschange(f_x, f_x_previous) ≤ f_tol * abs(f_x) - if f_x > f_x_previous - f_increased = true - end + f_increased = f_x > f_x_previous - if g_residual(g) ≤ g_tol - g_converged = true - end + g_converged = g_residual(g_x) ≤ g_tol # Base.maximum !== maximum return x_converged, f_converged, g_converged, f_increased end diff --git a/src/utilities/generic.jl b/src/utilities/generic.jl index fe26689d6..125b71673 100644 --- a/src/utilities/generic.jl +++ b/src/utilities/generic.jl @@ -13,7 +13,7 @@ end @def initial_linesearch begin ( - similar(initial_x), # Buffer of x for line search in state.x_ls - real(one(T)), + fill!(similar(initial_x), NaN), # Buffer of x for line search in state.x_ls + real(oneunit(eltype(initial_x))), ) # Keep track of step size in state.alpha end diff --git a/src/utilities/perform_linesearch.jl b/src/utilities/perform_linesearch.jl index fe2e223de..b944a21f8 100644 --- a/src/utilities/perform_linesearch.jl +++ b/src/utilities/perform_linesearch.jl @@ -5,11 +5,8 @@ reset_search_direction!(state, d, method) = false # no-op _alphaguess(a) = a _alphaguess(a::Number) = LineSearches.InitialStatic(alpha = a) -# Note that for these resets we're using `gradient(d)` but we don't need to use -# project_tangent! here, because we already did that inplace on gradient(d) after -# the last evaluation (we basically just always do it) -function reset_search_direction!(state, d, method::BFGS) - gx = gradient!(d, state.x) +function reset_search_direction!(state::BFGSState, method::BFGS) + gx = state.g_x if method.initial_invH === nothing n = length(state.x) T = typeof(state.invH) @@ -27,27 +24,25 @@ function reset_search_direction!(state, d, method::BFGS) return true end -function reset_search_direction!(state, d, method::LBFGS) +function reset_search_direction!(state::LBFGSState, ::LBFGS) state.pseudo_iteration = 1 - state.s .= .-gradient!(d, state.x) + state.s .= .-state.g_x return true end -function reset_search_direction!(state, d, method::ConjugateGradient) +function reset_search_direction!(state::ConjugateGradientState, ::ConjugateGradient) state.s .= .-state.pg return true end function perform_linesearch!(state, method, d) # Calculate search direction dphi0 - fx = value_gradient!(d, state.x) - gx = gradient(d) - dphi_0 = real(dot(gx, state.s)) + dphi_0 = real(dot(state.g_x, state.s)) # reset the direction if it becomes corrupted - if dphi_0 >= zero(dphi_0) && reset_search_direction!(state, d, method) - dphi_0 = real(dot(gx, state.s)) # update after direction reset + if dphi_0 >= zero(dphi_0) && reset_search_direction!(state, method) + dphi_0 = real(dot(state.g_x, state.s)) # update after direction reset end - phi_0 = value!(d, state.x) + phi_0 = state.f_x # Guess an alpha method.alphaguess!(method.linesearch!, state, phi_0, dphi_0, d) @@ -55,6 +50,9 @@ function perform_linesearch!(state, method, d) # Store current x and f(x) for next iteration state.f_x_previous = phi_0 copyto!(state.x_previous, state.x) + if hasproperty(state, :g_x_previous) + copyto!(state.g_x_previous, state.g_x) + end # Perform line search; catch LineSearchException to allow graceful exit try diff --git a/src/utilities/trace.jl b/src/utilities/trace.jl index 7a46a2916..34e8d6160 100644 --- a/src/utilities/trace.jl +++ b/src/utilities/trace.jl @@ -5,7 +5,7 @@ function common_trace!( d, state, iteration::Integer, - method::FirstOrderOptimizer, + ::FirstOrderOptimizer, options::Options, curr_time = time(), ) @@ -13,14 +13,14 @@ function common_trace!( dt["time"] = curr_time if options.extended_trace dt["x"] = copy(state.x) - dt["g(x)"] = copy(gradient(d)) + dt["g(x)"] = copy(state.g_x) dt["Current step size"] = state.alpha end - g_norm = Base.maximum(abs, gradient(d)) # Base.maximum !== maximum + g_norm = Base.maximum(abs, state.g_x) # Base.maximum !== maximum update!( tr, iteration, - value(d), + state.f_x, g_norm, dt, options.store_trace, diff --git a/test/general/convergence.jl b/test/general/convergence.jl index 701a624a9..4785040e4 100644 --- a/test/general/convergence.jl +++ b/test/general/convergence.jl @@ -3,7 +3,7 @@ mutable struct DummyState <: Optim.AbstractOptimizerState x_previous::Any f_x::Any f_x_previous::Any - g::Any + g_x::Any end mutable struct DummyStateZeroth <: Optim.ZerothOrderState @@ -11,7 +11,7 @@ mutable struct DummyStateZeroth <: Optim.ZerothOrderState x_previous::Any f_x::Any f_x_previous::Any - g::Any + g_x::Any end mutable struct DummyOptions @@ -63,26 +63,32 @@ mutable struct DummyMethodZeroth <: Optim.ZerothOrderOptimizer end ## initial_convergence and gradient_convergence_assessment ds = DummyState(x1, x0, f1, f0, g) - dOpt = DummyOptions(x_tol, f_tol, g_tol, g_abstol) + opt = Optim.Options(; x_abstol = x_tol, f_abstol = f_tol, g_tol, g_abstol) dm = DummyMethod() # >= First Order d = Optim.OnceDifferentiable(x -> sum(abs2.(x)), zeros(2)) - Optim.gradient!(d, ones(2)) - @test !Optim.gradient_convergence_assessment(ds, d, dOpt) - Optim.gradient!(d, zeros(2)) - @test Optim.gradient_convergence_assessment(ds, d, dOpt) - - @test Optim.initial_convergence(d, ds, dm, ones(2), dOpt) == (false, false) - @test Optim.initial_convergence(d, ds, dm, zeros(2), dOpt) == (true, false) + x = ones(2) + f_x, g_x = Optim.value_gradient!(d, x) + ds = DummyState(x, x0, f_x, f0, g_x) + @test !Optim.gradient_convergence_assessment(ds, opt) + @test Optim.initial_convergence(ds, opt) == (false, false) + x = zeros(2) + f_x, g_x = Optim.value_gradient!(d, x) + ds = DummyState(x, x0, f_x, f0, g_x) + @test Optim.gradient_convergence_assessment(ds, opt) + @test Optim.initial_convergence(ds, opt) == (true, false) # Zeroth order methods have no gradient -> returns false by default ds = DummyStateZeroth(x1, x0, f1, f0, g) dm = DummyMethodZeroth() - @test !Optim.gradient_convergence_assessment(ds, d, dOpt) - @test Optim.initial_convergence(d, ds, dm, ones(2), dOpt) == (false, false) + x = ones(2) + f_x, g_x = Optim.value_gradient!(d, x) + ds = DummyState(x, x0, f_x, f0, g_x) + @test !Optim.gradient_convergence_assessment(ds, opt) + @test Optim.initial_convergence(ds, opt) == (false, false) # should check all other methods as well diff --git a/test/general/objective_types.jl b/test/general/objective_types.jl index f197c515b..0d4474666 100644 --- a/test/general/objective_types.jl +++ b/test/general/objective_types.jl @@ -7,12 +7,12 @@ odad1 = T(x -> 5.0, rand(1); autodiff = AutoFiniteDiff(; fdtype = Val(:central))) odad2 = T(x -> 5.0, rand(1); autodiff = AutoForwardDiff()) odad3 = T(x -> 5.0, rand(1); autodiff = AutoReverseDiff()) - Optim.gradient!(odad1, rand(1)) - Optim.gradient!(odad2, rand(1)) - Optim.gradient!(odad3, rand(1)) - @test Optim.gradient(odad1) == [0.0] - @test Optim.gradient(odad2) == [0.0] - @test Optim.gradient(odad3) == [0.0] + NLSolversBase.gradient!(odad1, rand(1)) + NLSolversBase.gradient!(odad2, rand(1)) + NLSolversBase.gradient!(odad3, rand(1)) + @test NLSolversBase.gradient(odad1) == [0.0] + @test NLSolversBase.gradient(odad2) == [0.0] + @test NLSolversBase.gradient(odad3) == [0.0] end for a in (1.0, 5.0) @@ -20,33 +20,33 @@ odad1 = OnceDifferentiable(x -> a * x[1], xa; autodiff = AutoFiniteDiff(; fdtype = Val(:central))) odad2 = OnceDifferentiable(x -> a * x[1], xa; autodiff = AutoForwardDiff()) odad3 = OnceDifferentiable(x -> a * x[1], xa; autodiff = AutoReverseDiff()) - Optim.gradient!(odad1, xa) - Optim.gradient!(odad2, xa) - Optim.gradient!(odad3, xa) - @test Optim.gradient(odad1) ≈ [a] - @test Optim.gradient(odad2) == [a] - @test Optim.gradient(odad3) == [a] + NLSolversBase.gradient!(odad1, xa) + NLSolversBase.gradient!(odad2, xa) + NLSolversBase.gradient!(odad3, xa) + @test NLSolversBase.gradient(odad1) ≈ [a] + @test NLSolversBase.gradient(odad2) == [a] + @test NLSolversBase.gradient(odad3) == [a] end for a in (1.0, 5.0) xa = rand(1) odad1 = OnceDifferentiable(x -> a * x[1]^2, xa; autodiff = AutoFiniteDiff(; fdtype = Val(:central))) odad2 = OnceDifferentiable(x -> a * x[1]^2, xa; autodiff = AutoForwardDiff()) odad3 = OnceDifferentiable(x -> a * x[1]^2, xa; autodiff = AutoReverseDiff()) - Optim.gradient!(odad1, xa) - Optim.gradient!(odad2, xa) - Optim.gradient!(odad3, xa) - @test Optim.gradient(odad1) ≈ 2.0 * a * xa - @test Optim.gradient(odad2) == 2.0 * a * xa - @test Optim.gradient(odad3) == 2.0 * a * xa + NLSolversBase.gradient!(odad1, xa) + NLSolversBase.gradient!(odad2, xa) + NLSolversBase.gradient!(odad3, xa) + @test NLSolversBase.gradient(odad1) ≈ 2.0 * a * xa + @test NLSolversBase.gradient(odad2) == 2.0 * a * xa + @test NLSolversBase.gradient(odad3) == 2.0 * a * xa end for dtype in (OnceDifferentiable, TwiceDifferentiable) for autodiff in (AutoFiniteDiff(; fdtype = Val(:central)), AutoForwardDiff(), AutoReverseDiff()) differentiable = dtype(x -> sum(x), rand(2); autodiff = autodiff) - Optim.value(differentiable) - Optim.value!(differentiable, rand(2)) - Optim.value_gradient!(differentiable, rand(2)) - Optim.gradient!(differentiable, rand(2)) - dtype == TwiceDifferentiable && Optim.hessian!(differentiable, rand(2)) + NLSolversBase.value(differentiable) + NLSolversBase.value!(differentiable, rand(2)) + NLSolversBase.value_gradient!(differentiable, rand(2)) + NLSolversBase.gradient!(differentiable, rand(2)) + dtype == TwiceDifferentiable && NLSolversBase.hessian!(differentiable, rand(2)) end end end @@ -54,18 +54,18 @@ a = 3.0 x_seed = rand(1) odad1 = OnceDifferentiable(x -> a * x[1]^2, x_seed) - Optim.value_gradient!(odad1, x_seed) - @test Optim.gradient(odad1) ≈ 2 .* a .* (x_seed) + NLSolversBase.value_gradient!(odad1, x_seed) + @test NLSolversBase.gradient(odad1) ≈ 2 .* a .* (x_seed) @testset "call counters" begin - @test Optim.f_calls(odad1) == 1 - @test Optim.g_calls(odad1) == 1 - @test Optim.h_calls(odad1) == 0 - Optim.value_gradient!(odad1, x_seed .+ 1.0) - @test Optim.f_calls(odad1) == 2 - @test Optim.g_calls(odad1) == 2 - @test Optim.h_calls(odad1) == 0 + @test NLSolversBase.f_calls(odad1) == 1 + @test NLSolversBase.g_calls(odad1) == 1 + @test NLSolversBase.h_calls(odad1) == 0 + NLSolversBase.value_gradient!(odad1, x_seed .+ 1.0) + @test NLSolversBase.f_calls(odad1) == 2 + @test NLSolversBase.g_calls(odad1) == 2 + @test NLSolversBase.h_calls(odad1) == 0 end - @test Optim.gradient(odad1) ≈ 2 .* a .* (x_seed .+ 1.0) + @test NLSolversBase.gradient(odad1) ≈ 2 .* a .* (x_seed .+ 1.0) end end diff --git a/test/multivariate/solvers/constrained/ipnewton/constraints.jl b/test/multivariate/solvers/constrained/ipnewton/constraints.jl index 917128529..a440855dd 100644 --- a/test/multivariate/solvers/constrained/ipnewton/constraints.jl +++ b/test/multivariate/solvers/constrained/ipnewton/constraints.jl @@ -2,8 +2,7 @@ # Utility function for hand-setting the μ parameter function setstate!(state, μ, d, constraints, method) state.μ = μ - Optim.update_fg!(d, constraints, state, method) - Optim.update_h!(d, constraints, state, method) + Optim.update_fgh!(d, constraints, state, method) end @testset "Bounds parsing" begin @@ -141,19 +140,22 @@ bounds = Optim.ConstraintBounds(Float64[], Float64[], Float64[], Float64[]) bstate = Optim.BarrierStateVars(bounds, x) bgrad = similar(bstate) - f_x, L = Optim.lagrangian_fg!( + f_x = NLSolversBase.value(dg, x) + g_x = NLSolversBase.gradient(dg, x) + L, ev = Optim.lagrangian_fg!( gx, bgrad, - dg, bounds, x, + f_x, + g_x, Float64[], Array{Float64}(undef, 0, 0), bstate, μ, ) @test f_x == L == dg.f(x) - @test gx == H * x + @test gx == g_x == H * x constraints = TwiceDifferentiableConstraints( (c, x) -> nothing, (J, x) -> nothing, @@ -172,7 +174,9 @@ bstate = Optim.BarrierStateVars(bounds) rand!(bstate.λxE) bgrad = similar(bstate) - f_x, L = Optim.lagrangian_fg!(gx, bgrad, d0, bounds, x, c, J, bstate, μ) + f_x = NLSolversBase.value(d0, x) + g_x = NLSolversBase.gradient(d0, x) + L, ev = Optim.lagrangian_fg!(gx, bgrad, bounds, x, f_x, g_x, c, J, bstate, μ) @test f_x == 0 @test L ≈ dot(bstate.λxE, xbar - x) @test gx == -bstate.λxE @@ -213,12 +217,15 @@ bstate = Optim.BarrierStateVars(bounds, y) rand!(bstate.λx) bgrad = similar(bstate) - f_x, L = Optim.lagrangian_fg!( + f_y = NLSolversBase.value(d0, y) + g_y = NLSolversBase.gradient(d0, y) + L, ev = Optim.lagrangian_fg!( gx, bgrad, - d0, bounds, y, + f_y, + g_y, Float64[], Array{Float64}(undef, 0, 0), bstate, @@ -254,12 +261,15 @@ rand!(bstate.slack_x) # intentionally displace from the correct value rand!(bstate.λx) bgrad = similar(bstate) - f_x, L = Optim.lagrangian_fg!( + f_x = NLSolversBase.value(d0, x) + g_x = NLSolversBase.gradient(d0, x) + L, ev = Optim.lagrangian_fg!( gx, bgrad, - d0, bounds, x, + f_x, + g_x, Float64[], Array{Float64}(undef, 0, 0), bstate, @@ -336,7 +346,9 @@ bstate = Optim.BarrierStateVars(bounds, x, c) rand!(bstate.λcE) bgrad = similar(bstate) - f_x, L = Optim.lagrangian_fg!(gx, bgrad, d0, bounds, x, c, J, bstate, μ) + f_x = NLSolversBase.value(d0, x) + g_x = NLSolversBase.gradient(d0, x) + L, ev = Optim.lagrangian_fg!(gx, bgrad, bounds, x, f_x, g_x, c, J, bstate, μ) @test f_x == 0 @test L ≈ dot(bstate.λcE, cbar - c) @test gx ≈ -J' * bstate.λcE @@ -361,7 +373,9 @@ rand!(bstate.slack_c) # intentionally displace from the correct value rand!(bstate.λc) bgrad = similar(bstate) - f_x, L = Optim.lagrangian_fg!(gx, bgrad, d0, bounds, x, c, J, bstate, μ) + f_x = NLSolversBase.value(d0, x) + g_x = NLSolversBase.gradient(d0, x) + L, ev = Optim.lagrangian_fg!(gx, bgrad, bounds, x, f_x, g_x, c, J, bstate, μ) @test f_x == 0 Ltarget = -μ * sum(log, bstate.slack_c) + @@ -415,8 +429,8 @@ constraints = TwiceDifferentiableConstraints([0.5, 0.0, -Inf, -Inf], [Inf, Inf, 1.0, 0.8]) state = Optim.initial_state(method, options, d, constraints, x) - Optim.update_fg!(d, constraints, state, method) - @test norm(f_g - state.g) ≈ 0.01 * norm(f_g) + Optim.update_fgh!(d, constraints, state, method) + @test norm(f_g - state.g_L_x) ≈ 0.01 * norm(f_g) # Nonlinear inequalities constraints = TwiceDifferentiableConstraints( (c, x) -> (c[1] = x[1] * x[2]; c[2] = 3 * x[3] + x[4]^2), @@ -429,8 +443,8 @@ ) @test Optim.isinterior(constraints, x) state = Optim.initial_state(method, options, d, constraints, x) - Optim.update_fg!(d, constraints, state, method) - @test norm(f_g - state.g) ≈ 0.01 * norm(f_g) + Optim.update_fgh!(d, constraints, state, method) + @test norm(f_g - state.g_L_x) ≈ 0.01 * norm(f_g) # Mixed equalities and inequalities constraints = TwiceDifferentiableConstraints( (c, x) -> (c[1] = x[1] * x[2]; c[2] = 3 * x[3] + x[4]^2), @@ -443,14 +457,14 @@ ) @test Optim.isfeasible(constraints, x) state = Optim.initial_state(method, options, d, constraints, x) - Optim.update_fg!(d, constraints, state, method) + Optim.update_fgh!(d, constraints, state, method) J = zeros(2, 4) constraints.jacobian!(J, x) eqnormal = vec(J[1, :]) eqnormal = eqnormal / norm(eqnormal) - @test abs(dot(state.g, eqnormal)) < 1e-12 # orthogonal to equality constraint + @test abs(dot(state.g_L_x, eqnormal)) < 1e-12 # orthogonal to equality constraint Pfg = f_g - dot(f_g, eqnormal) * eqnormal - Pg = state.g - dot(state.g, eqnormal) * eqnormal + Pg = state.g_L_x - dot(state.g_L_x, eqnormal) * eqnormal @test norm(Pfg - Pg) ≈ 0.01 * norm(Pfg) ## An objective function with a nonzero hessian hd = [1.0, 100.0, 0.01, 2.0] # diagonal terms of hessian @@ -467,10 +481,10 @@ constraints = TwiceDifferentiableConstraints([0.5, 0.0, -Inf, -Inf], [Inf, Inf, 1.0, 0.8]) state = Optim.initial_state(method, options, d, constraints, x) - Optim.update_fg!(d, constraints, state, method) - @test abs(dot(gx, state.g) / dot(gx, gx) - 1) <= 0.011 - Optim.update_h!(d, constraints, state, method) - @test abs(dot(gx, state.H * gx) / dot(gx, hx * gx) - 1) <= 0.011 + Optim.update_fgh!(d, constraints, state, method) + @test abs(dot(gx, state.g_L_x) / dot(gx, gx) - 1) <= 0.011 + Optim.update_fgh!(d, constraints, state, method) + @test abs(dot(gx, state.H_L_x * gx) / dot(gx, hx * gx) - 1) <= 0.011 # Nonlinear inequalities constraints = TwiceDifferentiableConstraints( (c, x) -> (c[1] = x[1] * x[2]; c[2] = 3 * x[3] + x[4]^2), @@ -483,10 +497,10 @@ ) @test Optim.isinterior(constraints, x) state = Optim.initial_state(method, options, d, constraints, x) - Optim.update_fg!(d, constraints, state, method) - @test abs(dot(gx, state.g) / dot(gx, gx) - 1) <= 0.011 - Optim.update_h!(d, constraints, state, method) - @test abs(dot(gx, state.H * gx) / dot(gx, hx * gx) - 1) <= 0.011 + Optim.update_fgh!(d, constraints, state, method) + @test abs(dot(gx, state.g_L_x) / dot(gx, gx) - 1) <= 0.011 + Optim.update_fgh!(d, constraints, state, method) + @test abs(dot(gx, state.H_L_x * gx) / dot(gx, hx * gx) - 1) <= 0.011 # Mixed equalities and inequalities constraints = TwiceDifferentiableConstraints( (c, x) -> (c[1] = x[1] * x[2]; c[2] = 3 * x[3] + x[4]^2), @@ -499,16 +513,16 @@ ) @test Optim.isfeasible(constraints, x) state = Optim.initial_state(method, options, d, constraints, x) - Optim.update_fg!(d, constraints, state, method) + Optim.update_fgh!(d, constraints, state, method) J = zeros(2, 4) constraints.jacobian!(J, x) eqnormal = vec(J[1, :]) eqnormal = eqnormal / norm(eqnormal) - @test abs(dot(state.g, eqnormal)) < 1e-12 # orthogonal to equality constraint + @test abs(dot(state.g_L_x, eqnormal)) < 1e-12 # orthogonal to equality constraint Pgx = gx - dot(gx, eqnormal) * eqnormal - @test abs(dot(Pgx, state.g) / dot(Pgx, Pgx) - 1) <= 0.011 - Optim.update_h!(d, constraints, state, method) - @test abs(dot(Pgx, state.H * Pgx) / dot(Pgx, hx * Pgx) - 1) <= 0.011 + @test abs(dot(Pgx, state.g_L_x) / dot(Pgx, Pgx) - 1) <= 0.011 + Optim.update_fgh!(d, constraints, state, method) + @test abs(dot(Pgx, state.H_L_x * Pgx) / dot(Pgx, hx * Pgx) - 1) <= 0.011 end @testset "IPNewton step" begin @@ -609,8 +623,7 @@ for i = 1:10 Optim.update_state!(d, constraints, state, method, options) state.μ = μ - Optim.update_fg!(d, constraints, state, method) - Optim.update_h!(d, constraints, state, method) + Optim.update_fgh!(d, constraints, state, method) end @test isapprox(first(state.x), μ / F, rtol = 1e-4) # |x| ≥ 1, and check that we get slack precision better than eps(1.0) @@ -626,8 +639,7 @@ for i = 1:10 Optim.update_state!(d, constraints, state, method, options) state.μ = μ - Optim.update_fg!(d, constraints, state, method) - Optim.update_h!(d, constraints, state, method) + Optim.update_fgh!(d, constraints, state, method) end @test state.x[1] == σ @test state.bstate.slack_x[1] < eps(float(σ)) @@ -650,8 +662,7 @@ Optim.initial_state(method, options, d, constraints, [(1 + eps(1.0)) * σ]) for i = 1:10 Optim.update_state!(d, constraints, state, method, options) - Optim.update_fg!(d, constraints, state, method) - Optim.update_h!(d, constraints, state, method) + Optim.update_fgh!(d, constraints, state, method) end @test state.x[1] ≈ σ @test state.bstate.slack_c[1] < eps(float(σ)) diff --git a/test/multivariate/solvers/first_order/ngmres.jl b/test/multivariate/solvers/first_order/ngmres.jl index 301d7b094..93efcc31d 100644 --- a/test/multivariate/solvers/first_order/ngmres.jl +++ b/test/multivariate/solvers/first_order/ngmres.jl @@ -52,8 +52,8 @@ using Optim, Test # The bounds are due to different systems behaving differently # TODO: is it a bad idea to hardcode these? @test 64 < Optim.iterations(res) < 100 - @test 234 < Optim.f_calls(res) < 310 - @test 234 < Optim.g_calls(res) < 310 + @test 234 <= Optim.f_calls(res) < 310 + @test 234 <= Optim.g_calls(res) < 310 @test Optim.minimum(res) < 1e-10 @test_throws AssertionError method(