Improve NewtonTrustRegion (#1215)

devmotion · web-flow · commit 9290b55ace3b · 2025-11-27T21:25:58.000+01:00
* Fix implementation of NLSolversBase API (#1213) (cherry picked from commit 2f1e619) * Improve `NewtonTrustRegion`
diff --git a/src/multivariate/solvers/second_order/newton_trust_region.jl b/src/multivariate/solvers/second_order/newton_trust_region.jl
@@ -40,7 +40,7 @@ end
 function calc_p!(lambda::T, min_i, n, qg, H_eig, p) where {T}
     fill!(p, zero(T))
     for i = min_i:n
-        p[:] -= qg[i] / (H_eig.values[i] + lambda) * H_eig.vectors[:, i]
+        LinearAlgebra.axpy!(-qg[i] / (H_eig.values[i] + lambda), view(H_eig.vectors, :, i), p)
     end
     return nothing
 end
@@ -59,7 +59,7 @@ function initial_safeguards(H, gr, delta, lambda)
     λL = max(T(0), λS, gr_norm / delta - Hnorm)
     λU = gr_norm / delta + Hnorm
     # p. 558
-    lambda = min(max(lambda, λL), λU)
+    lambda = clamp(lambda, λL, λU)
     if lambda ≤ λS
         lambda = max(T(1) / 1000 * λU, sqrt(λL * λU))
     end
@@ -160,7 +160,7 @@ function solve_tr_subproblem!(gr, H, delta, s; tolerance = 1e-10, max_iters = 5)
                 # I don't think it matters which eigenvector we pick so take
                 # the first.
                 calc_p!(lambda, min_i, n, qg, H_eig, s)
-                s[:] = -s + tau * H_eig.vectors[:, 1]
+                LinearAlgebra.axpby!(tau, view(H_eig.vectors, :, 1), -1, s)
             end
         end
 
@@ -174,8 +174,8 @@ function solve_tr_subproblem!(gr, H, delta, s; tolerance = 1e-10, max_iters = 5)
             for iter = 1:max_iters
                 lambda_previous = lambda
 
-                for i = 1:n
-                    H_ridged[i, i] = H[i, i] + lambda
+                for i in diagind(H_ridged)
+                    H_ridged[i] = H[i] + lambda
                 end
 
                 F = cholesky(Hermitian(H_ridged), check = false)
@@ -196,7 +196,7 @@ function solve_tr_subproblem!(gr, H, delta, s; tolerance = 1e-10, max_iters = 5)
                 # Check that lambda is not less than lambda_lb, and if so, go
                 # half the way to lambda_lb.
                 if lambda < lambda_lb
-                    lambda = 0.5 * (lambda_previous - lambda_lb) + lambda_lb
+                    lambda = (lambda_previous + lambda_lb) / 2
                 end
 
                 if abs(lambda - lambda_previous) < tolerance
@@ -207,7 +207,7 @@ function solve_tr_subproblem!(gr, H, delta, s; tolerance = 1e-10, max_iters = 5)
         end
     end
 
-    m = dot(gr, s) + 0.5 * dot(s, H * s)
+    m = dot(gr, s) + dot(s, H, s) / 2
 
     return m, interior, lambda, hard_case, reached_solution
 end
@@ -220,6 +220,37 @@ struct NewtonTrustRegion{T<:Real} <: SecondOrderOptimizer
     rho_lower::T
     rho_upper::T
     use_fg::Bool
+
+    function NewtonTrustRegion(
+        initial_delta::T,
+        delta_hat::T,
+        delta_min::T,
+        eta::T,
+        rho_lower::T,
+        rho_upper::T,
+        use_fg::Bool,
+    ) where {T<:Real}
+        if !(delta_hat > 0)
+            throw(DomainError(delta_hat, "maximum trust region radius must be positive"))
+        end
+        if !(0 < initial_delta < delta_hat)
+            throw(DomainError(initial_delta, LazyString("initial trust region radius must be positive and below the maiximum trust region radius (", delta_hat, ")")))
+        end
+        if !(delta_min >= 0)
+            throw(DomainError(delta_min, "smallest allowable trust region radius must be non-negative"))
+        end
+        if !(eta >= 0)
+            throw(DomainError(eta, "minimum threshold of actual and predicted reduction for accepting a step must be positivethreshold eta must be non-negative"))
+        end
+        if !(rho_lower > eta)
+            throw(DomainError(rho_lower, LazyString("maximum threshold of actual and predicted reduction for shrinking the trust region must be greater than the minimum threshold for accepting a step (", eta, ")")))
+        end
+        if !(rho_upper > rho_lower)
+            throw(DomainError(rho_upper, LazyString("minimum threshold of actual and predicted reduction for growing the trust region must be greater than the minimum threshold for shrinking it (", rho_lower, ")")))
+        end
+
+        return new{T}(initial_delta, delta_hat, delta_min, eta, rho_lower, rho_upper, use_fg)
+    end
 end
 
 """
@@ -235,13 +266,13 @@ NewtonTrustRegion(; initial_delta = 1.0,
                     use_fg = true)
 ```
 
-The constructor has 5 keywords:
-* `initial_delta`, the starting trust region radius. Defaults to `1.0`.
+The constructor has 7 keywords:
+* `initial_delta`, the initial trust region radius. Defaults to `1.0`.
 * `delta_hat`, the largest allowable trust region radius. Defaults to `100.0`.
-* `delta_min`, the smallest alowable trust region radius. Optimization halts if the updated radius is smaller than this value. Defaults to `sqrt(eps(Float64))`.
-* `eta`, when `rho` is at least `eta`, accept the step. Defaults to `0.1`.
-* `rho_lower`, when `rho` is less than `rho_lower`, shrink the trust region. Defaults to `0.25`.
-* `rho_upper`, when `rho` is greater than `rho_upper`, grow the trust region. Defaults to `0.75`.
+* `delta_min`, the smallest allowable trust region radius. Optimization halts if the updated radius is smaller than this value. Defaults to `sqrt(eps(Float64))`.
+* `eta`, when the ratio of actual and predicted reduction is greater than `eta`, accept the step. Defaults to `0.1`.
+* `rho_lower`, when the ratio of actual and predicted reduction is less than `rho_lower`, shrink the trust region. Defaults to `0.25`.
+* `rho_upper`, when the ratio of actual and predicted reduction is greater than `rho_upper` and the proposed step is at the boundary of the trust region, grow the trust region. Defaults to `0.75`.
 * `use_fg`, when true always evaluate the gradient with the value after solving the subproblem. This is more efficient if f and g share expensive computations. Defaults to `true`.
 
 ## Description
@@ -257,23 +288,17 @@ trust-region methods in practice.
 ## References
  - Nocedal, J., & Wright, S. (2006). Numerical optimization. Springer Science & Business Media.
 """
-NewtonTrustRegion(;
+function NewtonTrustRegion(;
     initial_delta::Real = 1.0,
     delta_hat::Real = 100.0,
     delta_min::Real = sqrt(eps(Float64)),
     eta::Real = 0.1,
     rho_lower::Real = 0.25,
     rho_upper::Real = 0.75,
-    use_fg = true,
-) = NewtonTrustRegion(
-    initial_delta,
-    delta_hat,
-    delta_min,
-    eta,
-    rho_lower,
-    rho_upper,
-    use_fg,
+    use_fg::Bool = true,
 )
+    NewtonTrustRegion(promote(initial_delta, delta_hat, delta_min, eta, rho_lower, rho_upper)..., use_fg)
+end
 
 Base.summary(io::IO, ::NewtonTrustRegion) = print(io, "Newton's Method (Trust Region)")
 
@@ -283,6 +308,8 @@ mutable struct NewtonTrustRegionState{Tx,T,G} <: AbstractOptimizerState
     g_previous::G
     f_x_previous::T
     s::Tx
+    x_cache::Tx
+    g_cache::G
     hard_case::Bool
     reached_subproblem_solution::Bool
     interior::Bool
@@ -295,12 +322,6 @@ end
 function initial_state(method::NewtonTrustRegion, options, d, initial_x)
     T = eltype(initial_x)
     n = length(initial_x)
-    # Maintain current gradient in gr
-    @assert(method.delta_hat > 0, "delta_hat must be strictly positive")
-    @assert(0 < method.initial_delta < method.delta_hat, "delta must be in (0, delta_hat)")
-    @assert(0 <= method.eta < method.rho_lower, "eta must be in [0, rho_lower)")
-    @assert(method.rho_lower < method.rho_upper, "must have rho_lower < rho_upper")
-    @assert(method.rho_lower >= 0.0)
     # Keep track of trust region sizes
     delta = copy(method.initial_delta)
 
@@ -318,6 +339,8 @@ function initial_state(method::NewtonTrustRegion, options, d, initial_x)
         copy(gradient(d)), # Store previous gradient in state.g_previous
         T(NaN), # Store previous f in state.f_x_previous
         similar(initial_x), # Maintain current search direction in state.s
+        fill!(similar(initial_x), NaN), # For resetting the state if a step is rejected
+        fill!(method.use_fg ? similar(gradient(d)) : empty(gradient(d)), NaN), # For resetting the gradient if a step is rejected
         hard_case,
         reached_subproblem_solution,
         interior,
@@ -336,21 +359,21 @@ function update_state!(d, state::NewtonTrustRegionState, method::NewtonTrustRegi
         solve_tr_subproblem!(gradient(d), NLSolversBase.hessian(d), state.delta, state.s)
 
     # Maintain a record of previous position
-    copyto!(state.x_previous, state.x)
-    state.f_x_previous = value(d)
+    copyto!(state.x_cache, state.x)
+    f_cache = value(d)
 
     # Update current position
     state.x .+= state.s
     # Update the function value and gradient
     if method.use_fg
-        state.g_previous .= gradient(d)
+        copyto!(state.g_cache, gradient(d))
         value_gradient!(d, state.x)
     else
         value!(d, state.x)
     end
     # Update the trust region size based on the discrepancy between
     # the predicted and actual function values.  (Algorithm 4.1 in N&W (2006))
-    f_x_diff = state.f_x_previous - value(d)
+    f_x_diff = f_cache - value(d)
     if abs(m) <= eps(T)
         # This should only happen when the step is very small, in which case
         # we should accept the step and assess_convergence().
@@ -361,38 +384,45 @@ function update_state!(d, state::NewtonTrustRegionState, method::NewtonTrustRegi
         # region.
         state.rho = -1.0
     else
-        state.rho = f_x_diff / (0 - m)
+        state.rho = f_x_diff / (- m)
     end
 
-    if state.rho < method.rho_lower
-        state.delta *= 0.25
-    elseif (state.rho > method.rho_upper) && (!state.interior)
-        state.delta = min(2 * state.delta, method.delta_hat)
-    else
-        # else leave delta unchanged.
-    end
-    if state.rho <= state.eta
+    # The step is accepted if the ratio is greater than eta
+    accept_step = state.rho > state.eta
+
+    # Update trust region radius
+    if !accept_step
         # The improvement is too small and we won't take it.
         # If you reject an interior solution, make sure that the next
-        # delta is smaller than the current step. Otherwise you waste
+        # delta is smaller than the current step (state.s). Otherwise you waste
         # steps reducing delta by constant factors while each solution
         # will be the same. If this keeps on happening it could be a sign
         # errors in the gradient or a non-differentiability at the optimum.
-        x_diff = state.x - state.x_previous
-        state.delta = 0.25 * norm(x_diff)
+        state.delta = norm(state.s) / 4
+    elseif state.rho < method.rho_lower
+        state.delta /= 4
+    elseif (state.rho > method.rho_upper) && !state.interior
+        state.delta = min(2 * state.delta, method.delta_hat)
+    end
 
-        d.F = state.f_x_previous
-        copyto!(state.x, state.x_previous)
-        if method.use_fg
-            copyto!(d.DF, state.g_previous)
-            copyto!(d.x_df, state.x_previous)
-        end
-    else
+    # Update/reset gradients and function values
+    if accept_step
         if method.use_fg
             hessian!(d, state.x)
         else
             NLSolversBase.gradient_hessian!!(d, state.x)
         end
+    else
+        # Reset state
+        copyto!(state.x, state.x_cache)
+
+        # Reset objective function
+        copyto!(d.x_f, state.x_cache)
+        d.F = f_cache
+        if method.use_fg
+            copyto!(d.x_df, state.x_cache)
+            copyto!(d.DF, state.g_cache)
+        end
     end
 
     false