Skip to content

Commit 9290b55

Browse files
authored
Improve NewtonTrustRegion (#1215)
* Fix implementation of NLSolversBase API (#1213) (cherry picked from commit 2f1e619) * Improve `NewtonTrustRegion`
1 parent 9580452 commit 9290b55

File tree

1 file changed

+82
-52
lines changed

1 file changed

+82
-52
lines changed

src/multivariate/solvers/second_order/newton_trust_region.jl

Lines changed: 82 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ end
4040
function calc_p!(lambda::T, min_i, n, qg, H_eig, p) where {T}
4141
fill!(p, zero(T))
4242
for i = min_i:n
43-
p[:] -= qg[i] / (H_eig.values[i] + lambda) * H_eig.vectors[:, i]
43+
LinearAlgebra.axpy!(-qg[i] / (H_eig.values[i] + lambda), view(H_eig.vectors, :, i), p)
4444
end
4545
return nothing
4646
end
@@ -59,7 +59,7 @@ function initial_safeguards(H, gr, delta, lambda)
5959
λL = max(T(0), λS, gr_norm / delta - Hnorm)
6060
λU = gr_norm / delta + Hnorm
6161
# p. 558
62-
lambda = min(max(lambda, λL), λU)
62+
lambda = clamp(lambda, λL, λU)
6363
if lambda λS
6464
lambda = max(T(1) / 1000 * λU, sqrt(λL * λU))
6565
end
@@ -160,7 +160,7 @@ function solve_tr_subproblem!(gr, H, delta, s; tolerance = 1e-10, max_iters = 5)
160160
# I don't think it matters which eigenvector we pick so take
161161
# the first.
162162
calc_p!(lambda, min_i, n, qg, H_eig, s)
163-
s[:] = -s + tau * H_eig.vectors[:, 1]
163+
LinearAlgebra.axpby!(tau, view(H_eig.vectors, :, 1), -1, s)
164164
end
165165
end
166166

@@ -174,8 +174,8 @@ function solve_tr_subproblem!(gr, H, delta, s; tolerance = 1e-10, max_iters = 5)
174174
for iter = 1:max_iters
175175
lambda_previous = lambda
176176

177-
for i = 1:n
178-
H_ridged[i, i] = H[i, i] + lambda
177+
for i in diagind(H_ridged)
178+
H_ridged[i] = H[i] + lambda
179179
end
180180

181181
F = cholesky(Hermitian(H_ridged), check = false)
@@ -196,7 +196,7 @@ function solve_tr_subproblem!(gr, H, delta, s; tolerance = 1e-10, max_iters = 5)
196196
# Check that lambda is not less than lambda_lb, and if so, go
197197
# half the way to lambda_lb.
198198
if lambda < lambda_lb
199-
lambda = 0.5 * (lambda_previous - lambda_lb) + lambda_lb
199+
lambda = (lambda_previous + lambda_lb) / 2
200200
end
201201

202202
if abs(lambda - lambda_previous) < tolerance
@@ -207,7 +207,7 @@ function solve_tr_subproblem!(gr, H, delta, s; tolerance = 1e-10, max_iters = 5)
207207
end
208208
end
209209

210-
m = dot(gr, s) + 0.5 * dot(s, H * s)
210+
m = dot(gr, s) + dot(s, H, s) / 2
211211

212212
return m, interior, lambda, hard_case, reached_solution
213213
end
@@ -220,6 +220,37 @@ struct NewtonTrustRegion{T<:Real} <: SecondOrderOptimizer
220220
rho_lower::T
221221
rho_upper::T
222222
use_fg::Bool
223+
224+
function NewtonTrustRegion(
225+
initial_delta::T,
226+
delta_hat::T,
227+
delta_min::T,
228+
eta::T,
229+
rho_lower::T,
230+
rho_upper::T,
231+
use_fg::Bool,
232+
) where {T<:Real}
233+
if !(delta_hat > 0)
234+
throw(DomainError(delta_hat, "maximum trust region radius must be positive"))
235+
end
236+
if !(0 < initial_delta < delta_hat)
237+
throw(DomainError(initial_delta, LazyString("initial trust region radius must be positive and below the maiximum trust region radius (", delta_hat, ")")))
238+
end
239+
if !(delta_min >= 0)
240+
throw(DomainError(delta_min, "smallest allowable trust region radius must be non-negative"))
241+
end
242+
if !(eta >= 0)
243+
throw(DomainError(eta, "minimum threshold of actual and predicted reduction for accepting a step must be positivethreshold eta must be non-negative"))
244+
end
245+
if !(rho_lower > eta)
246+
throw(DomainError(rho_lower, LazyString("maximum threshold of actual and predicted reduction for shrinking the trust region must be greater than the minimum threshold for accepting a step (", eta, ")")))
247+
end
248+
if !(rho_upper > rho_lower)
249+
throw(DomainError(rho_upper, LazyString("minimum threshold of actual and predicted reduction for growing the trust region must be greater than the minimum threshold for shrinking it (", rho_lower, ")")))
250+
end
251+
252+
return new{T}(initial_delta, delta_hat, delta_min, eta, rho_lower, rho_upper, use_fg)
253+
end
223254
end
224255

225256
"""
@@ -235,13 +266,13 @@ NewtonTrustRegion(; initial_delta = 1.0,
235266
use_fg = true)
236267
```
237268
238-
The constructor has 5 keywords:
239-
* `initial_delta`, the starting trust region radius. Defaults to `1.0`.
269+
The constructor has 7 keywords:
270+
* `initial_delta`, the initial trust region radius. Defaults to `1.0`.
240271
* `delta_hat`, the largest allowable trust region radius. Defaults to `100.0`.
241-
* `delta_min`, the smallest alowable trust region radius. Optimization halts if the updated radius is smaller than this value. Defaults to `sqrt(eps(Float64))`.
242-
* `eta`, when `rho` is at least `eta`, accept the step. Defaults to `0.1`.
243-
* `rho_lower`, when `rho` is less than `rho_lower`, shrink the trust region. Defaults to `0.25`.
244-
* `rho_upper`, when `rho` is greater than `rho_upper`, grow the trust region. Defaults to `0.75`.
272+
* `delta_min`, the smallest allowable trust region radius. Optimization halts if the updated radius is smaller than this value. Defaults to `sqrt(eps(Float64))`.
273+
* `eta`, when the ratio of actual and predicted reduction is greater than `eta`, accept the step. Defaults to `0.1`.
274+
* `rho_lower`, when the ratio of actual and predicted reduction is less than `rho_lower`, shrink the trust region. Defaults to `0.25`.
275+
* `rho_upper`, when the ratio of actual and predicted reduction is greater than `rho_upper` and the proposed step is at the boundary of the trust region, grow the trust region. Defaults to `0.75`.
245276
* `use_fg`, when true always evaluate the gradient with the value after solving the subproblem. This is more efficient if f and g share expensive computations. Defaults to `true`.
246277
247278
## Description
@@ -257,23 +288,17 @@ trust-region methods in practice.
257288
## References
258289
- Nocedal, J., & Wright, S. (2006). Numerical optimization. Springer Science & Business Media.
259290
"""
260-
NewtonTrustRegion(;
291+
function NewtonTrustRegion(;
261292
initial_delta::Real = 1.0,
262293
delta_hat::Real = 100.0,
263294
delta_min::Real = sqrt(eps(Float64)),
264295
eta::Real = 0.1,
265296
rho_lower::Real = 0.25,
266297
rho_upper::Real = 0.75,
267-
use_fg = true,
268-
) = NewtonTrustRegion(
269-
initial_delta,
270-
delta_hat,
271-
delta_min,
272-
eta,
273-
rho_lower,
274-
rho_upper,
275-
use_fg,
298+
use_fg::Bool = true,
276299
)
300+
NewtonTrustRegion(promote(initial_delta, delta_hat, delta_min, eta, rho_lower, rho_upper)..., use_fg)
301+
end
277302

278303
Base.summary(io::IO, ::NewtonTrustRegion) = print(io, "Newton's Method (Trust Region)")
279304

@@ -283,6 +308,8 @@ mutable struct NewtonTrustRegionState{Tx,T,G} <: AbstractOptimizerState
283308
g_previous::G
284309
f_x_previous::T
285310
s::Tx
311+
x_cache::Tx
312+
g_cache::G
286313
hard_case::Bool
287314
reached_subproblem_solution::Bool
288315
interior::Bool
@@ -295,12 +322,6 @@ end
295322
function initial_state(method::NewtonTrustRegion, options, d, initial_x)
296323
T = eltype(initial_x)
297324
n = length(initial_x)
298-
# Maintain current gradient in gr
299-
@assert(method.delta_hat > 0, "delta_hat must be strictly positive")
300-
@assert(0 < method.initial_delta < method.delta_hat, "delta must be in (0, delta_hat)")
301-
@assert(0 <= method.eta < method.rho_lower, "eta must be in [0, rho_lower)")
302-
@assert(method.rho_lower < method.rho_upper, "must have rho_lower < rho_upper")
303-
@assert(method.rho_lower >= 0.0)
304325
# Keep track of trust region sizes
305326
delta = copy(method.initial_delta)
306327

@@ -318,6 +339,8 @@ function initial_state(method::NewtonTrustRegion, options, d, initial_x)
318339
copy(gradient(d)), # Store previous gradient in state.g_previous
319340
T(NaN), # Store previous f in state.f_x_previous
320341
similar(initial_x), # Maintain current search direction in state.s
342+
fill!(similar(initial_x), NaN), # For resetting the state if a step is rejected
343+
fill!(method.use_fg ? similar(gradient(d)) : empty(gradient(d)), NaN), # For resetting the gradient if a step is rejected
321344
hard_case,
322345
reached_subproblem_solution,
323346
interior,
@@ -336,21 +359,21 @@ function update_state!(d, state::NewtonTrustRegionState, method::NewtonTrustRegi
336359
solve_tr_subproblem!(gradient(d), NLSolversBase.hessian(d), state.delta, state.s)
337360

338361
# Maintain a record of previous position
339-
copyto!(state.x_previous, state.x)
340-
state.f_x_previous = value(d)
362+
copyto!(state.x_cache, state.x)
363+
f_cache = value(d)
341364

342365
# Update current position
343366
state.x .+= state.s
344367
# Update the function value and gradient
345368
if method.use_fg
346-
state.g_previous .= gradient(d)
369+
copyto!(state.g_cache, gradient(d))
347370
value_gradient!(d, state.x)
348371
else
349372
value!(d, state.x)
350373
end
351374
# Update the trust region size based on the discrepancy between
352375
# the predicted and actual function values. (Algorithm 4.1 in N&W (2006))
353-
f_x_diff = state.f_x_previous - value(d)
376+
f_x_diff = f_cache - value(d)
354377
if abs(m) <= eps(T)
355378
# This should only happen when the step is very small, in which case
356379
# we should accept the step and assess_convergence().
@@ -361,38 +384,45 @@ function update_state!(d, state::NewtonTrustRegionState, method::NewtonTrustRegi
361384
# region.
362385
state.rho = -1.0
363386
else
364-
state.rho = f_x_diff / (0 - m)
387+
state.rho = f_x_diff / (- m)
365388
end
366389

367-
if state.rho < method.rho_lower
368-
state.delta *= 0.25
369-
elseif (state.rho > method.rho_upper) && (!state.interior)
370-
state.delta = min(2 * state.delta, method.delta_hat)
371-
else
372-
# else leave delta unchanged.
373-
end
374-
if state.rho <= state.eta
390+
# The step is accepted if the ratio is greater than eta
391+
accept_step = state.rho > state.eta
392+
393+
# Update trust region radius
394+
if !accept_step
375395
# The improvement is too small and we won't take it.
376396
# If you reject an interior solution, make sure that the next
377-
# delta is smaller than the current step. Otherwise you waste
397+
# delta is smaller than the current step (state.s). Otherwise you waste
378398
# steps reducing delta by constant factors while each solution
379399
# will be the same. If this keeps on happening it could be a sign
380400
# errors in the gradient or a non-differentiability at the optimum.
381-
x_diff = state.x - state.x_previous
382-
state.delta = 0.25 * norm(x_diff)
401+
state.delta = norm(state.s) / 4
402+
elseif state.rho < method.rho_lower
403+
state.delta /= 4
404+
elseif (state.rho > method.rho_upper) && !state.interior
405+
state.delta = min(2 * state.delta, method.delta_hat)
406+
end
383407

384-
d.F = state.f_x_previous
385-
copyto!(state.x, state.x_previous)
386-
if method.use_fg
387-
copyto!(d.DF, state.g_previous)
388-
copyto!(d.x_df, state.x_previous)
389-
end
390-
else
408+
# Update/reset gradients and function values
409+
if accept_step
391410
if method.use_fg
392411
hessian!(d, state.x)
393412
else
394413
NLSolversBase.gradient_hessian!!(d, state.x)
395414
end
415+
else
416+
# Reset state
417+
copyto!(state.x, state.x_cache)
418+
419+
# Reset objective function
420+
copyto!(d.x_f, state.x_cache)
421+
d.F = f_cache
422+
if method.use_fg
423+
copyto!(d.x_df, state.x_cache)
424+
copyto!(d.DF, state.g_cache)
425+
end
396426
end
397427

398428
false

0 commit comments

Comments
 (0)