From 618263266cd1677c7d2f021a3e9ea000b77c951f Mon Sep 17 00:00:00 2001
From: David Widmann <devmotion@users.noreply.github.com>
Date: Wed, 26 Nov 2025 11:20:02 +0100
Subject: [PATCH] Do not (mis)use objective as state

---
 Project.toml                                  |   2 +-
 src/Manifolds.jl                              |  11 +-
 src/Optim.jl                                  |   3 -
 src/api.jl                                    |   9 +-
 src/multivariate/optimize/optimize.jl         | 110 ++++----
 .../solvers/constrained/fminbox.jl            | 237 ++++++++----------
 .../solvers/constrained/ipnewton/interior.jl  | 130 +++++-----
 .../solvers/constrained/ipnewton/ipnewton.jl  | 159 ++++++------
 .../ipnewton/utilities/assess_convergence.jl  |   6 +-
 .../constrained/ipnewton/utilities/trace.jl   |  10 +-
 src/multivariate/solvers/constrained/samin.jl |  84 +++----
 .../accelerated_gradient_descent.jl           |  33 +--
 src/multivariate/solvers/first_order/adam.jl  | 101 ++++----
 .../solvers/first_order/adamax.jl             | 103 ++++----
 src/multivariate/solvers/first_order/bfgs.jl  |  84 ++++---
 src/multivariate/solvers/first_order/cg.jl    |  66 +++--
 .../solvers/first_order/gradient_descent.jl   |  45 ++--
 .../solvers/first_order/l_bfgs.jl             |  67 ++---
 .../first_order/momentum_gradient_descent.jl  |  29 ++-
 .../solvers/first_order/ngmres.jl             | 110 ++++----
 .../second_order/krylov_trust_region.jl       |  44 ++--
 .../solvers/second_order/newton.jl            |  52 ++--
 .../second_order/newton_trust_region.jl       |  94 ++++---
 .../solvers/zeroth_order/nelder_mead.jl       |  38 +--
 .../solvers/zeroth_order/particle_swarm.jl    |  27 +-
 .../zeroth_order/simulated_annealing.jl       |  55 ++--
 .../solvers/zeroth_order/zeroth_utils.jl      |   6 +-
 src/types.jl                                  |   4 +-
 src/utilities/assess_convergence.jl           |  79 ++----
 src/utilities/generic.jl                      |   4 +-
 src/utilities/perform_linesearch.jl           |  26 +-
 src/utilities/trace.jl                        |   8 +-
 test/general/convergence.jl                   |  30 ++-
 test/general/objective_types.jl               |  66 ++---
 .../constrained/ipnewton/constraints.jl       |  87 ++++---
 .../solvers/first_order/ngmres.jl             |   4 +-
 36 files changed, 1037 insertions(+), 986 deletions(-)

diff --git a/Project.toml b/Project.toml
index 5bac458c1..33ff5a79a 100644
--- a/Project.toml
+++ b/Project.toml
@@ -30,7 +30,7 @@ ExplicitImports = "1.13.2"
 FillArrays = "0.6.2, 0.7, 0.8, 0.9, 0.10, 0.11, 0.12, 0.13, 1"
 ForwardDiff = "0.10, 1"
 JET = "0.9, 0.10"
-LineSearches = "7.4.0"
+LineSearches = "7.5.1"
 LinearAlgebra = "<0.0.1, 1.6"
 MathOptInterface = "1.17"
 Measurements = "2.14.1"
diff --git a/src/Manifolds.jl b/src/Manifolds.jl
index 8d02b553a..b016dd629 100644
--- a/src/Manifolds.jl
+++ b/src/Manifolds.jl
@@ -28,16 +28,7 @@ end
 # TODO: is it safe here to call retract! and change x?
 function NLSolversBase.value!(obj::ManifoldObjective, x)
     xin = retract(obj.manifold, x)
-    value!(obj.inner_obj, xin)
-end
-function NLSolversBase.value(obj::ManifoldObjective)
-    value(obj.inner_obj)
-end
-function NLSolversBase.gradient(obj::ManifoldObjective)
-    gradient(obj.inner_obj)
-end
-function NLSolversBase.gradient(obj::ManifoldObjective, i::Int)
-    gradient(obj.inner_obj, i)
+    return value!(obj.inner_obj, xin)
 end
 function NLSolversBase.gradient!(obj::ManifoldObjective, x)
     xin = retract(obj.manifold, x)
diff --git a/src/Optim.jl b/src/Optim.jl
index a311e1af9..e5be7198c 100644
--- a/src/Optim.jl
+++ b/src/Optim.jl
@@ -41,10 +41,7 @@ using NLSolversBase:
     TwiceDifferentiableConstraints,
     nconstraints,
     nconstraints_x,
-    hessian,
     hessian!,
-    hessian!!,
-    hv_product,
     hv_product!
 
 # var for NelderMead
diff --git a/src/api.jl b/src/api.jl
index 72e0eca7b..467488ff4 100644
--- a/src/api.jl
+++ b/src/api.jl
@@ -100,18 +100,15 @@ g_norm_trace(r::OptimizationResults) =
 g_norm_trace(r::MultivariateOptimizationResults) = [state.g_norm for state in trace(r)]
 
 f_calls(r::OptimizationResults) = r.f_calls
-f_calls(d) = first(d.f_calls)
+f_calls(d::AbstractObjective) = NLSolversBase.f_calls(d)
 
 g_calls(r::OptimizationResults) = error("g_calls is not implemented for $(summary(r)).")
 g_calls(r::MultivariateOptimizationResults) = r.g_calls
-g_calls(d::NonDifferentiable) = 0
-g_calls(d) = first(d.df_calls)
+g_calls(d::AbstractObjective) = NLSolversBase.g_calls(d)
 
 h_calls(r::OptimizationResults) = error("h_calls is not implemented for $(summary(r)).")
 h_calls(r::MultivariateOptimizationResults) = r.h_calls
-h_calls(d::Union{NonDifferentiable,OnceDifferentiable}) = 0
-h_calls(d) = first(d.h_calls)
-h_calls(d::TwiceDifferentiableHV) = first(d.hv_calls)
+h_calls(d::AbstractObjective) = NLSolversBase.h_calls(d) + NLSolversBase.hv_calls(d)
 
 converged(r::UnivariateOptimizationResults) = r.stopped_by.converged
 function converged(r::MultivariateOptimizationResults)
diff --git a/src/multivariate/optimize/optimize.jl b/src/multivariate/optimize/optimize.jl
index 73498df30..6af2533c6 100644
--- a/src/multivariate/optimize/optimize.jl
+++ b/src/multivariate/optimize/optimize.jl
@@ -1,37 +1,42 @@
-update_g!(d, state, method) = nothing
-function update_g!(d, state, method::FirstOrderOptimizer)
-    # Update the function value and gradient
-    value_gradient!(d, state.x)
-    project_tangent!(method.manifold, gradient(d), state.x)
+# Update function value, gradient and Hessian
+function update_fgh!(d, state, ::ZerothOrderOptimizer)
+    f_x = value!(d, state.x)
+    state.f_x = f_x
+    return nothing
 end
-function update_g!(d, state, method::Newton)
-    # Update the function value and gradient
-    value_gradient!(d, state.x)
-end
-update_fg!(d, state, method) = nothing
-update_fg!(d, state, method::ZerothOrderOptimizer) = value!(d, state.x)
-function update_fg!(d, state, method::FirstOrderOptimizer)
-    value_gradient!(d, state.x)
-    project_tangent!(method.manifold, gradient(d), state.x)
+function update_fgh!(d, state, method::FirstOrderOptimizer)
+    f_x, g_x = value_gradient!(d, state.x)
+    if hasproperty(method, :manifold)
+        project_tangent!(method.manifold, g_x, state.x)
+    end
+    state.f_x = f_x
+    copyto!(state.g_x, g_x)
+    return nothing
 end
-function update_fg!(d, state, method::Newton)
-    value_gradient!(d, state.x)
+function update_fgh!(d, state, method::SecondOrderOptimizer)
+    # Manifold optimization is currently not supported for second order optimization algorithms
+    @assert !hasproperty(method, :manifold)
+
+    # TODO: Switch to `value_gradient_hessian!` when it becomes available
+    f_x, g_x = value_gradient!(d, state.x)
+    H_x = hessian!(d, state.x)
+    state.f_x = f_x
+    copyto!(state.g_x, g_x)
+    copyto!(state.H_x, H_x)
+
+    return nothing
 end
 
-# Update the Hessian
-update_h!(d, state, method) = nothing
-update_h!(d, state, method::SecondOrderOptimizer) = hessian!(d, state.x)
-
 after_while!(d, state, method, options) = nothing
 
-function initial_convergence(d, state, method::AbstractOptimizer, initial_x, options)
-    gradient!(d, initial_x)
-    stopped = !isfinite(value(d)) || any(!isfinite, gradient(d))
-    g_residual(d, state) <= options.g_abstol, stopped
+function initial_convergence(state::AbstractOptimizerState, options::Options)
+    stopped = !isfinite(state.f_x) || any(!isfinite, state.g_x)
+    return g_residual(state) <= options.g_abstol, stopped
 end
-function initial_convergence(d, state, method::ZerothOrderOptimizer, initial_x, options)
+function initial_convergence(::ZerothOrderState, ::Options)
     false, false
 end
+
 function optimize(
     d::D,
     initial_x::Tx,
@@ -41,7 +46,7 @@ function optimize(
 ) where {D<:AbstractObjective,M<:AbstractOptimizer,Tx<:AbstractArray,T,TCallback}
 
     t0 = time() # Initial time stamp used to control early stopping by options.time_limit
-    tr = OptimizationTrace{typeof(value(d)),typeof(method)}()
+    tr = OptimizationTrace{typeof(state.f_x),typeof(method)}()
     tracing =
         options.store_trace ||
         options.show_trace ||
@@ -51,7 +56,7 @@ function optimize(
     f_limit_reached, g_limit_reached, h_limit_reached = false, false, false
     x_converged, f_converged, f_increased, counter_f_tol = false, false, false, 0
 
-    g_converged, stopped = initial_convergence(d, state, method, initial_x, options)
+    g_converged, stopped = initial_convergence(state, options)
     converged = g_converged || stopped
     # prepare iteration counter (used to make "initial state" trace entry)
     iteration = 0
@@ -62,22 +67,29 @@ function optimize(
     ls_success::Bool = true
     while !converged && !stopped && iteration < options.iterations
         iteration += 1
+
+        # Convention: When `update_state!` is called, then `state` satisfies:
+        # - `state.x`: Current state
+        # - `state.f`: Objective function value of the current state, ie. `d(state.x)`
+        # - `state.g_x` (if available): Gradient of the objective function at the current state, i.e. `gradient(d, state.x)`
+        # - `state.H_x` (if available): Hessian of the objective function at the current state, i.e. `hessian(d, state.x)` 
         ls_success = !update_state!(d, state, method)
         if !ls_success
             break # it returns true if it's forced by something in update! to stop (eg dx_dg == 0.0 in BFGS, or linesearch errors)
         end
-        if !(method isa NewtonTrustRegion)
-            update_g!(d, state, method) # TODO: Should this be `update_fg!`?
-        end
+
+        # Update function value, gradient and Hessian matrix (skipped by some methods that already update those in `update_state!`)
+        # TODO: Already perform in `update_state!`?
+        update_fgh!(d, state, method)
+
+        # Check convergence
         x_converged, f_converged, g_converged, f_increased =
             assess_convergence(state, d, options)
         # For some problems it may be useful to require `f_converged` to be hit multiple times
         # TODO: Do the same for x_tol?
         counter_f_tol = f_converged ? counter_f_tol + 1 : 0
         converged = x_converged || g_converged || (counter_f_tol > options.successive_f_tol)
-        if !(converged && method isa Newton) && !(method isa NewtonTrustRegion)
-            update_h!(d, state, method) # only relevant if not converged
-        end
+
         if tracing
             # update trace; callbacks can stop routine early by returning true
             stopped_by_callback =
@@ -113,11 +125,11 @@ function optimize(
             end
         end
 
-        if g_calls(d) > 0 && !all(isfinite, gradient(d))
+        if hasproperty(state, :g_x) && !all(isfinite, state.g_x)
             options.show_warnings && @warn "Terminated early due to NaN in gradient."
             break
         end
-        if h_calls(d) > 0 && !(d isa TwiceDifferentiableHV) && !all(isfinite, hessian(d))
+        if hasproperty(state, :H_x) && !all(isfinite, state.H_x)
             options.show_warnings && @warn "Terminated early due to NaN in Hessian."
             break
         end
@@ -127,7 +139,7 @@ function optimize(
 
     # we can just check minimum, as we've earlier enforced same types/eltypes
     # in variables besides the option settings
-    Tf = typeof(value(d))
+    Tf = typeof(state.f_x)
     f_incr_pick = f_increased && !options.allow_f_increases
     stopped_by = (x_converged, f_converged, g_converged,
         f_limit_reached = f_limit_reached,
@@ -141,7 +153,7 @@ function optimize(
     )
 
     termination_code =
-        _termination_code(d, g_residual(d, state), state, stopped_by, options)
+        _termination_code(d, g_residual(state), state, stopped_by, options)
 
     return MultivariateOptimizationResults{
         typeof(method),
@@ -154,7 +166,7 @@ function optimize(
         method,
         initial_x,
         pick_best_x(f_incr_pick, state),
-        pick_best_f(f_incr_pick, state, d),
+        pick_best_f(f_incr_pick, state),
         iteration,
         Tf(options.x_abstol),
         Tf(options.x_reltol),
@@ -162,10 +174,10 @@ function optimize(
         x_relchange(state),
         Tf(options.f_abstol),
         Tf(options.f_reltol),
-        f_abschange(d, state),
-        f_relchange(d, state),
+        f_abschange(state),
+        f_relchange(state),
         Tf(options.g_abstol),
-        g_residual(d, state),
+        g_residual(state),
         tr,
         f_calls(d),
         g_calls(d),
@@ -186,13 +198,13 @@ function _termination_code(d, gres, state, stopped_by, options)
     elseif (iszero(options.x_abstol) && x_abschange(state) <= options.x_abstol) ||
            (iszero(options.x_reltol) && x_relchange(state) <= options.x_reltol)
         TerminationCode.NoXChange
-    elseif (iszero(options.f_abstol) && f_abschange(d, state) <= options.f_abstol) ||
-           (iszero(options.f_reltol) && f_relchange(d, state) <= options.f_reltol)
+    elseif (iszero(options.f_abstol) && f_abschange(state) <= options.f_abstol) ||
+           (iszero(options.f_reltol) && f_relchange(state) <= options.f_reltol)
         TerminationCode.NoObjectiveChange
     elseif x_abschange(state) <= options.x_abstol || x_relchange(state) <= options.x_reltol
         TerminationCode.SmallXChange
-    elseif f_abschange(d, state) <= options.f_abstol ||
-           f_relchange(d, state) <= options.f_reltol
+    elseif f_abschange(state) <= options.f_abstol ||
+           f_relchange(state) <= options.f_reltol
         TerminationCode.SmallObjectiveChange
     elseif stopped_by.ls_failed
         TerminationCode.FailedLinesearch
@@ -210,11 +222,11 @@ function _termination_code(d, gres, state, stopped_by, options)
         TerminationCode.HessianCalls
     elseif stopped_by.f_increased
         TerminationCode.ObjectiveIncreased
-    elseif f_calls(d) > 0 && !isfinite(value(d))
-        TerminationCode.GradientNotFinite
-    elseif g_calls(d) > 0 && !all(isfinite, gradient(d))
+    elseif !isfinite(state.f_x)
+        TerminationCode.ObjectiveNotFinite
+    elseif hasproperty(state, :g_x) && !all(isfinite, state.g_x)
         TerminationCode.GradientNotFinite
-    elseif h_calls(d) > 0 && !(d isa TwiceDifferentiableHV) && !all(isfinite, hessian(d))
+    elseif hasproperty(state, :H_x) && !all(isfinite, state.H_x)
         TerminationCode.HessianNotFinite
     else
         TerminationCode.NotImplemented
diff --git a/src/multivariate/solvers/constrained/fminbox.jl b/src/multivariate/solvers/constrained/fminbox.jl
index 5ac0c0e54..9837ba79f 100644
--- a/src/multivariate/solvers/constrained/fminbox.jl
+++ b/src/multivariate/solvers/constrained/fminbox.jl
@@ -1,5 +1,5 @@
 using NLSolversBase:
-    value, value!, value!!, gradient, gradient!, value_gradient!, value_gradient!!
+    value, value!, gradient!, value_gradient!
 ####### FIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIX THE MIDDLE OF BOX CASE THAT WAS THERE
 mutable struct BarrierWrapper{TO,TB,Tm,TF,TDF} <: AbstractObjective
     obj::TO
@@ -10,13 +10,16 @@ mutable struct BarrierWrapper{TO,TB,Tm,TF,TDF} <: AbstractObjective
     DFb::TDF
     DFtotal::TDF
 end
-f_calls(obj::BarrierWrapper) = f_calls(obj.obj)
-g_calls(obj::BarrierWrapper) = g_calls(obj.obj)
-h_calls(obj::BarrierWrapper) = h_calls(obj.obj)
+
+NLSolversBase.f_calls(obj::BarrierWrapper) = NLSolversBase.f_calls(obj.obj)
+NLSolversBase.g_calls(obj::BarrierWrapper) = NLSolversBase.g_calls(obj.obj)
+NLSolversBase.h_calls(obj::BarrierWrapper) = NLSolversBase.h_calls(obj.obj)
+NLSolversBase.hv_calls(obj::BarrierWrapper) = NLSolversBase.hv_calls(obj.obj)
+
 function BarrierWrapper(obj::NonDifferentiable, mu, lower, upper)
     barrier_term = BoxBarrier(lower, upper)
 
-    BarrierWrapper(obj, barrier_term, mu, copy(obj.F), copy(obj.F), nothing, nothing)
+    BarrierWrapper(obj, barrier_term, mu, oftype(obj.F, NaN), oftype(obj.F, NaN), nothing, nothing)
 end
 function BarrierWrapper(obj::OnceDifferentiable, mu, lower, upper)
     barrier_term = BoxBarrier(lower, upper)
@@ -25,10 +28,10 @@ function BarrierWrapper(obj::OnceDifferentiable, mu, lower, upper)
         obj,
         barrier_term,
         mu,
-        copy(obj.F),
-        copy(obj.F),
-        copy(obj.DF),
-        copy(obj.DF),
+        oftype(obj.F, NaN),
+        oftype(obj.F, NaN),
+        fill!(copy(obj.DF), NaN),
+        fill!(copy(obj.DF), NaN),
     )
 end
 
@@ -69,29 +72,6 @@ function _barrier_term_gradient(x::T, l, u) where {T}
 end
 
 # Wrappers
-function NLSolversBase.value!!(bw::BarrierWrapper, x)
-    bw.Fb = _barrier_value(bw.b, x)
-    if in_box(bw, x)
-        F = value!!(bw.obj, x)
-        bw.Ftotal = muladd(bw.mu, bw.Fb, F)
-    else
-        bw.Ftotal = bw.mu * bw.Fb
-    end
-    return bw.Ftotal
-end
-function NLSolversBase.value_gradient!!(bw::BarrierWrapper, x)
-    bw.Fb = _barrier_value(bw.b, x)
-    bw.DFb .= _barrier_term_gradient.(x, bw.b.lower, bw.b.upper)
-    if in_box(bw, x)
-        F, DF = value_gradient!!(bw.obj, x)
-        bw.Ftotal = muladd(bw.mu, bw.Fb, F)
-        bw.DFtotal .= muladd.(bw.mu, bw.DFb, DF)
-    else
-        bw.Ftotal = bw.mu * bw.Fb
-        bw.DFtotal .= bw.mu .* bw.DFb
-    end
-    return bw.Ftotal, bw.DFtotal
-end
 function NLSolversBase.value_gradient!(bb::BarrierWrapper, x)
     bb.DFb .= _barrier_term_gradient.(x, bb.b.lower, bb.b.upper)
     bb.Fb = _barrier_value(bb.b, x)
@@ -115,7 +95,6 @@ function NLSolversBase.value!(obj::BarrierWrapper, x)
     end
     return obj.Ftotal
 end
-NLSolversBase.value(obj::BarrierWrapper) = obj.Ftotal
 function NLSolversBase.value(obj::BarrierWrapper, x)
     Fb = _barrier_value(obj.b, x)
     if in_box(obj, x)
@@ -134,45 +113,6 @@ function NLSolversBase.gradient!(obj::BarrierWrapper, x)
     end
     return obj.DFtotal
 end
-NLSolversBase.gradient(obj::BarrierWrapper) = obj.DFtotal
-
-# this mutates mu but not the gradients
-# Super unsafe in that it depends on x_df being correct!
-function initial_mu(obj::BarrierWrapper, F)
-    T = typeof(obj.Fb) # this will not work if F is real, G is complex
-    gbarrier = map(
-        x ->
-            (isfinite.(x[2]) ? one(T) / (x[1] - x[2]) : zero(T)) +
-            (isfinite(x[3]) ? one(T) / (x[3] - x[1]) : zero(T)),
-        zip(obj.obj.x_f, obj.b.lower, obj.b.upper),
-    )
-
-    # obj.mu = initial_mu(gradient(obj.obj), gradient(obj.b, obj.DFb, obj.obj.x_df), T(F.mufactor), T(F.mu0))
-    obj.mu = initial_mu(gradient(obj.obj), gbarrier, T(F.mufactor), T(F.mu0))
-end
-# Attempt to compute a reasonable default mu: at the starting
-# position, the gradient of the input function should dominate the
-# gradient of the barrier.
-function initial_mu(
-    gfunc::AbstractArray{T},
-    gbarrier::AbstractArray{T},
-    mu0factor::T = T(1) / 1000,
-    mu0::T = convert(T, NaN),
-) where {T}
-
-    if isnan(mu0)
-        gbarriernorm = sum(abs, gbarrier)
-        if gbarriernorm > 0
-            mu = mu0factor * sum(abs, gfunc) / gbarriernorm
-        else
-            # Presumably, there is no barrier function
-            mu = zero(T)
-        end
-    else
-        mu = mu0
-    end
-    return mu
-end
 
 function limits_box(
     x::AbstractArray{T},
@@ -279,6 +219,36 @@ barrier_method(
     precondprep,
 ) = m # use `m` as is
 
+struct BoxState{T,Tx} <: ZerothOrderState
+    x::Tx
+    f_x::T
+    x_previous::Tx
+    f_x_previous::T
+end
+
+# Attempt to compute a reasonable default mu: at the starting
+# position, the gradient of the input function should dominate the
+# gradient of the barrier. 
+function initial_mu(box::BoxBarrier, x::AbstractArray, g_x::AbstractArray, F::Fminbox)
+    # Compute 1-norm of gradient of input function and the gradient of the barrier
+    _gnorm = sum(abs, g_x)
+    _gbarrier_norm = sum(Broadcast.instantiate(Broadcast.broadcasted((xi, li, ui) -> abs(_barrier_term_gradient(xi, li, ui)), x, box.lower, box.upper)))
+
+    gnorm, gbarrier_norm, mufactor, mu0 = promote(_gnorm, _gbarrier_norm, F.mufactor, F.mu0)
+    mu = if isnan(mu0)
+        if gbarrier_norm > 0
+            mufactor * gnorm / gbarrier_norm
+        else
+            # Presumably, there is no barrier function
+            zero(gnorm)
+        end
+    else
+        mu0
+    end
+
+    return mu
+end
+
 function optimize(
     f,
     l::AbstractArray,
@@ -483,22 +453,18 @@ function optimize(
     # barrier-aware optimization method instance (precondition relevance)
     _optimizer = barrier_method(F.method, P, (P, x) -> F.precondprep(P, x, l, u, dfbox))
 
-    state = initial_state(_optimizer, options, dfbox, x)
-    # we wait until state has been initialized to set the initial mu because
-    # we need the gradient of the objective and initial_state will value_gradient!!
-    # the objective, so that forces an evaluation
-    if F.method isa NelderMead
-        gradient!(dfbox, x)
-    end
-    dfbox.mu = initial_mu(dfbox, F)
-    if F.method isa NelderMead
-        for i = 1:length(state.f_simplex)
-            x = state.simplex[i]
-            boxval = _barrier_value(dfbox.b, x)
-            state.f_simplex[i] += boxval
-        end
-        state.i_order = sortperm(state.f_simplex)
+    # we wait until state has been initialized to set the initial mu because we need the gradient of the objective
+    state = initial_state(_optimizer, options, df, x)
+    @assert state.x == x
+    f_x = state.f_x
+    g_x = if hasproperty(state, :g_x)
+        copy(state.g_x)
+    else
+        copy(gradient!(dfbox, x))
     end
+    box = BoxBarrier(l, u)
+    mu = initial_mu(box, x, g_x, F)
+
     if show_trace > 00
         println("Fminbox")
         println("-------")
@@ -507,17 +473,9 @@ function optimize(
         println("\n")
     end
 
-    g = copy(x)
-    fval_all = Vector{Vector{T}}()
-
-    # Count the total number of outer iterations
+    # First iteration
     iteration = 1
-
-    # define the function (dfbox) to optimize by the inner optimizer
-
-    xold = copy(x)
     _time = time()
-    fval0 = dfbox.obj.F
 
     # Optimize with current setting of mu
     if show_trace > 0
@@ -530,15 +488,15 @@ function optimize(
         println("(numbers below include barrier contribution)")
     end
 
-    # we need to update the +mu*barrier_grad part. Since we're using the
-    # value_gradient! not !! as in initial_state, we won't make a superfluous
-    # evaluation
-    if !(F.method isa NelderMead)
-        value_gradient!(dfbox, x)
-        reset!(_optimizer, state, dfbox, x)
-    else
-        value!(dfbox, x)
-    end
+    # We add the barrier term to the objective function
+    # Since this changes the objective of the inner optimizer, we have to reset its state
+    dfbox = BarrierWrapper(df, mu, l, u)
+    reset!(_optimizer, state, dfbox, x)
+
+    # Store current state
+    x_previous = copy(x)
+    f_x_previous = f_x
+
     results = optimize(dfbox, x, _optimizer, options, state)
     stopped_by_callback = results.stopped_by.callback
     dfbox.obj.f_calls[1] = 0
@@ -548,7 +506,12 @@ function optimize(
     if hasfield(typeof(dfbox.obj), :h_calls)
         dfbox.obj.h_calls[1] = 0
     end
+
+    # Compute function value and gradient (without barrier term)
     copyto!(x, minimizer(results))
+    f_x, _g_x = value_gradient!(df, x)
+    copyto!(g_x, _g_x)
+
     boxdist = Base.minimum(((xi, li, ui),) -> min(xi - li, ui - xi), zip(x, l, u)) # Base.minimum !== minimum
     if show_trace > 0
         println()
@@ -559,16 +522,14 @@ function optimize(
         println("Decreasing barrier term μ.\n")
     end
 
-    # Decrease mu
-    dfbox.mu *= T(F.mufactor)
     # Test for convergence
-    g = x .- min.(max.(x .- gradient(dfbox.obj), l), u)
+    g = x .- clamp.(x .- g_x, l, u)
     _x_converged, _f_converged, _g_converged, f_increased =
         assess_convergence(
             x,
-            xold,
-            minimum(results),
-            fval0,
+            x_previous,
+            f_x,
+            f_x_previous,
             g,
             options.outer_x_abstol,
             options.outer_x_reltol,
@@ -589,11 +550,12 @@ function optimize(
         @warn("f(x) increased: stopping optimization")
     else
         while !converged && !stopped && iteration < outer_iterations
-            fval0 = dfbox.obj.F
             # Increment the number of steps we've had to perform
             iteration += 1
 
-            copyto!(xold, x)
+            # Decrease mu
+            mu *= T(F.mufactor)
+
             # Optimize with current setting of mu
             if show_trace > 0
                 header_string = "Fminbox iteration $iteration"
@@ -605,16 +567,15 @@ function optimize(
                 println("(numbers below include barrier contribution)")
             end
 
-            # we need to update the +mu*barrier_grad part. Since we're using the
-            # value_gradient! not !! as in initial_state, we won't make a superfluous
-            # evaluation
-
-            if !(F.method isa NelderMead)
-                value_gradient!(dfbox, x)
-            else
-                value!(dfbox, x)
-            end
+            # We need to update the barrier term of the objective function.
+            # Since this changes the objective of the inner optimizer, we have to reset its state
+            dfbox = BarrierWrapper(df, mu, l, u)
             reset!(_optimizer, state, dfbox, x)
+
+            # Store current state
+            copyto!(x_previous, x)
+            f_x_previous = f_x
+
             resultsnew = optimize(dfbox, x, _optimizer, options, state)
             stopped_by_callback = resultsnew.stopped_by.callback
             append!(results, resultsnew)
@@ -625,7 +586,12 @@ function optimize(
             if hasfield(typeof(dfbox.obj), :h_calls)
                 dfbox.obj.h_calls[1] = 0
             end
+
+            # Compute function value and gradient (without barrier term)
             copyto!(x, minimizer(results))
+            f_x, _g_x = value_gradient!(df, x)
+            copyto!(g_x, _g_x)
+
             boxdist = Base.minimum(((xi, li, ui),) -> min(xi - li, ui - xi), zip(x, l, u)) # Base.minimum !== minimum
             if show_trace > 0
                 println()
@@ -636,16 +602,14 @@ function optimize(
                 println("Decreasing barrier term μ.\n")
             end
 
-            # Decrease mu
-            dfbox.mu *= T(F.mufactor)
             # Test for convergence
-            g = x .- min.(max.(x .- gradient(dfbox.obj), l), u)
+            g .= x .- clamp.(g_x, l, u)
             _x_converged, _f_converged, _g_converged, f_increased =
                 assess_convergence(
                     x,
-                    xold,
-                    minimum(results),
-                    fval0,
+                    x_previous,
+                    f_x,
+                    f_x_previous,
                     g,
                     options.outer_x_abstol,
                     options.outer_x_reltol,
@@ -681,25 +645,24 @@ function optimize(
         f_converged = _f_converged,
         g_converged = _g_converged,
     )
-    box_state = (; x, x_previous = xold, f_x_previous = fval0)
-    termination_code = _termination_code(df, g_residual(g), box_state, stopped_by, options)
+    termination_code = _termination_code(df, g_residual(g), BoxState(x, f_x, x_previous, f_x_previous), stopped_by, options)
 
     return MultivariateOptimizationResults(
         F,
         initial_x,
-        minimizer(results),
-        df.f(minimizer(results)),
+        x,
+        f_x,
         iteration,
         results.x_abstol,
         results.x_reltol,
-        norm(x - xold, Inf),
-        norm(x - xold, Inf) / norm(x, Inf),
+        x_abschange(x, x_previous),
+        x_relchange(x, x_previous),
         results.f_abstol,
         results.f_reltol,
-        f_abschange(minimum(results), fval0),
-        f_relchange(minimum(results), fval0),
+        f_abschange(f_x, f_x_previous),
+        f_relchange(f_x, f_x_previous),
         results.g_abstol,
-        g_residual(g, Inf),
+        g_residual(g),
         results.trace,
         results.f_calls,
         results.g_calls,
diff --git a/src/multivariate/solvers/constrained/ipnewton/interior.jl b/src/multivariate/solvers/constrained/ipnewton/interior.jl
index 9c197e019..df9bfcaf6 100644
--- a/src/multivariate/solvers/constrained/ipnewton/interior.jl
+++ b/src/multivariate/solvers/constrained/ipnewton/interior.jl
@@ -1,7 +1,7 @@
 # TODO: when Optim supports sparse arrays, make a SparseMatrixCSC version of jacobianx
 
 
-abstract type AbstractBarrierState end
+abstract type AbstractBarrierState <: AbstractOptimizerState end
 
 # These are used not only for the current state, but also for the step and the gradient
 struct BarrierStateVars{T}
@@ -209,9 +209,8 @@ ls_update!(
 function initial_convergence(d, state, method::ConstrainedOptimizer, initial_x, options)
     # TODO: Make sure state.bgrad has been evaluated at initial_x
     # state.bgrad normally comes from constraints.c!(..., initial_x) in initial_state
-    gradient!(d, initial_x)
-    stopped = !isfinite(value(d)) || any(!isfinite, gradient(d))
-    g_residual(d, state) + norm(state.bgrad, Inf) < options.g_abstol, stopped
+    stopped = !isfinite(state.f_x) || any(!isfinite, state.g_x)
+    norm(state.g_x, Inf) + norm(state.bgrad, Inf) < options.g_abstol, stopped
 end
 
 function optimize(
@@ -253,7 +252,7 @@ function optimize(
     t0 = time() # Initial time stamp used to control early stopping by options.time_limit
     _time = t0
 
-    tr = OptimizationTrace{typeof(value(d)),typeof(method)}()
+    tr = OptimizationTrace{typeof(state.f_x),typeof(method)}()
     tracing =
         options.store_trace ||
         options.show_trace ||
@@ -275,9 +274,20 @@ function optimize(
     while !converged && !stopped && iteration < options.iterations
         iteration += 1
 
-        update_state!(d, constraints, state, method, options) && break # it returns true if it's forced by something in update! to stop (eg dx_dg == 0.0 in BFGS or linesearch errors)
+        # Convention: When `update_state!` is called, then `state` satisfies:
+        # - `state.x`: Current state
+        # - `state.f`: Objective function value of the current state, ie. `d(state.x)`
+        # - `state.g_x` (if available): Gradient of the objective function at the current state, i.e. `gradient(d, state.x)`
+        # - `state.H_x` (if available): Hessian of the objective function at the current state, i.e. `hessian(d, state.x)` 
+        fail = update_state!(d, constraints, state, method, options)
+        if fail
+            # `fail = true` e.g. if it's forced by something in update! to stop (eg dx_dg == 0.0 in BFGS or linesearch errors)
+            break
+        end
 
-        update_fg!(d, constraints, state, method)
+        # Update function value, gradient and Hessian matrix (skipped by some methods that already update those in `update_state!`)
+        # TODO: Already perform in `update_state!`?
+        update_fgh!(d, constraints, state, method)
 
         # TODO: Do we need to rethink f_increased for `ConstrainedOptimizer`s?
         x_converged, f_converged, g_converged, f_increased =
@@ -292,10 +302,6 @@ function optimize(
         counter_f_tol = f_converged ? counter_f_tol + 1 : 0
         converged = x_converged || g_converged || (counter_f_tol > options.successive_f_tol)
 
-        # We don't use the Hessian for anything if we have declared convergence,
-        # so we might as well not make the (expensive) update if converged == true
-        !converged && update_h!(d, constraints, state, method)
-
         if tracing
             # update trace; callbacks can stop routine early by returning true
             stopped_by_callback = trace!(tr, d, state, iteration, method, options)
@@ -327,14 +333,13 @@ function optimize(
     # we can just check minimum, as we've earlier enforced same types/eltypes
     # in variables besides the option settings
     T = typeof(options.f_reltol)
-    Tf = typeof(value(d))
     f_incr_pick = f_increased && !options.allow_f_increases
     termination_code = TerminationCode.NotImplemented
     return MultivariateOptimizationResults(
         method,
         initial_x,
         pick_best_x(f_incr_pick, state),
-        pick_best_f(f_incr_pick, state, d),
+        pick_best_f(f_incr_pick, state),
         iteration,
         T(options.x_abstol),
         T(options.x_reltol),
@@ -342,10 +347,10 @@ function optimize(
         x_relchange(state),
         T(options.f_abstol),
         T(options.f_reltol),
-        f_abschange(d, state),
-        f_relchange(d, state),
+        f_abschange(state),
+        f_relchange(state),
         T(options.g_abstol),
-        g_residual(d, state),
+        g_residual(state),
         tr,
         f_calls(d),
         g_calls(d),
@@ -357,13 +362,11 @@ function optimize(
     )
 end
 
-# Fallbacks (for methods that don't need these)
+# Fallback (for methods that don't need this)
 after_while!(d, constraints::AbstractConstraints, state, method, options) = nothing
-update_h!(d, constraints::AbstractConstraints, state, method) = nothing
 
 """
-    initialize_μ_λ!(state, bounds, μ0=:auto, β=0.01)
-    initialize_μ_λ!(state, bounds, (Hobj,HcI), μ0=:auto, β=0.01)
+    initialize_μ_λ!(state, bounds, HcI, μ0=:auto, β=0.01)
 
 Pick μ and λ to ensure that the equality constraints are satisfied
 locally (at the current `state.x`), and that the initial gradient
@@ -371,11 +374,10 @@ including the barrier would be a descent direction for the problem
 without the barrier (μ = 0). This ensures that the search isn't pushed
 out of the basin of the user-supplied initial guess.
 
-Upon entry, the objective function gradient, constraint values, and
-constraint jacobian must be set in `state.g`, `state.c`, and `state.J`
-respectively. If you also wish to ensure that the projection of
-Hessian is minimally-perturbed along the initial gradient, supply the
-hessian of the objective (`Hobj`) and
+Upon entry, the objective function gradient and Hessian, constraint values, and
+constraint jacobian must be set in `state.g_x`, `state.H_x`, `state.c`, and `state.J`,
+respectively. To ensure that the projection of Hessian is minimally-perturbed
+along the initial gradient, additionally supply
 
     HcI = ∑_i (σ_i/s_i)∇∇ c_{Ii}
 
@@ -390,7 +392,7 @@ values of `λ` are set using the chosen `μ`.
 function initialize_μ_λ!(
     state,
     bounds::ConstraintBounds,
-    Hinfo,
+    HcI::AbstractMatrix{<:Real},
     μ0::Union{Symbol,Number},
     β::Number = 1 // 100,
 )
@@ -399,7 +401,8 @@ function initialize_μ_λ!(
         fill!(state.bstate, 0)
         return state
     end
-    gf = state.g  # must be pre-set to ∇f
+    copyto!(state.g_L_x, state.g_x)
+    gf = state.g_L_x
     # Calculate projection of ∇f into the subspace spanned by the
     # equality constraint Jacobian
     JE = jacobianE(state, bounds)
@@ -420,7 +423,7 @@ function initialize_μ_λ!(
         Δg = JI' * σdivs
         PperpΔg = Δg - JE' * (Cc \ (JE * Δg))
         DI = dot(PperpΔg, PperpΔg)
-        κperp, κI = hessian_projections(Hinfo, Pperpg, (JI * Pperpg) ./ s)
+        κperp, κI = hessian_projections((state.H_x, HcI), Pperpg, (JI * Pperpg) ./ s)
         # Calculate μ and λI
         μ = β * (κperp == 0 ? sqrt(Dperp / DI) : min(sqrt(Dperp / DI), abs(κperp / κI)))
         if !isfinite(μ)
@@ -449,14 +452,6 @@ function initialize_μ_λ!(
     k == length(λE) || error("Something is wrong when initializing μ and λ.")
     state
 end
-function initialize_μ_λ!(
-    state,
-    bounds::ConstraintBounds,
-    μ0::Union{Number,Symbol},
-    β::Number = 1 // 100,
-)
-    initialize_μ_λ!(state, bounds, nothing, μ0, β)
-end
 
 function hessian_projections(Hinfo::Tuple{AbstractMatrix,AbstractMatrix}, Pperpg, y)
     κperp = dot(Hinfo[1] * Pperpg, Pperpg)
@@ -545,32 +540,38 @@ userλ(λcI, constraints) = userλ(λcI, constraints.bounds)
 ## Computation of the Lagrangian and its gradient
 # This is in a parametrization that is also useful during linesearch
 # TODO: `lagrangian` does not seem to be used (IPNewton)?
-function lagrangian(d, bounds::ConstraintBounds, x, c, bstate::BarrierStateVars, μ)
-    f_x = NLSolversBase.value!(d, x)
+function lagrangian(bounds::ConstraintBounds, x, f_x, c, bstate::BarrierStateVars, μ)
     ev = equality_violation(bounds, x, c, bstate)
-    L_xsλ = f_x + barrier_value(bounds, x, bstate, μ) + ev
-    f_x, L_xsλ, ev
+    L_x = f_x + barrier_value(bounds, x, bstate, μ) + ev
+    return L_x, ev
 end
 
 function lagrangian_fg!(
-    gx,
+    g_L_x,
     bgrad,
-    d,
     bounds::ConstraintBounds,
     x,
+    f_x,
+    g_x,
     c,
     J,
     bstate::BarrierStateVars,
     μ,
 )
-    fill!(bgrad, 0)
-    f_x, g_x = NLSolversBase.value_gradient!(d, x)
-    gx .= g_x
+    # Compute the value of the Lagrangian at x
     ev = equality_violation(bounds, x, c, bstate)
-    L_xsλ = f_x + barrier_value(bounds, x, bstate, μ) + ev
+    L_x = f_x + barrier_value(bounds, x, bstate, μ) + ev
+ 
+    # Compute gradient of the Lagrangian at x:
+    # - store the gradient of the barrier at x in `bgrad`
+    # - store the gradient of the Lagrangian at x in `g_L_x`
+    copyto!(g_L_x, g_x)
+    fill!(bgrad, 0)
     barrier_grad!(bgrad, bounds, x, bstate, μ)
-    equality_grad!(gx, bgrad, bounds, x, c, J, bstate)
-    f_x, L_xsλ, ev
+    equality_grad!(g_L_x, bgrad, bounds, x, c, J, bstate)
+
+    
+    return L_x, ev
 end
 
 # TODO: do we need lagrangian_vec? Maybe for automatic differentiation?
@@ -642,11 +643,13 @@ function lagrangian_linefunc(αs, d, constraints, state)
 end
 
 function _lagrangian_linefunc(αs, d, constraints, state)
-    b_ls, bounds = state.b_ls, constraints.bounds
+    b_ls = state.b_ls
     ls_update!(state.x_ls, state.x, state.s, alphax(αs))
     ls_update!(b_ls.bstate, state.bstate, state.bstep, αs)
     constraints.c!(b_ls.c, state.x_ls)
-    lagrangian(d, constraints.bounds, state.x_ls, b_ls.c, b_ls.bstate, state.μ)
+    f_x_ls = NLSolversBase.value!(d, state.x_ls)
+    L_x_ls, ev_ls = lagrangian(constraints.bounds, state.x_ls, f_x_ls, b_ls.c, b_ls.bstate, state.μ)
+    return f_x_ls, L_x_ls, ev_ls
 end
 alphax(α::Number) = α
 alphax(αs::Union{Tuple,AbstractVector}) = αs[1]
@@ -658,10 +661,8 @@ function lagrangian_linefunc!(
     state,
     method::IPOptimizer{typeof(backtrack_constrained)},
 )
-    # For backtrack_constrained, the last evaluation is the one we
-    # keep, so it's safe to store the results in state
-    state.f_x, state.L, state.ev = _lagrangian_linefunc(α, d, constraints, state)
-    state.L
+    _, L_x_ls, _ = _lagrangian_linefunc(α, d, constraints, state)
+    return L_x_ls
 end
 lagrangian_linefunc!(α, d, constraints, state, method) =
     lagrangian_linefunc(α, d, constraints, state)
@@ -669,8 +670,8 @@ lagrangian_linefunc!(α, d, constraints, state, method) =
 
 ## for line searches that do use the gradient along the line
 function lagrangian_lineslope(αs, d, constraints, state)
-    f_x, L, ev, slope = _lagrangian_lineslope(αs, d, constraints, state)
-    L, slope
+    _, L_x_ls, _, slope = _lagrangian_lineslope(αs, d, constraints, state)
+    return L_x_ls, slope
 end
 
 function _lagrangian_lineslope(αs, d, constraints, state)
@@ -680,19 +681,22 @@ function _lagrangian_lineslope(αs, d, constraints, state)
     ls_update!(b_ls.bstate, state.bstate, bstep, αs)
     constraints.c!(b_ls.c, state.x_ls)
     constraints.jacobian!(b_ls.J, state.x_ls)
-    f_x, L, ev = lagrangian_fg!(
-        state.g,
+    f_x_ls, g_x_ls = value_gradient!(d, state.x_ls)
+    g_L_x_ls = copy(state.g_L_x)
+    L_x_ls, ev_ls = lagrangian_fg!(
+        g_L_x_ls,
         bgrad,
-        d,
         bounds,
         state.x_ls,
+        f_x_ls,
+        g_x_ls,
         b_ls.c,
         b_ls.J,
         b_ls.bstate,
         state.μ,
     )
-    slopeα = slopealpha(state.s, state.g, bstep, bgrad)
-    f_x, L, ev, slopeα
+    slopeα = slopealpha(state.s, g_L_x_ls, bstep, bgrad)
+    f_x_ls, L_x_ls, ev_ls, slopeα
 end
 
 function lagrangian_lineslope!(
@@ -704,8 +708,8 @@ function lagrangian_lineslope!(
 )
     # For backtrack_constrained, the last evaluation is the one we
     # keep, so it's safe to store the results in state
-    state.f_x, state.L, state.ev, slope = _lagrangian_lineslope(αs, d, constraints, state)
-    state.L, slope
+    state.f_x, state.L_x, state.ev, slope = _lagrangian_lineslope(αs, d, constraints, state)
+    state.L_x, slope
 end
 lagrangian_lineslope!(αs, d, constraints, state, method) =
     lagrangian_lineslope(αs, d, constraints, state)
diff --git a/src/multivariate/solvers/constrained/ipnewton/ipnewton.jl b/src/multivariate/solvers/constrained/ipnewton/ipnewton.jl
index cc8a20914..189f6a8c9 100644
--- a/src/multivariate/solvers/constrained/ipnewton/ipnewton.jl
+++ b/src/multivariate/solvers/constrained/ipnewton/ipnewton.jl
@@ -59,21 +59,21 @@ IPNewton(;
     show_linesearch::Bool = false,
 ) = IPNewton(linesearch, μ0, show_linesearch)
 
-mutable struct IPNewtonState{T,Tx} <: AbstractBarrierState
-    x::Tx
-    f_x::T
-    x_previous::Tx
-    g::Tx
-    f_x_previous::T
-    H::Matrix{T}          # Hessian of the Lagrangian?
-    HP::Any # TODO: remove HP? It's not used
-    Hd::Vector{Int8}  # TODO: remove Hd? It's not used
-    s::Tx  # step for x
+mutable struct IPNewtonState{T,Tx,Tg} <: AbstractBarrierState
+    x::Tx                   # Current state
+    g_x::Tg                 # Gradient of the objective function at x
+    H_x::Matrix{T}          # Hessian of the objective function at x
+    f_x::T                  # Value of objective function at x
+    x_previous::Tx          # Previous state
+    f_x_previous::T         # Value of the objective function at x_previous
+    s::Tx                   # Step for x
     # Barrier penalty fields
-    μ::T                  # coefficient of the barrier penalty
-    μnext::T              # μ for the next iteration
-    L::T                  # value of the Lagrangian (objective + barrier + equality)
-    L_previous::T
+    μ::T                    # coefficient of the barrier penalty
+    μnext::T                # μ for the next iteration
+    L_x::T                  # value of the Lagrangian (objective + barrier + equality) at x
+    g_L_x::Tg               # gradient of the Lagrangian at x
+    H_L_x::Matrix{T}        # Hessian of the Lagrangian at x
+    L_x_previous::T           # value of the Lagrangian at x_previous
     bstate::BarrierStateVars{T}   # value of slack and λ variables (current "position")
     bgrad::BarrierStateVars{T}    # gradient of slack and λ variables at current "position"
     bstep::BarrierStateVars{T}    # search direction for slack and λ
@@ -86,39 +86,6 @@ mutable struct IPNewtonState{T,Tx} <: AbstractBarrierState
     Htilde::Any               # Positive Cholesky factorization of H from PositiveFactorizations.jl
 end
 
-# TODO: Do we need this convert thing? (It seems to be used with `show(IPNewtonState)`)
-function Base.convert(
-    ::Type{IPNewtonState{T,Tx}},
-    state::IPNewtonState{S,Sx},
-) where {T,Tx,S,Sx}
-    IPNewtonState(
-        convert(Tx, state.x),
-        T(state.f_x),
-        convert(Tx, state.x_previous),
-        convert(Tx, state.g),
-        T(state.f_x_previous),
-        convert(Matrix{T}, state.H),
-        state.HP,
-        state.Hd,
-        convert(Tx, state.s),
-        T(state.μ),
-        T(state.μnext),
-        T(state.L),
-        T(state.L_previous),
-        convert(BarrierStateVars{T}, state.bstate),
-        convert(BarrierStateVars{T}, state.bgrad),
-        convert(BarrierStateVars{T}, state.bstep),
-        convert(Vector{T}, state.constr_c),
-        convert(Matrix{T}, state.constr_J),
-        T(state.ev),
-        convert(Tx, state.x_ls),
-        T(state.alpha),
-        convert(BarrierLineSearchGrad{T}, state.b_ls),
-        convert(Tx, state.gtilde),
-        state.Htilde,
-    )
-end
-
 function initial_state(
     method::IPNewton,
     options::Options,
@@ -139,18 +106,12 @@ function initial_state(
     end
     # Allocate fields for the objective function
     n = length(initial_x)
-    g = similar(initial_x)
-    s = similar(initial_x)
-    f_x_previous = NaN
-    f_x, g_x = value_gradient!(d, initial_x)
-    g .= g_x # needs to be a separate copy of g_x
-    hessian!(d, initial_x)
-    H = collect(T, hessian(d))
-    Hd = zeros(Int8, n)
+
+    # TODO: Switch to `value_gradient_hessian!`
+    f_x, g_x, H_x = NLSolversBase.value_gradient_hessian!!(d, initial_x)
 
     # More constraints
     constr_J = fill!(Matrix{T}(undef, mc, n), NaN)
-    gtilde = copy(g)
     constraints.jacobian!(constr_J, initial_x)
     μ = T(1)
     bstate = BarrierStateVars(constraints.bounds, initial_x, constr_c)
@@ -161,18 +122,18 @@ function initial_state(
 
     state = IPNewtonState(
         copy(initial_x), # Maintain current state in state.x
+        copy(g_x), # Store current gradient in state.g_x
+        copy(H_x), # Store current Hessian in state.H_x
         f_x, # Store current f in state.f_x
-        copy(initial_x), # Maintain previous state in state.x_previous
-        g, # Store current gradient in state.g (TODO: includes Lagrangian calculation?)
-        T(NaN), # Store previous f in state.f_x_previous
-        H,
-        0,    # will be replaced
-        Hd,
-        similar(initial_x), # Maintain current x-search direction in state.s
+        fill!(similar(initial_x), NaN), # Maintain previous state in state.x_previous
+        oftype(f_x, NaN), # Store previous f in state.f_x_previous
+        fill!(similar(initial_x), NaN), # Maintain current x-search direction in state.s
         μ,
         μ,
-        T(NaN),
-        T(NaN),
+        oftype(f_x, NaN), # Store Lagrangian at x
+        fill!(similar(g_x), NaN), # Store gradient of Lagrangian at x
+        fill!(similar(H_x), NaN), # Store Hessian of Lagrangian at x
+        oftype(f_x, NaN), # Store Lagrangian at x_previous
         bstate,
         bgrad,
         bstep,
@@ -181,31 +142,48 @@ function initial_state(
         T(NaN),
         @initial_linesearch()..., # Maintain a cache for line search results in state.lsr
         b_ls,
-        gtilde,
+        fill!(similar(g_x), NaN), # Modified gradient at x
         0,
     )
 
-    Hinfo = (state.H, hessianI(initial_x, constraints, 1 ./ bstate.slack_c, 1))
-    initialize_μ_λ!(state, constraints.bounds, Hinfo, method.μ0)
-    update_fg!(d, constraints, state, method)
-    update_h!(d, constraints, state, method)
+    HcI = hessianI(initial_x, constraints, 1 ./ bstate.slack_c, 1)
+    initialize_μ_λ!(state, constraints.bounds, HcI, method.μ0)
+
+    # Update function value, gradient and Hessian matrix
+    update_fgh!(d, constraints, state, method)
 
     state
 end
 
-function update_fg!(d, constraints::TwiceDifferentiableConstraints, state, method::IPNewton)
-    state.f_x, state.L, state.ev = lagrangian_fg!(
-        state.g,
+function update_fgh!(d, constraints::TwiceDifferentiableConstraints, state, method::IPNewton)
+    # Compute objective function, gradient and Hessian matrix
+    # TODO: Switch to `value_gradient_hessian!!`
+    f_x, g_x, H_x = NLSolversBase.value_gradient_hessian!!(d, state.x)
+    copyto!(state.g_x, g_x)
+    copyto!(state.H_x, H_x)
+    state.f_x = f_x
+
+    # Compute value and gradient of the Lagrangian
+    state.L_x, state.ev = lagrangian_fg!(
+        state.g_L_x,
         state.bgrad,
-        d,
         constraints.bounds,
         state.x,
+        state.f_x,
+        state.g_x,
         state.constr_c,
         state.constr_J,
         state.bstate,
         state.μ,
     )
+
+    # Calculate the modified gradient (see below)
     update_gtilde!(d, constraints, state, method)
+
+    # Compute the Hessian of the Lagrangian
+    update_lagrangian_h!(d, constraints, state, method)
+
+    return nothing
 end
 
 function update_gtilde!(
@@ -217,9 +195,9 @@ function update_gtilde!(
     # Calculate the modified x-gradient for the block-eliminated problem
     # gtilde is the gradient for the affine-scaling problem, i.e.,
     # with μ=0, used in the adaptive setting of μ. Once we calculate μ we'll correct it
-    gtilde, bstate, bgrad = state.gtilde, state.bstate, state.bgrad
+    (; gtilde, bstate, bgrad) = state
     bounds = constraints.bounds
-    copyto!(gtilde, state.g)
+    copyto!(gtilde, state.g_L_x)
     JIc = view(state.constr_J, bounds.ineqc, :)
     if !isempty(JIc)
         Hssc = Diagonal(bstate.λc ./ bstate.slack_c)
@@ -234,13 +212,14 @@ function update_gtilde!(
     state
 end
 
-function update_h!(d, constraints::TwiceDifferentiableConstraints, state, method::IPNewton)
-    x, μ, Hxx, J = state.x, state.μ, state.H, state.constr_J
+function update_lagrangian_h!(d, constraints::TwiceDifferentiableConstraints, state, method::IPNewton)
+    x, μ, Hxx, J = state.x, state.μ, state.H_L_x, state.constr_J
     bstate, bgrad, bounds = state.bstate, state.bgrad, constraints.bounds
     m, n = size(J, 1), size(J, 2)
 
-    hessian!(d, state.x)  # objective's Hessian
-    copyto!(Hxx, hessian(d))  # objective's Hessian
+    # Initialize the Hessian of the Langrangian with the Hessian of the objective function
+    copyto!(Hxx, state.H_x)  # objective's Hessian
+
     # accumulate the constraint second derivatives
     λ = userλ(bstate.λc, constraints)
     λ[bounds.eqc] = -bstate.λcE  # the negative sign is from the Hessian
@@ -265,18 +244,24 @@ function update_h!(d, constraints::TwiceDifferentiableConstraints, state, method
     state
 end
 
+# TODO: This only works for method.linesearch = backtracking_constrained_grad
+# TODO: How are we meant to implement backtracking_constrained?.
+#       It requires both an alpha and an alphaI (αmax and αImax) ...
 function update_state!(
     d,
     constraints::TwiceDifferentiableConstraints,
     state::IPNewtonState{T},
-    method::IPNewton,
+    method::IPNewton{typeof(backtrack_constrained_grad)},
     options::Options,
 ) where {T}
-    state.f_x_previous, state.L_previous = state.f_x, state.L
+    state.f_x_previous, state.L_x_previous = state.f_x, state.L_x
     bstate, bstep, bounds = state.bstate, state.bstep, constraints.bounds
     qp = solve_step!(state, constraints, options, method.show_linesearch)
     # If a step α=1 will not change any of the parameters, we can quit now.
     # This prevents a futile linesearch.
+    if !(qp isa NTuple{3,Any})
+        return false
+    end
     if is_smaller_eps(state.x, state.s) &&
        is_smaller_eps(bstate.slack_x, bstep.slack_x) &&
        is_smaller_eps(bstate.slack_c, bstep.slack_c) &&
@@ -294,9 +279,7 @@ function update_state!(
 
     # Determine the actual distance of movement along the search line
     ϕ = linesearch_anon(d, constraints, state, method)
-    # TODO: This only works for method.linesearch = backtracking_constrained_grad
-    # TODO: How are we meant to implement backtracking_constrained?.
-    #       It requires both an alpha and an alphaI (αmax and αImax) ...
+
     state.alpha =
         method.linesearch!(ϕ, T(1), αmax, qp; show_linesearch = method.show_linesearch)
 
@@ -395,9 +378,9 @@ function solve_step!(
     k == length(ΔλE) || error("exhausted targets before ΔλE")
     solve_slack!(bstep, Δx, bounds, bstate, bgrad, J, μ)
     # Solve for the quadratic parameters (use the real H, not the posdef H)
-    Hstepx, HstepλE = state.H * Δx - JE' * ΔλE, -JE * Δx
-    qp = state.L,
-    slopealpha(state.s, state.g, bstep, bgrad),
+    Hstepx, HstepλE = state.H_L_x * Δx - JE' * ΔλE, -JE * Δx
+    qp = state.L_x,
+    slopealpha(state.s, state.g_L_x, bstep, bgrad),
     dot(Δx, Hstepx) + dot(ΔλE, HstepλE)
     qp
 end
diff --git a/src/multivariate/solvers/constrained/ipnewton/utilities/assess_convergence.jl b/src/multivariate/solvers/constrained/ipnewton/utilities/assess_convergence.jl
index f928adba2..1d754d104 100644
--- a/src/multivariate/solvers/constrained/ipnewton/utilities/assess_convergence.jl
+++ b/src/multivariate/solvers/constrained/ipnewton/utilities/assess_convergence.jl
@@ -4,9 +4,9 @@ function assess_convergence(state::IPNewtonState, d, options::Options)
     assess_convergence(
         state.x,
         state.x_previous,
-        state.L,
-        state.L_previous,
-        [state.g; bgrad.slack_x; bgrad.slack_c; bgrad.λx; bgrad.λc; bgrad.λxE; bgrad.λcE],
+        state.L_x,
+        state.L_x_previous,
+        [state.g_x; bgrad.slack_x; bgrad.slack_c; bgrad.λx; bgrad.λc; bgrad.λxE; bgrad.λcE],
         options.x_abstol,
         options.f_reltol,
         options.g_abstol,
diff --git a/src/multivariate/solvers/constrained/ipnewton/utilities/trace.jl b/src/multivariate/solvers/constrained/ipnewton/utilities/trace.jl
index 05de5f4ef..35ebd1b9a 100644
--- a/src/multivariate/solvers/constrained/ipnewton/utilities/trace.jl
+++ b/src/multivariate/solvers/constrained/ipnewton/utilities/trace.jl
@@ -25,15 +25,15 @@ end
 
 function trace!(tr, d, state, iteration, method::IPOptimizer, options, curr_time = time())
     dt = Dict()
-    dt["Lagrangian"] = state.L
+    dt["Lagrangian"] = state.L_x
     dt["μ"] = state.μ
     dt["ev"] = abs(state.ev)
     dt["time"] = curr_time
     if options.extended_trace
         dt["α"] = state.alpha
         dt["x"] = copy(state.x)
-        dt["g(x)"] = copy(state.g)
-        dt["h(x)"] = copy(state.H)
+        dt["g(x)"] = copy(state.g_x)
+        dt["h(x)"] = copy(state.H_x)
         if !isempty(state.bstate)
             dt["gtilde(x)"] = copy(state.gtilde)
             dt["bstate"] = copy(state.bstate)
@@ -41,11 +41,11 @@ function trace!(tr, d, state, iteration, method::IPOptimizer, options, curr_time
             dt["c"] = copy(state.constr_c)
         end
     end
-    g_norm = norm(state.g, Inf) + norm(state.bgrad, Inf)
+    g_norm = norm(state.g_x, Inf) + norm(state.bgrad, Inf)
     update!(
         tr,
         iteration,
-        value(d),
+        state.f_x,
         g_norm,
         dt,
         options.store_trace,
diff --git a/src/multivariate/solvers/constrained/samin.jl b/src/multivariate/solvers/constrained/samin.jl
index a8a9f7f33..a7ec24040 100644
--- a/src/multivariate/solvers/constrained/samin.jl
+++ b/src/multivariate/solvers/constrained/samin.jl
@@ -62,7 +62,7 @@ function optimize(
     obj_fn,
     lb::AbstractArray,
     ub::AbstractArray,
-    x::AbstractArray{Tx},
+    initial_x::AbstractArray{Tx},
     method::SAMIN,
     options::Options = Options(),
 ) where {Tx}
@@ -70,9 +70,11 @@ function optimize(
     time0 = time() # Initial time stamp used to control early stopping by options.time_limit
 
     hline = "="^80
+    x = copy(initial_x)
     d = NonDifferentiable(obj_fn, x)
+    f_x = value!(d, x)
 
-    tr = OptimizationTrace{typeof(value(d)),typeof(method)}()
+    tr = OptimizationTrace{typeof(f_x),typeof(method)}()
     tracing =
         options.store_trace ||
         options.show_trace ||
@@ -84,7 +86,6 @@ function optimize(
 
     x_tol, f_tol = options.x_abstol, options.f_abstol
 
-    x0 = copy(x)
     n = size(x, 1) # dimension of parameter
     #  Set initial values
     nacc = 0 # total accepted trials
@@ -96,11 +97,10 @@ function optimize(
     f_absΔ = Inf
     # most recent values, to compare to when checking convergend
     fstar = typemax(Float64) * ones(neps)
-    # Initial obj_value
-    xopt = copy(x)
-    f_old = value!(d, x)
-    fopt = copy(f_old) # give it something to compare to
-    details = [f_calls(d) t fopt xopt']
+    # Best values
+    f_opt = f_x
+    x_opt = copy(x)
+    details = [f_calls(d) t f_opt x_opt']
     bounds = ub - lb
     # check for out-of-bounds starting values
     for i = 1:n
@@ -114,7 +114,7 @@ function optimize(
     trace!(
         tr,
         d,
-        (x = xopt, iteration = iteration),
+        (; x, f_x, iteration = iteration),
         iteration,
         method,
         options,
@@ -122,6 +122,7 @@ function optimize(
     )
     stopped_by_callback = false
     # main loop, first increase temperature until parameter space covered, then reduce until convergence
+    xp = copy(x) # proposal
     while converge == 0
         # statistics to report at each temp change, set back to zero
         nup = 0
@@ -141,37 +142,35 @@ function optimize(
                     # new Sept 2011, if bounds are same, skip the search for that vbl.
                     # Allows restrictions without complicated programming
                     if (lb[h] != ub[h])
-                        xp = copy(x)
+                        copyto!(xp, x)
                         xp[h] += (Tx(2.0) * rand(Tx) - Tx(1.0)) * bounds[h]
                         if (xp[h] < lb[h]) || (xp[h] > ub[h])
                             xp[h] = lb[h] + (ub[h] - lb[h]) * rand(Tx)
                             lnobds += 1
                         end
                         # Evaluate function at new point
-                        f_proposal = value(d, xp)
+                        f_proposal = value!(d, xp)
                         #  Accept the new point if the function value decreases
-                        if (f_proposal <= f_old)
-                            x = copy(xp)
-                            f_old = f_proposal
+                        if (f_proposal <= f_x)
+                            copyto!(x, xp)
+                            f_x = f_proposal
                             nacc += 1 # total number of acceptances
                             nacp[h] += 1 # acceptances for this parameter
                             nup += 1
                             #  If lower than any other point, record as new optimum
-                            if f_proposal < fopt
-                                xopt = copy(xp)
-                                fopt = f_proposal
-                                d.F = f_proposal
+                            if f_proposal < f_opt
+                                copyto!(x_opt, xp)
+                                f_opt = f_proposal
                                 nnew += 1
                                 details = [details; [f_calls(d) t f_proposal xp']]
                             end
                             # If the point is higher, use the Metropolis criteria to decide on
                             # acceptance or rejection.
                         else
-                            p = exp(-(f_proposal - f_old) / t)
+                            p = exp(-(f_proposal - f_x) / t)
                             if rand(Tx) < p
-                                x = copy(xp)
-                                f_old = copy(f_proposal)
-                                d.F = f_proposal
+                                copyto!(x, xp)
+                                f_x = f_proposal
                                 nacc += 1
                                 nacp[h] += 1
                                 ndown += 1
@@ -186,7 +185,7 @@ function optimize(
                         stopped_by_callback = trace!(
                             tr,
                             d,
-                            (x = xopt, iteration = iteration),
+                            (; x = x_opt, f_x = f_opt, iteration = iteration),
                             iteration,
                             method,
                             options,
@@ -204,10 +203,10 @@ function optimize(
                             println(hline)
                             println("SAMIN results")
                             println("NO CONVERGENCE: MAXEVALS exceeded")
-                            @printf("\n     Obj. value:  %16.5f\n\n", fopt)
+                            @printf("\n     Obj. value:  %16.5f\n\n", f_opt)
                             println("       parameter      search width")
                             for i = 1:n
-                                @printf("%16.5f  %16.5f \n", xopt[i], bounds[i])
+                                @printf("%16.5f  %16.5f \n", x_opt[i], bounds[i])
                             end
                             println(hline)
                         end
@@ -215,9 +214,9 @@ function optimize(
                         termination_code = TerminationCode.NotImplemented
                         return MultivariateOptimizationResults(
                             method,
-                            x0,# initial_x,
-                            xopt, #pick_best_x(f_incr_pick, state),
-                            fopt, # pick_best_f(f_incr_pick, state, d),
+                            initial_x,
+                            x_opt, #pick_best_x(f_incr_pick, state),
+                            f_opt, # pick_best_f(f_incr_pick, state),
                             f_calls(d), #iteration,
                             x_tol,#T(options.x_tol),
                             0.0,#T(options.x_tol),
@@ -225,10 +224,10 @@ function optimize(
                             NaN,# x_abschange(state),
                             f_tol,#T(options.f_tol),
                             0.0,#T(options.f_tol),
-                            f_absΔ,#f_abschange(d, state),
-                            NaN,#f_abschange(d, state),
+                            f_absΔ,#f_abschange(state),
+                            NaN,#f_abschange(state),
                             0.0,#T(options.g_tol),
-                            NaN,#g_residual(d),
+                            NaN,#g_residual(state),
                             tr,
                             f_calls(d),
                             g_calls(d),
@@ -274,7 +273,7 @@ function optimize(
             println(hline)
             println("samin: intermediate results before next temperature change")
             println("temperature: ", round(t, digits = 5))
-            println("current best function value: ", round(fopt, digits = 5))
+            println("current best function value: ", round(f_opt, digits = 5))
             println("total evaluations so far: ", f_calls(d))
             println("total moves since last temperature reduction: ", nup + ndown + nrej)
             println("downhill: ", nup)
@@ -285,16 +284,16 @@ function optimize(
             println()
             println("       parameter      search width")
             for i = 1:n
-                @printf("%16.5f  %16.5f \n", xopt[i], bounds[i])
+                @printf("%16.5f  %16.5f \n", x_opt[i], bounds[i])
             end
             println(hline * "\n")
         end
         # Check for convergence, if we have covered the parameter space
         if coverage_ok
             # last value close enough to last neps values?
-            fstar[1] = f_old
-            f_absΔ = abs.(fopt - f_old) # close enough to best so far?
-            if all((abs.(fopt .- fstar)) .< f_tol) # within to for last neps trials?
+            fstar[1] = f_x
+            f_absΔ = abs.(f_opt - f_x) # close enough to best so far?
+            if all((abs.(f_opt .- fstar)) .< f_tol) # within to for last neps trials?
                 f_converged = true
                 # check for bound narrow enough for parameter convergence
                 if any(bounds .> x_tol)
@@ -336,10 +335,10 @@ function optimize(
                         )
                     end
                     println("total number of objective function evaluations: ", f_calls(d))
-                    @printf("\n     Obj. value:  %16.10f\n\n", fopt)
+                    @printf("\n     Obj. value:  %16.10f\n\n", f_opt)
                     println("       parameter      search width")
                     for i = 1:n
-                        @printf("%16.5f  %16.5f \n", xopt[i], bounds[i])
+                        @printf("%16.5f  %16.5f \n", x_opt[i], bounds[i])
                     end
                     println(hline * "\n")
                 end
@@ -347,18 +346,17 @@ function optimize(
             # Reduce temperature, record current function value in the
             # list of last "neps" values, and loop again
             t *= rt
-            pushfirst!(fstar, f_old)
+            pushfirst!(fstar, f_x)
             fstar = fstar[1:end-1]
-            f_old = copy(fopt)
-            x = copy(xopt)
         else  # coverage not ok - increase temperature quickly to expand search area
             t *= r_expand
             for i = neps:-1:2
                 fstar[i] = fstar[i-1]
             end
-            f_old = fopt
-            x = xopt
         end
+
+        f_x = f_opt
+        copyto!(x, x_opt)
     end
 end
 
diff --git a/src/multivariate/solvers/first_order/accelerated_gradient_descent.jl b/src/multivariate/solvers/first_order/accelerated_gradient_descent.jl
index 27896487c..7e1ca1d53 100644
--- a/src/multivariate/solvers/first_order/accelerated_gradient_descent.jl
+++ b/src/multivariate/solvers/first_order/accelerated_gradient_descent.jl
@@ -22,8 +22,10 @@ function AcceleratedGradientDescent(;
     AcceleratedGradientDescent(_alphaguess(alphaguess), linesearch, manifold)
 end
 
-mutable struct AcceleratedGradientDescentState{T,Tx} <: AbstractOptimizerState
+mutable struct AcceleratedGradientDescentState{T,Tx,Tg} <: AbstractOptimizerState
     x::Tx
+    g_x::Tg
+    f_x::T
     x_previous::Tx
     f_x_previous::T
     iteration::Int
@@ -35,25 +37,25 @@ end
 
 function initial_state(
     method::AcceleratedGradientDescent,
-    options,
+    ::Options,
     d,
-    initial_x::AbstractArray{T},
-) where {T}
+    initial_x::AbstractArray,
+)
     initial_x = copy(initial_x)
     retract!(method.manifold, initial_x)
-
-    value_gradient!!(d, initial_x)
-
-    project_tangent!(method.manifold, gradient(d), initial_x)
+    f_x, g_x = value_gradient!(d, initial_x)
+    project_tangent!(method.manifold, g_x, initial_x)
 
     AcceleratedGradientDescentState(
-        copy(initial_x), # Maintain current state in state.x
-        copy(initial_x), # Maintain previous state in state.x_previous
-        real(T)(NaN), # Store previous f in state.f_x_previous
+        initial_x, # Maintain current state in state.x
+        copy(g_x), # Maintain current gradient in state.g_x
+        f_x, # Maintain current f in state.f_x
+        fill!(similar(initial_x), NaN), # Maintain previous state in state.x_previous
+        oftype(f_x, NaN), # Store previous f in state.f_x_previous
         0, # Iteration
         copy(initial_x), # Maintain intermediary current state in state.y
-        similar(initial_x), # Maintain intermediary state in state.y_previous
-        similar(initial_x), # Maintain current search direction in state.s
+        fill!(similar(initial_x), NaN), # Maintain intermediary state in state.y_previous
+        fill!(similar(initial_x), NaN), # Maintain current search direction in state.s
         @initial_linesearch()...,
     )
 end
@@ -63,11 +65,10 @@ function update_state!(
     state::AcceleratedGradientDescentState,
     method::AcceleratedGradientDescent,
 )
-    value_gradient!(d, state.x)
     state.iteration += 1
-    project_tangent!(method.manifold, gradient(d), state.x)
+
     # Search direction is always the negative gradient
-    state.s .= .-gradient(d)
+    state.s .= .-state.g_x
 
     # Determine the distance of movement along the search line
     lssuccess = perform_linesearch!(state, method, ManifoldObjective(method.manifold, d))
diff --git a/src/multivariate/solvers/first_order/adam.jl b/src/multivariate/solvers/first_order/adam.jl
index b8fef5b09..6eb608210 100644
--- a/src/multivariate/solvers/first_order/adam.jl
+++ b/src/multivariate/solvers/first_order/adam.jl
@@ -45,81 +45,96 @@ function default_options(method::Adam)
     (; allow_f_increases = true, iterations = 10_000)
 end
 
-mutable struct AdamState{Tx,T,Tm,Tu,Ti} <: AbstractOptimizerState
+mutable struct AdamState{Tx,T,Tg,Tu,Ti} <: AbstractOptimizerState
     x::Tx
+    g_x::Tg
+    f_x::T
     x_previous::Tx
     f_x_previous::T
     s::Tx
-    m::Tm
+    m::Tg
     u::Tu
     alpha::T
     iter::Ti
 end
-function reset!(method, state::AdamState, obj, x)
-    value_gradient!!(obj, x)
-end
 
-function _get_init_params(method::Adam{T}) where {T<:Real}
-    method.α, method.β₁, method.β₂
+function reset!(method::Adam, state::AdamState, obj, x)
+    # Update function value and gradient
+    copyto!(state.x, x)
+    retract!(method.manifold, state.x)
+    f_x, g_x = value_gradient!(obj, state.x)
+    copyto!(state.g_x, g_x)
+    project_tangent!(method.manifold, state.g_x, state.x)
+    state.f_x = f_x
+
+    # Reset history
+    fill!(state.x_previous, NaN)
+    state.f_x_previous = oftype(state.f_x_previous, NaN)
+    fill!(state.s, NaN)
+
+    # Reset momentum
+    copyto!(state.m, state.g_x)
+    fill!(state.u, false)
+
+    return nothing
 end
 
-function _get_init_params(method::Adam)
-    method.α(1), method.β₁, method.β₂
+function _init_alpha(method::Adam)
+    (; α) = method
+    return α isa Real ? α : α(1)
 end
 
-function initial_state(method::Adam, options::Options, d, initial_x::AbstractArray{T}) where {T}
+function initial_state(method::Adam, ::Options, d, initial_x::AbstractArray)
+    # Compute function value and gradient
     initial_x = copy(initial_x)
-
-    value_gradient!!(d, initial_x)
-    α, β₁, β₂ = _get_init_params(method)
-
-    m = copy(gradient(d))
-    u = zero(m)
-    iter = 0
+    retract!(method.manifold, initial_x)
+    f_x, g_x = value_gradient!(d, initial_x)
+    g_x = copy(g_x)
+    project_tangent!(method.manifold, g_x, initial_x)
 
     AdamState(
         initial_x, # Maintain current state in state.x
-        copy(initial_x), # Maintain previous state in state.x_previous
-        real(T(NaN)), # Store previous f in state.f_x_previous
-        similar(initial_x), # Maintain current search direction in state.s
-        m,
-        u,
-        α,
-        iter,
+        g_x, # Maintain current gradient in state.g_x
+        f_x, # Maintain current f in state.f_x
+        fill!(similar(initial_x), NaN), # Maintain previous state in state.x_previous
+        oftype(f_x, NaN), # Store previous f in state.f_x_previous
+        fill!(similar(initial_x), NaN), # Maintain current search direction in state.s
+        copy(g_x), # m
+        zero(g_x), # u
+        _init_alpha(method), # alpha
+        0, # iter
     )
 end
 
-function _update_iter_alpha_in_state!(state::AdamState, method::Adam{T}) where {T<:Real}
+function update_state!(d, state::AdamState, method::Adam)
+    state.iter += 1
 
-    state.iter = state.iter + 1
-end
-
-function _update_iter_alpha_in_state!(state::AdamState, method::Adam)
+    # Update α parameter if it is not constant
+    if !(method.α isa Real)
+        state.alpha = method.α(state.iter)
+    end
 
-    state.iter = state.iter + 1
-    state.alpha = method.α(state.iter)
-end
-
-function update_state!(d, state::AdamState{T}, method::Adam) where {T}
-
-    _update_iter_alpha_in_state!(state, method)
-    value_gradient!(d, state.x)
-
-    α, β₁, β₂, ϵ = state.alpha, method.β₁, method.β₂, method.ϵ
+    # Unpack parameters
+    α = state.alpha
+    (; β₁, β₂, ϵ) = method
     a = 1 - β₁
     b = 1 - β₂
 
     m, u = state.m, state.u
     v = u
-    m .= β₁ .* m .+ a .* gradient(d)
-    v .= β₂ .* v .+ b .* gradient(d) .^ 2
+    m .= β₁ .* m .+ a .* state.g_x
+    v .= β₂ .* v .+ b .* state.g_x .^ 2
     #  m̂ = m./(1-β₁^state.iter)
     # v̂ = v./(1-β₂^state.iter)
     #@. z = z - α*m̂/(sqrt(v̂+ϵ))
     αₜ = α * sqrt(1 - β₂^state.iter) / (1 - β₁^state.iter)
+
+    # Update current state
+    copyto!(state.x_previous, state.x)
+    state.f_x_previous = state.f_x
     @. state.x = state.x - αₜ * m / (sqrt(v) + ϵ)
-    # Update current position # x = x + alpha * s
-    false # break on linesearch error
+
+    false # no error
 end
 
 function trace!(tr, d, state::AdamState, iteration::Integer, method::Adam, options::Options, curr_time = time())
diff --git a/src/multivariate/solvers/first_order/adamax.jl b/src/multivariate/solvers/first_order/adamax.jl
index 782c3d7e5..678a683d4 100644
--- a/src/multivariate/solvers/first_order/adamax.jl
+++ b/src/multivariate/solvers/first_order/adamax.jl
@@ -44,75 +44,90 @@ function default_options(method::AdaMax)
     (; allow_f_increases = true, iterations = 10_000)
 end
 
-
-mutable struct AdaMaxState{Tx,T,Tm,Tu,Ti} <: AbstractOptimizerState
+mutable struct AdaMaxState{Tx,T,Tg} <: AbstractOptimizerState
     x::Tx
+    g_x::Tg
+    f_x::T
     x_previous::Tx
     f_x_previous::T
     s::Tx
-    m::Tm
-    u::Tu
+    m::Tg
+    u::Tg
     alpha::T
-    iter::Ti
-end
-function reset!(method, state::AdaMaxState, obj, x)
-    value_gradient!!(obj, x)
+    iter::Int
 end
 
-function _get_init_params(method::AdaMax{T}) where {T<:Real}
-    method.α, method.β₁, method.β₂
+function reset!(method::AdaMax, state::AdaMaxState, obj, x)
+    # Update function value and gradient
+    copyto!(state.x, x)
+    retract!(method.manifold, state.x)
+    f_x, g_x = value_gradient!(obj, state.x)
+    copyto!(state.g_x, g_x)
+    project_tangent!(method.manifold, state.g_x, state.x)
+    state.f_x = f_x
+
+    # Delete history
+    fill!(state.x_previous, NaN)
+    state.f_x_previous = oftype(state.f_x_previous, NaN)
+    fill!(state.s, NaN)
+
+    # Update momentum
+    copyto!(state.m, state.g_x)
+    fill!(state.u, false)
+
+    return nothing
 end
 
-function _get_init_params(method::AdaMax)
-    method.α(1), method.β₁, method.β₂
+function _init_alpha(method::AdaMax)
+    (; α) = method
+    return α isa Real ? α : α(1)
 end
 
 function initial_state(method::AdaMax, options::Options, d, initial_x::AbstractArray{T}) where {T}
+    # Compute function value and gradient
     initial_x = copy(initial_x)
-
-    value_gradient!!(d, initial_x)
-    α, β₁, β₂ = _get_init_params(method)
-
-    m = copy(gradient(d))
-    u = zero(m)
-    iter = 0
+    retract!(method.manifold, initial_x)
+    f_x, g_x = value_gradient!(d, initial_x)
+    g_x = copy(g_x)
+    project_tangent!(method.manifold, g_x, initial_x)
 
     AdaMaxState(
         initial_x, # Maintain current state in state.x
-        copy(initial_x), # Maintain previous state in state.x_previous
-        real(T(NaN)), # Store previous f in state.f_x_previous
-        similar(initial_x), # Maintain current search direction in state.s
-        m,
-        u,
-        α,
-        iter,
+        g_x, # Maintain current gradient in state.g_x
+        f_x, # Maintain current f in state.f_x
+        fill!(similar(initial_x), NaN), # Maintain previous state in state.x_previous
+        oftype(f_x, NaN), # Store previous f in state.f_x_previous
+        fill!(similar(initial_x), NaN), # Maintain current search direction in state.s
+        copy(g_x), # m
+        zero(g_x), # u
+        _init_alpha(method), # alpha
+        0, # iter
     )
 end
 
-function _update_iter_alpha_in_state!(state::AdaMaxState, method::AdaMax{T}) where {T<:Real}
+function update_state!(d, state::AdaMaxState, method::AdaMax)
+    state.iter += 1
 
-    state.iter = state.iter + 1
-end
+    # Update step size alpha if it is not constant
+    if !(method.α isa Real)
+        state.alpha = method.α(state.iter)
+    end
 
-function _update_iter_alpha_in_state!(state::AdaMaxState, method::AdaMax)
-
-    state.iter = state.iter + 1
-    state.alpha = method.α(state.iter)
-end
-
-function update_state!(d, state::AdaMaxState{T}, method::AdaMax) where {T}
-    _update_iter_alpha_in_state!(state, method)
-    value_gradient!(d, state.x)
-    α, β₁, β₂, ϵ = state.alpha, method.β₁, method.β₂, method.ϵ
+    # Unpack parameters
+    α = state.alpha
+    (; β₁, β₂, ϵ) = method
     a = 1 - β₁
-    m, u = state.m, state.u
 
-    m .= β₁ .* m .+ a .* gradient(d)
-    u .= max.(ϵ, max.(β₂ .* u, abs.(gradient(d)))) # I know it's not there in the paper but if m and u start at 0 for some element... NaN occurs next
+    (; g_x, m, u) = state
+    m .= β₁ .* m .+ a .* g_x
+    u .= max.(ϵ, max.(β₂ .* u, abs.(g_x))) # I know it's not there in the paper but if m and u start at 0 for some element... NaN occurs next
 
+    # Update current state
+    copyto!(state.x_previous, state.x)
+    state.f_x_previous = state.f_x
     @. state.x = state.x - (α / (1 - β₁^state.iter)) * m / u
-    # Update current position # x = x + alpha * s
-    false # break on linesearch error
+
+    false # no error
 end
 
 function trace!(tr, d, state::AdaMaxState, iteration::Integer, method::AdaMax, options::Options, curr_time = time())
diff --git a/src/multivariate/solvers/first_order/bfgs.jl b/src/multivariate/solvers/first_order/bfgs.jl
index ed1c3b455..9e6041b51 100644
--- a/src/multivariate/solvers/first_order/bfgs.jl
+++ b/src/multivariate/solvers/first_order/bfgs.jl
@@ -49,8 +49,10 @@ end
 
 mutable struct BFGSState{Tx,Tm,T,G} <: AbstractOptimizerState
     x::Tx
+    g_x::G
+    f_x::T
     x_previous::Tx
-    g_previous::G
+    g_x_previous::G
     f_x_previous::T
     dx::Tx
     dg::Tx
@@ -69,74 +71,82 @@ function _init_identity_matrix(x::AbstractArray{T}, scale::T = T(1)) where {T}
 end
 
 function reset!(method, state::BFGSState, obj, x)
-    n = length(x)
-    T = eltype(x)
+    # Update function value and gradient
     retract!(method.manifold, x)
-    value_gradient!(obj, x)
-    project_tangent!(method.manifold, gradient(obj), x)
+    f_x, g_x = value_gradient!(obj, x)
+    project_tangent!(method.manifold, g_x, x)
+    state.f_x = f_x
+    copyto!(state.g_x, g_x)
 
+    # Delete history
+    fill!(state.x_previous, NaN)
+    fill!(state.g_x_previous, NaN)
+    state.f_x_previous = oftype(state.f_x_previous, NaN)
+
+    # Update approximation of inverse Hessian
     if method.initial_invH === nothing
         if method.initial_stepnorm === nothing
             # Identity matrix of size n x n
             state.invH = _init_identity_matrix(x)
         else
-            initial_scale = T(method.initial_stepnorm) * inv(norm(gradient(obj), Inf))
+            T = eltype(state.invH)
+            initial_scale = T(method.initial_stepnorm) * inv(norm(g_x, Inf))
             state.invH = _init_identity_matrix(x, initial_scale)
         end
     else
         state.invH .= method.initial_invH(x)
     end
+
+    return nothing
 end
 
-function initial_state(method::BFGS, options::Options, d, initial_x::AbstractArray{T}) where {T}
-    n = length(initial_x)
+function initial_state(method::BFGS, ::Options, d, initial_x::AbstractArray)
+    # Compute function value and gradient
     initial_x = copy(initial_x)
     retract!(method.manifold, initial_x)
+    f_x, g_x = value_gradient!(d, initial_x)
+    project_tangent!(method.manifold, g_x, initial_x)
 
-    value_gradient!!(d, initial_x)
-
-    project_tangent!(method.manifold, gradient(d), initial_x)
-
+    # Initialize approximation of inverse Hessian
     if method.initial_invH === nothing
         if method.initial_stepnorm === nothing
             # Identity matrix of size n x n
             invH0 = _init_identity_matrix(initial_x)
         else
-            initial_scale = T(method.initial_stepnorm) * inv(norm(gradient(d), Inf))
+            T = eltype(g_x)
+            initial_scale = T(method.initial_stepnorm) * inv(norm(g_x, Inf))
             invH0 = _init_identity_matrix(initial_x, initial_scale)
         end
     else
         invH0 = method.initial_invH(initial_x)
     end
+
     # Maintain a cache for line search results
     # Trace the history of states visited
     BFGSState(
         initial_x, # Maintain current state in state.x
-        copy(initial_x), # Maintain previous state in state.x_previous
-        copy(gradient(d)), # Store previous gradient in state.g_previous
-        real(T)(NaN), # Store previous f in state.f_x_previous
-        similar(initial_x), # Store changes in position in state.dx
-        similar(initial_x), # Store changes in gradient in state.dg
-        similar(initial_x), # Buffer stored in state.u
+        copy(g_x), # Maintain current gradient in state.g_x
+        f_x, # Maintain current f in state.f_x
+        fill!(similar(initial_x), NaN), # Maintain previous state in state.x_previous
+        fill!(similar(g_x), NaN), # Store previous gradient in state.g_x_previous
+        oftype(f_x, NaN), # Store previous f in state.f_x_previous
+        fill!(similar(initial_x), NaN), # Store changes in position in state.dx
+        fill!(similar(initial_x), NaN), # Store changes in gradient in state.dg
+        fill!(similar(initial_x), NaN), # Buffer stored in state.u
         invH0, # Store current invH in state.invH
-        similar(initial_x), # Store current search direction in state.s
+        fill!(similar(initial_x), NaN), # Store current search direction in state.s
         @initial_linesearch()...,
     )
 end
 
 
 function update_state!(d, state::BFGSState, method::BFGS)
-    n = length(state.x)
-    T = eltype(state.s)
     # Set the search direction
     # Search direction is the negative gradient divided by the approximate Hessian
-    mul!(vec(state.s), state.invH, vec(gradient(d)))
-    rmul!(state.s, T(-1))
+    mul!(vec(state.s), state.invH, vec(state.g_x))
+    rmul!(state.s, eltype(state.s)(-1))
     project_tangent!(method.manifold, state.s, state.x)
 
-    # Maintain a record of the previous gradient
-    copyto!(state.g_previous, gradient(d))
-
     # Determine the distance of movement along the search line
     # This call resets invH to initial_invH if the former is not positive
     # semi-definite
@@ -150,11 +160,17 @@ function update_state!(d, state::BFGSState, method::BFGS)
     return !lssuccess # break on linesearch error
 end
 
-function update_h!(d, state, method::BFGS)
-    n = length(state.x)
+function update_fgh!(d, state::BFGSState, method::BFGS)
     (; invH, dx, dg, u) = state
+    
+    # Update function value and gradient
+    f_x, g_x = value_gradient!(d, state.x)
+    copyto!(state.g_x, g_x)
+    project_tangent!(method.manifold, state.g_x, state.x)
+    state.f_x = f_x
+
     # Measure the change in the gradient
-    dg .= gradient(d) .- state.g_previous
+    dg .= state.g_x .- state.g_x_previous
 
     # Update the inverse Hessian approximation using Sherman-Morrison
     dx_dg = real(dot(dx, dg))
@@ -166,6 +182,7 @@ function update_h!(d, state, method::BFGS)
 
         # invH = invH + c1 * (s * s') - c2 * (u * s' + s * u')
         if (invH isa Array) # i.e. not a CuArray
+            n = length(dx)
             @inbounds for j = 1:n
                 c1dxj = c1 * dx[j]'
                 c2dxj = c2 * dx[j]'
@@ -191,16 +208,15 @@ function trace!(tr, d, state::BFGSState, iteration::Integer, method::BFGS, optio
     dt["time"] = curr_time
     if options.extended_trace
         dt["x"] = copy(state.x)
-        dt["g(x)"] = copy(gradient(d))
+        dt["g(x)"] = copy(state.g_x)
         dt["~inv(H)"] = copy(state.invH)
         dt["Current step size"] = state.alpha
     end
-    g_norm = norm(gradient(d), Inf)
     update!(
         tr,
         iteration,
-        value(d),
-        g_norm,
+        state.f_x,
+        g_residual(state),
         dt,
         options.store_trace,
         options.show_trace,
diff --git a/src/multivariate/solvers/first_order/cg.jl b/src/multivariate/solvers/first_order/cg.jl
index fdc777f68..99fd9d724 100644
--- a/src/multivariate/solvers/first_order/cg.jl
+++ b/src/multivariate/solvers/first_order/cg.jl
@@ -93,8 +93,10 @@ end
 
 mutable struct ConjugateGradientState{Tx,T,G} <: AbstractOptimizerState
     x::Tx
+    g_x::G
+    f_x::T
     x_previous::Tx
-    g_previous::G
+    g_x_previous::G
     f_x_previous::T
     y::Tx
     py::Tx
@@ -104,24 +106,30 @@ mutable struct ConjugateGradientState{Tx,T,G} <: AbstractOptimizerState
 end
 
 function reset!(cg::ConjugateGradient, cgs::ConjugateGradientState, obj, x)
-    cgs.x .= x
-    _precondition!(cgs.pg, cg, x, gradient(obj))
-
+    copyto!(cgs.x, x)
+    retract!(cg.manifold, cgs.x)
+    f_x, g_x = value_gradient!(obj, cgs.x)
+    project_tangent!(cg.manifold, g_x, cgs.x)
+    cgs.f_x = f_x
+    copyto!(cgs.g_x, g_x)
+
+    fill!(cgs.x_previous, NaN)
+    cgs.f_x_previous = oftype(cgs.f_x_previous, NaN)
+    fill!(cgs.g_x_previous, NaN)
+
+    _precondition!(cgs.pg, cg, x, g_x)
     if cg.P !== nothing
         project_tangent!(cg.manifold, cgs.pg, x)
     end
     cgs.s .= .-cgs.pg
-    cgs.f_x_previous = typeof(cgs.f_x_previous)(NaN)
+
+    return nothing
 end
-function initial_state(method::ConjugateGradient, options::Options, d, initial_x)
-    T = eltype(initial_x)
+function initial_state(method::ConjugateGradient, ::Options, d, initial_x)
     initial_x = copy(initial_x)
     retract!(method.manifold, initial_x)
-
-    value_gradient!!(d, initial_x)
-
-    project_tangent!(method.manifold, gradient(d), initial_x)
-    pg = copy(gradient(d))
+    f_x, g_x = value_gradient!(d, initial_x)
+    project_tangent!(method.manifold, g_x, initial_x)
 
     # Could move this out? as a general check?
     #=
@@ -136,16 +144,19 @@ function initial_state(method::ConjugateGradient, options::Options, d, initial_x
     # Determine the intial search direction
     #    if we don't precondition, then this is an extra superfluous copy
     #    TODO: consider allowing a reference for pg instead of a copy
-    _precondition!(pg, method, initial_x, gradient(d))
+    pg = copy(g_x)
+    _precondition!(pg, method, initial_x, g_x)
     if method.P !== nothing
         project_tangent!(method.manifold, pg, initial_x)
     end
 
     ConjugateGradientState(
         initial_x, # Maintain current state in state.x
-        0 .* (initial_x), # Maintain previous state in state.x_previous
-        0 .* (gradient(d)), # Store previous gradient in state.g_previous
-        real(T)(NaN), # Store previous f in state.f_x_previous
+        copy(g_x), # Maintain current gradient in state.g
+        f_x, # Maintain current f in state.f
+        fill!(similar(initial_x), NaN), # Maintain previous state in state.x_previous
+        fill!(similar(g_x), NaN), # Store previous gradient in state.g_x_previous
+        oftype(f_x, NaN), # Store previous f in state.f_x_previous
         0 .* (initial_x), # Intermediate value in CG calculation
         0 .* (initial_x), # Preconditioned intermediate value in CG calculation
         pg, # Maintain the preconditioned gradient in pg
@@ -158,21 +169,23 @@ function update_state!(d, state::ConjugateGradientState, method::ConjugateGradie
     # Search direction is predetermined
 
     # Maintain a record of the previous gradient
-    copyto!(state.g_previous, gradient(d))
+    copyto!(state.g_x_previous, state.g_x)
 
     # Determine the distance of movement along the search line
     lssuccess = perform_linesearch!(state, method, ManifoldObjective(method.manifold, d))
 
     # Update current position # x = x + alpha * s
-    @. state.x = state.x + state.alpha * state.s
+    state.x .= muladd.(state.alpha, state.s, state.x)
     retract!(method.manifold, state.x)
 
     # Update the function value and gradient
-    value_gradient!(d, state.x)
-    project_tangent!(method.manifold, gradient(d), state.x)
+    f_x, g_x = value_gradient!(d, state.x)
+    project_tangent!(method.manifold, g_x, state.x)
+    state.f_x = f_x
+    copyto!(state.g_x, g_x)
 
     # Check sanity of function and gradient
-    isfinite(value(d)) || error("Non-finite f(x) while optimizing ($(value(d)))")
+    isfinite(f_x) || error(LazyString("Non-finite f(x) while optimizing (", f_x, ")"))
 
     # Determine the next search direction using HZ's CG rule
     #  Calculate the beta factor (HZ2013)
@@ -186,19 +199,19 @@ function update_state!(d, state::ConjugateGradientState, method::ConjugateGradie
     # also updates P for the preconditioning step below
     _apply_precondprep(method, state.x)
     dPd = _inverse_precondition(method, state)
-    etak = method.eta * real(dot(state.s, state.g_previous)) / dPd # New in HZ2013
-    state.y .= gradient(d) .- state.g_previous
+    etak = method.eta * real(dot(state.s, state.g_x_previous)) / dPd # New in HZ2013
+    state.y .= g_x .- state.g_x_previous
     ydots = real(dot(state.y, state.s))
     copyto!(state.py, state.pg)        # below, store pg - pg_previous in py
     # P already updated in _apply_precondprep above
-    __precondition!(state.pg, method.P, gradient(d))
+    __precondition!(state.pg, method.P, g_x)
 
     state.py .= state.pg .- state.py
     # ydots may be zero if f is not strongly convex or the line search does not satisfy Wolfe
     betak =
         (
             real(dot(state.y, state.pg)) -
-            real(dot(state.y, state.py)) * real(dot(gradient(d), state.s)) / ydots
+            real(dot(state.y, state.py)) * real(dot(g_x, state.s)) / ydots
         ) / ydots
     # betak may be undefined if ydots is zero (may due to f not strongly convex or non-Wolfe linesearch)
     beta = NaNMath.max(betak, etak) # TODO: Set to zero if betak is NaN?
@@ -207,7 +220,8 @@ function update_state!(d, state::ConjugateGradientState, method::ConjugateGradie
     return !lssuccess # break on linesearch error
 end
 
-update_g!(d, state, method::ConjugateGradient) = nothing
+# Function value, gradient and Hessian are already updated in `update_state!`
+update_fgh!(d, state, ::ConjugateGradient) = nothing
 
 function trace!(
     tr,
diff --git a/src/multivariate/solvers/first_order/gradient_descent.jl b/src/multivariate/solvers/first_order/gradient_descent.jl
index b6abe20ec..b999f2096 100644
--- a/src/multivariate/solvers/first_order/gradient_descent.jl
+++ b/src/multivariate/solvers/first_order/gradient_descent.jl
@@ -39,47 +39,59 @@ function GradientDescent(;
     GradientDescent(_alphaguess(alphaguess), linesearch, P, precondprep, manifold)
 end
 
-mutable struct GradientDescentState{Tx,T} <: AbstractOptimizerState
+mutable struct GradientDescentState{Tx,Tg,T} <: AbstractOptimizerState
     x::Tx
+    g_x::Tg
+    f_x::T
     x_previous::Tx
     f_x_previous::T
     s::Tx
     @add_linesearch_fields()
 end
-function reset!(method, state::GradientDescentState, obj, x)
-    retract!(method.manifold, x)
 
-    value_gradient!!(obj, x)
+function reset!(method::GradientDescent, state::GradientDescentState, obj, x)
+    # Update function value and gradient
+    copyto!(state.x, x)
+    retract!(method.manifold, state.x)
+    f_x, g_x = value_gradient!(obj, state.x)
+    copyto!(state.g_x, g_x)
+    project_tangent!(method.manifold, g_x, state.x)
+    state.f_x = f_x
+
+    # Delete history
+    fill!(state.x_previous, NaN)
+    state.f_x_previous = oftype(state.f_x_previous, NaN)
+    fill!(state.s, NaN)
 
-    project_tangent!(method.manifold, gradient(obj), x)
+    return nothing
 end
 function initial_state(
     method::GradientDescent,
-    options,
+    ::Options,
     d,
     initial_x::AbstractArray{T},
 ) where {T}
+    # Compute function value and gradient
     initial_x = copy(initial_x)
     retract!(method.manifold, initial_x)
-
-    value_gradient!!(d, initial_x)
-
-    project_tangent!(method.manifold, gradient(d), initial_x)
+    f_x, g_x = value_gradient!(d, initial_x)
+    g_x = copy(g_x)
+    project_tangent!(method.manifold, g_x, initial_x)
 
     GradientDescentState(
         initial_x, # Maintain current state in state.x
-        copy(initial_x), # Maintain previous state in state.x_previous
-        real(T(NaN)), # Store previous f in state.f_x_previous
-        similar(initial_x), # Maintain current search direction in state.s
+        g_x, # Maintain current gradient in state.g_x
+        f_x, # Maintain current f in state.f_x
+        fill!(similar(initial_x), NaN), # Maintain previous state in state.x_previous
+        oftype(f_x, NaN), # Store previous f in state.f_x_previous
+        fill!(similar(initial_x), NaN), # Maintain current search direction in state.s
         @initial_linesearch()...,
     )
 end
 
 function update_state!(d, state::GradientDescentState{T}, method::GradientDescent) where {T}
-    value_gradient!(d, state.x)
     # Search direction is always the negative preconditioned gradient
-    project_tangent!(method.manifold, gradient(d), state.x)
-    _precondition!(state.s, method, state.x, gradient(d))
+    _precondition!(state.s, method, state.x, state.g_x)
     rmul!(state.s, eltype(state.s)(-1))
     if method.P !== nothing
         project_tangent!(method.manifold, state.s, state.x)
@@ -91,6 +103,7 @@ function update_state!(d, state::GradientDescentState{T}, method::GradientDescen
     # Update current position # x = x + alpha * s
     @. state.x = state.x + state.alpha * state.s
     retract!(method.manifold, state.x)
+
     return !lssuccess # break on linesearch error
 end
 
diff --git a/src/multivariate/solvers/first_order/l_bfgs.jl b/src/multivariate/solvers/first_order/l_bfgs.jl
index 2d074a7c6..23aa175c7 100644
--- a/src/multivariate/solvers/first_order/l_bfgs.jl
+++ b/src/multivariate/solvers/first_order/l_bfgs.jl
@@ -137,8 +137,10 @@ Base.summary(io::IO, ::LBFGS) = print(io, "L-BFGS")
 
 mutable struct LBFGSState{Tx,Tdx,Tdg,T,G} <: AbstractOptimizerState
     x::Tx
+    g_x::G
+    f_x::T
     x_previous::Tx
-    g_previous::G
+    g_x_previous::G
     rho::Vector{T}
     dx_history::Tdx
     dg_history::Tdg
@@ -154,45 +156,52 @@ mutable struct LBFGSState{Tx,Tdx,Tdg,T,G} <: AbstractOptimizerState
 end
 function reset!(method::LBFGS, state::LBFGSState, obj, x::AbstractArray)
     retract!(method.manifold, x)
-    value_gradient!(obj, x)
-    project_tangent!(method.manifold, gradient(obj), x)
+    f_x, g_x = value_gradient!(obj, x)
+    project_tangent!(method.manifold, g_x, x)
+    state.f_x = f_x
+    copyto!(state.g_x, g_x)
+
+    fill!(state.x_previous, NaN)
+    fill!(state.g_x_previous, NaN)
+    state.f_x_previous = oftype(state.f_x_previous, NaN)
 
     state.pseudo_iteration = 0
+
+    return nothing
 end
 function initial_state(method::LBFGS, options::Options, d, initial_x::AbstractArray)
     T = real(eltype(initial_x))
-    n = length(initial_x)
     initial_x = copy(initial_x)
     retract!(method.manifold, initial_x)
-
-    value_gradient!!(d, initial_x)
-
-    project_tangent!(method.manifold, gradient(d), initial_x)
+    f_x, g_x = value_gradient!(d, initial_x)
+    project_tangent!(method.manifold, g_x, initial_x)
     LBFGSState(
         initial_x, # Maintain current state in state.x
-        copy(initial_x), # Maintain previous state in state.x_previous
-        copy(gradient(d)), # Store previous gradient in state.g_previous
+        copy(g_x), # Maintain current gradient in state.g_x
+        f_x, # Main current f in state.f_x
+        fill!(similar(initial_x), NaN), # Maintain previous state in state.x_previous
+        fill!(similar(g_x), NaN), # Store previous gradient in state.g_x_previous
         fill!(Vector{T}(undef, method.m), NaN), # state.rho
-        [similar(initial_x) for i = 1:method.m], # Store changes in position in state.dx_history
-        [eltype(gradient(d))(NaN) .* gradient(d) for i = 1:method.m], # Store changes in position in state.dg_history
-        T(NaN) * initial_x, # Buffer for new entry in state.dx_history
-        T(NaN) * initial_x, # Buffer for new entry in state.dg_history
-        T(NaN) * initial_x, # Buffer stored in state.u
-        real(T)(NaN), # Store previous f in state.f_x_previous
-        similar(initial_x), #Buffer for use by twoloop
-        Vector{T}(undef, method.m), #Buffer for use by twoloop
+        [fill!(similar(initial_x), NaN) for i = 1:method.m], # Store changes in position in state.dx_history
+        [fill!(similar(g_x), NaN) for i = 1:method.m], # Store changes in position in state.dg_history
+        fill!(similar(initial_x), NaN), # Buffer for new entry in state.dx_history
+        fill!(similar(initial_x), NaN), # Buffer for new entry in state.dg_history
+        fill!(similar(initial_x), NaN), # Buffer stored in state.u
+        oftype(f_x, NaN), # Store previous f in state.f_x_previous
+        fill!(similar(initial_x), NaN), #Buffer for use by twoloop
+        fill!(Vector{T}(undef, method.m), NaN),#Buffer for use by twoloop
         0,
-        eltype(gradient(d))(NaN) .* gradient(d), # Store current search direction in state.s
+        fill!(similar(g_x), NaN), # Store current search direction in state.s
         @initial_linesearch()...,
     )
 end
 
 function update_state!(d, state::LBFGSState, method::LBFGS)
-    n = length(state.x)
     # Increment the number of steps we've had to perform
     state.pseudo_iteration += 1
 
-    project_tangent!(method.manifold, gradient(d), state.x)
+    # TODO: Can this be removed?
+    project_tangent!(method.manifold, state.g_x, state.x)
 
     # update the preconditioner
     _apply_precondprep(method, state.x)
@@ -200,7 +209,7 @@ function update_state!(d, state::LBFGSState, method::LBFGS)
     # Determine the L-BFGS search direction # FIXME just pass state and method?
     twoloop!(
         state.s,
-        gradient(d),
+        state.g_x,
         state.rho,
         state.dx_history,
         state.dg_history,
@@ -213,9 +222,6 @@ function update_state!(d, state::LBFGSState, method::LBFGS)
     )
     project_tangent!(method.manifold, state.s, state.x)
 
-    # Save g value to prepare for update_g! call
-    copyto!(state.g_previous, gradient(d))
-
     # Determine the distance of movement along the search line
     lssuccess = perform_linesearch!(state, method, ManifoldObjective(method.manifold, d))
 
@@ -228,10 +234,15 @@ function update_state!(d, state::LBFGSState, method::LBFGS)
 end
 
 
-function update_h!(d, state::LBFGSState, method::LBFGS)
-    n = length(state.x)
+function update_fgh!(d, state::LBFGSState, method::LBFGS)
+    # Update function value and gradient
+    f_x, g_x = value_gradient!(d, state.x)
+    copyto!(state.g_x, g_x)
+    project_tangent!(method.manifold, state.g_x, state.x)
+    state.f_x = f_x
+
     # Measure the change in the gradient
-    state.dg .= gradient(d) .- state.g_previous
+    state.dg .= state.g_x .- state.g_x_previous
 
     # Update the L-BFGS history of positions and gradients
     rho_iteration = one(eltype(state.dx)) / real(dot(state.dx, state.dg))
diff --git a/src/multivariate/solvers/first_order/momentum_gradient_descent.jl b/src/multivariate/solvers/first_order/momentum_gradient_descent.jl
index b9eca6af1..7b074e17c 100644
--- a/src/multivariate/solvers/first_order/momentum_gradient_descent.jl
+++ b/src/multivariate/solvers/first_order/momentum_gradient_descent.jl
@@ -19,8 +19,10 @@ function MomentumGradientDescent(;
     MomentumGradientDescent(mu, _alphaguess(alphaguess), linesearch, manifold)
 end
 
-mutable struct MomentumGradientDescentState{Tx,T} <: AbstractOptimizerState
+mutable struct MomentumGradientDescentState{Tx,Tg,T} <: AbstractOptimizerState
     x::Tx
+    g_x::Tg
+    f_x::T
     x_previous::Tx
     x_momentum::Tx
     f_x_previous::T
@@ -28,21 +30,21 @@ mutable struct MomentumGradientDescentState{Tx,T} <: AbstractOptimizerState
     @add_linesearch_fields()
 end
 
-function initial_state(method::MomentumGradientDescent, options::Options, d, initial_x::AbstractArray)
-    T = eltype(initial_x)
+function initial_state(method::MomentumGradientDescent, ::Options, d, initial_x::AbstractArray)
+    # Compute function value and gradient
     initial_x = copy(initial_x)
     retract!(method.manifold, initial_x)
-
-    value_gradient!!(d, initial_x)
-
-    project_tangent!(method.manifold, gradient(d), initial_x)
+    f_x, g_x = value_gradient!(d, initial_x)
+    project_tangent!(method.manifold, g_x, initial_x)
 
     MomentumGradientDescentState(
         initial_x, # Maintain current state in state.x
+        copy(g_x), # Maintain current gradient in state.g_x
+        f_x, # Maintain current f in state.f_x 
         copy(initial_x), # Maintain previous state in state.x_previous
-        similar(initial_x), # Record momentum correction direction in state.x_momentum
-        real(T)(NaN), # Store previous f in state.f_x_previous
-        similar(initial_x), # Maintain current search direction in state.s
+        fill!(similar(initial_x), NaN), # Record momentum correction direction in state.x_momentum
+        oftype(f_x, NaN), # Store previous f in state.f_x_previous
+        fill!(similar(initial_x), NaN), # Maintain current search direction in state.s
         @initial_linesearch()...,
     )
 end
@@ -52,18 +54,19 @@ function update_state!(
     state::MomentumGradientDescentState,
     method::MomentumGradientDescent,
 )
-    project_tangent!(method.manifold, gradient(d), state.x)
     # Search direction is always the negative gradient
-    state.s .= .-gradient(d)
+    state.s .= .-state.g_x
 
-    # Update position, and backup current one
+    # Update momentum
     state.x_momentum .= state.x .- state.x_previous
 
     # Determine the distance of movement along the search line
     lssuccess = perform_linesearch!(state, method, ManifoldObjective(method.manifold, d))
 
+    # Update state
     state.x .+= state.alpha .* state.s .+ method.mu .* state.x_momentum
     retract!(method.manifold, state.x)
+
     return !lssuccess # break on linesearch error
 end
 
diff --git a/src/multivariate/solvers/first_order/ngmres.jl b/src/multivariate/solvers/first_order/ngmres.jl
index 8dd2ee32f..9c74b4695 100644
--- a/src/multivariate/solvers/first_order/ngmres.jl
+++ b/src/multivariate/solvers/first_order/ngmres.jl
@@ -136,20 +136,22 @@ function OACCEL(;
 end
 
 
-mutable struct NGMRESState{P,Tx,Te,T,eTx} <:
-               AbstractOptimizerState where {P<:AbstractOptimizerState}
+mutable struct NGMRESState{P<:AbstractOptimizerState,eTx,Tx<:AbstractArray{eTx},eTg,Tg<:AbstractArray{eTg},T,Te<:VecOrMat{T}} <:
+               AbstractOptimizerState
     # eTx is the eltype of Tx
-    x::Tx                    # Reference to nlpreconstate.x
-    x_previous::Tx           # Reference to nlpreconstate.x_previous
+    x::Tx                    # Current state. Reference to nlpreconstate.x
+    g_x::Tg                  # Gradient at x. Reference to nlpreconstate.g_x
+    f_x::T                   # Function value of x
+    x_previous::Tx           # Previous state. Reference to nlpreconstate.x_previous
     x_previous_0::Tx         # Used to deal with assess_convergence of NGMRES
-    f_x_previous::T
+    f_x_previous::T          # Function value of previous state.
     f_x_previous_0::T        # Used to deal with assess_convergence of NGMRES
     f_xP::T                  # For tracing purposes
     grnorm_xP::T             # For tracing purposes
     s::Tx                    # Search direction for linesearch between xP and xA
     nlpreconstate::P         # Nonlinear preconditioner state
     X::Array{eTx,2}          # Solution vectors in the window
-    R::Array{eTx,2}          # Gradient vectors in the window
+    R::Array{eTg,2}          # Gradient vectors in the window
     Q::Array{T,2}            # Storage to create linear system (TODO: make Symmetric?)
     ξ::Te                    # Storage to create linear system
     curw::Int                # Counter for current window size
@@ -158,7 +160,7 @@ mutable struct NGMRESState{P,Tx,Te,T,eTx} <:
     xA::Vector{eTx}          # Container for accelerated step
     k::Int                   # Used for indexing where to put values in the Storage containers
     restart::Bool            # Restart flag
-    g_abstol::T                 # Exit tolerance to be checked after nonlinear preconditioner apply
+    g_abstol::T              # Exit tolerance to be checked after nonlinear preconditioner apply
     subspacealpha::Vector{T} # Storage for coefficients in the subspace for the acceleration step
     @add_linesearch_fields()
 end
@@ -225,8 +227,8 @@ function initial_state(
     method::AbstractNGMRES,
     options::Options,
     d,
-    initial_x::AbstractArray{eTx},
-) where {eTx}
+    initial_x::AbstractArray,
+)
     if !(typeof(method.nlprecon) <: Union{GradientDescent,LBFGS})
         if !ngmres_oaccel_warned[]
             @warn "Use caution. N-GMRES/O-ACCEL has only been tested with Gradient Descent and L-BFGS preconditioning."
@@ -235,36 +237,38 @@ function initial_state(
     end
 
     nlpreconstate = initial_state(method.nlprecon, method.nlpreconopts, d, initial_x)
+    (; f_x, x, g_x) = nlpreconstate
     # Manifold comment:
     # We assume nlprecon calls retract! and project_tangent! on
-    # nlpreconstate.x and gradient(d)
-    T = real(eTx)
-
-    n = length(nlpreconstate.x)
+    # nlpreconstate.x and nlpreconstate.g_x
+ 
+    T = typeof(f_x)
+    n = length(x)
     wmax = method.wmax
-    X = Array{eTx}(undef, n, wmax)
-    R = Array{eTx}(undef, n, wmax)
-    Q = Array{T}(undef, wmax, wmax)
+    X = fill!(Array{eltype(x)}(undef, n, wmax), NaN)
+    R = fill!(Array{eltype(g_x)}(undef, n, wmax), NaN)
+    Q = fill!(Array{typeof(f_x)}(undef, wmax, wmax), NaN)
     ξ = if typeof(method) <: OACCEL
-        Array{T}(undef, wmax, 2)
+        fill!(Array{T}(undef, wmax, 2), NaN)
     else
-        Array{T}(undef, wmax)
+        fill!(Array{T}(undef, wmax), NaN)
     end
 
-    copyto!(view(X, :, 1), nlpreconstate.x)
-    copyto!(view(R, :, 1), gradient(d))
-
+    copyto!(view(X, :, 1), x)
+    copyto!(view(R, :, 1), g_x)
     _updateQ!(Q, 1, 1, X, R, method)
 
     NGMRESState(
-        nlpreconstate.x,          # Maintain current state in state.x. Use same vector as preconditioner.
-        nlpreconstate.x_previous, # Maintain  in state.x_previous. Use same vector as preconditioner.
-        copy(nlpreconstate.x), # Maintain state at the beginning of an iteration in state.x_previous_0. Used for convergence asessment.
+        nlpreconstate.x,          # Maintain current state in state.x. Uses same vector as preconditioner.
+        nlpreconstate.g_x,        # Maintain current gradient in state.g_x. Uses same vector as preconditioner.
+        nlpreconstate.f_x,        # Maintain current f in state.f_x.
+        nlpreconstate.x_previous, # Maintain in state.x_previous. Use same vector as preconditioner.
+        fill!(similar(nlpreconstate.x), NaN), # Maintain state at the beginning of an iteration in state.x_previous_0. Used for convergence asessment.
         T(NaN),                   # Store previous f in state.f_x_previous
         T(NaN),                   # Store f value from the beginning of an iteration in state.f_x_previous_0. Used for convergence asessment.
         T(NaN),                   # Store value f_xP of f(x^P) for tracing purposes
         T(NaN),                   # Store value grnorm_xP of |g(x^P)| for tracing purposes
-        similar(initial_x),       # Maintain current search direction in state.s
+        fill!(similar(initial_x), NaN), # Maintain current search direction in state.s
         nlpreconstate,            # State storage for preconditioner
         X,
         R,
@@ -282,17 +286,15 @@ function initial_state(
     )
 end
 
-nlprecon_post_optimize!(d, state, method) = update_h!(d, state.nlpreconstate, method)
-
-nlprecon_post_accelerate!(d, state, method) = update_h!(d, state.nlpreconstate, method)
-
+nlprecon_post_optimize!(d, state, method) = update_fgh!(d, state.nlpreconstate, method)
+nlprecon_post_accelerate!(d, state, method) = update_fgh!(d, state.nlpreconstate, method)
 function nlprecon_post_accelerate!(
     d,
     state::NGMRESState{X,T},
     method::LBFGS,
 ) where {X} where {T}
     state.nlpreconstate.pseudo_iteration += 1
-    update_h!(d, state.nlpreconstate, method)
+    update_fgh!(d, state.nlpreconstate, method)
 end
 
 
@@ -303,32 +305,26 @@ function update_state!(
 ) where {X} where {T}
     # Maintain a record of previous position, for convergence assessment
     copyto!(state.x_previous_0, state.x)
-    state.f_x_previous_0 = value(d)
+    state.f_x_previous_0 = state.f_x
 
     state.k += 1
     curw = state.curw
 
     # Step 1: Call preconditioner to get x^P
-    res = optimize(d, state.x, method.nlprecon, method.nlpreconopts, state.nlpreconstate)
+    optimize(d, state.x, method.nlprecon, method.nlpreconopts, state.nlpreconstate)
     # TODO: Is project_tangent! necessary, or is it called by nlprecon before exit?
-    project_tangent!(method.manifold, gradient(d), state.x)
+    project_tangent!(method.manifold, state.g_x, state.x)
+    state.f_xP = state.nlpreconstate.f_x
+    state.f_x = state.nlpreconstate.f_x
 
-    if any(.!isfinite.(state.x)) || any(.!isfinite.(gradient(d))) || !isfinite(value(d))
+    if !all(isfinite, state.x) || !all(isfinite, state.g_x) || !isfinite(state.f_xP)
         @warn("Non-finite values attained from preconditioner $(summary(method.nlprecon)).")
         return true
     end
 
+    state.grnorm_xP = g_residual(state.g_x)
 
-    # Calling value_gradient! in normally done on state.x in optimize or update_g! above,
-    # but there are corner cases where we need this.
-    state.f_xP, _g = value_gradient!(d, state.x)
-    # Manifold start
-    project_tangent!(method.manifold, gradient(d), state.x)
-    # Manifold stop
-    gP = gradient(d)
-    state.grnorm_xP = g_residual(gP)
-
-    if g_residual(gP) ≤ state.g_abstol
+    if state.grnorm_xP ≤ state.g_abstol
         return false # Exit on gradient norm convergence
     end
 
@@ -336,11 +332,11 @@ function update_state!(
     nlprecon_post_optimize!(d, state, method.nlprecon)
 
     # Step 2: Do acceleration calculation
-    η = _updateη(state.x, gP, method)
+    η = _updateη(state.x, state.g_x, method)
 
     for i = 1:curw
         # Update storage vectors according to method {NGMRES, OACCEL}
-        _updateξ!(state.ξ, i, state.X, state.x, state.R, gP, method)
+        _updateξ!(state.ξ, i, state.X, state.x, state.R, state.g_x, method)
         _updateb!(state.b, i, state.ξ, η, method)
     end
 
@@ -377,7 +373,8 @@ function update_state!(
     end
 
     # 3: Perform condition checks
-    if real(dot(state.s, gP)) ≥ 0 || !isfinite(real(dot(state.s, gP)))
+    sTg_x = real(dot(state.s, state.g_x))
+    if sTg_x ≥ 0 || !isfinite(sTg_x)
         # Moving from xP to xA is *not* a descent direction
         # Discard xA
         state.restart = true # TODO: expand restart heuristics
@@ -428,11 +425,13 @@ function update_state!(
     return !lssuccess # Break on linesearch error
 end
 
-function update_g!(d, state, method::AbstractNGMRES)
+function update_fgh!(d, state, method::AbstractNGMRES)
     # Update the function value and gradient
     # TODO: do we need a retract! on state.x here?
-    value_gradient!(d, state.x)
-    project_tangent!(method.manifold, gradient(d), state.x)
+    f_x, g_x = value_gradient!(d, state.x)
+    project_tangent!(method.manifold, g_x, state.x)
+    state.f_x = f_x
+    copyto!(state.g_x, g_x)
 
     if state.restart
         state.k = 0
@@ -443,11 +442,13 @@ function update_g!(d, state, method::AbstractNGMRES)
     j = mod(state.k, method.wmax) + 1
 
     copyto!(view(state.X, :, j), vec(state.x))
-    copyto!(view(state.R, :, j), vec(gradient(d)))
+    copyto!(view(state.R, :, j), vec(g_x))
 
     for i = 1:state.curw
         _updateQ!(state.Q, i, j, state.X, state.R, method)
     end
+
+    return nothing
 end
 
 function trace!(
@@ -463,7 +464,7 @@ function trace!(
     dt["time"] = curr_time
     if options.extended_trace
         dt["x"] = copy(state.x)
-        dt["g(x)"] = copy(gradient(d))
+        dt["g(x)"] = copy(state.g_x)
         dt["subspace-α"] = state.subspacealpha[1:state.curw-1]
         if state.restart
             dt["Current step size"] = NaN
@@ -481,12 +482,11 @@ function trace!(
         dt["|g(x^P)|"] = state.grnorm_xP
     end
 
-    g_norm = g_residual(d)
     update!(
         tr,
         iteration,
-        value(d),
-        g_norm,
+        state.f_x,
+        g_residual(state),
         dt,
         options.store_trace,
         options.show_trace,
diff --git a/src/multivariate/solvers/second_order/krylov_trust_region.jl b/src/multivariate/solvers/second_order/krylov_trust_region.jl
index 476b49239..30b1c164a 100644
--- a/src/multivariate/solvers/second_order/krylov_trust_region.jl
+++ b/src/multivariate/solvers/second_order/krylov_trust_region.jl
@@ -17,11 +17,11 @@ KrylovTrustRegion(;
     cg_tol::Real = 0.01,
 ) = KrylovTrustRegion(initial_radius, max_radius, eta, rho_lower, rho_upper, cg_tol)
 
-update_h!(d, state, method::KrylovTrustRegion) = nothing
-
 # TODO: support x::Array{T,N} et al.?
 mutable struct KrylovTrustRegionState{T} <: AbstractOptimizerState
     x::Vector{T}
+    f_x::T
+    g_x::Vector{T}
     x_previous::Vector{T}
     f_x_previous::T
     s::Vector{T}
@@ -45,10 +45,12 @@ function initial_state(method::KrylovTrustRegion, options::Options, d, initial_x
     @assert(method.rho_lower < method.rho_upper)
     @assert(method.rho_lower >= 0)
 
-    value_gradient!!(d, initial_x)
+    f_x, g_x = value_gradient!(d, initial_x)
 
     KrylovTrustRegionState(
         copy(initial_x),    # Maintain current state in state.x
+        f_x,                # Maintain f of current state in state.f_x
+        g_x,                # Maintain gradient of current state in state.g_x
         copy(initial_x), # x_previous
         zero(T),            # f_x_previous
         similar(initial_x), # Maintain current search direction in state.s
@@ -87,12 +89,11 @@ function trace!(
         dt["f_diff"] = state.f_diff
         dt["cg_iters"] = state.cg_iters
     end
-    g_norm = norm(gradient(d), Inf)
     update!(
         tr,
         iteration,
-        value(d),
-        g_norm,
+        state.f_x,
+        g_residual(state),
         dt,
         options.store_trace,
         options.show_trace,
@@ -107,19 +108,18 @@ function cg_steihaug!(
     state::KrylovTrustRegionState{T},
     method::KrylovTrustRegion,
 ) where {T}
-    n = length(state.x)
-    x, g, d, r, z, Hd =
-        state.x, gradient(objective), state.d, state.r, state.s, hv_product(objective)
+    (; x, g_x, d, r) = state
+    z = state.s
 
     fill!(z, 0.0)  # the search direction is initialized to the 0 vector,
-    r .= g  # so at first the whole gradient is the residual.
+    copyto!(r, g_x)  # so at first the whole gradient is the residual.
     d .= .-r # the first direction is the direction of steepest descent.
     rho0 = 1e100  # just a big number
 
     state.cg_iters = 0
-    for i = 1:n
+    for i = 1:length(x)
         state.cg_iters += 1
-        hv_product!(objective, x, d)
+        Hd = hv_product!(objective, x, d)
         dHd = dot(d, Hd)
         if -1e-15 < dHd < 1e-15
             break
@@ -151,8 +151,8 @@ function cg_steihaug!(
         end
     end
 
-    hv_product!(objective, x, z)
-    return dot(g, z) + 0.5 * dot(z, Hd)
+    Hd = hv_product!(objective, x, z)
+    return dot(g_x, z) + 0.5 * dot(z, Hd)
 end
 
 
@@ -164,7 +164,7 @@ function update_state!(
     state.m_diff = cg_steihaug!(objective, state, method)
     @assert state.m_diff <= 0
 
-    state.f_diff = value(objective, state.x .+ state.s) - value(objective)
+    state.f_diff = value(objective, state.x .+ state.s) - state.f_x
     state.rho = state.f_diff / state.m_diff
     state.interior = norm(state.s) < 0.9 * state.radius
 
@@ -183,11 +183,13 @@ function update_state!(
 end
 
 
-function update_g!(objective, state::KrylovTrustRegionState, method::KrylovTrustRegion)
+function update_fgh!(objective, state::KrylovTrustRegionState, ::KrylovTrustRegion)
     if state.accept_step
         # Update the function value and gradient
-        state.f_x_previous = value(objective)
-        value_gradient!(objective, state.x)
+        state.f_x_previous = state.f_x
+        f_x, g_x = value_gradient!(objective, state.x)
+        state.f_x = f_x
+        copyto!(state.g_x, g_x)
     end
 end
 
@@ -206,13 +208,13 @@ function assess_convergence(state::KrylovTrustRegionState, d, options::Options)
     # if abs(f_x - f_x_previous) < f_tol
     # Relative Tolerance
     if abs(state.f_diff) < max(
-        options.f_reltol * (abs(value(d)) + options.f_reltol),
-        eps(abs(value(d)) + abs(state.f_x_previous)),
+        options.f_reltol * (abs(state.f_x) + options.f_reltol),
+        eps(abs(state.f_x) + abs(state.f_x_previous)),
     )
         f_converged = true
     end
 
-    if norm(gradient(d), Inf) < options.g_abstol
+    if norm(state.g_x, Inf) < options.g_abstol
         g_converged = true
     end
 
diff --git a/src/multivariate/solvers/second_order/newton.jl b/src/multivariate/solvers/second_order/newton.jl
index 776928247..20a4824ae 100644
--- a/src/multivariate/solvers/second_order/newton.jl
+++ b/src/multivariate/solvers/second_order/newton.jl
@@ -29,8 +29,11 @@ end
 
 Base.summary(io::IO, ::Newton) = print(io, "Newton's Method")
 
-mutable struct NewtonState{Tx,T,F<:Cholesky} <: AbstractOptimizerState
+mutable struct NewtonState{Tx,Tg,TH,T,F<:Cholesky} <: AbstractOptimizerState
     x::Tx
+    g_x::Tg
+    H_x::TH
+    f_x::T
     x_previous::Tx
     f_x_previous::T
     F::F
@@ -39,20 +42,18 @@ mutable struct NewtonState{Tx,T,F<:Cholesky} <: AbstractOptimizerState
 end
 
 function initial_state(method::Newton, options, d, initial_x)
-    T = eltype(initial_x)
-    n = length(initial_x)
-    # Maintain current gradient in gr
-    s = similar(initial_x)
-
-    value_gradient!!(d, initial_x)
-    hessian!!(d, initial_x)
+    # TODO: Switch to `value_gradient_hessian!`
+    f_x, g_x, H_x = NLSolversBase.value_gradient_hessian!!(d, initial_x)
 
     NewtonState(
         copy(initial_x), # Maintain current state in state.x
-        copy(initial_x), # Maintain previous state in state.x_previous
-        T(NaN), # Store previous f in state.f_x_previous
-        Cholesky(similar(d.H, T, 0, 0), :U, 0),
-        similar(initial_x), # Maintain current search direction in state.s
+        copy(g_x), # Maintain current gradient in state.g_x
+        copy(H_x), # Maintain current Hessian in state.H_x
+        f_x, # Maintain current f in state.f_x
+        fill!(similar(initial_x), NaN), # Maintain previous state in state.x_previous
+        oftype(f_x, NaN), # Store previous f in state.f_x_previous
+        Cholesky(similar(H_x, 0, 0), :U, 0),
+        fill!(similar(initial_x), NaN), # Maintain current search direction in state.s
         @initial_linesearch()...,
     )
 end
@@ -63,19 +64,19 @@ function update_state!(d, state::NewtonState, method::Newton)
     # represented by H. It deviates from the usual "add a scaled
     # identity matrix" version of the modified Newton method. More
     # information can be found in the discussion at issue #153.
-    T = eltype(state.x)
 
-    if typeof(NLSolversBase.hessian(d)) <: AbstractSparseMatrix
-        state.s .= .-(NLSolversBase.hessian(d) \ convert(Vector{T}, gradient(d)))
+    if state.H_x isa AbstractSparseMatrix
+        state.s .= .-(state.H_x \ convert(Vector, state.g_x))
     else
-        state.F = cholesky!(Positive, NLSolversBase.hessian(d))
-        if typeof(gradient(d)) <: Array
+        state.F = cholesky!(Positive, state.H_x)
+        if state.g_x isa Array
             # is this actually StridedArray?
-            ldiv!(state.s, state.F, -gradient(d))
+            ldiv!(state.s, state.F, state.g_x)
+            state.s .= .-state.s
         else
             # not Array, we can't do inplace ldiv
-            gv = Vector{T}(undef, length(gradient(d)))
-            copyto!(gv, -gradient(d))
+            gv = Vector{eltype(state.g_x)}(undef, length(state.g_x))
+            gv .= .-state.g_x
             copyto!(state.s, state.F \ gv)
         end
     end
@@ -87,21 +88,20 @@ function update_state!(d, state::NewtonState, method::Newton)
     return !lssuccess # break on linesearch error
 end
 
-function trace!(tr, d, state::NewtonState, iteration::Integer, method::Newton, options::Options, curr_time = time())
+function trace!(tr, d, state::NewtonState, iteration::Integer, ::Newton, options::Options, curr_time = time())
     dt = Dict()
     dt["time"] = curr_time
     if options.extended_trace
         dt["x"] = copy(state.x)
-        dt["g(x)"] = copy(gradient(d))
-        dt["h(x)"] = copy(NLSolversBase.hessian(d))
+        dt["g(x)"] = copy(state.g_x)
+        dt["h(x)"] = copy(state.H_x)
         dt["Current step size"] = state.alpha
     end
-    g_norm = norm(gradient(d), Inf)
     update!(
         tr,
         iteration,
-        value(d),
-        g_norm,
+        state.f_x,
+        g_residual(state),
         dt,
         options.store_trace,
         options.show_trace,
diff --git a/src/multivariate/solvers/second_order/newton_trust_region.jl b/src/multivariate/solvers/second_order/newton_trust_region.jl
index f7daccde6..348d647af 100644
--- a/src/multivariate/solvers/second_order/newton_trust_region.jl
+++ b/src/multivariate/solvers/second_order/newton_trust_region.jl
@@ -302,14 +302,17 @@ end
 
 Base.summary(io::IO, ::NewtonTrustRegion) = print(io, "Newton's Method (Trust Region)")
 
-mutable struct NewtonTrustRegionState{Tx,T,G} <: AbstractOptimizerState
+mutable struct NewtonTrustRegionState{Tx,T,Tg,TH} <: AbstractOptimizerState
     x::Tx
+    g_x::Tg
+    H_x::TH
+    f_x::T
     x_previous::Tx
-    g_previous::G
+    g_x_previous::Tg
     f_x_previous::T
     s::Tx
     x_cache::Tx
-    g_cache::G
+    g_cache::Tg
     hard_case::Bool
     reached_subproblem_solution::Bool
     interior::Bool
@@ -321,7 +324,6 @@ end
 
 function initial_state(method::NewtonTrustRegion, options, d, initial_x)
     T = eltype(initial_x)
-    n = length(initial_x)
     # Keep track of trust region sizes
     delta = copy(method.initial_delta)
 
@@ -331,16 +333,20 @@ function initial_state(method::NewtonTrustRegion, options, d, initial_x)
     interior = true
     lambda = NaN
 
-    NLSolversBase.value_gradient_hessian!!(d, initial_x)
+    # TODO: Switch to `value_gradient_hessian!`
+    f_x, g_x, H_x = NLSolversBase.value_gradient_hessian!!(d, initial_x)
 
     NewtonTrustRegionState(
         copy(initial_x), # Maintain current state in state.x
-        copy(initial_x), # Maintain previous state in state.x_previous
-        copy(gradient(d)), # Store previous gradient in state.g_previous
-        T(NaN), # Store previous f in state.f_x_previous
-        similar(initial_x), # Maintain current search direction in state.s
-        fill!(similar(initial_x), NaN), # For resetting the state if a step is rejected
-        fill!(method.use_fg ? similar(gradient(d)) : empty(gradient(d)), NaN), # For resetting the gradient if a step is rejected
+        copy(g_x), # Maintain current gradient in state.g_x
+        copy(H_x), # Maintain current Hessian in state.H_x
+        f_x, # Maintain current f in state.f_x
+        fill!(similar(initial_x), NaN), # Maintain previous state in state.x_previous
+        fill!(similar(g_x), NaN), # Store previous gradient in state.g_x_previous
+        oftype(f_x, NaN), # Store previous f in state.f_x_previous
+        fill!(similar(initial_x), NaN), # Maintain current search direction in state.s
+        fill!(similar(initial_x), NaN), # Cache to be able to reset state.x
+        fill!(method.use_fg ? similar(g_x) : empty(g_x), NaN), # Cache to be able to reset state.g_x
         hard_case,
         reached_subproblem_solution,
         interior,
@@ -353,28 +359,31 @@ end
 
 
 function update_state!(d, state::NewtonTrustRegionState, method::NewtonTrustRegion)
-    T = eltype(state.x)
     # Find the next step direction.
     m, state.interior, state.lambda, state.hard_case, state.reached_subproblem_solution =
-        solve_tr_subproblem!(gradient(d), NLSolversBase.hessian(d), state.delta, state.s)
+        solve_tr_subproblem!(state.g_x, state.H_x, state.delta, state.s)
 
-    # Maintain a record of previous position
+    # Maintain a record of current position, to be able to reset it below
     copyto!(state.x_cache, state.x)
-    f_cache = value(d)
+    f_cache = state.f_x
 
     # Update current position
     state.x .+= state.s
+
     # Update the function value and gradient
     if method.use_fg
-        copyto!(state.g_cache, gradient(d))
-        value_gradient!(d, state.x)
+        copyto!(state.g_cache, state.g_x)
+        f_x, g_x = value_gradient!(d, state.x)
+        copyto!(state.g_x, g_x)
+        state.f_x = f_x
     else
-        value!(d, state.x)
+        f_x = value!(d, state.x)
+        state.f_x = f_x
     end
     # Update the trust region size based on the discrepancy between
     # the predicted and actual function values.  (Algorithm 4.1 in N&W (2006))
-    f_x_diff = f_cache - value(d)
-    if abs(m) <= eps(T)
+    f_x_diff = f_cache - f_x
+    if abs(m) <= eps(typeof(m))
         # This should only happen when the step is very small, in which case
         # we should accept the step and assess_convergence().
         state.rho = 1.0
@@ -408,20 +417,24 @@ function update_state!(d, state::NewtonTrustRegionState, method::NewtonTrustRegi
     # Update/reset gradients and function values
     if accept_step
         if method.use_fg
-            hessian!(d, state.x)
+            copyto!(state.H_x, hessian!(d, state.x))
         else
-            NLSolversBase.gradient_hessian!!(d, state.x)
+            # TODO: Switch to `gradient_hessian!`
+            g_x, H_x = NLSolversBase.gradient_hessian!!(d, state.x)
+            copyto!(state.g_x, g_x)
+            copyto!(state.H_x, H_x)
         end
+
+        # Update history
+        copyto!(state.x_previous, state.x_cache)
+        copyto!(state.g_x_previous, state.g_cache)
+        state.f_x_previous = f_cache
     else
         # Reset state
         copyto!(state.x, state.x_cache)
-
-        # Reset objective function
-        copyto!(d.x_f, state.x_cache)
-        d.F = f_cache
+        state.f_x = f_cache
         if method.use_fg
-            copyto!(d.x_df, state.x_cache)
-            copyto!(d.DF, state.g_cache)
+            copyto!(state.g_x, state.g_cache)
         end
     end
 
@@ -429,30 +442,32 @@ function update_state!(d, state::NewtonTrustRegionState, method::NewtonTrustRegi
 end
 
 function assess_convergence(state::NewtonTrustRegionState, d, options::Options)
-    x_converged, f_converged, g_converged, converged, f_increased =
-        false, false, false, false, false
     if state.rho > state.eta
         # Accept the point and check convergence
-        x_converged, f_converged, g_converged, f_increased = assess_convergence(
+        return assess_convergence(
             state.x,
             state.x_previous,
-            value(d),
+            state.f_x,
             state.f_x_previous,
-            gradient(d),
+            state.g_x,
             options.x_abstol,
             options.f_reltol,
             options.g_abstol,
         )
+    else
+        return false, false, false, false
     end
-    x_converged, f_converged, g_converged, f_increased
 end
 
+# Function value, gradient and Hessian matrix are already updated in update_state!
+update_fgh!(d, state, ::NewtonTrustRegion) = nothing
+
 function trace!(
     tr,
     d,
     state::NewtonTrustRegionState,
     iteration::Integer,
-    method::NewtonTrustRegion,
+    ::NewtonTrustRegion,
     options::Options,
     curr_time = time(),
 )
@@ -460,20 +475,19 @@ function trace!(
     dt["time"] = curr_time
     if options.extended_trace
         dt["x"] = copy(state.x)
-        dt["g(x)"] = copy(gradient(d))
-        dt["h(x)"] = copy(NLSolversBase.hessian(d))
+        dt["g(x)"] = copy(state.g_x)
+        dt["h(x)"] = copy(state.H_x)
         dt["delta"] = copy(state.delta)
         dt["interior"] = state.interior
         dt["hard case"] = state.hard_case
         dt["reached_subproblem_solution"] = state.reached_subproblem_solution
         dt["lambda"] = state.lambda
     end
-    g_norm = norm(gradient(d), Inf)
     update!(
         tr,
         iteration,
-        value(d),
-        g_norm,
+        state.f_x,
+        g_residual(state),
         dt,
         options.store_trace,
         options.show_trace,
diff --git a/src/multivariate/solvers/zeroth_order/nelder_mead.jl b/src/multivariate/solvers/zeroth_order/nelder_mead.jl
index 6f66db505..47c0e49f5 100644
--- a/src/multivariate/solvers/zeroth_order/nelder_mead.jl
+++ b/src/multivariate/solvers/zeroth_order/nelder_mead.jl
@@ -130,6 +130,7 @@ end
 
 mutable struct NelderMeadState{Tx,T,Tfs} <: ZerothOrderState
     x::Tx
+    f_x::T
     m::Int
     simplex::Vector{Tx}
     x_centroid::Tx
@@ -150,27 +151,31 @@ mutable struct NelderMeadState{Tx,T,Tfs} <: ZerothOrderState
 end
 function reset!(method::NelderMead, state::NelderMeadState, obj, x)
     state.simplex = simplexer(method.initial_simplex, x)
+    @assert state.simplex[1] == x
 
-    value!(obj, first(state.simplex))
-    state.f_simplex[1] = value(obj)
+    state.f_simplex[1] = state.f_x = value!(obj, state.simplex[1])
     for i = 2:length(state.simplex)
         state.f_simplex[i] = value(obj, state.simplex[i])
     end
+
     # Get the indices that correspond to the ordering of the f values
     # at the vertices. i_order[1] is the index in the simplex of the vertex
     # with the lowest function value, and i_order[end] is the index in the
     # simplex of the vertex with the highest function value
     state.i_order = sortperm(state.f_simplex)
+
+    return nothing
 end
 function initial_state(method::NelderMead, options, d, initial_x)
     T = eltype(initial_x)
     n = length(initial_x)
     m = n + 1
     simplex = simplexer(method.initial_simplex, initial_x)
-    f_simplex = zeros(T, m)
+    @assert simplex[1] == initial_x
 
-    value!!(d, first(simplex))
-    f_simplex[1] = value(d)
+    f_x = value!(d, simplex[1])
+    f_simplex = Vector{typeof(f_x)}(undef, m)
+    f_simplex[1] = f_x
     for i = 2:length(simplex)
         f_simplex[i] = value(d, simplex[i])
     end
@@ -184,6 +189,7 @@ function initial_state(method::NelderMead, options, d, initial_x)
 
     NelderMeadState(
         copy(initial_x), # Variable to hold final minimizer value for MultivariateOptimizationResults
+        f_x, # Variable to hold final objective function value
         m, # Number of vertices in the simplex
         simplex, # Maintain simplex in state.simplex
         centroid(simplex, i_order[m]), # Maintain centroid in state.centroid
@@ -309,26 +315,22 @@ function after_while!(f, state, method::NelderMead, options)
         f.F = f_min
     end
     state.x .= x_min
+    state.f_x = f_min
+    return nothing
 end
-# We don't have an f_x_previous in NelderMeadState, so we need to special case these
-pick_best_x(f_increased, state::NelderMeadState) = state.x
-pick_best_f(f_increased, state::NelderMeadState, d) = value(d)
+
+# We don't have an x_previous and f_x_previous in NelderMeadState, so we need to special case these
+pick_best_x(f_increased::Bool, state::NelderMeadState) = state.x
+pick_best_f(f_increased::Bool, state::NelderMeadState) = state.f_x
 
 function assess_convergence(state::NelderMeadState, d, options::Options)
     g_converged = state.nm_x <= options.g_abstol # Hijact g_converged for NM stopping criterior
     return false, false, g_converged, false
 end
 
-function initial_convergence(
-    d,
-    state::NelderMeadState,
-    method::NelderMead,
-    initial_x,
-    options,
-)
-    nmo = nmobjective(state.f_simplex, state.m, length(initial_x))
-
-    nmo <= options.g_abstol, !isfinite(value(d)) || !isfinite(nmo)
+function initial_convergence(state::NelderMeadState, options::Options)
+    nmo = nmobjective(state.f_simplex, state.m, length(state.x))
+    nmo <= options.g_abstol, !isfinite(state.f_x) || !isfinite(nmo)
 end
 
 function trace!(
diff --git a/src/multivariate/solvers/zeroth_order/particle_swarm.jl b/src/multivariate/solvers/zeroth_order/particle_swarm.jl
index 083b2a7cd..022533a7f 100644
--- a/src/multivariate/solvers/zeroth_order/particle_swarm.jl
+++ b/src/multivariate/solvers/zeroth_order/particle_swarm.jl
@@ -40,6 +40,7 @@ Base.summary(io::IO, ::ParticleSwarm) = print(io, "Particle Swarm")
 
 mutable struct ParticleSwarmState{Tx,T} <: ZerothOrderState
     x::Tx
+    f_x::T
     iteration::Int
     lower::Tx
     upper::Tx
@@ -127,8 +128,8 @@ function initial_state(
 
     current_state = 0
 
-    value!!(d, initial_x)
-    score[1] = value(d)
+    f_x = value!(d, initial_x)
+    score[1] = f_x
 
     # if search space is limited, spread the initial population
     # uniformly over the whole search space
@@ -169,6 +170,7 @@ function initial_state(
 
     ParticleSwarmState(
         x,
+        f_x,
         0,
         lower,
         upper,
@@ -193,15 +195,15 @@ function update_state!(f, state::ParticleSwarmState{T}, method::ParticleSwarm) w
 
     if state.iteration == 0
         copyto!(state.best_score, state.score)
-        f.F = Base.minimum(state.score) # Base.minimum !== minimum
+        # state.f_x = Base.minimum(state.score) # Base.minimum !== minimum
     end
-    f.F = housekeeping!(
+    state.f_x = housekeeping!(
         state.score,
         state.best_score,
         state.X,
         state.X_best,
         state.x,
-        value(f),
+        state.f_x,
         state.n_particles,
     )
     # Elitist Learning:
@@ -238,14 +240,13 @@ function update_state!(f, state::ParticleSwarmState{T}, method::ParticleSwarm) w
         end
     end
 
-    score_learn = value(f, state.x_learn)
-    if score_learn < f.F
-        f.F = score_learn * 1.0
-        for j = 1:n
-            state.X_best[j, i_worst] = state.x_learn[j]
-            state.X[j, i_worst] = state.x_learn[j]
-            state.x[j] = state.x_learn[j]
-        end
+    score_learn = value!(f, state.x_learn)
+    if score_learn < state.f_x
+        copyto!(state.x, state.x_learn)
+        copyto!(view(state.X_best, :, i_worst), state.x_learn)
+        copyto!(view(state.X, :, i_worst), state.x_learn)
+
+        state.f_x = score_learn
         state.score[i_worst] = score_learn
         state.best_score[i_worst] = score_learn
     end
diff --git a/src/multivariate/solvers/zeroth_order/simulated_annealing.jl b/src/multivariate/solvers/zeroth_order/simulated_annealing.jl
index a0e3ba5da..6388e9109 100644
--- a/src/multivariate/solvers/zeroth_order/simulated_annealing.jl
+++ b/src/multivariate/solvers/zeroth_order/simulated_annealing.jl
@@ -46,29 +46,38 @@ SimulatedAnnealing(;
 Base.summary(io::IO, ::SimulatedAnnealing) = print(io, "Simulated Annealing")
 
 mutable struct SimulatedAnnealingState{Tx,T} <: ZerothOrderState
-    x::Tx
-    iteration::Int
-    x_current::Tx
-    x_proposal::Tx
-    f_x_current::T
-    f_proposal::T
+    x::Tx            # Best state ever visited
+    f_x::T           # Function value of the best state ever visited
+    iteration::Int   # Iteration number
+    x_current::Tx    # Current state
+    f_x_current::T   # Function value of current state
+    x_proposal::Tx   # Proposed state
+    f_proposal::T    # Function value of proposed state
 end
-# We don't have an f_x_previous in SimulatedAnnealing, so we need to special case these
-pick_best_x(f_increased, state::SimulatedAnnealingState) = state.x
-pick_best_f(f_increased, state::SimulatedAnnealingState, d) = value(d)
+
+# We don't have an x_previous and f_x_previous in SimulatedAnnealing, so we need to special case these
+# The best state ever visited is stored in x, and its function value in f_x
+pick_best_x(::Bool, state::SimulatedAnnealingState) = state.x
+pick_best_f(::Bool, state::SimulatedAnnealingState) = state.f_x
 
 function initial_state(
-    method::SimulatedAnnealing,
-    options,
+    ::SimulatedAnnealing,
+    ::Options,
     d,
-    initial_x::AbstractArray{T},
-) where {T}
-
-    value!!(d, initial_x)
-
-    # Store the best state ever visited
-    best_x = copy(initial_x)
-    SimulatedAnnealingState(copy(best_x), 1, best_x, copy(initial_x), value(d), value(d))
+    initial_x::AbstractArray,
+)
+    # Compute function value
+    f_x = value!(d, initial_x)
+
+    return SimulatedAnnealingState(
+        copy(initial_x), # best state ever visited
+        f_x, # function value of the best state ever visited
+        1, # iteration
+        copy(initial_x), # current state
+        f_x, # function value of the current state
+        fill!(similar(initial_x), NaN), # proposed state
+        oftype(f_x, NaN), # function value of the proposed state
+    )
 end
 
 function update_state!(
@@ -76,7 +85,6 @@ function update_state!(
     state::SimulatedAnnealingState{Tx,T},
     method::SimulatedAnnealing,
 ) where {Tx,T}
-
     # Determine the temperature for current iteration
     t = method.temperature(state.iteration)
 
@@ -84,7 +92,7 @@ function update_state!(
     method.neighbor!(state.x_current, state.x_proposal)
 
     # Evaluate the cost function at the proposed state
-    state.f_proposal = value(nd, state.x_proposal)
+    state.f_proposal = value!(nd, state.x_proposal)
 
     if state.f_proposal <= state.f_x_current
         # If proposal is superior, we always move to it
@@ -92,9 +100,9 @@ function update_state!(
         state.f_x_current = state.f_proposal
 
         # If the new state is the best state yet, keep a record of it
-        if state.f_proposal < value(nd)
-            nd.F = state.f_proposal
+        if state.f_proposal < state.f_x
             copyto!(state.x, state.x_proposal)
+            state.f_x = state.f_proposal
         end
     else
         # If proposal is inferior, we move to it with probability p
@@ -106,5 +114,6 @@ function update_state!(
     end
 
     state.iteration += 1
+
     false
 end
diff --git a/src/multivariate/solvers/zeroth_order/zeroth_utils.jl b/src/multivariate/solvers/zeroth_order/zeroth_utils.jl
index c6fd9bc3a..857be8a5b 100644
--- a/src/multivariate/solvers/zeroth_order/zeroth_utils.jl
+++ b/src/multivariate/solvers/zeroth_order/zeroth_utils.jl
@@ -15,7 +15,7 @@ function trace!(
     update!(
         tr,
         state.iteration,
-        d.F,
+        state.f_x,
         NaN,
         dt,
         options.store_trace,
@@ -29,7 +29,7 @@ function assess_convergence(state::ZerothOrderState, d, options::Options)
     false, false, false, false
 end
 
-f_abschange(d::AbstractObjective, state::ZerothOrderState) = convert(typeof(value(d)), NaN)
-f_relchange(d::AbstractObjective, state::ZerothOrderState) = convert(typeof(value(d)), NaN)
+f_abschange(state::ZerothOrderState) = oftype(state.f_x, NaN)
+f_relchange(state::ZerothOrderState) = oftype(state.f_x, NaN)
 x_abschange(state::ZerothOrderState) = convert(real(eltype(state.x)), NaN)
 x_relchange(state::ZerothOrderState) = convert(real(eltype(state.x)), NaN)
diff --git a/src/types.jl b/src/types.jl
index 3e98def7c..e18e57318 100644
--- a/src/types.jl
+++ b/src/types.jl
@@ -308,8 +308,8 @@ termination_code(mvr::MultivariateOptimizationResults) = mvr.termination_code
 
 # pick_best_x and pick_best_f are used to pick the minimizer if we stopped because
 # f increased and we didn't allow it
-pick_best_x(f_increased, state) = f_increased ? state.x_previous : state.x
-pick_best_f(f_increased, state, d) = f_increased ? state.f_x_previous : value(d)
+pick_best_x(f_increased::Bool, state::AbstractOptimizerState) = f_increased ? state.x_previous : state.x
+pick_best_f(f_increased::Bool, state::AbstractOptimizerState) = f_increased ? state.f_x_previous : state.f_x
 
 function Base.show(io::IO, t::OptimizationState)
     @printf io "%6d   %14e   %14e\n" t.iteration t.value t.g_norm
diff --git a/src/utilities/assess_convergence.jl b/src/utilities/assess_convergence.jl
index 5fcd3ed47..35e87ff3e 100644
--- a/src/utilities/assess_convergence.jl
+++ b/src/utilities/assess_convergence.jl
@@ -1,21 +1,20 @@
-f_abschange(d::AbstractObjective, state) = f_abschange(value(d), state.f_x_previous)
-f_abschange(f_x::T, f_x_previous) where {T} = abs(f_x - f_x_previous)
-f_relchange(d::AbstractObjective, state) = f_relchange(value(d), state.f_x_previous)
-f_relchange(f_x::T, f_x_previous) where {T} = abs(f_x - f_x_previous) / abs(f_x)
+f_abschange(state::AbstractOptimizerState) = f_abschange(state.f_x, state.f_x_previous)
+f_abschange(f_x, f_x_previous) = abs(f_x - f_x_previous)
+f_relchange(state::AbstractOptimizerState) = f_relchange(state.f_x, state.f_x_previous)
+f_relchange(f_x, f_x_previous) = abs(f_x - f_x_previous) / abs(f_x)
 
-x_abschange(state) = x_abschange(state.x, state.x_previous)
+x_abschange(state::AbstractOptimizerState) = x_abschange(state.x, state.x_previous)
 x_abschange(x, x_previous) = maxdiff(x, x_previous)
-x_relchange(state) = x_relchange(state.x, state.x_previous)
+x_relchange(state::AbstractOptimizerState) = x_relchange(state.x, state.x_previous)
 x_relchange(x, x_previous) = maxdiff(x, x_previous) / Base.maximum(abs, x) # Base.maximum !== maximum
 
-g_residual(d, state) = g_residual(d)
-g_residual(d, state::NelderMeadState) = state.nm_x
-g_residual(d::AbstractObjective) = g_residual(gradient(d))
-g_residual(d::NonDifferentiable) = convert(typeof(value(d)), NaN)
-g_residual(g) = Base.maximum(abs, g) # Base.maximum !== maximum
-gradient_convergence_assessment(state::AbstractOptimizerState, d, options) =
-    g_residual(gradient(d)) ≤ options.g_abstol
-gradient_convergence_assessment(state::ZerothOrderState, d, options) = false
+g_residual(state::NelderMeadState) = state.nm_x
+g_residual(state::ZerothOrderState) = oftype(state.f_x, NaN)
+g_residual(state::AbstractOptimizerState) = g_residual(state.g_x)
+g_residual(g_x::AbstractArray) = Base.maximum(abs, g_x) # Base.maximum !== maximum
+gradient_convergence_assessment(state::AbstractOptimizerState, options::Options) =
+    g_residual(state) ≤ options.g_abstol
+gradient_convergence_assessment(state::ZerothOrderState, options::Options) = false
 
 # Default function for convergence assessment used by
 # AcceleratedGradientDescentState, BFGSState, ConjugateGradientState,
@@ -24,9 +23,9 @@ function assess_convergence(state::AbstractOptimizerState, d, options::Options)
     assess_convergence(
         state.x,
         state.x_previous,
-        value(d),
+        state.f_x,
         state.f_x_previous,
-        gradient(d),
+        state.g_x,
         options.x_abstol,
         options.x_reltol,
         options.f_abstol,
@@ -39,65 +38,41 @@ function assess_convergence(
     x_previous,
     f_x,
     f_x_previous,
-    gx,
+    g_x,
     x_abstol,
     x_reltol,
     f_abstol,
     f_reltol,
     g_abstol,
 )
-    x_converged, f_converged, f_increased, g_converged = false, false, false, false
-
     # TODO: Create function for x_convergence_assessment
-    if x_abschange(x, x_previous) ≤ x_abstol
-        x_converged = true
-    end
-    if x_abschange(x, x_previous) ≤ x_reltol * Base.maximum(abs, x) # Base.maximum !== maximum
-        x_converged = true
-    end
+    x_converged = x_abschange(x, x_previous) ≤ x_abstol ||
+        x_abschange(x, x_previous) ≤ x_reltol * Base.maximum(abs, x) # Base.maximum !== maximum
 
     # Relative Tolerance
     # TODO: Create function for f_convergence_assessment
-    if f_abschange(f_x, f_x_previous) ≤ f_abstol
-        f_converged = true
-    end
-
-    if f_abschange(f_x, f_x_previous) ≤ f_reltol * abs(f_x)
-        f_converged = true
-    end
+    f_converged = f_abschange(f_x, f_x_previous) ≤ f_abstol ||
+        f_abschange(f_x, f_x_previous) ≤ f_reltol * abs(f_x)
 
-    if f_x > f_x_previous
-        f_increased = true
-    end
+    f_increased = f_x > f_x_previous
 
-    g_converged = g_residual(gx) ≤ g_abstol
+    g_converged = g_residual(g_x) ≤ g_abstol # Base.maximum !== maximum
 
     return x_converged, f_converged, g_converged, f_increased
 end
 
 # Used by Fminbox and IPNewton
-function assess_convergence(x, x_previous, f_x, f_x_previous, g, x_tol, f_tol, g_tol)
-
-    x_converged, f_converged, f_increased, g_converged = false, false, false, false
-
-    if x_abschange(x, x_previous) ≤ x_tol
-        x_converged = true
-    end
+function assess_convergence(x, x_previous, f_x, f_x_previous, g_x, x_tol, f_tol, g_tol)
+    x_converged = x_abschange(x, x_previous) ≤ x_tol
 
     # Absolute Tolerance
     # if abs(f_x - f_x_previous) < f_tol
     # Relative Tolerance
-    if f_abschange(f_x, f_x_previous) ≤ f_tol * abs(f_x)
-        f_converged = true
-    end
+    f_converged = f_abschange(f_x, f_x_previous) ≤ f_tol * abs(f_x)
 
-    if f_x > f_x_previous
-        f_increased = true
-    end
+    f_increased = f_x > f_x_previous
 
-    if g_residual(g) ≤ g_tol
-        g_converged = true
-    end
+    g_converged = g_residual(g_x) ≤ g_tol # Base.maximum !== maximum
 
     return x_converged, f_converged, g_converged, f_increased
 end
diff --git a/src/utilities/generic.jl b/src/utilities/generic.jl
index fe26689d6..125b71673 100644
--- a/src/utilities/generic.jl
+++ b/src/utilities/generic.jl
@@ -13,7 +13,7 @@ end
 
 @def initial_linesearch begin
     (
-        similar(initial_x), # Buffer of x for line search in state.x_ls
-        real(one(T)),
+        fill!(similar(initial_x), NaN), # Buffer of x for line search in state.x_ls
+        real(oneunit(eltype(initial_x))),
     )             # Keep track of step size in state.alpha
 end
diff --git a/src/utilities/perform_linesearch.jl b/src/utilities/perform_linesearch.jl
index fe2e223de..b944a21f8 100644
--- a/src/utilities/perform_linesearch.jl
+++ b/src/utilities/perform_linesearch.jl
@@ -5,11 +5,8 @@ reset_search_direction!(state, d, method) = false # no-op
 _alphaguess(a) = a
 _alphaguess(a::Number) = LineSearches.InitialStatic(alpha = a)
 
-# Note that for these resets we're using `gradient(d)` but we don't need to use
-# project_tangent! here, because we already did that inplace on gradient(d) after
-# the last evaluation (we basically just always do it)
-function reset_search_direction!(state, d, method::BFGS)
-    gx = gradient!(d, state.x)
+function reset_search_direction!(state::BFGSState, method::BFGS)
+    gx = state.g_x
     if method.initial_invH === nothing
         n = length(state.x)
         T = typeof(state.invH)
@@ -27,27 +24,25 @@ function reset_search_direction!(state, d, method::BFGS)
     return true
 end
 
-function reset_search_direction!(state, d, method::LBFGS)
+function reset_search_direction!(state::LBFGSState, ::LBFGS)
     state.pseudo_iteration = 1
-    state.s .= .-gradient!(d, state.x)
+    state.s .= .-state.g_x
     return true
 end
 
-function reset_search_direction!(state, d, method::ConjugateGradient)
+function reset_search_direction!(state::ConjugateGradientState, ::ConjugateGradient)
     state.s .= .-state.pg
     return true
 end
 
 function perform_linesearch!(state, method, d)
     # Calculate search direction dphi0
-    fx = value_gradient!(d, state.x)
-    gx = gradient(d)
-    dphi_0 = real(dot(gx, state.s))
+    dphi_0 = real(dot(state.g_x, state.s))
     # reset the direction if it becomes corrupted
-    if dphi_0 >= zero(dphi_0) && reset_search_direction!(state, d, method)
-        dphi_0 = real(dot(gx, state.s)) # update after direction reset
+    if dphi_0 >= zero(dphi_0) && reset_search_direction!(state, method)
+        dphi_0 = real(dot(state.g_x, state.s)) # update after direction reset
     end
-    phi_0 = value!(d, state.x)
+    phi_0 = state.f_x
 
     # Guess an alpha
     method.alphaguess!(method.linesearch!, state, phi_0, dphi_0, d)
@@ -55,6 +50,9 @@ function perform_linesearch!(state, method, d)
     # Store current x and f(x) for next iteration
     state.f_x_previous = phi_0
     copyto!(state.x_previous, state.x)
+    if hasproperty(state, :g_x_previous)
+        copyto!(state.g_x_previous, state.g_x)
+    end
 
     # Perform line search; catch LineSearchException to allow graceful exit
     try
diff --git a/src/utilities/trace.jl b/src/utilities/trace.jl
index 7a46a2916..34e8d6160 100644
--- a/src/utilities/trace.jl
+++ b/src/utilities/trace.jl
@@ -5,7 +5,7 @@ function common_trace!(
     d,
     state,
     iteration::Integer,
-    method::FirstOrderOptimizer,
+    ::FirstOrderOptimizer,
     options::Options,
     curr_time = time(),
 )
@@ -13,14 +13,14 @@ function common_trace!(
     dt["time"] = curr_time
     if options.extended_trace
         dt["x"] = copy(state.x)
-        dt["g(x)"] = copy(gradient(d))
+        dt["g(x)"] = copy(state.g_x)
         dt["Current step size"] = state.alpha
     end
-    g_norm = Base.maximum(abs, gradient(d)) # Base.maximum !== maximum
+    g_norm = Base.maximum(abs, state.g_x) # Base.maximum !== maximum
     update!(
         tr,
         iteration,
-        value(d),
+        state.f_x,
         g_norm,
         dt,
         options.store_trace,
diff --git a/test/general/convergence.jl b/test/general/convergence.jl
index 701a624a9..4785040e4 100644
--- a/test/general/convergence.jl
+++ b/test/general/convergence.jl
@@ -3,7 +3,7 @@ mutable struct DummyState <: Optim.AbstractOptimizerState
     x_previous::Any
     f_x::Any
     f_x_previous::Any
-    g::Any
+    g_x::Any
 end
 
 mutable struct DummyStateZeroth <: Optim.ZerothOrderState
@@ -11,7 +11,7 @@ mutable struct DummyStateZeroth <: Optim.ZerothOrderState
     x_previous::Any
     f_x::Any
     f_x_previous::Any
-    g::Any
+    g_x::Any
 end
 
 mutable struct DummyOptions
@@ -63,26 +63,32 @@ mutable struct DummyMethodZeroth <: Optim.ZerothOrderOptimizer end
     ## initial_convergence and gradient_convergence_assessment
 
     ds = DummyState(x1, x0, f1, f0, g)
-    dOpt = DummyOptions(x_tol, f_tol, g_tol, g_abstol)
+    opt = Optim.Options(; x_abstol = x_tol, f_abstol = f_tol, g_tol, g_abstol)
     dm = DummyMethod()
 
     # >= First Order
     d = Optim.OnceDifferentiable(x -> sum(abs2.(x)), zeros(2))
 
-    Optim.gradient!(d, ones(2))
-    @test !Optim.gradient_convergence_assessment(ds, d, dOpt)
-    Optim.gradient!(d, zeros(2))
-    @test Optim.gradient_convergence_assessment(ds, d, dOpt)
-
-    @test Optim.initial_convergence(d, ds, dm, ones(2), dOpt) == (false, false)
-    @test Optim.initial_convergence(d, ds, dm, zeros(2), dOpt) == (true, false)
+    x = ones(2)
+    f_x, g_x = Optim.value_gradient!(d, x)
+    ds = DummyState(x, x0, f_x, f0, g_x)
+    @test !Optim.gradient_convergence_assessment(ds, opt)
+    @test Optim.initial_convergence(ds, opt) == (false, false)
+    x = zeros(2)
+    f_x, g_x = Optim.value_gradient!(d, x)
+    ds = DummyState(x, x0, f_x, f0, g_x)
+    @test Optim.gradient_convergence_assessment(ds, opt)
+    @test Optim.initial_convergence(ds, opt) == (true, false)
 
     # Zeroth order methods have no gradient -> returns false by default
     ds = DummyStateZeroth(x1, x0, f1, f0, g)
     dm = DummyMethodZeroth()
 
-    @test !Optim.gradient_convergence_assessment(ds, d, dOpt)
-    @test Optim.initial_convergence(d, ds, dm, ones(2), dOpt) == (false, false)
+    x = ones(2)
+    f_x, g_x = Optim.value_gradient!(d, x)
+    ds = DummyState(x, x0, f_x, f0, g_x)
+    @test !Optim.gradient_convergence_assessment(ds, opt)
+    @test Optim.initial_convergence(ds, opt) == (false, false)
 
     # should check all other methods as well
 
diff --git a/test/general/objective_types.jl b/test/general/objective_types.jl
index f197c515b..0d4474666 100644
--- a/test/general/objective_types.jl
+++ b/test/general/objective_types.jl
@@ -7,12 +7,12 @@
             odad1 = T(x -> 5.0, rand(1); autodiff = AutoFiniteDiff(; fdtype = Val(:central)))
             odad2 = T(x -> 5.0, rand(1); autodiff = AutoForwardDiff())
             odad3 = T(x -> 5.0, rand(1); autodiff = AutoReverseDiff())
-            Optim.gradient!(odad1, rand(1))
-            Optim.gradient!(odad2, rand(1))
-            Optim.gradient!(odad3, rand(1))
-            @test Optim.gradient(odad1) == [0.0]
-            @test Optim.gradient(odad2) == [0.0]
-            @test Optim.gradient(odad3) == [0.0]
+            NLSolversBase.gradient!(odad1, rand(1))
+            NLSolversBase.gradient!(odad2, rand(1))
+            NLSolversBase.gradient!(odad3, rand(1))
+            @test NLSolversBase.gradient(odad1) == [0.0]
+            @test NLSolversBase.gradient(odad2) == [0.0]
+            @test NLSolversBase.gradient(odad3) == [0.0]
         end
 
         for a in (1.0, 5.0)
@@ -20,33 +20,33 @@
             odad1 = OnceDifferentiable(x -> a * x[1], xa; autodiff = AutoFiniteDiff(; fdtype = Val(:central)))
             odad2 = OnceDifferentiable(x -> a * x[1], xa; autodiff = AutoForwardDiff())
             odad3 = OnceDifferentiable(x -> a * x[1], xa; autodiff = AutoReverseDiff())
-            Optim.gradient!(odad1, xa)
-            Optim.gradient!(odad2, xa)
-            Optim.gradient!(odad3, xa)
-            @test Optim.gradient(odad1) ≈ [a]
-            @test Optim.gradient(odad2) == [a]
-            @test Optim.gradient(odad3) == [a]
+            NLSolversBase.gradient!(odad1, xa)
+            NLSolversBase.gradient!(odad2, xa)
+            NLSolversBase.gradient!(odad3, xa)
+            @test NLSolversBase.gradient(odad1) ≈ [a]
+            @test NLSolversBase.gradient(odad2) == [a]
+            @test NLSolversBase.gradient(odad3) == [a]
         end
         for a in (1.0, 5.0)
             xa = rand(1)
             odad1 = OnceDifferentiable(x -> a * x[1]^2, xa; autodiff = AutoFiniteDiff(; fdtype = Val(:central)))
             odad2 = OnceDifferentiable(x -> a * x[1]^2, xa; autodiff = AutoForwardDiff())
             odad3 = OnceDifferentiable(x -> a * x[1]^2, xa; autodiff = AutoReverseDiff())
-            Optim.gradient!(odad1, xa)
-            Optim.gradient!(odad2, xa)
-            Optim.gradient!(odad3, xa)
-            @test Optim.gradient(odad1) ≈ 2.0 * a * xa
-            @test Optim.gradient(odad2) == 2.0 * a * xa
-            @test Optim.gradient(odad3) == 2.0 * a * xa
+            NLSolversBase.gradient!(odad1, xa)
+            NLSolversBase.gradient!(odad2, xa)
+            NLSolversBase.gradient!(odad3, xa)
+            @test NLSolversBase.gradient(odad1) ≈ 2.0 * a * xa
+            @test NLSolversBase.gradient(odad2) == 2.0 * a * xa
+            @test NLSolversBase.gradient(odad3) == 2.0 * a * xa
         end
         for dtype in (OnceDifferentiable, TwiceDifferentiable)
             for autodiff in (AutoFiniteDiff(; fdtype = Val(:central)), AutoForwardDiff(), AutoReverseDiff())
                 differentiable = dtype(x -> sum(x), rand(2); autodiff = autodiff)
-                Optim.value(differentiable)
-                Optim.value!(differentiable, rand(2))
-                Optim.value_gradient!(differentiable, rand(2))
-                Optim.gradient!(differentiable, rand(2))
-                dtype == TwiceDifferentiable && Optim.hessian!(differentiable, rand(2))
+                NLSolversBase.value(differentiable)
+                NLSolversBase.value!(differentiable, rand(2))
+                NLSolversBase.value_gradient!(differentiable, rand(2))
+                NLSolversBase.gradient!(differentiable, rand(2))
+                dtype == TwiceDifferentiable && NLSolversBase.hessian!(differentiable, rand(2))
             end
         end
     end
@@ -54,18 +54,18 @@
         a = 3.0
         x_seed = rand(1)
         odad1 = OnceDifferentiable(x -> a * x[1]^2, x_seed)
-        Optim.value_gradient!(odad1, x_seed)
-        @test Optim.gradient(odad1) ≈ 2 .* a .* (x_seed)
+        NLSolversBase.value_gradient!(odad1, x_seed)
+        @test NLSolversBase.gradient(odad1) ≈ 2 .* a .* (x_seed)
         @testset "call counters" begin
-            @test Optim.f_calls(odad1) == 1
-            @test Optim.g_calls(odad1) == 1
-            @test Optim.h_calls(odad1) == 0
-            Optim.value_gradient!(odad1, x_seed .+ 1.0)
-            @test Optim.f_calls(odad1) == 2
-            @test Optim.g_calls(odad1) == 2
-            @test Optim.h_calls(odad1) == 0
+            @test NLSolversBase.f_calls(odad1) == 1
+            @test NLSolversBase.g_calls(odad1) == 1
+            @test NLSolversBase.h_calls(odad1) == 0
+            NLSolversBase.value_gradient!(odad1, x_seed .+ 1.0)
+            @test NLSolversBase.f_calls(odad1) == 2
+            @test NLSolversBase.g_calls(odad1) == 2
+            @test NLSolversBase.h_calls(odad1) == 0
         end
-        @test Optim.gradient(odad1) ≈ 2 .* a .* (x_seed .+ 1.0)
+        @test NLSolversBase.gradient(odad1) ≈ 2 .* a .* (x_seed .+ 1.0)
     end
 
 end
diff --git a/test/multivariate/solvers/constrained/ipnewton/constraints.jl b/test/multivariate/solvers/constrained/ipnewton/constraints.jl
index 917128529..a440855dd 100644
--- a/test/multivariate/solvers/constrained/ipnewton/constraints.jl
+++ b/test/multivariate/solvers/constrained/ipnewton/constraints.jl
@@ -2,8 +2,7 @@
     # Utility function for hand-setting the μ parameter
     function setstate!(state, μ, d, constraints, method)
         state.μ = μ
-        Optim.update_fg!(d, constraints, state, method)
-        Optim.update_h!(d, constraints, state, method)
+        Optim.update_fgh!(d, constraints, state, method)
     end
 
     @testset "Bounds parsing" begin
@@ -141,19 +140,22 @@
         bounds = Optim.ConstraintBounds(Float64[], Float64[], Float64[], Float64[])
         bstate = Optim.BarrierStateVars(bounds, x)
         bgrad = similar(bstate)
-        f_x, L = Optim.lagrangian_fg!(
+        f_x = NLSolversBase.value(dg, x)
+        g_x = NLSolversBase.gradient(dg, x)
+        L, ev = Optim.lagrangian_fg!(
             gx,
             bgrad,
-            dg,
             bounds,
             x,
+            f_x,
+            g_x,
             Float64[],
             Array{Float64}(undef, 0, 0),
             bstate,
             μ,
         )
         @test f_x == L == dg.f(x)
-        @test gx == H * x
+        @test gx == g_x == H * x
         constraints = TwiceDifferentiableConstraints(
             (c, x) -> nothing,
             (J, x) -> nothing,
@@ -172,7 +174,9 @@
         bstate = Optim.BarrierStateVars(bounds)
         rand!(bstate.λxE)
         bgrad = similar(bstate)
-        f_x, L = Optim.lagrangian_fg!(gx, bgrad, d0, bounds, x, c, J, bstate, μ)
+        f_x = NLSolversBase.value(d0, x)
+        g_x = NLSolversBase.gradient(d0, x)
+        L, ev = Optim.lagrangian_fg!(gx, bgrad, bounds, x, f_x, g_x, c, J, bstate, μ)
         @test f_x == 0
         @test L ≈ dot(bstate.λxE, xbar - x)
         @test gx == -bstate.λxE
@@ -213,12 +217,15 @@
         bstate = Optim.BarrierStateVars(bounds, y)
         rand!(bstate.λx)
         bgrad = similar(bstate)
-        f_x, L = Optim.lagrangian_fg!(
+        f_y = NLSolversBase.value(d0, y)
+        g_y = NLSolversBase.gradient(d0, y)
+        L, ev = Optim.lagrangian_fg!(
             gx,
             bgrad,
-            d0,
             bounds,
             y,
+            f_y,
+            g_y,
             Float64[],
             Array{Float64}(undef, 0, 0),
             bstate,
@@ -254,12 +261,15 @@
         rand!(bstate.slack_x)  # intentionally displace from the correct value
         rand!(bstate.λx)
         bgrad = similar(bstate)
-        f_x, L = Optim.lagrangian_fg!(
+        f_x = NLSolversBase.value(d0, x)
+        g_x = NLSolversBase.gradient(d0, x)
+        L, ev = Optim.lagrangian_fg!(
             gx,
             bgrad,
-            d0,
             bounds,
             x,
+            f_x,
+            g_x,
             Float64[],
             Array{Float64}(undef, 0, 0),
             bstate,
@@ -336,7 +346,9 @@
         bstate = Optim.BarrierStateVars(bounds, x, c)
         rand!(bstate.λcE)
         bgrad = similar(bstate)
-        f_x, L = Optim.lagrangian_fg!(gx, bgrad, d0, bounds, x, c, J, bstate, μ)
+        f_x = NLSolversBase.value(d0, x)
+        g_x = NLSolversBase.gradient(d0, x)
+        L, ev = Optim.lagrangian_fg!(gx, bgrad, bounds, x, f_x, g_x, c, J, bstate, μ)
         @test f_x == 0
         @test L ≈ dot(bstate.λcE, cbar - c)
         @test gx ≈ -J' * bstate.λcE
@@ -361,7 +373,9 @@
         rand!(bstate.slack_c)  # intentionally displace from the correct value
         rand!(bstate.λc)
         bgrad = similar(bstate)
-        f_x, L = Optim.lagrangian_fg!(gx, bgrad, d0, bounds, x, c, J, bstate, μ)
+        f_x = NLSolversBase.value(d0, x)
+        g_x = NLSolversBase.gradient(d0, x)
+        L, ev = Optim.lagrangian_fg!(gx, bgrad, bounds, x, f_x, g_x, c, J, bstate, μ)
         @test f_x == 0
         Ltarget =
             -μ * sum(log, bstate.slack_c) +
@@ -415,8 +429,8 @@
         constraints =
             TwiceDifferentiableConstraints([0.5, 0.0, -Inf, -Inf], [Inf, Inf, 1.0, 0.8])
         state = Optim.initial_state(method, options, d, constraints, x)
-        Optim.update_fg!(d, constraints, state, method)
-        @test norm(f_g - state.g) ≈ 0.01 * norm(f_g)
+        Optim.update_fgh!(d, constraints, state, method)
+        @test norm(f_g - state.g_L_x) ≈ 0.01 * norm(f_g)
         # Nonlinear inequalities
         constraints = TwiceDifferentiableConstraints(
             (c, x) -> (c[1] = x[1] * x[2]; c[2] = 3 * x[3] + x[4]^2),
@@ -429,8 +443,8 @@
         )
         @test Optim.isinterior(constraints, x)
         state = Optim.initial_state(method, options, d, constraints, x)
-        Optim.update_fg!(d, constraints, state, method)
-        @test norm(f_g - state.g) ≈ 0.01 * norm(f_g)
+        Optim.update_fgh!(d, constraints, state, method)
+        @test norm(f_g - state.g_L_x) ≈ 0.01 * norm(f_g)
         # Mixed equalities and inequalities
         constraints = TwiceDifferentiableConstraints(
             (c, x) -> (c[1] = x[1] * x[2]; c[2] = 3 * x[3] + x[4]^2),
@@ -443,14 +457,14 @@
         )
         @test Optim.isfeasible(constraints, x)
         state = Optim.initial_state(method, options, d, constraints, x)
-        Optim.update_fg!(d, constraints, state, method)
+        Optim.update_fgh!(d, constraints, state, method)
         J = zeros(2, 4)
         constraints.jacobian!(J, x)
         eqnormal = vec(J[1, :])
         eqnormal = eqnormal / norm(eqnormal)
-        @test abs(dot(state.g, eqnormal)) < 1e-12  # orthogonal to equality constraint
+        @test abs(dot(state.g_L_x, eqnormal)) < 1e-12  # orthogonal to equality constraint
         Pfg = f_g - dot(f_g, eqnormal) * eqnormal
-        Pg = state.g - dot(state.g, eqnormal) * eqnormal
+        Pg = state.g_L_x - dot(state.g_L_x, eqnormal) * eqnormal
         @test norm(Pfg - Pg) ≈ 0.01 * norm(Pfg)
         ## An objective function with a nonzero hessian
         hd = [1.0, 100.0, 0.01, 2.0]   # diagonal terms of hessian
@@ -467,10 +481,10 @@
         constraints =
             TwiceDifferentiableConstraints([0.5, 0.0, -Inf, -Inf], [Inf, Inf, 1.0, 0.8])
         state = Optim.initial_state(method, options, d, constraints, x)
-        Optim.update_fg!(d, constraints, state, method)
-        @test abs(dot(gx, state.g) / dot(gx, gx) - 1) <= 0.011
-        Optim.update_h!(d, constraints, state, method)
-        @test abs(dot(gx, state.H * gx) / dot(gx, hx * gx) - 1) <= 0.011
+        Optim.update_fgh!(d, constraints, state, method)
+        @test abs(dot(gx, state.g_L_x) / dot(gx, gx) - 1) <= 0.011
+        Optim.update_fgh!(d, constraints, state, method)
+        @test abs(dot(gx, state.H_L_x * gx) / dot(gx, hx * gx) - 1) <= 0.011
         # Nonlinear inequalities
         constraints = TwiceDifferentiableConstraints(
             (c, x) -> (c[1] = x[1] * x[2]; c[2] = 3 * x[3] + x[4]^2),
@@ -483,10 +497,10 @@
         )
         @test Optim.isinterior(constraints, x)
         state = Optim.initial_state(method, options, d, constraints, x)
-        Optim.update_fg!(d, constraints, state, method)
-        @test abs(dot(gx, state.g) / dot(gx, gx) - 1) <= 0.011
-        Optim.update_h!(d, constraints, state, method)
-        @test abs(dot(gx, state.H * gx) / dot(gx, hx * gx) - 1) <= 0.011
+        Optim.update_fgh!(d, constraints, state, method)
+        @test abs(dot(gx, state.g_L_x) / dot(gx, gx) - 1) <= 0.011
+        Optim.update_fgh!(d, constraints, state, method)
+        @test abs(dot(gx, state.H_L_x * gx) / dot(gx, hx * gx) - 1) <= 0.011
         # Mixed equalities and inequalities
         constraints = TwiceDifferentiableConstraints(
             (c, x) -> (c[1] = x[1] * x[2]; c[2] = 3 * x[3] + x[4]^2),
@@ -499,16 +513,16 @@
         )
         @test Optim.isfeasible(constraints, x)
         state = Optim.initial_state(method, options, d, constraints, x)
-        Optim.update_fg!(d, constraints, state, method)
+        Optim.update_fgh!(d, constraints, state, method)
         J = zeros(2, 4)
         constraints.jacobian!(J, x)
         eqnormal = vec(J[1, :])
         eqnormal = eqnormal / norm(eqnormal)
-        @test abs(dot(state.g, eqnormal)) < 1e-12  # orthogonal to equality constraint
+        @test abs(dot(state.g_L_x, eqnormal)) < 1e-12  # orthogonal to equality constraint
         Pgx = gx - dot(gx, eqnormal) * eqnormal
-        @test abs(dot(Pgx, state.g) / dot(Pgx, Pgx) - 1) <= 0.011
-        Optim.update_h!(d, constraints, state, method)
-        @test abs(dot(Pgx, state.H * Pgx) / dot(Pgx, hx * Pgx) - 1) <= 0.011
+        @test abs(dot(Pgx, state.g_L_x) / dot(Pgx, Pgx) - 1) <= 0.011
+        Optim.update_fgh!(d, constraints, state, method)
+        @test abs(dot(Pgx, state.H_L_x * Pgx) / dot(Pgx, hx * Pgx) - 1) <= 0.011
     end
 
     @testset "IPNewton step" begin
@@ -609,8 +623,7 @@
             for i = 1:10
                 Optim.update_state!(d, constraints, state, method, options)
                 state.μ = μ
-                Optim.update_fg!(d, constraints, state, method)
-                Optim.update_h!(d, constraints, state, method)
+                Optim.update_fgh!(d, constraints, state, method)
             end
             @test isapprox(first(state.x), μ / F, rtol = 1e-4)
             # |x| ≥ 1, and check that we get slack precision better than eps(1.0)
@@ -626,8 +639,7 @@
             for i = 1:10
                 Optim.update_state!(d, constraints, state, method, options)
                 state.μ = μ
-                Optim.update_fg!(d, constraints, state, method)
-                Optim.update_h!(d, constraints, state, method)
+                Optim.update_fgh!(d, constraints, state, method)
             end
             @test state.x[1] == σ
             @test state.bstate.slack_x[1] < eps(float(σ))
@@ -650,8 +662,7 @@
                 Optim.initial_state(method, options, d, constraints, [(1 + eps(1.0)) * σ])
             for i = 1:10
                 Optim.update_state!(d, constraints, state, method, options)
-                Optim.update_fg!(d, constraints, state, method)
-                Optim.update_h!(d, constraints, state, method)
+                Optim.update_fgh!(d, constraints, state, method)
             end
             @test state.x[1] ≈ σ
             @test state.bstate.slack_c[1] < eps(float(σ))
diff --git a/test/multivariate/solvers/first_order/ngmres.jl b/test/multivariate/solvers/first_order/ngmres.jl
index 301d7b094..93efcc31d 100644
--- a/test/multivariate/solvers/first_order/ngmres.jl
+++ b/test/multivariate/solvers/first_order/ngmres.jl
@@ -52,8 +52,8 @@ using Optim, Test
     # The bounds are due to different systems behaving differently
     # TODO: is it a bad idea to hardcode these?
     @test 64 < Optim.iterations(res) < 100
-    @test 234 < Optim.f_calls(res) < 310
-    @test 234 < Optim.g_calls(res) < 310
+    @test 234 <= Optim.f_calls(res) < 310
+    @test 234 <= Optim.g_calls(res) < 310
     @test Optim.minimum(res) < 1e-10
 
     @test_throws AssertionError method(