diff --git a/src/cg.jl b/src/cg.jl index 2aa8c3f..d41a933 100644 --- a/src/cg.jl +++ b/src/cg.jl @@ -11,7 +11,7 @@ struct ConjugateGradient{F<:CGFlavor,T<:Real,L<:AbstractLineSearch} <: Optimizat end ConjugateGradient(; flavor = HagerZhang(), maxiter = typemax(Int), gradtol::Real = 1e-8, restart = typemax(Int), verbosity::Int = 0, - linesearch::AbstractLineSearch = HagerZhangLineSearch(;verbosity = verbosity - 2)) = + linesearch::AbstractLineSearch = HagerZhangLineSearch()) = ConjugateGradient(flavor, maxiter, gradtol, linesearch, restart, verbosity) function optimize(fg, x, alg::ConjugateGradient; @@ -29,8 +29,12 @@ function optimize(fg, x, alg::ConjugateGradient; normgradhistory = [normgrad] # compute here once to define initial value of α in scale-invariant way - Pg = precondition(x, g) - normPg = sqrt(inner(x, Pg, Pg)) + if precondition === _precondition + Pg = g + else + Pg = precondition(x, deepcopy(g)) + end + normPg = sqrt(abs(inner(x, g, Pg))) α = 1/(normPg) # initial guess: scale invariant # α = one(normgrad) @@ -66,7 +70,7 @@ function optimize(fg, x, alg::ConjugateGradient; _glast[] = g _dlast[] = η x, f, g, ξ, α, nfg = alg.linesearch(fg, x, η, (f, g); - initialguess = α, retract = retract, inner = inner) + initialguess = α, retract = retract, inner = inner, verbosity = verbosity - 2) numfg += nfg numiter += 1 x, f, g = finalize!(x, f, g, numiter) diff --git a/src/gd.jl b/src/gd.jl index eefd265..7caf926 100644 --- a/src/gd.jl +++ b/src/gd.jl @@ -6,7 +6,7 @@ struct GradientDescent{T<:Real,L<:AbstractLineSearch} <: OptimizationAlgorithm end GradientDescent(; maxiter = typemax(Int), gradtol::Real = 1e-8, verbosity::Int = 0, - linesearch::AbstractLineSearch = HagerZhangLineSearch(;verbosity = verbosity - 2)) = + linesearch::AbstractLineSearch = HagerZhangLineSearch()) = GradientDescent(maxiter, gradtol, linesearch, verbosity) function optimize(fg, x, alg::GradientDescent; @@ -41,7 +41,7 @@ function optimize(fg, x, alg::GradientDescent; _glast[] = g _dlast[] = η x, f, g, ξ, α, nfg = alg.linesearch(fg, x, η, (f, g); - initialguess = α, retract = retract, inner = inner) + initialguess = α, retract = retract, inner = inner, verbosity = verbosity - 2) numfg += nfg numiter += 1 x, f, g = finalize!(x, f, g, numiter) diff --git a/src/lbfgs.jl b/src/lbfgs.jl index 0c6d377..663eca0 100644 --- a/src/lbfgs.jl +++ b/src/lbfgs.jl @@ -1,3 +1,28 @@ +""" + struct LBFGS{T<:Real,L<:AbstractLineSearch} <: OptimizationAlgorithm + LBFGS(m::Int = 8; maxiter = typemax(Int), gradtol::Real = 1e-8, + acceptfirst::Bool = true, verbosity::Int = 0, + linesearch::AbstractLineSearch = HagerZhangLineSearch()) + + + + +LBFGS optimization algorithm. + +## Fields +- `m::Int`: The number of previous iterations to store for the limited memory BFGS approximation. +- `maxiter::Int`: The maximum number of iterations. +- `gradtol::T`: The tolerance for the norm of the gradient. +- `acceptfirst::Bool`: Whether to accept the first step of the line search. +- `linesearch::L`: The line search algorithm to use. +- `verbosity::Int`: The verbosity level. + +## Constructors +- `LBFGS(m::Int = 8; maxiter = typemax(Int), gradtol::Real = 1e-8, acceptfirst::Bool = true, + verbosity::Int = 0, + linesearch::AbstractLineSearch = HagerZhangLineSearch())`: Construct an LBFGS object with the specified parameters. + +""" struct LBFGS{T<:Real,L<:AbstractLineSearch} <: OptimizationAlgorithm m::Int maxiter::Int @@ -6,9 +31,29 @@ struct LBFGS{T<:Real,L<:AbstractLineSearch} <: OptimizationAlgorithm linesearch::L verbosity::Int end + +""" + LBFGS(m::Int = 8; maxiter = typemax(Int), gradtol::Real = 1e-8, acceptfirst::Bool = true, + verbosity::Int = 0, + linesearch::AbstractLineSearch = HagerZhangLineSearch()) + +Construct an LBFGS object with the specified parameters. + +## Arguments +- `m::Int = 8`: The number of previous iterations to store for the limited memory BFGS approximation. +- `maxiter::Int = typemax(Int)`: The maximum number of iterations. +- `gradtol::Real = 1e-8`: The tolerance for the norm of the gradient. +- `acceptfirst::Bool = true`: Whether to accept the first step of the line search. +- `verbosity::Int = 0`: The verbosity level. +- `linesearch::AbstractLineSearch = HagerZhangLineSearch()`: The line search algorithm to use. + +## Returns +- `LBFGS`: The LBFGS object. + +""" LBFGS(m::Int = 8; maxiter = typemax(Int), gradtol::Real = 1e-8, acceptfirst::Bool = true, verbosity::Int = 0, - linesearch::AbstractLineSearch = HagerZhangLineSearch(;verbosity = verbosity - 2)) = + linesearch::AbstractLineSearch = HagerZhangLineSearch()) = LBFGS(m, maxiter, gradtol, acceptfirst, linesearch, verbosity) function optimize(fg, x, alg::LBFGS; @@ -44,7 +89,7 @@ function optimize(fg, x, alg::LBFGS; else Pg = precondition(x, deepcopy(g)) normPg = sqrt(inner(x, Pg, Pg)) - η = scale!(Pg, -1/normPg) # initial guess: scale invariant + η = scale!(Pg, -0.01/normPg) # initial guess: scale invariant end # store current quantities as previous quantities @@ -59,7 +104,7 @@ function optimize(fg, x, alg::LBFGS; x, f, g, ξ, α, nfg = alg.linesearch(fg, x, η, (f, g); initialguess = one(f), acceptfirst = alg.acceptfirst, # for some reason, line search seems to converge to solution alpha = 2 in most cases if acceptfirst = false. If acceptfirst = true, the initial value of alpha can immediately be accepted. This typically leads to a more erratic convergence of normgrad, but to less function evaluations in the end. - retract = retract, inner = inner) + retract = retract, inner = inner, verbosity = verbosity - 2) numfg += nfg numiter += 1 x, f, g = finalize!(x, f, g, numiter) diff --git a/src/linesearches.jl b/src/linesearches.jl index 5afc5da..02f6483 100644 --- a/src/linesearches.jl +++ b/src/linesearches.jl @@ -19,29 +19,235 @@ function checkexactwolfe(x::LineSearchPoint, x₀::LineSearchPoint, c₁, c₂) return (x.ϕ <= x₀.ϕ + c₁*x.α*x₀.dϕ )&& (x.dϕ > c₂*x₀.dϕ) end +# Hager-Zhang Line Search Algorithm +# Algorithm 851: CG_DESCENT +# (Hager & Zhang, ACM Transactions on Mathematical Software, Vol 32 (2006)) + struct HagerZhangLineSearch{T<:Real} <: AbstractLineSearch - c₁::T # parameter for (approximate) first wolfe condition: c₁ < 1/2 < c₂ - c₂::T # parameter for second wolf condition: c₁ < 1/2 < c₂ - ϵ::T # parameter for approximate Wolfe termination - θ::T # used in update rules for bracketing interval - γ::T # determines when a bisection step is performed - ρ::T # used in determining initial bracketing interval - maxiter::Int - verbosity::Int + c₁::T # parameter for the (approximate) first Wolfe condition (Armijo rule), controlling sufficient decay of function value: c₁ < 1/2 < c₂ + c₂::T # parameter for the second Wolfe condition (curvature contition), controlling sufficient decay of slope: c₁ < 1/2 < c₂ + ϵ::T # parameter for expected accuracy of objective function, controlling maximal allowed increase in function value + θ::T # parameter regulating the bisection step + γ::T # parameter triggering the bisection step, namely if bracket reduction rate is slower than `γ` + ρ::T # parameter controlling the initial bracket expansion rate end +""" + struct HagerZhangLineSearch{T<:Real} <: AbstractLineSearch + HagerZhangLineSearch(; c₁::Real = 1//10, c₂::Real = 9//10, ϵ::Real = 1//10^6, + θ::Real = 1//2, γ::Real = 2//3, ρ::Real = 5//1) + +Constructs a Hager-Zhang line search object with the specified parameters. + +## Arguments: +- `c₁::Real`: Parameter for the (approximate) first Wolfe condition (Armijo rule), controlling sufficient decay of function value: c₁ < 1/2 < c₂. Default is `1//10`. +- `c₂::Real`: Parameter for the second Wolfe condition (curvature contition), controlling sufficient decay of slope: c₁ < 1/2 < c₂. Default is `9//10`. +- `ϵ::Real`: Parameter for expected accuracy of objective function, controlling maximal allowed increase in function value. Default is `1//10^6`. +- `θ::Real`: Parameter regulating the bisection step. Default is `1//2` (should probably not be changed). +- `γ::Real`: Parameter triggering the bisection step, namely if bracket reduction rate is slower than `γ`. Default is `2//3`. +- `ρ::Real`: Parameter controlling the initial bracket expansion rate. Default is `5//1`. + +## Returns: +This method returns a `HagerZhangLineSearch` object `ls`, that can then be can be applied as `ls(fg, x₀, η₀; kwargs...)` +to perform a line search for function `fg` that computes the objective function and its gradient at a given point, starting +from `x₀` in direction `η₀`. +""" +HagerZhangLineSearch(; c₁::Real = 1//10, c₂::Real = 9//10, ϵ::Real = 1//10^6, + θ::Real = 1//2, γ::Real = 2//3, ρ::Real = 5//1) = + HagerZhangLineSearch(promote(c₁, c₂, ϵ, θ, γ, ρ)...) + +# implementation as function +""" + (ls::HagerZhangLineSearch)(fg, x₀, η₀, fg₀ = fg(x₀); + retract = _retract, inner = _inner, + initialguess = one(fg₀[1]), acceptfirst = false, + maxiter = 50, maxfuneval = 100, verbosity = 0) + +Perform a Hager-Zhang line search to find a step length that satisfies the (approximate) Wolfe conditions. + +## Arguments: +- `ls::HagerZhangLineSearch`: The HagerZhangLineSearch object. +- `fg`: Function that computes the objective function and its gradient. +- `x₀`: Starting point. +- `η₀`: Search direction. +- `fg₀`: Objective function and gradient evaluated at `x₀`. Defaults to `fg(x₀)`, but can be supplied if this information has already been calculated. + +## Keyword Arguments: +- `retract`: Function that performs the retraction step, i.e. the generalisation of `x₀ + α * η₀`. Defaults to `_retract`. +- `inner`: Function that computes the inner product between search direction and gradient. Defaults to `_inner`. +- `initialguess::Real`: Initial guess for the step length. Defaults to `one(fg₀[1])`. +- `acceptfirst::Bool`: Parameter that controls whether the initial guess can be accepted if it satisfies the strong Wolfe conditions. Defaults to `false`, thus requiring + at least one line search iteration and one extra function evaluation. +- `maxiter::Int`: Hard limit on the number of iterations. Default is `50`. +- `maxfuneval::Int`: Soft limit on the number of function evaluations. Default is `100`. +- `verbosity::Int`: The verbosity level (see below). Default is `0`. + +### Verbosity Levels +- `0`: No output. +- `1`: Single output about convergence when the linesearch has terminated. +- `2`: Output after the start and every individual iteration step of the Hager-Zhang linesearch. +- `3`: Additional output about the initial bracketing and further bracket update and bisection steps. +- `4`: Output after every function evaluation in the bracketing, updating, and bisection steps. + +## Returns: +- `x`: The point `retract(x₀, η₀, α)` where the (approximate) Wolfe conditions are satisfied. +- `f`: Function value at `x`. +- `g`: Gradient at `x`. +- `ξ`: Tangent at `x` to the line search path. +- `α`: Step length that satisfies the (approximate) Wolfe conditions. +- `numfg`: Number of function evaluations performed. +""" +function (ls::HagerZhangLineSearch)(fg, x₀, η₀, fg₀ = fg(x₀); + retract = _retract, inner = _inner, + initialguess::Real = one(fg₀[1]), acceptfirst::Bool = false, + maxiter::Int = 50, maxfuneval::Int = 100, verbosity::Int = 0) + (f₀, g₀) = fg₀ + ϕ₀ = f₀ + dϕ₀ = inner(x₀, g₀, η₀) + if dϕ₀ >= zero(dϕ₀) + @warn "Linesearch was not given a descent direction: returning zero step length" + return x₀, f₀, g₀, η₀, zero(one(f₀)), 0 + end + + p₀ = LineSearchPoint(zero(ϕ₀), ϕ₀, dϕ₀, x₀, f₀, g₀, η₀) + iter = HagerZhangLineSearchIterator(fg, retract, inner, p₀, η₀, initialguess, acceptfirst, verbosity, ls) + verbosity >= 2 && + @info @sprintf("Linesearch start: dϕ₀ = %.2e, ϕ₀ = %.2e", dϕ₀, ϕ₀) + next = iterate(iter) + @assert next !== nothing + + k = 1 + while true + (x, f, g, ξ, α, dϕ), state = next + a, b, numfg, done = state + verbosity >= 2 && + @info @sprintf("Linesearch iteration step %d, function evaluation count %d:\n[a,b] = [%.2e, %.2e], dϕᵃ = %.2e, dϕᵇ = %.2e, ϕᵃ - ϕ₀ = %.2e, ϕᵇ - ϕ₀ = %.2e", k, numfg, a.α, b.α, a.dϕ, b.dϕ, a.ϕ - ϕ₀, b.ϕ - ϕ₀) + if done + verbosity >= 1 && + @info @sprintf("Linesearch converged after %d iterations and %d function evaluations:\nα = %.2e, dϕ = %.2e, ϕ - ϕ₀ = %.2e", k, numfg, α, dϕ, f - ϕ₀) + return x, f, g, ξ, α, numfg + elseif k == maxiter || numfg >= maxfuneval + verbosity >= 1 && + @info @sprintf("Linesearch not converged after %d iterations and %d function evaluations:\nα = %.2e, dϕ = %.2e, ϕ - ϕ₀ = %.2e", k, numfg, α, dϕ, f - ϕ₀) + return x, f, g, ξ, α, numfg + else + next = iterate(iter, state) + @assert next !== nothing + k += 1 + end + end +end + +# Hager-Zhang Line Search Algorithm implemented as iterator struct HagerZhangLineSearchIterator{T₁<:Real,F₁,F₂,F₃,X,G,T₂<:Real} - fdf::F₁ # computes function value and gradient for a given x, i.e. f, g, extra = f(x, oldextra...) + fdf::F₁ # computes function value and gradient for a given x, i.e. f, g = f(x) retract::F₂ # function used to step in direction η₀ with step size α, i.e. x, ξ = retract(x₀, η₀, α) where x = Rₓ₀(α*η₀) is the new position and ξ = D Rₓ₀(α*η₀)[η₀] is the derivative or tangent of x to α at the position x inner::F₃ # function used to compute inner product between gradient and direction, i.e. dϕ = inner(x, g, d); can depend on x (i.e. metric on a manifold) p₀::LineSearchPoint{T₁,X,G} # initial position, containing x₀, f₀, g₀ η₀::G # search direction α₀::T₁ # initial guess for step size acceptfirst::Bool # whether or not the initial guess can be accepted (e.g. LBFGS) + verbosity::Int # verbosity level parameters::HagerZhangLineSearch{T₂} end +function Base.iterate(iter::HagerZhangLineSearchIterator) + c₁ = iter.parameters.c₁ + c₂ = iter.parameters.c₂ + ϵ = iter.parameters.ϵ + verbosity = iter.verbosity + p₀ = iter.p₀ + + # L0 in the Line Search Algorithm: take initial step + c = takestep(iter, iter.α₀) + numfg = 1 + if iter.acceptfirst + ewolfe = checkexactwolfe(c, p₀, c₁, c₂) + awolfe = checkapproxwolfe(c, p₀, c₁, c₂, ϵ) + verbosity >= 3 && + @info @sprintf("Linesearch initial step:\nc = %.2e, dϕᶜ = %.2e, ϕᶜ - ϕ₀ = %.2e, exact wolfe = %d, approx wolfe = %d", c.α, c.dϕ, c.ϕ - p₀.ϕ, ewolfe, awolfe) + if ewolfe || awolfe + return (c.x, c.f, c.∇f, c.ξ, c.α, c.dϕ), (c, c, numfg, true) + end + else + verbosity >= 3 && + @info @sprintf("Linesearch initial step (cannot be accepted):\nc = %.2e, dϕᶜ = %.2e, ϕᶜ - ϕ₀ = %.2e", c.α, c.dϕ, c.ϕ - p₀.ϕ) + end + + # L0 in the Line Search Algorithm: find initial bracketing interval + a, b, nfg = bracket(iter, c) + verbosity >= 3 && + @info @sprintf("Linesearch initial bracket:\n[a,b] = [%.2e, %.2e], dϕᵃ = %.2e, dϕᵇ = %.2e, ϕᵃ - ϕ₀ = %.2e, ϕᵇ - ϕ₀ = %.2e", a.α, b.α, a.dϕ, b.dϕ, a.ϕ - p₀.ϕ, b.ϕ - p₀.ϕ) + + numfg += nfg + if a.α == b.α + return (a.x, a.f, a.∇f, a.ξ, a.α, a.dϕ), (a, b, numfg, true) + elseif (b.α - a.α) < eps(one(a.α)) + @warn "Linesearch bracket converged to a point without satisfying Wolfe conditions?" + return (a.x, a.f, a.∇f, a.ξ, a.α, a.dϕ), (a, b, numfg, true) + else + return (a.x, a.f, a.∇f, a.ξ, a.α, a.dϕ), (a, b, numfg, false) + end +end + +function Base.iterate(iter::HagerZhangLineSearchIterator, state::Tuple{LineSearchPoint,LineSearchPoint,Int,Bool}) + c₁ = iter.parameters.c₁ + c₂ = iter.parameters.c₂ + ϵ = iter.parameters.ϵ + verbosity = iter.verbosity + p₀ = iter.p₀ + + a, b, numfg, done = state + if done + return nothing + end + dα = b.α - a.α + + # L1 in the Line Search Algorithm: secant2 step + αc = secant(a.α, b.α, a.dϕ, b.dϕ) + A, B, nfg = update(iter, a, b, αc) + numfg += nfg + if A.α == B.α + return (A.x, A.f, A.∇f, A.ξ, A.α, A.dϕ), (A, B, numfg, true) + end + if αc == B.α + αc = secant(b.α, B.α, b.dϕ, B.dϕ) + a, b, nfg = update(iter, A, B, αc) + numfg += nfg + elseif αc == A.α + αc = secant(a.α, A.α, a.dϕ, A.dϕ) + a, b, nfg = update(iter, A, B, αc) + numfg += nfg + else + a, b = A, B + end + verbosity >= 3 && + @info @sprintf("Linesearch updated bracket (secant2):\n[a,b] = [%.2e, %.2e], dϕᵃ = %.2e, dϕᵇ = %.2e, ϕᵃ - ϕ₀ = %.2e, ϕᵇ - ϕ₀ = %.2e", a.α, b.α, a.dϕ, b.dϕ, a.ϕ - p₀.ϕ, b.ϕ - p₀.ϕ) + if a.α == b.α + return (a.x, a.f, a.∇f, a.ξ, a.α, a.dϕ), (a, b, numfg, true) + end + + # L2 in the Line Search Algorithm: bisection step if secant2 convergence too slow + if b.α - a.α > iter.parameters.γ * dα + a, b, nfg = update(iter, a, b, (a.α + b.α)/2) + numfg += nfg + verbosity >= 3 && + @info @sprintf("Linesearch updated bracket (bisection):\n[a,b] = [%.2e, %.2e], dϕᵃ = %.2e, dϕᵇ = %.2e, ϕᵃ - ϕ₀ = %.2e, ϕᵇ - ϕ₀ = %.2e", a.α, b.α, a.dϕ, b.dϕ, a.ϕ - p₀.ϕ, b.ϕ - p₀.ϕ) + end + if a.α == b.α + return (a.x, a.f, a.∇f, a.ξ, a.α, a.dϕ), (a, b, numfg, true) + elseif (b.α - a.α) < eps(one(a.α)) + @warn "Linesearch bracket converged to a point without satisfying Wolfe conditions?" + return (a.x, a.f, a.∇f, a.ξ, a.α, a.dϕ), (a, b, numfg, true) + else + return (a.x, a.f, a.∇f, a.ξ, a.α, a.dϕ), (a, b, numfg, false) + end +end + +# auxiliary methods + +# main function for taking a step and computing the necessary line search quantities function takestep(iter, α) x, ξ = iter.retract(iter.p₀.x, iter.η₀, α) f, ∇f = iter.fdf(x) @@ -50,36 +256,32 @@ function takestep(iter, α) return LineSearchPoint(α, ϕ, dϕ, x, f, ∇f, ξ) end +# secant method for finding the next guess secant(a, b, fa, fb) = (a*fb-b*fa)/(fb-fa) -# function secant2(iter::HagerZhangLineSearchIterator, a::LineSearchPoint, b::LineSearchPoint) -# # interval has (a.dϕ < 0, a.ϕ <= f₀+ϵ), (b.dϕ >= 0) -# αc = secant(a.α, b.α, a.dϕ, b.dϕ) -# A, B = update(iter, a, b, αc) -# if αc == B.α -# αc = secant(b.α, B.α, b.dϕ, B.dϕ) -# return update(iter, A, B, αc) -# elseif αc == A.α -# αc = secant(a.α, A.α, a.dϕ, A.dϕ) -# return update(iter, A, B, αc) -# else -# return A, B -# end -# end +# update bracketing interval: interval has (a.dϕ < 0, a.ϕ <= f₀+ϵ), (b.dϕ >= 0) +# the bisection step is separtely implemented in the bisect function function update(iter::HagerZhangLineSearchIterator, a::LineSearchPoint, b::LineSearchPoint, αc) - # interval has (a.dϕ < 0, a.ϕ <= f₀+ϵ), (b.dϕ >= 0) + p₀ = iter.p₀ c₁ = iter.parameters.c₁ c₂ = iter.parameters.c₂ ϵ = iter.parameters.ϵ + verbosity = iter.verbosity ϕ₀ = iter.p₀.f fmax = ϕ₀ + iter.parameters.ϵ - !(a.α < αc < b.α) && return a, b, 0 # U0 - c = takestep(iter, αc) - @assert isfinite(c.ϕ) - iter.parameters.verbosity > 2 && - @info @sprintf("Linesearch update: try c = %.2e, dϕᶜ = %.2e, ϕᶜ - ϕ₀ = %.2e", c.α, c.dϕ, c.ϕ - ϕ₀) - if checkexactwolfe(c, p₀, c₁, c₂) || checkapproxwolfe(c, p₀, c₁, c₂, ϵ) + + !(a.α < αc < b.α) && return a, b, 0 # line U0: secant point not in interval, rely on L2 + + # take step + c = takestep(iter, αc) # try secant point + !isfinite(c.ϕ) && error("Linesearch bracket update: invalid function value for step length $αc") + + ewolfe = checkexactwolfe(c, p₀, c₁, c₂) + awolfe = checkapproxwolfe(c, p₀, c₁, c₂, ϵ) + verbosity >= 4 && + @info @sprintf("Linesearch update step (U1-U2): c = %.2e, dϕᶜ = %.2e, ϕᶜ - ϕ₀ = %.2e, exact wolfe = %d, approx wolfe = %d", c.α, c.dϕ, c.ϕ - ϕ₀, ewolfe, awolfe) + if ewolfe || awolfe return c, c, 1 end if c.dϕ >= zero(c.dϕ) # U1 @@ -91,7 +293,6 @@ function update(iter::HagerZhangLineSearchIterator, a::LineSearchPoint, b::LineS return a, b, nfg + 1 end end - function bisect(iter::HagerZhangLineSearchIterator, a::LineSearchPoint, b::LineSearchPoint) # applied when (a.dϕ < 0, a.ϕ <= f₀+ϵ), (b.dϕ < 0, b.ϕ > f₀+ϵ) θ = iter.parameters.θ @@ -99,58 +300,55 @@ function bisect(iter::HagerZhangLineSearchIterator, a::LineSearchPoint, b::LineS c₁ = iter.parameters.c₁ c₂ = iter.parameters.c₂ ϵ = iter.parameters.ϵ + verbosity = iter.verbosity fmax = p₀.f + ϵ numfg = 0 while true - if (b.α - a.α) <= eps(max(b.α, one(b.α))) - if iter.parameters.verbosity > 0 - @warn @sprintf("Linesearch bisection failure: [a, b] = [%.2e, %.2e], b-a = %.2e, dϕᵃ = %.2e, dϕᵇ = %.2e, (ϕᵇ - ϕᵃ)/(b-a) = %.2e", a.α, b.α, b.α - a.α, a.dϕ, b.dϕ, (b.ϕ - a.ϕ)/(b.α - a.α)) + if (b.α - a.α) <= eps(one(a.α)) + if verbosity > 0 + @warn @sprintf("Linesearch bisection failure:\n[a, b] = [%.2e, %.2e], b-a = %.2e, dϕᵃ = %.2e, dϕᵇ = %.2e, (ϕᵇ - ϕᵃ)/(b-a) = %.2e", a.α, b.α, b.α - a.α, a.dϕ, b.dϕ, (b.ϕ - a.ϕ)/(b.α - a.α)) end return a, b, numfg end - αc = (1 - θ) * a.α + θ * b.α - c = takestep(iter, αc) + αd = (1 - θ) * a.α + θ * b.α + d = takestep(iter, αd) numfg += 1 - if iter.parameters.verbosity > 2 - @info @sprintf( - """Linesearch bisect: [a, b] = [%.2e, %.2e], b-a = %.2e, dϕᵃ = %.2e, dϕᵇ = %.2e, (ϕᵇ - ϕᵃ) = %.2e - ↪︎ c = %.2e, dϕᶜ = %.2e, ϕᶜ - ϕᵃ = %.2e, wolfe = %d, approxwolfe = %d""", - a.α, b.α, b.α-a.α, a.dϕ, b.dϕ, (b.ϕ - a.ϕ), c.α, c.dϕ, c.ϕ - a.ϕ, checkexactwolfe(c, p₀, c₁, c₂), checkapproxwolfe(c, p₀, c₁, c₂, ϵ)) - end - if checkexactwolfe(c, p₀, c₁, c₂) || checkapproxwolfe(c, p₀, c₁, c₂, ϵ) - return c, c, numfg + ewolfe = checkexactwolfe(d, p₀, c₁, c₂) + awolfe = checkapproxwolfe(d, p₀, c₁, c₂, ϵ) + verbosity >= 4 && + @info @sprintf("Linesearch bisection update (U3):\nd = %.2e, dϕᵈ = %.2e, ϕᵈ - ϕ₀ = %.2e, exact wolfe = %d, approx wolfe = %d", d.α, d.dϕ, d.ϕ - p₀.ϕ, ewolfe, awolfe) + if ewolfe || awolfe + return d, d, numfg end - if c.dϕ >= 0 # U3.a - return a, c, numfg - elseif c.ϕ <= fmax # U3.b - a = c - else # U3.c - # a′ = takestep(iter, a.α) - # @info @sprintf("""ϕᵃ = %.2e, ϕᵃ′ = %.2e""", a.ϕ, a′.ϕ) - b = c + if d.dϕ >= 0 # Line U3.a + return a, d, numfg + elseif d.ϕ <= fmax # Line U3.b + a = d + else # Line U3.c + b = d end end end +# bracket function for finding an initial bracketing interval from a given initial step function bracket(iter::HagerZhangLineSearchIterator{T}, c::LineSearchPoint) where {T} numfg = 0 p₀ = iter.p₀ c₁ = iter.parameters.c₁ c₂ = iter.parameters.c₂ ϵ = iter.parameters.ϵ + verbosity = iter.verbosity a = p₀ fmax = a.f + ϵ - iter.parameters.verbosity > 2 && - @info @sprintf("Linesearch start: dϕ₀ = %.2e, ϕ₀ = %.2e", a.dϕ, a.ϕ) + α = c.α while true while !(isfinite(c.ϕ) && isfinite(c.dϕ)) α = (a.α + α)/2 c = takestep(iter, α) numfg += 1 - end - if iter.parameters.verbosity > 2 - @info @sprintf("Linesearch bracket: try c = %.2e, dϕᶜ = %.2e, ϕᶜ - ϕ₀ = %.2e, wolfe = %d, approxwolfe = %d", c.α, c.dϕ, c.ϕ - p₀.ϕ, checkexactwolfe(c, p₀, c₁, c₂), checkapproxwolfe(c, p₀, c₁, c₂, ϵ)) + verbosity >= 4 && + @info @sprintf("Linesearch bracket step: c = %.2e, dϕᶜ = %.2e, ϕᶜ - ϕ₀ = %.2e", c.α, c.dϕ, c.ϕ - p₀.ϕ) end c.dϕ >= 0 && return a, c, numfg# B1 # from here: c.dϕ < 0 @@ -162,120 +360,11 @@ function bracket(iter::HagerZhangLineSearchIterator{T}, c::LineSearchPoint) wher α *= iter.parameters.ρ c = takestep(iter, α) numfg += 1 + verbosity >= 4 && + @info @sprintf("Linesearch bracket step: c = %.2e, dϕᶜ = %.2e, ϕᶜ - ϕ₀ = %.2e", c.α, c.dϕ, c.ϕ - p₀.ϕ) # if checkexactwolfe(c, p₀, c₁, c₂) || checkapproxwolfe(c, p₀, c₁, c₂, ϵ) # return c, c, numfg # end end end end - - -function Base.iterate(iter::HagerZhangLineSearchIterator) - c₁ = iter.parameters.c₁ - c₂ = iter.parameters.c₂ - ϵ = iter.parameters.ϵ - p₀ = iter.p₀ - a = takestep(iter, iter.α₀) - if iter.acceptfirst - if checkexactwolfe(a, p₀, c₁, c₂) || checkapproxwolfe(a, p₀, c₁, c₂, ϵ) - return (a.x, a.f, a.∇f, a.ξ, a.α, a.dϕ), (a, a, 1, true) - end - end - a, b, numfg = bracket(iter, a) - numfg += 1 # from takestep few lines above - if a.α == b.α - return (a.x, a.f, a.∇f, a.ξ, a.α, a.dϕ), (a, b, numfg, true) - elseif (b.α - a.α) < eps(one(a.α)) - @warn "Linesearch bracket converged to a point without satisfying Wolfe conditions?" - return (a.x, a.f, a.∇f, a.ξ, a.α, a.dϕ), (a, b, numfg, true) - else - return (a.x, a.f, a.∇f, a.ξ, a.α, a.dϕ), (a, b, numfg, false) - end -end - -function Base.iterate(iter::HagerZhangLineSearchIterator, state::Tuple{LineSearchPoint,LineSearchPoint,Int,Bool}) - c₁ = iter.parameters.c₁ - c₂ = iter.parameters.c₂ - ϵ = iter.parameters.ϵ - p₀ = iter.p₀ - - a, b, numfg, done = state - if done - return nothing - end - dα = b.α - a.α - # secant2 step - αc = secant(a.α, b.α, a.dϕ, b.dϕ) - A, B, nfg = update(iter, a, b, αc) - numfg += nfg - if A.α == B.α - return (A.x, A.f, A.∇f, A.ξ, A.α, A.dϕ), (A, B, numfg, true) - end - if αc == B.α - αc = secant(b.α, B.α, b.dϕ, B.dϕ) - a, b, nfg = update(iter, A, B, αc) - numfg += nfg - elseif αc == A.α - αc = secant(a.α, A.α, a.dϕ, A.dϕ) - a, b, nfg = update(iter, A, B, αc) - numfg += nfg - else - a, b = A, B - end - if a.α == b.α - return (a.x, a.f, a.∇f, a.ξ, a.α, a.dϕ), (a, b, numfg, true) - end - # end secant2 - if b.α - a.α > iter.parameters.γ * dα - a, b, nfg = update(iter, a, b, (a.α + b.α)/2) - numfg += nfg - end - if a.α == b.α - return (a.x, a.f, a.∇f, a.ξ, a.α, a.dϕ), (a, b, numfg, true) - elseif (b.α - a.α) < eps(one(a.α)) - @warn "Linesearch bracket converged to a point without satisfying Wolfe conditions?" - return (a.x, a.f, a.∇f, a.ξ, a.α, a.dϕ), (a, b, numfg, true) - else - return (a.x, a.f, a.∇f, a.ξ, a.α, a.dϕ), (a, b, numfg, false) - end -end - -HagerZhangLineSearch(; c₁::Real = 1//10, c₂::Real = 9//10, ϵ::Real = 1//10^6, - θ::Real = 1//2, γ::Real = 2//3, ρ::Real = 5//1, - maxiter = typemax(Int), verbosity::Int = 0) = - HagerZhangLineSearch(promote(c₁, c₂, ϵ, θ, γ, ρ)..., maxiter, verbosity) - -function (ls::HagerZhangLineSearch)(fg, x₀, η₀, fg₀ = fg(x₀); - retract = _retract, inner = _inner, - initialguess = one(fg₀[1]), acceptfirst = false) - - (f₀, g₀) = fg₀ - df₀ = inner(x₀, g₀, η₀) - if df₀ >= zero(df₀) - error("linesearch was not given a descent direction!") - end - p₀ = LineSearchPoint(zero(f₀), f₀, df₀, x₀, f₀, g₀, η₀) - iter = HagerZhangLineSearchIterator(fg, retract, inner, p₀, η₀, initialguess, acceptfirst, ls) - next = iterate(iter) - @assert next !== nothing - k = 1 - while true - (x, f, g, ξ, α, dϕ), state = next - a, b, numfg, done = state - if done - ls.verbosity >= 1 && - @info @sprintf("Linesearch converged after %2d iterations: α = %.2e, dϕ = %.2e, ϕ - ϕ₀ = %.2e", k, α, dϕ, f - f₀) - return x, f, g, ξ, α, numfg - elseif k == ls.maxiter - ls.verbosity >= 1 && - @info @sprintf("Linesearch not converged after %2d iterations: α = %.2e, dϕ = %.2e, ϕ - ϕ₀ = %.2e", k, α, dϕ, f - f₀) - return x, f, g, ξ, α, numfg - else - ls.verbosity >= 2 && - @info @sprintf("Linesearch step %d: [a,b] = [%.2e, %.2e], dϕᵃ = %.2e, dϕᵇ = %.2e, ϕᵃ - ϕ₀ = %.2e, ϕᵇ - ϕ₀ = %.2e", k, a.α, b.α, a.dϕ, b.dϕ, a.ϕ - f₀, b.ϕ - f₀) - next = iterate(iter, state) - @assert next !== nothing - k += 1 - end - end -end diff --git a/test/runtests.jl b/test/runtests.jl index 6873c8b..654ce52 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -14,8 +14,8 @@ using LinearAlgebra global c₁ = 0.5*rand() global c₂ = 0.5 + 0.5*rand() - ls = HagerZhangLineSearch(; c₁ = c₁, c₂ = c₂, ϵ = 0, ρ = 1.5, verbosity = 0) - x, f, g, ξ, α, numfg = ls(fg, x₀, -g₀) + ls = HagerZhangLineSearch(; c₁ = c₁, c₂ = c₂, ϵ = 0, ρ = 1.5) + x, f, g, ξ, α, numfg = ls(fg, x₀, -g₀; verbosity = 4) @test f ≈ fg(x)[1] @test g ≈ fg(x)[2] @@ -23,7 +23,7 @@ using LinearAlgebra @test dot(ξ,g) >= c₂*dot(ξ,g₀) @test f <= f₀ + α * c₁ * dot(ξ, g₀) || (2*c₁ - 1)*dot(ξ,g₀) > dot(ξ,g) - x, f, g, ξ, α, numfg = ls(fg, x₀, -g₀; initialguess = 1e-4) # test extrapolation phase + x, f, g, ξ, α, numfg = ls(fg, x₀, -g₀; initialguess = 1e-4, verbosity = 2) # test extrapolation phase @test f ≈ fg(x)[1] @test g ≈ fg(x)[2] @@ -31,7 +31,7 @@ using LinearAlgebra @test dot(ξ,g) >= c₂*dot(ξ,g₀) @test f <= f₀ + α * c₁ * dot(ξ, g₀) || (2*c₁ - 1)*dot(ξ,g₀) > dot(ξ,g) - x, f, g, ξ, α, numfg = ls(fg, x₀, -g₀; initialguess = 1e4) # test infinities + x, f, g, ξ, α, numfg = ls(fg, x₀, -g₀; initialguess = 1e4, verbosity = 0) # test infinities @test f ≈ fg(x)[1] @test g ≈ fg(x)[2]