jiahao/Optimizer.py

## Optimizer.py
#!/usr/bin/env python

'''
Pure Python implementation of some numerical optimizers

Created on Jan 21, 2011

@author Jiahao Chen
'''

from scipy import dot, eye, finfo, asarray_chkfinite, sqrt, \
    outer, zeros, isinf, arccos
from scipy.linalg import inv, norm, pinv, solve, LinAlgError
from copy import deepcopy

_epsilon = sqrt(finfo(float).eps)

#################
# Line searches #
#################
def MinpackLineSearch(f, df, x, p, df_x = None, f_x = None,
                args = (), c1 = 1e-4, c2 = 0.4, amax = 50, xtol = _epsilon):
    "Modified shamelessly from scipy.optimize"

    if df_x == None: df_x = df(x)
    if f_x == None: f_x = f(x)
    from scipy.optimize import line_search
    stp, _, _, _, _, _ = line_search(f, df, x, p, df_x, f_x, f_x, args, c1, c2, amax)
    if stp == None: #Fall back to backtracking linesearch
        return BacktrackingLineSearch(f, df, x, p, df_x, f_x, args)
    else:
        return stp


def MoreThuenteLinesearchIterate(delta, alpha_l, alpha_u, alpha_t, alpha_min,
            alpha_max, f_l, g_l, f_t, g_t, f_u, g_u, isBracketed, Verbose):
    #Section 4 of More and Thuente: Pick new trial alpha
    if Verbose: print 'Section 4 of More and Thuente'
    assert not (isBracketed and not min(alpha_l, alpha_u) < alpha_t < max(alpha_l, alpha_u)), 'Bracketing fail'
    #if g_t * (alpha_t - alpha_l) < 0.0:
    #    if Verbose: print 'Going uphill; reverting to last good point'
    #    return isBracketed, alpha_l, alpha_l, alpha_t

    if alpha_min == alpha_max:
        print 'Squeezed!'
        return isBracketed, alpha_min, alpha_min, alpha_min

    assert alpha_min < alpha_max

    sgnd = dot(g_t, g_l / abs(g_l))
    isBounded = False
    if f_t > f_l:
        if Verbose:
            print """Case 1: a higher function value. \
The minimum is bracketed by [a_l, a_t]. If the cubic step is closer to \
lower bound then take it, else take average of quadratic and cubic steps."""
        #Calculate cubic and quadratic steps
        da = alpha_t - alpha_l
        d1 = g_l + g_t + 3 * (f_l - f_t) / da
        s = max(g_l, g_t, d1)
        d2 = s * ((d1 / s) ** 2 - (g_l / s * g_t / s)) ** 0.5
        if alpha_t < alpha_l:
            d2 = -d2
        cubp = d2 - g_l + d1
        cubq = ((d2 - g_l) + d2) + g_t
        if abs(cubq) > _epsilon:
            r = cubp / cubq
            #Minimizer of cubic that interpolates f_l, f_t, g_l, g_t
            alpha_c = alpha_l + r * da
            #Minimizer of quadratic that interpolates f_l, f_t, g_l
            alpha_q = alpha_l + ((g_l / ((f_l - f_t) / da + g_l)) * 0.5 * da)
        else:
            #print 'WARNING: VERY SMALL INTERVAL'
            alpha_c = alpha_q = 0.5 * (alpha_l + alpha_t) #Invalid, choose midpoint
        #Calculate new trial
        if Verbose:
            print 'Interpolating guesses:', alpha_c, alpha_q
        if abs(alpha_c - alpha_l) < abs(alpha_q - alpha_l):
            alpha_tt = alpha_c
        else:
            alpha_tt = 0.5 * (alpha_c + alpha_q)
        isBracketed = True
        isBounded = True

    elif sgnd < 0: #f_t <= f_l assumed by here
        if Verbose:
            print """Case 2: lower function value and derivatives are of opposite sign. \
The minimum is bracketed by [a_l, a_t]. If the cubic step is closer to lower \
bound then take it, else take quadratic step."""
        #Calculate cubic and quadratic steps
        da = alpha_t - alpha_l
        d1 = g_l + g_t + 3 * (f_l - f_t) / da
        s = max(g_l, g_t, d1)
        d2 = s * ((d1 / s) ** 2 - (g_l / s * g_t / s)) ** 0.5
        if alpha_t < alpha_l:
            d2 = -d2
        cubp = d2 - g_t + d1
        cubq = ((d2 - g_t) + d2) + g_l
        if abs(cubq) > _epsilon:
            r = cubp / cubq #Minimizer of cubic that interpolates f_l, f_t, g_l, g_t
            alpha_c = alpha_t - r * da #Minimizer of quadratic that interpolates f_l, f_t, g_l
            alpha_s = alpha_t - (g_l / (g_l - g_t)) * da
        else:
            #print 'WARNING: VERY SMALL INTERVAL'
            alpha_s = alpha_c = 0.5 * (alpha_l + alpha_t) #Invalid, choose midpoint
        #Calculate new trial
        if Verbose:
            print 'Interpolating guesses:', alpha_c, alpha_s
        if abs(alpha_c - alpha_t) >= abs(alpha_s - alpha_t):
            alpha_tt = alpha_c
        else:
            alpha_tt = alpha_s
        isBracketed = True
        isBounded = False
    elif norm(g_t) <= norm(g_l): #g_t * g_l >= 0 and f_t <= f_l assumed by here
        if Verbose:
            print """Case 3: lower function value and derivatives are of same \
sign, and magnitude of derivative decreased.\n Cubic step used only if it tends \
to infinity in direction of step or if minimum of cubic lies beyond step. \
Otherwise cubic step is defined to be either stpmin or stpmax. Quadratic step is \
taken if it is closer to alpha_t, else take the farthest step."""
        #Calculate cubic and quadratic steps
        da = alpha_t - alpha_l
        d1 = g_l + g_t + 3 * (f_l - f_t) / da
        s = max(g_l, g_t, d1)
        d2 = s * ((d1 / s) ** 2 - (g_l / s * g_t / s)) ** 0.5
        if alpha_t < alpha_l:
            d2 = -d2

        cubp = d2 - g_t + d1
        cubq = ((d2 - g_t) + d2) + g_l
        r = cubp / cubq
        #The case d2 == 0 arises only if cubic does not tend to infinity in
        #direction of step
        if (r < 0. and abs(d2) < _epsilon):
            alpha_c = alpha_t - r * da
        elif alpha_t > alpha_l:
            alpha_c = alpha_max
        else:
            alpha_c = alpha_min

        #Minimizer of quadratic that interpolates f_l, f_t, g_l
        alpha_s = -alpha_t - (g_l / (g_l - g_t)) * da

        if isBracketed:
            if abs(alpha_c - alpha_t) < abs(alpha_s - alpha_t):
                alpha_tt = alpha_c
            else:
                alpha_tt = alpha_s
        else:
            if abs(alpha_c - alpha_t) > abs(alpha_s - alpha_t):
                alpha_tt = alpha_c
            else:
                alpha_tt = alpha_s
    else:
        if Verbose:
            print """Case 4: lower function value and derivatives \
are of same sign, and magnitude of derivative did not decrease.
The minimum is NOT NECESSARILY bracketed. \
If bracketed, take cubic step, otherwise take  alpha_min or alpha_max."""
        #Minimizer of cubic that interpolates f_u, f_t, g_u, g_t
        #Formula from Nocedal and Wright 2/e, Eq. 3.59, p 59
        if isBracketed:
            da = alpha_t - alpha_u
            d1 = g_u + g_t + 3 * (f_u - f_t) / da
            s = max(g_l, g_t, d1)
            d2 = s * ((d1 / s) ** 2 - (g_l / s * g_t / s)) ** 0.5
            if alpha_t < alpha_u:
                d2 = -d2
            cubp = d2 - g_t + d1
            cubq = ((d2 - g_t) + d2) + g_u
            if abs(cubq) > _epsilon:
                r = cubp / cubq
                alpha_tt = alpha_t - r * da
            else:
                #print 'WARNING: VERY SMALL INTERVAL'
                alpha_tt = 0.5 * (alpha_l + alpha_t) #Invalid, choose midpoint
        else:
            if alpha_t > alpha_l:
                alpha_tt = alpha_max
            else:
                alpha_tt = alpha_min

        isBounded = False #abs(g_t) > abs(g_l)

    if Verbose: print 'Now proceeding to update'

    #Update interval of uncertainty
    if f_t > f_l:
        if Verbose: print 'Update case 1'
        alpha_u, f_u, g_u = alpha_t, f_t, g_t
    else:
        if sgnd < 0:
            if Verbose: print 'Update case 2'
            alpha_u, f_u, g_u = alpha_l, f_l, g_l
        else:
            if Verbose: print 'Update case 3'
        alpha_l, f_l, g_l = alpha_t, f_t, g_t


    """Refine trial value if it lies outside [alpha_t, alpha_u] or
    is otherwise too close to alpha_u."""
    alpha_tt = min(alpha_max, alpha_tt)
    alpha_tt = max(alpha_min, alpha_tt) #Update
    alpha_t = alpha_tt
    if isBracketed and isBounded:
        if alpha_u > alpha_l:
            alpha_t = min(alpha_l + delta * (alpha_u - alpha_l), alpha_t)
        else:
            alpha_t = max(alpha_l + delta * (alpha_u - alpha_l), alpha_t)

    if Verbose: print 'Refined trial value = ', alpha_t

    return isBracketed, alpha_t, alpha_l, alpha_u


def MoreThuenteLineSearch(f, df, x, p, df_x = None, f_x = None, args = (),
    mu = 0.01, eta = 0.5, ftol = _epsilon, gtol = _epsilon, rtol = _epsilon,
    maxiter = 9, fmin = 0.0, alpha_min0 = 0.001, alpha_max0 = 100.0, alpha_t = 1.0,
    p5 = 0.5, xtrapf = 4.0, delta = 0.66, Verbose = False):
    """Implements the line search algorithm of Mor\'e and Thuente, ACM TOMS 1994
    "Line search algorithms with guaranteed sufficient decrease"
    doi: 10.1145/192115.192132

    Inputs
    ------
    f
    df
    x
    p
    df_x = None
    f_x = None
    args = ()
    mu = 0.01
    mu is maximum required decrease
     mu >= |f(x+dx) - f(x)| / |df(x).p|
     mu in (0,1)
    eta = 0.5
    eta is maximum permissible value of |df(x+dx).p|/|df(x).p|
    eta in (0,1)
    ftol = _epsilon
    gtol = _epsilon
    rtol = _epsilon
    rtol is relative tolerance for acceptable step
    maxiter: Maximum number of iterations. Default: 9
    fmin = 0.0
    alpha_min0 = 0.001
    alpha_max0 = 100.0
    alpha_t = 1.0
    p5 = 0.5
    xtrapf: An extrapolation parameter. Recommended: between [1.1, 4.0].
            Default: 4.0
    delta = 0.66
    Verbose = True
    """

    ##################################
    # Evaluate function and derivative
    ##################################
    if f_x == None: f_0 = f(x, *args)
    else:           f_0 = f_x

    if df_x == None: df_0 = df(x, *args)
    else:            df_0 = df_x

    g_0 = dot(df_0, p)

    ############################################################
    # Check that initial gradient in search direction is downhill
    #############################################################
    assert g_0.shape == (1, 1) or g_0.shape == (), 'Scipy screwed up the calculation.'
    assert g_0 < 0, 'Attempted to linesearch uphill'

    ############
    # Initialize
    ############
    FoundGoodPoint = isBracketed = False
    gtest = ftol * g_0
    width = alpha_max0 - alpha_min0
    width1 = width / p5

    f_t = f(x + alpha_t * p, *args)
    df_t = df(x + alpha_t * p, *args)
    g_t = dot(df_t, p)

    # alpha = step size
    # f = function value
    # g = gradient value in descent direction
    # _l = best step so far
    # _u = other endpoint of interval of uncertainty
    # _t = current step iterate
    alpha_l = alpha_u = 0.0
    f_l = f_u = f_0
    g_l = g_u = g_0

    if alpha_l == alpha_min0: alpha_l += _epsilon
    if alpha_u == alpha_max0: alpha_u -= _epsilon
    assert alpha_min0 != alpha_max0

    for k in range(maxiter):
        if Verbose: print 'Iteration', k, ':', alpha_t, 'in (', alpha_l, ',', alpha_u, ')'

        ############################################
        # Initialize current interval of uncertainty
        ############################################
        if isBracketed:
            alpha_min = min(alpha_l, alpha_u)
            alpha_max = max(alpha_l, alpha_u)
        else:
            alpha_min = alpha_l
            alpha_max = alpha_t + xtrapf * abs(alpha_t - alpha_l)
            assert alpha_min <= alpha_max

        ##################################################
        # Safeguard trial step within pre-specified bounds
        ##################################################
        alpha_t = max(alpha_t, alpha_min0)
        alpha_t = min(alpha_t, alpha_max0)

        #If something funny happened, set it to lowest point obtained thus far
        if ((isBracketed and (alpha_t <= alpha_min or alpha_t >= alpha_max)) \
         or (isBracketed and (alpha_max - alpha_min <= rtol * alpha_max)) \
            ):
            if Verbose: print 'Something funny happened'
            return alpha_l


        #Evaluate function and gradient at new point
        f_t = f(x + alpha_t * p, *args)
        df_t = df(x + alpha_t * p, *args)
        g_t = dot(df_t, p)
        ftest1 = f_0 + alpha_t * gtest

        ######################
        # Test for convergence
        ######################

        if (alpha_l == alpha_max0 and f_t <= ftest1 and g_t <= gtest):
            if Verbose: print 'Stuck on upper bound'
            break
        if (alpha_l == alpha_min0 and f_t > ftest1 and g_t >= gtest):
            if Verbose: print 'Stuck on lower bound'
            break
        #Check for a) sufficient decrease AND b) strong curvature criterion
        if ((f_t <= f_0 + mu * g_0 * alpha_t) \
            and abs(g_t) <= eta * abs(g_0)):
            if Verbose: print 'Decrease criterion satisfied'
            break

        #Check for bad stuff
        if isBracketed:
            if (abs(alpha_max - alpha_min) >= delta * width1):
                print 'Warning: Could not satisfy curvature conditions'
                alpha_t = alpha_min + p5 * (alpha_max - alpha_min)
            width1 = width
            width = abs(alpha_max - alpha_min)

        #If interval is bracketed to sufficient precision, break
        if k > 0 and abs(alpha_l - alpha_u) < rtol:
            if Verbose: print 'Interval bracketed: alpha = ', alpha_t
            print 'Line search stuck. Emergency abort.'
            break

        ########################################################
        # Nocedal's modification of the More-Thuente line search
        ########################################################
        ftest1 = f_0 + alpha_l * gtest

        # Seek a step for which the modified function has a nonpositive value
        # and a nonnegative derivative

        if (not FoundGoodPoint) and f_t <= ftest1 and g_t >= g_0 * min(ftol, gtol):
            FoundGoodPoint = True

        # The modified function is used only if we don't have a step where the
        # previous condition is attained, and if a lower function value has been
        # obtained but it is not low enough

        if (not FoundGoodPoint) and f_t <= f_l and f_t > ftest1:
            if Verbose: print "Performing Nocedal's modification"
            f_mt = f_t - alpha_t * gtest
            f_ml = f_l - alpha_l * gtest
            f_mu = f_u - alpha_u * gtest
            g_mt = g_t - gtest
            g_ml = g_l - gtest
            g_mu = g_u - gtest

            isBracketed, alpha_t, alpha_l, alpha_u = MoreThuenteLinesearchIterate\
            (delta, alpha_l, alpha_u, alpha_t, alpha_min, alpha_max, f_ml, \
             g_ml, f_mt, g_mt, f_mu, g_mu, isBracketed, Verbose)

            f_l = f_ml + alpha_l * gtest
            f_u = f_mu + alpha_u * gtest
            g_l = g_ml + gtest
            g_u = g_mu + gtest

        else:
            if Verbose: print 'Performing original More-Thuente line search'
            isBracketed, alpha_t, alpha_l, alpha_u = MoreThuenteLinesearchIterate\
            (delta, alpha_l, alpha_u, alpha_t, alpha_min, alpha_max, f_l, g_l, \
             f_t, g_t, f_u, g_u, isBracketed, Verbose)

        ###########################
        # Force sufficient decrease
        ###########################
        if isBracketed:
            if Verbose: print 'Force sufficient decrease'
            if abs(alpha_u - alpha_l) >= delta * width1:
                alpha_t = alpha_l + p5 * (alpha_u - alpha_l)
            width1 = width
            width = abs(alpha_u - alpha_l)


    if Verbose: print k + 1, 'iterations in More-Thuente line search'
    return alpha_t


def CubicLineSearch(f, df, x, p, df_x, f_x = None, args = (),
        alpha = 0.0001, beta = 0.9, eps = 0.0001, Verbose = False):

    if f_x is None:
        f_x = f(x, *args)

    assert df_x.T.shape == p.shape
    assert 0 < alpha < 1, 'Invalid value of alpha in backtracking linesearch'
    assert 0 < beta < 1, 'Invalid value of beta in backtracking linesearch'


    phi = lambda c: f(x + (c * p), *args)

    phi0 = f_x
    derphi0 = dot(df_x, p)

    assert derphi0.shape == (1, 1) or derphi0.shape == ()
    assert derphi0 < 0, 'Attempted to linesearch uphill'

    stp = 1.0

    #Loop until Armijo condition is satisfied
    while phi(stp) > phi0 + alpha * stp * derphi0:
        #Quadratic interpolant
        stp_q = -derphi0 * alpha ** 2 / (2 * phi(stp) - phi0 - derphi0 * stp)
        if phi(stp_q) > f_x + alpha * stp_q * derphi0:
            return stp_q

        #Quadratic interpolant bad, use cubic interpolant
        A = zeros((2, 2))
        b = zeros((2,))

        A[0, 0] = stp * stp
        A[0, 1] = -stp_q ** 2
        A[1, 0] = -stp ** 3
        A[1, 1] = stp_q ** 3

        b[0] = phi(stp_q) - phi0 - derphi0 * stp_q
        b[1] = phi(stp) - phi0 - derphi0 * stp

        xx = dot(A, b) / ((stp * stp_q) ** 2 * (stp_q - stp))

        stp_c = (-xx[1] + (xx[1] ** 2 - 3 * xx[0] * derphi0) ** 0.5) / (3 * xx[0])

        #Safeguard: if new alpha too close to predecessor or too small
        if abs(stp_c - stp) < 0.001 or stp_c < 0.001:
            stp *= beta
        else:
            stp = stp_c
        #print stp

    #print stp
    return stp


def BacktrackingLineSearch(f, df, x, p, df_x = None, f_x = None, args = (),
        alpha = 0.0001, beta = 0.9, eps = _epsilon, Verbose = False):
    """
    Backtracking linesearch
    f: function
    x: current point
    p: direction of search
    df_x: gradient at x
    f_x = f(x) (Optional)
    args: optional arguments to f (optional)
    alpha, beta: backtracking parameters
    eps: (Optional) quit if norm of step produced is less than this
    Verbose: (Optional) Print lots of info about progress

    Reference: Nocedal and Wright 2/e (2006), p. 37

    Usage notes:
    -----------
    Recommended for Newton methods; less appropriate for quasi-Newton or conjugate gradients
    """

    if f_x is None:
        f_x = f(x, *args)
    if df_x is None:
        df_x = df(x, *args)

    assert df_x.T.shape == p.shape
    assert 0 < alpha < 1, 'Invalid value of alpha in backtracking linesearch'
    assert 0 < beta < 1, 'Invalid value of beta in backtracking linesearch'

    derphi = dot(df_x, p)

    assert derphi.shape == (1, 1) or derphi.shape == ()
    assert derphi < 0, 'Attempted to linesearch uphill'

    stp = 1.0
    fc = 0
    len_p = norm(p)


    #Loop until Armijo condition is satisfied
    while f(x + stp * p, *args) > f_x + alpha * stp * derphi:
        stp *= beta
        fc += 1
        if Verbose: print 'linesearch iteration', fc, ':', stp, f(x + stp * p, *args), f_x + alpha * stp * derphi
        if stp * len_p < eps:
            print 'Step is  too small, stop'
            break
    #if Verbose: print 'linesearch iteration 0 :', stp, f_x, f_x

    if Verbose: print 'linesearch done'
    #print fc, 'iterations in linesearch'
    return stp


#######################
# Trust region models #
#######################

class DogLeg:
    def __init__(self, f, df, B, x, g, f_x = None, df_x = None, TrustRadius = 1.0):
        self.TrustRadius = TrustRadius
        self.f = f   #Function
        self.df = df #Gradient function
        self.B = B #Approximate Hessian
        self.x = x #Current point
        self.g = g #Search direction

    def solve(self):
        g = self.g
        B = self.B
        #Step 1: Find unconstrained solution
        pB = g
        #Step 2: Find Cauchy point
        pU = -dot(g, g) / dot(g, dot(B, g)) * g
        #INCOMPLETE


################
# Extrapolator #
################

class DIISExtrapolator:
    def __init__(self, max = 300):
        self.maxvectors = max
        self.Reset()

    def Reset(self):
        self.errorvectors = []
        self.coordvectors = []

    def AddData(self, error, coord):
        #Check max
        while self.GetNumVectors() - self.maxvectors >= 0:
            self.errorvectors.pop()
            self.coordvectors.pop()

        from numpy import asarray
        #print asarray(error).flatten().shape, asarray(coord).flatten().shape

        self.errorvectors.append(deepcopy(asarray(error).flatten()))
        self.coordvectors.append(deepcopy(asarray(coord).flatten()))

    def GetNumVectors(self):
        assert len(self.errorvectors) == len(self.coordvectors)
        return len(self.errorvectors)

    def Extrapolate(self):
        #Construct Overlap matrix (aka Gram matrix)
        N = self.GetNumVectors()
        if N == 0: return None

        B = zeros((N + 1, N + 1))
        for i in range(N):
            for j in range(N):
                B[i, j] = dot(self.errorvectors[i], self.errorvectors[j])

        B[N, :] = -1
        B[:, N] = -1
        B[N, N] = 0

        v = zeros(N + 1)
        v[N] = -1

        #Solve for linear combination
        xex = self.coordvectors[0] * 0
        try:
            c = solve(B, v)
            assert c.shape == v.shape

            #Generate interpolated coordinate
            for i in range(N):
                xex += c[i] * self.coordvectors[i]
        except: #Linear dependency detected; trigger automatic reset
            c = dot(pinv(B), v)
            #Generate interpolated coordinate
            for i in range(N):
                xex += c[i] * self.coordvectors[i]
            print 'Linear dependency detected in DIIS; resetting'
            self.Reset()

        return xex


#######################
# Quasi-Newton driver #
#######################


class ApproxHessianBase:
    def __init__(self, N = None):
        if N is None:
            self.M = None
        else:
            self.M = eye(N)

    def Reset(self):
        self.SetToIdentity()

    def SetToIdentity(self):
        self.M = eye(self.M.shape[0])

    def SetToScaledIdentity(self, c = 1):
        self.M = c * eye(self.M.shape[0])

    def SetToMatrix(self, K):
        self.M = K

    def GetHessian(self):
        return self.M

    def Operate(self, g):
        assert False, 'Should never run this'

    def Update(self, *args):
        assert False, 'Should never run this'

    def __rshift__(self, g): #Overload >> operator
        """If self.B is defined, solves for x in B x = -g
        """
        try:
            dx = self.Operate(g)
        except ValueError, e:
            print e
            print g.shape, self.M.shape
            exit()

        try:
            return asarray_chkfinite(dx)
        except (LinAlgError, ValueError):
            print 'Restarting approximate Hessian'
            'Trigger restart, fall back to steepest descent'
            self.SetToIdentity()
            return g


class ApproxHessian(ApproxHessianBase):
    def __init__(self, N = None):
        ApproxHessianBase.__init__(self, N)

    def SetToNumericalHessian(self, df, x, step = 0.001, UseStencil = None):
        if UseStencil is None:
            import Stencil
            UseStencil = Stencil.FirstOrderCentralDifferenceStencil()
        #Initialize Hessian as numerical Hessian
        self.M = UseStencil.ApplyToFunction(df, x, step)

    def Operate(self, g):
        """If self.B is defined, solves for x in B x = -g
        """
        try:
            dx = -solve(self.M, g)
        except LinAlgError:
            print 'Warning, using pseudoinverse'
            dx = -pinv(self.M, g)
        return dx


class ApproxInvHessian(ApproxHessianBase):
    def __init__(self, N = None):
        ApproxHessianBase.__init__(self, N)

    def SetToNumericalHessian(self, df, x, step = 0.001, UseStencil = None):
        if UseStencil is None:
            import Stencil
            UseStencil = Stencil.FirstOrderCentralDifferenceStencil()
        #Initialize Hessian as numerical Hessian
        K = UseStencil.ApplyToFunction(df, x, step)
        try:
            self.M = inv(K)
        except LinAlgError:
            self.M = pinv(K)

    def GetHessian(self):
        try:
            return inv(self.M)
        except LinAlgError:
            print 'Warning, using pseudoinverse'
            return pinv(self.M)

    def Operate(self, g):
        """If self.H is defined, returns x = H g
        """
        return - dot(self.M, g)


class BFGSInvHessian(ApproxInvHessian):
    def Update(self, s, y, s_dot_y = None):
            if s_dot_y is None: s_dot_y = dot(s, y)
            rhok = 1.0 / s_dot_y
            I = eye(self.M.shape[0])
            A1 = I - outer(s, y) * rhok
            A2 = I - outer(y, s) * rhok
            self.M = dot(A1, dot(self.M, A2))
            self.M += +rhok * outer(s, s)


class LBFGSInvHessian(ApproxInvHessian):
    def __init__(self, M = 20):
        self.maxnumvecs = int(M)
        self.rho = []
        self.gamma = 1.0
        self.Reset()

        print 'Initialized L-BFGS Hessian approximation with M =', self.maxnumvecs

    def Reset(self):
        self.s = []
        self.y = []
        self.rho = []

    def SetToIdentity(self): #Reset
        self.Reset()

    def SetToScaledIdentity(self, c = None): #Reset
        self.Reset()
        if c is not None: self.gamma = c

    def Update(self, s, y, s_dot_y = None):
        if len(self.s) == self.maxnumvecs:
            self.s.pop(0)
            self.y.pop(0)
            self.rho.pop(0)

        if s_dot_y is None: s_dot_y = dot(s, y)
        rho = 1.0 / s_dot_y
        if isinf(rho):
            print 'Warning, s . y = 0; ignoring pair'
        else:
            self.s.append(s)
            self.y.append(y)
            self.rho.append(rho)

    def GetNumVecs(self):
        assert len(self.s) == len(self.y)
        return len(self.s)

    def __rshift__(self, g): #Overload >> as multiply -H g
        q = g
        m = self.GetNumVecs()
        s = self.s
        y = self.y
        a = zeros(m)
        for i in range(m - 1, -1, -1):
            a[i] = self.rho[i] * dot(s[i], q)
            q = q - a[i] * y[i]

        #r = Hguess * q
        gamma = self.gamma
        if m > 0: gamma = dot(s[-1], y[-1]) / dot(y[-1], y[-1])
        #if m > 0: gamma = 0.5 * gamma + 0.5 * dot(s[-1], y[-1]) / dot(y[-1], y[-1])
        self.gamma = gamma
        print 'gamma=',gamma
        r = gamma * q
        for i in range(m):
            b = self.rho[i] * dot(y[i], r)
            r = r + s[i] * (a[i] - b)

        return - r


def Optimizer(f, x0, df = None, xtol = 0.1*_epsilon**0.5,
             gtol =  0.1*_epsilon**0.5, maxstep = 1, maxiter = None,
             line_search = BacktrackingLineSearch, DIISTrigger = 0):

    #Hijack code with call to L-BFGS-B
    #from scipy.optimize import fmin_l_bfgs_b
    #return fmin_l_bfgs_b(f, x0, df, m = len(x0), pgtol = gtol, factr = 1.0/xtol, iprint=1)[0]

    #Do LBFGS
    #If true, will fall back on BFGS automatically
    DoLBFGS = True

    DoLineSearch = True
    TrustRadius = 0.5
    MaxTrustRadius = 10.0

    #Cosine squared of uphill angle
    UphillAngleThresh = 0.0
    UphillFThresh = 0.001

    #Massage x0
    x0 = asarray_chkfinite(x0).ravel()
    if x0.ndim == 0: x0.shape = (1,)

    #Set default number of maxiters
    if maxiter is None: maxiter = max(20000, len(x0) * 4)

    ###########
    #Initialize
    ###########
    x = x0
    f_x, df_x = f(x0), df(x0)
    norm_g = norm(df_x)
    N = len(x0)
    if DoLBFGS:
        #Dimension of 3 --- 20 is normal
        #H = LBFGSInvHessian(2 + N ** 0.2)
        H = LBFGSInvHessian(N)
    else:
        H = BFGSInvHessian(N)
        H.SetToNumericalHessian(df, x)

    if DIISTrigger != 0: DIIS = DIISExtrapolator()


    for k in range(maxiter):
        #######################################################################
        # Solve for unconstrained minimization direction p such that H p = -x #
        #######################################################################
        p = H >> df_x #overloaded
        AcceptStep = True

        ##########################################################
        # Safeguard: perform linesearch or trust-region limiting #
        #            to determine actual step s to take          #
        ##########################################################
        if DoLineSearch:
            ####################
            # Perform linesearch
            ####################
            alpha = line_search(f, df, x, p, df_x, f_x)
            try:
                alpha = line_search(f, df, x, p, df_x, f_x)
                assert alpha > 0
            except AssertionError:
                print 'Line search failed; falling back to exact line search'
                alpha = MinpackLineSearch(f, df, x, p, df_x, f_x)
                assert alpha > 0
            s = alpha * p
            norm_s = norm(s)
        else:
            ##############
            # Do dog-leg #
            ##############
            #First do line search along steepest descent direction
            alpha = line_search(f, df, x, -df_x, df_x, f_x)
            #Cauchy point is x - alpha * df_x
            #Next, move toward unconstrained solution from Cauchy point
            dogleg = p + alpha * df_x
            alpha2 = line_search(f, df, x - alpha * df_x, dogleg)
            s = -alpha * df_x + alpha2 * dogleg
            #Stupidly enforce simple trust radius
            norm_s = norm(s)
            while norm_s > TrustRadius:
                alpha2 *= 0.9
                s = -alpha * df_x + alpha2 * dogleg
                norm_s = norm(s)

        ######################################
        # Do DIIS Extrapolation if requested #
        ######################################
        DidDIIS = False
        if DIISTrigger != 0:
            data = zeros(len(df_x) + 1)
            data[1:] = df_x
            data[0] = f_x
            DIIS.AddData(error = data, coord = x)
            if DIIS.GetNumVectors() % DIISTrigger == 0:
                mindf = min([norm(xx[1:]) for xx in DIIS.errorvectors])
                xdnew = DIIS.Extrapolate()
                norm_df_xdnew = norm(df(xdnew))
                f_xdnew = f(xdnew)
                if norm_df_xdnew < mindf and f_xdnew < f_x:
                    #Accept only if there is actually a reduction in |df|
                    print 'Accepted DIIS extrapolation'
                    DidDIIS = True
                    s = xdnew - x
                    #if DIISTrigger > 1: DIISTrigger -= 1
                    #print 'Improvement ratios', (f_x - f_xdnew) / f_x, (mindf - norm_df_xdnew) / mindf
                    if (f_x - f_xdnew) < 0.000001 * f_x and (mindf - norm_df_xdnew) < 0.00001 * mindf :
                        print 'DIIS reaching diminishing returns, resetting'
                        DIIS.Reset()
                        #DIISTrigger += 1
                else:
                    print 'DIIS wanted to go to', f_xdnew, norm_df_xdnew
                    print "Rejected DIIS extrapolation"
                    DIIS.Reset()
                    DIISTrigger += 1
                    AcceptStep = False
            if DIISTrigger > 50:
                print 'Turning off DIIS'
                DIISTrigger = 0

        ####################################################
        # Take step; compute function value and derivative #
        ####################################################
        xnew = x + s
        f_xnew = f(xnew)
        df_xnew = df(xnew)
        y = df_xnew - df_x
        norm_y = norm(y)
        norm_gnew = norm(df_xnew)
        s_y = dot(s, y)

        #############################################
        # If doing line search, update trust radius #
        #############################################
        if True:
            ReductionRatio = 2 * (f_xnew - f_x) / dot(s, df_x)
            print '\nReduction ratio:', ReductionRatio
            if ReductionRatio <= 0:
                print 'Iterations are producing a WORSE answer!\nBailing.'
                print 'Rejecting step and restarting BFGS'
                AcceptStep = False
                if DoLBFGS:
                    H.Reset()
                    H.Update(s, y)
                else:
                    gamma = s_y / norm_y ** 2
                    H.SetToScaledIdentity(gamma)

        if not DoLineSearch: #ie do trust region
            #Compute model quality?
            ReductionRatio = 2 * (f_xnew - f_x) / dot(s, df_x)
            print '\nReduction ratio:', ReductionRatio
            if ReductionRatio > 0.75:
                if norm(s) == TrustRadius:
                    TrustRadius = min (2 * TrustRadius, MaxTrustRadius)
            elif ReductionRatio < 0.25:
                TrustRadius *= 0.25

        ###################
        # Check convergence
        ###################
        if norm_g < gtol:
            break

        #################################
        # Make sure we are going downhill
        #################################
        NegCosDescentAngle = s_y / (norm_s * norm_y)
        DescentAngle = arccos(-NegCosDescentAngle)
        DescentRatio = (f_x - f_xnew) / f_x
        isGoingUphill = NegCosDescentAngle < UphillAngleThresh
        isIncreasing = -DescentRatio > UphillFThresh
        if isGoingUphill or isIncreasing:
            if isGoingUphill:
                print '\nIteration %d: WARNING: Going uphill with angle %f' % \
                    (k, -DescentAngle)

            if isIncreasing:
                print '\nIteration %d: WARNING: function increased by ratio %f' % \
                    (k, -DescentRatio)

            print 'Rejecting step and restarting BFGS'
            AcceptStep = False
            if DoLBFGS:
                H.Reset()
                H.Update(s, y)
            else:
                gamma = s_y / norm_y ** 2
                H.SetToScaledIdentity(gamma)

        else:
            #####################################
            # Accept step, update quasi-Hessian #
            #####################################

            if not DidDIIS:
                H.Update(s, y)

        #####################################################
        # EXPERIMENTAL: Heuristics for switching algorithms #
        #####################################################
        if False and DoLineSearch and  DescentRatio < 1e-2:
            print '\nTurning off linesearch'
            DoLineSearch = False
        if False and DescentRatio < 1e-2 and DIISTrigger==0:
            print 'Do DIIS'
            DIISTrigger = 2
            DIIS = DIISExtrapolator()
            #print 'Switching to BFGS'
            #H = BFGSInvHessian(N)
            #gamma = dot(s, y) / dot(y, y)
            #H.SetToScaledIdentity(gamma)

        ###################################################
        # Accept step: update function value and gradient #
        ###################################################
        if AcceptStep:
            x = xnew
            df_x = df_xnew
            f_x = f_xnew
            norm_g = norm_gnew

        print "Iteration %d: f(x) = %f (%fx), ||f'(x)|| = %f, || dx || = %f, descent angle = %f" \
            % (k, f_x, DescentRatio, norm_g, norm_s, DescentAngle)


    if k == maxiter - 1: print 'Maximum number of iterations reached'
    print 'Quasi-Newton done'

    """
    W = ApproxHessian()
    W.SetToNumericalHessian(df, x)
    Hess = W.GetHessian()
    from numpy.linalg import svd
    for eigval in svd(Hess)[1]:
        print eigval
    """

    exit()
    return x