Skip to content

Commit

Permalink
basic testing lbfgs;
Browse files Browse the repository at this point in the history
got some basic convergence but definitely
still issues, sparse impl not working,
convergence in 3x the iters of lbfgsb3.0
  • Loading branch information
enzbus committed Jun 4, 2024
1 parent c349d44 commit 7f2ff85
Show file tree
Hide file tree
Showing 2 changed files with 218 additions and 4 deletions.
149 changes: 148 additions & 1 deletion project_euromir/tests/lbfgs.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,10 @@
https://doi.org/10.1090/S0025-5718-1980-0572855-7
(easy to find non-paywalled)
- Reference implementation: MINPACK-2, vmlm module
https://ftp.mcs.anl.gov/pub/MINPACK-2/
https://github.com/jacobwilliams/MINPACK-2/tree/master/vmlm
- Wikipedia page:
https://en.wikipedia.org/wiki/Limited-memory_BFGS,
https://web.archive.org/web/20240515120721/https://en.wikipedia.org/wiki/Limited-memory_BFGS
Expand Down Expand Up @@ -100,7 +104,7 @@ def lbfgs_multiply(
# center part
r = base_inverse_diagonal * q

# wikipedia does this correction, not found in original paper
# scale correction, see MINPACK-2/vmlm
# gamma_correction = np.dot(
# past_steps[-1], past_grad_diffs[-1]) / np.dot(
# past_grad_diffs[-1], past_grad_diffs[-1])
Expand Down Expand Up @@ -140,3 +144,146 @@ def _lbfgs_multiply_dense(
H = left @ H @ right + rho * np.outer(past_steps[i], past_steps[i])

return H @ current_gradient


def strong_wolfe(
current_point: np.array,
current_loss: float,
current_gradient: np.array,
direction: np.array,
step_size: float,
next_loss: float,
next_gradient: np.array,
c_1: float = 1e-3,
c_2: float = 0.9):
"""Simple line search satisfying Wolfe conditions.
This is a much simplified approach versus the canonical dcsrch from
MINPACK. Hopefully it works!
Idea: start from unit step. If Armijo rule is not satisfied, backtrack.
If curvature condition is not satisfied, make step longer. If neither is
satisfied, try both until we find a point where either is. Then restart
from there.
Default c_1 and c_2 are the same as in MINPACK-2/vmlm, from my tests they
seem like good choices.
"""

gradient_times_direction = current_gradient @ direction
assert gradient_times_direction < 0

armijo = \
next_loss <= current_loss + c_1 * step_size * gradient_times_direction

curvature = abs(next_gradient @ direction) <= c_2 * abs(
gradient_times_direction)

return armijo, curvature


# def _wolfe_conditions(
# current_point: np.array,
# current_loss: float,
# current_gradient: np.array,
# direction: np.array,
# step_size: float,
# next_loss: float,
# next_gradient: np.array,
# c_1: float = 1e-4,
# c_2: float = 0.9):
# """Choice satisfies Armijo rule and curvature condition.

# See Numerical Optimization, Nocedal & Wright, Chapter 3.
# """

# gradient_times_direction = current_gradient @ direction
# assert gradient_times_direction < 0

# armijo = \
# next_loss <= current_loss + c_1 * step_size * gradient_times_direction

# curvature = next_gradient @ direction >= c_2 * gradient_times_direction

# return armijo and curvature


# def _armijo(
# current_loss: float,
# current_gradient_dot_direction: float,
# step_size: float,
# next_loss: float,
# c_1: float = 1e-4,):
# """Choice satisfies Armijo rule."""

# assert current_gradient_dot_direction < 0
# # breakpoint()
# # print('armijo rhs', current_loss + c_1 * step_size * current_gradient_dot_direction)

# return next_loss <= current_loss + c_1 * step_size * current_gradient_dot_direction


def minimize_lbfgs(
loss_and_gradient_function, initial_point, memory=5, max_iters=100,
c_1=1e-3, c_2=.9, rho=.9, max_ls=20):
"""Minimize function using back-tracked L-BFGS."""

n = len(initial_point)

past_steps = np.empty((memory, n), dtype=float)
past_grad_diffs = np.empty((memory, n), dtype=float)

current_point = np.empty(n, dtype=float)
current_point[:] = initial_point

current_loss, current_gradient = loss_and_gradient_function(current_point)

for i in range(max_iters):
print('iter', i)
print('current_loss', current_loss)
print('current_gradient', current_gradient)

direction = - _lbfgs_multiply_dense(
current_gradient=current_gradient,
past_steps=past_steps[memory-i:],
past_grad_diffs=past_grad_diffs[memory-i:])

# line search
step_size = 1.
for _ in range(max_ls):
# print('backtracking', _)
step = step_size * direction
next_point = current_point + step
next_loss, next_gradient = loss_and_gradient_function(
next_point)

armijo, curvature = strong_wolfe(
current_point=current_point, current_loss=current_loss,
current_gradient=current_gradient, direction=direction,
step_size=step_size, next_loss=next_loss,
next_gradient=next_gradient, c_1=c_1, c_2=c_2)

print(
'step_size', step_size, 'armijo', armijo, 'curvature', curvature)

if not armijo:
step_size *= .5
continue

if not curvature:
step_size *= 1.1
continue

# both are satisfied
past_steps[:-1] = past_steps[1:]
past_grad_diffs[:-1] = past_grad_diffs[1:]
past_steps[-1] = step
past_grad_diffs[-1] = next_gradient - current_gradient
current_point = next_point
current_loss = next_loss
current_gradient = next_gradient
break

else:
print('BACKTRACKING FAILED')
return current_point
73 changes: 70 additions & 3 deletions project_euromir/tests/test_lbfgs.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
from unittest import TestCase

import numpy as np
import scipy.sparse as sp
import scipy as sp

import project_euromir as lib

Expand All @@ -41,8 +41,8 @@
class TestLBFGS(TestCase):
"""Test L-BFGS functions."""

def test_base(self):
"""Test Python implementation."""
def test_base_direction(self):
"""Test Python implementation of direction calculation."""

n = 10 # size
for m in [0, 1, 2, 5]: # memory
Expand All @@ -65,3 +65,70 @@ def test_base(self):

print(dense_direction)
print(sparse_direction)

def test_lbfgs_python(self):
"""Test Python implementation of l-bfgs."""

np.random.seed(0)
m = 10
n = 20
A = np.random.randn(m, n)
b = np.random.randn(m)

def loss_and_gradient_function(x):
residual = A @ x - b
loss = np.linalg.norm(residual) ** 2
gradient = 2 * A.T @ residual
return loss, gradient

# import matplotlib.pyplot as plt
# l, g = loss_and_gradient_function(np.zeros(n))

# alphas = np.linspace(0,0.03,100)
# losses_on_line = []
# grads_on_line = []
# for alpha in alphas:
# losses_on_line.append(loss_and_gradient_function(-alpha * g)[0])
# grads_on_line.append(loss_and_gradient_function(-alpha * g)[1])
# plt.plot(alphas, losses_on_line)
# plt.show()
# breakpoint()

result = sp.optimize.fmin_l_bfgs_b(
loss_and_gradient_function, x0=np.zeros(n))
print(result)

x = lbfgs.minimize_lbfgs(
loss_and_gradient_function=loss_and_gradient_function,
initial_point=np.zeros(n), memory=10, max_iters=100, c_1=1e-3,
c_2=0.9, rho=.9, max_ls=100)


if __name__ == '__main__': # pragma: no cover
import logging
from unittest import main
logging.basicConfig(level='INFO')
main()
# np.random.seed(0)
# m = 10
# n = 20
# A = np.random.randn(m,n)
# b = np.random.randn(m)

# def loss_and_gradient_function(x):
# residual = A @ x - b
# loss = np.linalg.norm(residual) ** 2
# gradient = 2 * A.T @ residual
# return loss, gradient

# import matplotlib.pyplot as plt
# l, g = loss_and_gradient_function(np.zeros(n))

# alphas = np.linspace(0,0.03,100)
# losses_on_line = []
# grads_on_line = []
# for alpha in alphas:
# losses_on_line.append(loss_and_gradient_function(-alpha * g)[0])
# grads_on_line.append(loss_and_gradient_function(-alpha * g)[1])
# plt.plot(alphas, losses_on_line)
# plt.show()

0 comments on commit 7f2ff85

Please sign in to comment.