basic testing lbfgs;

got some basic convergence but definitely still issues, sparse impl not working, convergence in 3x the iters of lbfgsb3.0
enzbus · Jun 4, 2024 · 7f2ff85 · 7f2ff85
1 parent c349d44
commit 7f2ff85
Show file tree

Hide file tree

Showing 2 changed files with 218 additions and 4 deletions.
diff --git a/project_euromir/tests/lbfgs.py b/project_euromir/tests/lbfgs.py
@@ -40,6 +40,10 @@
     https://doi.org/10.1090/S0025-5718-1980-0572855-7 
     (easy to find non-paywalled)
 
+- Reference implementation: MINPACK-2, vmlm module
+    https://ftp.mcs.anl.gov/pub/MINPACK-2/
+    https://github.com/jacobwilliams/MINPACK-2/tree/master/vmlm
+
 - Wikipedia page:
     https://en.wikipedia.org/wiki/Limited-memory_BFGS,
     https://web.archive.org/web/20240515120721/https://en.wikipedia.org/wiki/Limited-memory_BFGS
@@ -100,7 +104,7 @@ def lbfgs_multiply(
     # center part
     r = base_inverse_diagonal * q
 
-    # wikipedia does this correction, not found in original paper
+    # scale correction, see MINPACK-2/vmlm
     # gamma_correction = np.dot(
     #     past_steps[-1], past_grad_diffs[-1]) / np.dot(
     #         past_grad_diffs[-1], past_grad_diffs[-1])
@@ -140,3 +144,146 @@ def _lbfgs_multiply_dense(
         H = left @ H @ right + rho * np.outer(past_steps[i], past_steps[i])
 
     return H @ current_gradient
+
+
+def strong_wolfe(
+        current_point: np.array,
+        current_loss: float,
+        current_gradient: np.array,
+        direction: np.array,
+        step_size: float,
+        next_loss: float,
+        next_gradient: np.array,
+        c_1: float = 1e-3,
+        c_2: float = 0.9):
+    """Simple line search satisfying Wolfe conditions.
+
+    This is a much simplified approach versus the canonical dcsrch from
+    MINPACK. Hopefully it works!
+
+    Idea: start from unit step. If Armijo rule is not satisfied, backtrack.
+    If curvature condition is not satisfied, make step longer. If neither is
+    satisfied, try both until we find a point where either is. Then restart
+    from there.
+
+    Default c_1 and c_2 are the same as in MINPACK-2/vmlm, from my tests they
+    seem like good choices.
+    """
+
+    gradient_times_direction = current_gradient @ direction
+    assert gradient_times_direction < 0
+
+    armijo = \
+        next_loss <= current_loss + c_1 * step_size * gradient_times_direction
+
+    curvature = abs(next_gradient @ direction) <= c_2 * abs(
+        gradient_times_direction)
+
+    return armijo, curvature
+
+
+# def _wolfe_conditions(
+#         current_point: np.array,
+#         current_loss: float,
+#         current_gradient: np.array,
+#         direction: np.array,
+#         step_size: float,
+#         next_loss: float,
+#         next_gradient: np.array,
+#         c_1: float = 1e-4,
+#         c_2: float = 0.9):
+#     """Choice satisfies Armijo rule and curvature condition.
+
+#     See Numerical Optimization, Nocedal & Wright, Chapter 3.
+#     """
+
+#     gradient_times_direction = current_gradient @ direction
+#     assert gradient_times_direction < 0
+
+#     armijo = \
+#         next_loss <= current_loss + c_1 * step_size * gradient_times_direction
+
+#     curvature = next_gradient @ direction >= c_2 * gradient_times_direction
+
+#     return armijo and curvature
+
+
+# def _armijo(
+#         current_loss: float,
+#         current_gradient_dot_direction: float,
+#         step_size: float,
+#         next_loss: float,
+#         c_1: float = 1e-4,):
+#     """Choice satisfies Armijo rule."""
+
+#     assert current_gradient_dot_direction < 0
+#     # breakpoint()
+#     # print('armijo rhs', current_loss + c_1 * step_size * current_gradient_dot_direction)
+
+#     return next_loss <= current_loss + c_1 * step_size * current_gradient_dot_direction
+
+
+def minimize_lbfgs(
+        loss_and_gradient_function, initial_point, memory=5, max_iters=100,
+        c_1=1e-3, c_2=.9, rho=.9, max_ls=20):
+    """Minimize function using back-tracked L-BFGS."""
+
+    n = len(initial_point)
+
+    past_steps = np.empty((memory, n), dtype=float)
+    past_grad_diffs = np.empty((memory, n), dtype=float)
+
+    current_point = np.empty(n, dtype=float)
+    current_point[:] = initial_point
+
+    current_loss, current_gradient = loss_and_gradient_function(current_point)
+
+    for i in range(max_iters):
+        print('iter', i)
+        print('current_loss', current_loss)
+        print('current_gradient', current_gradient)
+
+        direction = - _lbfgs_multiply_dense(
+            current_gradient=current_gradient,
+            past_steps=past_steps[memory-i:],
+            past_grad_diffs=past_grad_diffs[memory-i:])
+
+        # line search
+        step_size = 1.
+        for _ in range(max_ls):
+            # print('backtracking', _)
+            step = step_size * direction
+            next_point = current_point + step
+            next_loss, next_gradient = loss_and_gradient_function(
+                next_point)
+
+            armijo, curvature = strong_wolfe(
+                current_point=current_point, current_loss=current_loss,
+                current_gradient=current_gradient, direction=direction,
+                step_size=step_size, next_loss=next_loss,
+                next_gradient=next_gradient, c_1=c_1, c_2=c_2)
+
+            print(
+                'step_size', step_size, 'armijo', armijo, 'curvature', curvature)
+
+            if not armijo:
+                step_size *= .5
+                continue
+
+            if not curvature:
+                step_size *= 1.1
+                continue
+
+            # both are satisfied
+            past_steps[:-1] = past_steps[1:]
+            past_grad_diffs[:-1] = past_grad_diffs[1:]
+            past_steps[-1] = step
+            past_grad_diffs[-1] = next_gradient - current_gradient
+            current_point = next_point
+            current_loss = next_loss
+            current_gradient = next_gradient
+            break
+
+        else:
+            print('BACKTRACKING FAILED')
+            return current_point
diff --git a/project_euromir/tests/test_lbfgs.py b/project_euromir/tests/test_lbfgs.py
@@ -31,7 +31,7 @@
 from unittest import TestCase
 
 import numpy as np
-import scipy.sparse as sp
+import scipy as sp
 
 import project_euromir as lib
 
@@ -41,8 +41,8 @@
 class TestLBFGS(TestCase):
     """Test L-BFGS functions."""
 
-    def test_base(self):
-        """Test Python implementation."""
+    def test_base_direction(self):
+        """Test Python implementation of direction calculation."""
 
         n = 10  # size
         for m in [0, 1, 2, 5]:  # memory
@@ -65,3 +65,70 @@ def test_base(self):
 
             print(dense_direction)
             print(sparse_direction)
+
+    def test_lbfgs_python(self):
+        """Test Python implementation of l-bfgs."""
+
+        np.random.seed(0)
+        m = 10
+        n = 20
+        A = np.random.randn(m, n)
+        b = np.random.randn(m)
+
+        def loss_and_gradient_function(x):
+            residual = A @ x - b
+            loss = np.linalg.norm(residual) ** 2
+            gradient = 2 * A.T @ residual
+            return loss, gradient
+
+        # import matplotlib.pyplot as plt
+        # l, g = loss_and_gradient_function(np.zeros(n))
+
+        # alphas = np.linspace(0,0.03,100)
+        # losses_on_line = []
+        # grads_on_line = []
+        # for alpha in alphas:
+        #     losses_on_line.append(loss_and_gradient_function(-alpha * g)[0])
+        #     grads_on_line.append(loss_and_gradient_function(-alpha * g)[1])
+        # plt.plot(alphas, losses_on_line)
+        # plt.show()
+        # breakpoint()
+
+        result = sp.optimize.fmin_l_bfgs_b(
+            loss_and_gradient_function, x0=np.zeros(n))
+        print(result)
+
+        x = lbfgs.minimize_lbfgs(
+            loss_and_gradient_function=loss_and_gradient_function,
+            initial_point=np.zeros(n), memory=10, max_iters=100, c_1=1e-3,
+            c_2=0.9, rho=.9, max_ls=100)
+
+
+if __name__ == '__main__':  # pragma: no cover
+    import logging
+    from unittest import main
+    logging.basicConfig(level='INFO')
+    main()
+    # np.random.seed(0)
+    # m = 10
+    # n = 20
+    # A = np.random.randn(m,n)
+    # b = np.random.randn(m)
+
+    # def loss_and_gradient_function(x):
+    #     residual = A @ x - b
+    #     loss = np.linalg.norm(residual) ** 2
+    #     gradient = 2 * A.T @ residual
+    #     return loss, gradient
+
+    # import matplotlib.pyplot as plt
+    # l, g = loss_and_gradient_function(np.zeros(n))
+
+    # alphas = np.linspace(0,0.03,100)
+    # losses_on_line = []
+    # grads_on_line = []
+    # for alpha in alphas:
+    #     losses_on_line.append(loss_and_gradient_function(-alpha * g)[0])
+    #     grads_on_line.append(loss_and_gradient_function(-alpha * g)[1])
+    # plt.plot(alphas, losses_on_line)
+    # plt.show()