draft: test impl

of lbfgs
enzbus · May 27, 2024 · 1126563 · 1126563
1 parent 8762876
commit 1126563
Showing 1 changed file with 106 additions and 0 deletions.
diff --git a/project_euromir/tests/lbfgs.py b/project_euromir/tests/lbfgs.py
@@ -0,0 +1,106 @@
+"""Pure Python implementation of L-BFGS for testing.
+ 
+We only need the multiplication of the gradient at the current point by the
+approximate inverse of the second derivative. Line search and choosing next
+point are done externally.
+
+References:
+
+- Original paper:
+    Updating quasi-Newton matrices with limited storage, Nocedal 1980
+    https://doi.org/10.1090/S0025-5718-1980-0572855-7 
+    (easy to find non-paywalled)
+
+- Wikipedia page:
+    https://en.wikipedia.org/wiki/Limited-memory_BFGS,
+    https://web.archive.org/web/20240515120721/https://en.wikipedia.org/wiki/Limited-memory_BFGS
+
+- Blog post:
+    https://aria42.com/blog/2014/12/understanding-lbfgs
+    https://web.archive.org/web/20231002054213/https://aria42.com/blog/2014/12/understanding-lbfgs
+"""
+
+from future import __typing__
+import numpy as np
+
+def lbfgs_multiply(
+    current_gradient: np.array,
+    past_steps: np.array,
+    past_grad_diffs: np.array,
+    base_inverse_diagonal: float | np.array = 1.
+):
+    r"""Multiply current gradient by the approximate inverse second derivative.
+
+    :param current_gradient:
+    :type current_gradient: np.array (1-dimensional)
+    :param past_steps: First dimension is L-BFGS memory. Most recent step is
+        last row.
+    :type past_steps: np.array (2-dimensional)
+    :param past_grad_diffs: First dimension is L-BFGS memory. Most recent 
+        gradient difference is last row.
+    :type past_grad_diffs: np.array (2-dimensional)
+    :param base_inverse_diagonal: Diagonal of :math:`H_0`, the base inverse
+        Hessian, before the L-BFGS corrections. By default 1, meaning we
+        take the identity as base.
+    :type base_inverse_diagonal: float or np.array (1-dimensional)
+    """
+
+    memory = past_steps.shape[0]
+    assert past_grad_diffs.shape[0] == memory
+
+    # compute rhos;
+    # this needs to be optimized for numerical stability, normalize by geo mean
+    # of l2 norms of s,y pairs
+    rhos = np.empty(memory, dtype=float)
+    for i in range(memory):
+        rhos[i] = 1. / np.dot(past_steps[i], past_grad_diffs[i])
+
+    # using paper notation
+    q = current_gradient
+
+    # right part, backward iteration
+    alphas = np.empty(memory, dtype=float)
+    for i in range(memory-1, -1, -1):
+        alphas[i] = rhos[i] * np.dot(past_steps[i], q)
+        q -= alphas[i] * past_grad_diffs[i]
+
+    # center part
+    r = base_inverse_diagonal * q
+
+    # wikipedia does this correction, not found in original paper
+    # gamma_correction = np.dot(
+    #     past_steps[-1], past_grad_diffs[-1]) / np.dot(
+    #         past_grad_diffs[-1], past_grad_diffs[-1])
+    # r = gamma_correction * (base_inverse_diagonal * q)
+
+    # left part, forward iteration
+    betas = np.empty(memory, dtype=float)
+    for i in range(memory):
+        betas[i] = rhos[i] * np.dot(past_grad_diffs[i], r)
+        r += (alphas[i] - betas[i]) * past_steps[i]
+
+    return r
+
+
+def _lbfgs_multiply_dense(
+    current_gradient: np.array,
+    past_steps: np.array,
+    past_grad_diffs: np.array,
+    memory: int = 5,
+    base_inverse_diagonal: float | np.array = 1.
+):
+    """Same as above using dense matrix."""
+
+    memory = past_steps.shape[1]
+    assert past_grad_diffs.shape[1] == memory
+
+    H = np.diag(base_inverse_diagonal)
+
+    for i in range(memory):
+        rho = 1. / np.dot(past_steps[i], past_grad_diffs[i])
+        left = np.eye(len(H)) - rho * np.outer(
+            past_grad_diffs[i], past_steps[i])
+        right = left.T
+        H = left @ H @ right + rho * np.outer(past_steps[i], past_steps[i])
+
+    return H @ current_gradient