some work on preconditioning CG;

enzbus · Jul 9, 2024 · 3e77ab6 · 3e77ab6
1 parent ad1de86
commit 3e77ab6
Show file tree

Hide file tree

Showing 4 changed files with 130 additions and 37 deletions.
diff --git a/project_euromir/direction_calculator.py b/project_euromir/direction_calculator.py
@@ -58,6 +58,10 @@ def nocedal_wright_termination(_current_point, current_gradient):
 class DirectionCalculator:
     """Base class for descent direction calculation."""
 
+    @property
+    def statistics(self):
+        return self._statistics if hasattr(self, '_statistics') else {}
+
     def get_direction(
         self, current_point: np.array, current_gradient: np.array) -> np.array:
         """Calculate descent direction at current point given current gradient.
@@ -133,6 +137,7 @@ class CGNewton(DenseNewton):
     """Calculate descent direction using Newton-CG."""
 
     _x0 = None # overwritten in derived class(es)
+    _preconditioner = None # overwritten in derived class(es)
 
     def __init__(
             self, hessian_function,
@@ -158,6 +163,7 @@ def __init__(
         self._max_cg_iters = max_cg_iters
         self._regularizer = regularizer
         self._minres = minres
+        self._statistics = {'HESSIAN_MATMULS':  0}
         super().__init__(hessian_function=hessian_function)
 
     def get_direction(
@@ -184,13 +190,21 @@ def _counter(_):
                 shape = current_hessian.shape,
                 matvec = lambda x: orig_hessian @ x + self._regularizer * x
             )
+        # breakpoint()
+        # diag = np.diag(self._preconditioner.todense())
+        # real_diag = np.diag( _densify(current_hessian))
+        # import matplotlib.pyplot as plt
+        # plt.plot(diag); plt.plot(real_diag); plt.show()
+        # breakpoint()
         result = getattr(sp.sparse.linalg, 'minres' if self._minres else 'cg')(
             A=current_hessian,
             b=-current_gradient,
             x0=self._x0, # this is None in this class
             rtol=self._rtol_termination(current_point, current_gradient),
+            M=self._preconditioner, # this is None in this class
             callback=_counter,
             maxiter=self._max_cg_iters)[0]
+        self._statistics['HESSIAN_MATMULS'] += iteration_counter
         logger.info(
             '%s calculated direction with residual norm %.2e, while the input'
             ' gradient had norm %.2e, in %d iterations',
@@ -200,6 +214,45 @@ def _counter(_):
             iteration_counter)
         return result
 
+class ExactDiagPreconditionedCGNewton(CGNewton):
+    """CG with exact diagonal preconditioning."""
+
+    def get_direction(
+        self, current_point: np.array, current_gradient: np.array) -> np.array:
+
+        diag_H = np.array(
+            np.diag(_densify(self._hessian_function(current_point))))
+
+        diag_H[diag_H < 1e-4] = 1.
+        self._preconditioner = sp.diags(1./diag_H)
+        return super().get_direction(
+            current_point=current_point, current_gradient=current_gradient)
+
+
+class DiagPreconditionedCGNewton(CGNewton):
+    """CG with diagonal preconditioning."""
+
+    def __init__(self, matrix, b, c, zero, **kwargs):
+
+        self._matrix = matrix
+        self._b = b
+        m = len(b)
+        self._c = c
+        n = len(c)
+        self._zero = zero
+        gap_part = np.concatenate([self._c, self._b])**2
+        col_norms = sp.sparse.linalg.norm(matrix, axis=0)**2
+        assert len(col_norms) == n
+        row_norms = sp.sparse.linalg.norm(matrix, axis=1)**2
+        assert len(row_norms) == m
+        diag = gap_part
+        diag[:n] += col_norms / np.sqrt(2)
+        diag[n:] += row_norms
+        diag[n+zero:] += 0.5
+        self._preconditioner = sp.sparse.diags(1./diag)
+        super().__init__(**kwargs)
+
+
 class WarmStartedCGNewton(CGNewton):
     """Calculate descent direction using warm-started Newton CG.
 

diff --git a/project_euromir/line_searcher.py b/project_euromir/line_searcher.py
@@ -41,6 +41,10 @@ class LineSearchError(Exception):
 class LineSearcher:
     """Base class for line search algorithms using strong Wolfe conditions."""
 
+    @property
+    def statistics(self):
+        return self._statistics if hasattr(self, '_statistics') else {}
+
     def __init__(
             self, initial_step = 1., c_1=1e-4, c_2=0.9):
         """Initialize with parameters.
@@ -80,18 +84,16 @@ def get_next(
     def armijo(
             self,
             current_loss: np.array,
-            current_gradient: np.array,
-            direction: np.array,
+            direction_dot_current_gradient: float,
             step_size: float,
             proposed_loss: np.array):
         """Is Armijo rule satisfied?
 
         :param current_loss: Loss at current point.
         :type current_loss: float
-        :param current_gradient: Gradient at current point.
-        :type current_gradient: np.array
-        :param direction: Search direction.
-        :type direction: np.array
+        :param direction_dot_current_gradient: Search direction dot gradient
+            at current point.
+        :type direction_dot_current_gradient: float
         :param step_size: Length of the step along direction.
         :type step_size: float
         :param proposed_loss: Loss at proposed point.
@@ -100,9 +102,10 @@ def armijo(
         :returns: Is Armijo rule satisfied?
         :rtype: bool
         """
+        assert direction_dot_current_gradient < 0
 
         return proposed_loss <= current_loss + self._c_1 * step_size * (
-            direction @ current_gradient)
+            direction_dot_current_gradient)
 
     def strong_curvature(
             self,
@@ -145,6 +148,7 @@ def __init__(
         self._loss_function = loss_function
         self._backtrack = backtrack
         self._max_iters = max_iters
+        self._statistics = {'LOSS_EVALS': 0}
         super().__init__(initial_step=initial_step, c_1=c_1, c_2=None)
 
     def get_next(
@@ -164,21 +168,23 @@ def get_next(
         :returns: Next point.
         :rtype: np.array
         """
-        assert current_gradient @ direction < 0
+        direction_dot_current_gradient = current_gradient @ direction
+        assert direction_dot_current_gradient < 0
         step_size = float(self._initial_step)
         proposed_point = np.empty_like(current_point)
 
-        for _ in range(self._max_iters):
+        for bt_iters in range(self._max_iters):
             proposed_point[:] = current_point + direction * step_size
             proposed_loss = self._loss_function(proposed_point)
             if self.armijo(
                     current_loss=current_loss,
-                    current_gradient=current_gradient, direction=direction,
+                    direction_dot_current_gradient=direction_dot_current_gradient,
                     step_size=step_size, proposed_loss=proposed_loss):
                 logger.info(
                     '%s: step lenght %.2e, function calls %d, current loss '
                     '%.2e, previous loss %.2e', self.__class__.__name__,
-                    step_size, _+1, proposed_loss, current_loss)
+                    step_size, bt_iters+1, proposed_loss, current_loss)
+                self._statistics['LOSS_EVALS'] += bt_iters
                 return (proposed_point, proposed_loss, None)
             step_size *= self._backtrack
         raise LineSearchError(

diff --git a/project_euromir/minamide.py b/project_euromir/minamide.py
@@ -56,7 +56,7 @@ def hessian_x_nogap(x, m, n, zero, matrix, b, regularizer = 0.):
 
     def _matvec(dx):
         result = np.empty_like(dx)
-        result[:] = 2 * (matrix.T @ (s_mask * (matrix @ dx)))
+        result[:] = (matrix.T @ (s_mask * (matrix @ dx)))
         return result + regularizer * dx
 
     return sp.sparse.linalg.LinearOperator(
@@ -76,8 +76,8 @@ def _matvec(dy):
         result = np.empty_like(dy)
 
         # dual residual sqnorm
-        result[:] = 2 * (matrix @ (matrix.T @ dy))
-        result[zero:] += 2 * y_mask * dy[zero:]
+        result[:] = (matrix @ (matrix.T @ dy))
+        result[zero:] += y_mask * dy[zero:]
 
         return result + regularizer * dy
 
@@ -93,7 +93,7 @@ def _matvec(dy):
 class MinamideTest(DirectionCalculator):
 
     def __init__(self, b, c, h_x_nogap, h_y_nogap):
-        self._constants_sqrt2 = np.concatenate([c, b]) * np.sqrt(2.)
+        self._constants = np.concatenate([c, b])
         self._hessian_x_nogap = h_x_nogap
         self._hessian_y_nogap = h_y_nogap
         self._n = len(c)
@@ -107,27 +107,42 @@ def get_direction(self, current_point, current_gradient):
         # gets stuck on directions of little descent, not sure how much
         # regularization is needed
 
-        # make all dense for test
-        hx = _densify(self._hessian_x_nogap(current_point[:self._n]))#  + np.eye(self._n) * REGU
-        hy = _densify(self._hessian_y_nogap(current_point[self._n:]))#  + np.eye(self._m) * REGU
+        # linear operators
+        hx = self._hessian_x_nogap(current_point[:self._n])#  + np.eye(self._n) * REGU
+        hy = self._hessian_y_nogap(current_point[self._n:])#  + np.eye(self._m) * REGU
 
         # minamide notation
-        S = sp.sparse.bmat([
-            [hx, None],
-            [None, hy],
-            ]).tocsc()
-        phi = self._constants_sqrt2
+
+        # S = sp.sparse.bmat([
+        #     [hx, None],
+        #     [None, hy],
+        #     ]).tocsc()
+
+        def _s_matvec(array):
+            return np.concatenate([
+                hx @ array[:self._n], hy @ array[self._n:]
+            ])
+
+        S = sp.sparse.linalg.LinearOperator(
+            shape = (self._n + self._m, self._n + self._m),
+            matvec = _s_matvec
+        )
+
+        phi = self._constants
+
+        hx_dense = _densify(hx)
+        hy_dense = _densify(hy)
 
         def _splus_matvec(array):
             return np.concatenate([
-                np.linalg.lstsq(hx, array[:self._n], rcond=None)[0],
-                np.linalg.lstsq(hy, array[self._n:], rcond=None)[0]
+                np.linalg.lstsq(hx_dense, array[:self._n], rcond=None)[0],
+                np.linalg.lstsq(hy_dense, array[self._n:], rcond=None)[0]
             ])
 
         # def _splus_matvec(array):
         #     return np.concatenate([
-        #         sp.sparse.linalg.cg(hx, array[:self._n], rtol=1e-5)[0],
-        #         sp.sparse.linalg.cg(hy, array[self._n:], rtol=1e-5)[0]
+        #         sp.sparse.linalg.cg(hx, array[:self._n], rtol=min(0.5, np.linalg.norm(current_gradient)))[0],
+        #         sp.sparse.linalg.cg(hy, array[self._n:], rtol=min(0.5, np.linalg.norm(current_gradient)))[0]
         #     ])
 
         # def _splus_matvec(array):
@@ -265,7 +280,7 @@ def my_hessian_y_nogap(y):
             [hess_x_ng, np.zeros((n, m))],
             [np.zeros((m, n)), hess_y_ng]])
         gap_constants = np.concatenate([c, b])
-        hess_rebuilt += np.outer(gap_constants, gap_constants) * 2.
+        hess_rebuilt += np.outer(gap_constants, gap_constants)
         assert np.allclose(hess, hess_rebuilt)
     print('\tOK!')
 
@@ -285,7 +300,7 @@ def my_hessian_y_nogap(y):
         S = np.bmat([
             [hess_x_ng, np.zeros((n, m))],
             [np.zeros((m, n)), hess_y_ng]]).A
-        phi = np.concatenate([c, b]) * np.sqrt(2.)
+        phi = np.concatenate([c, b])
 
         # obtain pinv of the reduced system
         Splus = np.linalg.pinv(S)

diff --git a/project_euromir/solver_new.py b/project_euromir/solver_new.py
@@ -37,11 +37,10 @@
 NOHSDE = True
 
 from project_euromir import equilibrate
-from project_euromir.direction_calculator import (CGNewton, DenseNewton,
-                                                  LSMRLevenbergMarquardt,
-                                                  LSQRLevenbergMarquardt,
-                                                  WarmStartedCGNewton,
-                                                  nocedal_wright_termination)
+from project_euromir.direction_calculator import (
+    CGNewton, DenseNewton, DiagPreconditionedCGNewton,
+    ExactDiagPreconditionedCGNewton, LSMRLevenbergMarquardt,
+    LSQRLevenbergMarquardt, WarmStartedCGNewton, nocedal_wright_termination)
 from project_euromir.line_searcher import (BacktrackingLineSearcher,
                                            LogSpaceLineSearcher,
                                            ScipyLineSearcher)
@@ -147,12 +146,29 @@ def _local_derivative_residual(xy):
         regularizer=1e-8, # it seems 1e-10 is best, but it's too sensitive to it :(
         )
 
-    direction_calculator = CGNewton(
+    # doesn't improve, yet; we can make many tests
+    # direction_calculator = DiagPreconditionedCGNewton(
+    #     matrix=matrix_transf,
+    #     b=b_transf,
+    #     c=c_transf,
+    #     zero=zero,
+    #     hessian_function=_local_hessian,
+    #     rtol_termination=lambda x, g: min(0.5, np.linalg.norm(g)), #,**2),
+    #     max_cg_iters=None,
+    # )
+
+    # this one also doesn't seem to improve; must be because my diagonal is
+    # already quite well scaled, and/or it interacts with the rank 1 part;
+    # makes sense to try diag plus rank 1;
+    # direction_calculator = ExactDiagPreconditionedCGNewton(
+    direction_calculator  = CGNewton(
         # warm start causes issues if null space changes b/w iterations
         hessian_function=_local_hessian,
-        rtol_termination=lambda x, g: min(0.5, np.linalg.norm(g)),
+        rtol_termination=lambda x, g: min(0.5, np.linalg.norm(g)), #,**2),
         max_cg_iters=None,
-        minres=False,
+        # minres=True, # less stable, needs more stringent termination,
+        # and/or better logic to transition to refinement, but it is a bit faster
+        # it seems
         # regularizer=1e-8, # it seems 1e-10 is best, but it's too sensitive to it :(
         )
 
@@ -215,6 +231,9 @@ def _local_derivative_residual(xy):
             'Certificates not yet implemented.')
 
     print('Newton-CG loop took %.2e seconds' % (time.time() - _start ))
+    print('Newton-CG iterations', newton_iterations)
+    print('DirectionCalculator statistics', direction_calculator.statistics)
+    print('LineSearcher statistics', line_searcher.statistics)
 
     # create HSDE variables for refinement
     u = np.empty(n+m+1, dtype=float)