From b8cae2bf54064b2f29323e9d9f71c2225846c7fd Mon Sep 17 00:00:00 2001
From: Enzo Busseti <enzo.busseti@me.com>
Date: Sat, 29 Jun 2024 18:30:16 +0400
Subject: [PATCH] many experiments; made line search in ncg more aggressive;
 activated QR presolver (need to write sparse QR, I am afraid); main issue is
 (not with these tests, but increasing probl sizes) sometimes approx Newton
 direction is not good and we make super small steps, some logic around it to
 truncate Netwon CG much earlier in those situations, ~steepest descent, would
 probably suffice; also to explore Woodbury on gap part of the Hessian,
 keeping saved solution of the (b,c) on the non-gap part; using QR on matrix,
 CG solving on the non-gap part should be much easier.

---
 project_euromir/loss_no_hsde.py |   4 +-
 project_euromir/newton_cg.py    |  10 ++--
 project_euromir/solver_cg.py    | 100 +++++++++++++++++++++++++-------
 3 files changed, 88 insertions(+), 26 deletions(-)

diff --git a/project_euromir/loss_no_hsde.py b/project_euromir/loss_no_hsde.py
index d1b30b2..ec027c5 100644
--- a/project_euromir/loss_no_hsde.py
+++ b/project_euromir/loss_no_hsde.py
@@ -93,7 +93,7 @@ def loss_gradient(xy, m, n, zero, matrix, b, c, workspace):
 
     return loss, workspace['gradient']
 
-def hessian(xy, m, n, zero, matrix, b, c, workspace):
+def hessian(xy, m, n, zero, matrix, b, c, workspace, regularizer = 0.):
     """Hessian to use inside LBFGS loop."""
 
     locals().update(workspace)
@@ -136,7 +136,7 @@ def _matvec(dxdy):
         constants = np.concatenate([c, b])
         result[:] += constants * (2 * (constants @ dxdy))
 
-        return result
+        return result + regularizer * dxdy
 
     return sp.sparse.linalg.LinearOperator(
         shape=(len(xy), len(xy)),
diff --git a/project_euromir/newton_cg.py b/project_euromir/newton_cg.py
index fa97d2e..ec521d5 100644
--- a/project_euromir/newton_cg.py
+++ b/project_euromir/newton_cg.py
@@ -12,7 +12,7 @@
                                         line_search_wolfe2)
 from scipy.sparse.linalg import LinearOperator
 
-ENZO_MODIFIED_MULTIPLIER = 1e-16 # in original it was 3.
+ENZO_MODIFIED_MULTIPLIER = 1e-8 # in original it was 3.
 
 _epsilon = sqrt(np.finfo(float).eps)
 
@@ -368,10 +368,12 @@ def terminate(warnflag, msg):
             # check curvature
             Ap = asarray(Ap).squeeze()  # get rid of matrices...
             curv = np.dot(psupi, Ap)
-            if 0 <= curv <= ENZO_MODIFIED_MULTIPLIER * float64eps:
+            if 0 <= curv <= ENZO_MODIFIED_MULTIPLIER * dri0:
                 # print(f'iter {k}, breaking CG loop at cgiter {k2} with curv {curv:.2e}')
                 break
             elif curv < 0:
+                print('NEGATIVE CURVATURE CLAUSE TRIGGERED!!!!')
+                breakpoint()
                 if (i > 0):
                     break
                 else:
@@ -402,8 +404,8 @@ def terminate(warnflag, msg):
         except _LineSearchError:
             print('ENTERING FALL-BACK BACKTRACKING')
             # ENZO: fallback to back-tracking
-            for bktrit in range(50):
-                if f(xk) < 1e-20:
+            for bktrit in range(100):
+                if f(xk) < 1e-25:
                     continue # we want to go to the else clause
                 if f(xk + pk * 0.5**(-bktrit)) < f(xk):
                     alphak = 0.5**(-bktrit)
diff --git a/project_euromir/solver_cg.py b/project_euromir/solver_cg.py
index 293effe..544846d 100644
--- a/project_euromir/solver_cg.py
+++ b/project_euromir/solver_cg.py
@@ -39,11 +39,11 @@
 from project_euromir.lbfgs import minimize_lbfgs
 
 if not NOHSDE:
-    from project_euromir.loss import (common_computation_main,
+    from project_euromir.loss import (_densify, common_computation_main,
                                       create_workspace_main, gradient, hessian,
                                       loss)
 else:
-    from project_euromir.loss_no_hsde import (create_workspace,  loss_gradient, hessian)
+    from project_euromir.loss_no_hsde import (create_workspace,  loss_gradient, hessian, _densify)
 
 from project_euromir.newton_cg import _epsilon, _minimize_newtoncg
 from project_euromir.refinement import refine
@@ -52,14 +52,23 @@
 if DEBUG:
     import matplotlib.pyplot as plt
 
-QR_PRESOLVE = False
+QR_PRESOLVE = True
 QR_PRESOLVE_AFTER = False
 REFINE = True
 
+CG_REGULARIZER = 0.
+
+SCIPY_EXPERIMENTS = False
+
+HESSIAN_COUNTER = {'ba': 0}
 
 def solve(matrix, b, c, zero, nonneg, lbfgs_memory=10):
     "Main function."
 
+    print(
+        f'PROBLEM SIZE: m={len(b)}, n={len(c)}, zero={zero},'
+        f' nonneg={nonneg}, nnz(matrix)={matrix.nnz}')
+
     # some shape checking
     n = len(c)
     m = len(b)
@@ -131,7 +140,17 @@ def separated_grad(xy):
 
         def separated_hessian(xy):
             return hessian(
-                xy, m=m, n=n, zero=zero, matrix=matrix_transf, b=b_transf, c=c_transf, workspace=workspace)
+                xy, m=m, n=n, zero=zero, matrix=matrix_transf, b=b_transf, c=c_transf, workspace=workspace, regularizer=CG_REGULARIZER)
+
+        def dense_hessian(xy):
+            return _densify(hessian(
+                xy, m=m, n=n, zero=zero, matrix=matrix_transf, b=b_transf, c=c_transf, workspace=workspace, regularizer=1e-10))
+
+        def hessp(xy, var):
+            # breakpoint()
+            HESSIAN_COUNTER['ba'] += 1
+            H = separated_hessian(xy)
+            return H @ var
 
         x_0 = np.zeros(n+m)
 
@@ -148,30 +167,71 @@ def callback(current):
         #     # current['x'] /= current.x[-1]
         # old_x[:] = current.x
 
+    # import matplotlib.pyplot as plt
+    # all_losses = []
+    # def callback_plot(x):
+    #     all_losses.append(separated_loss(x))
+
     start = time.time()
     # call newton-CG, implementation from scipy with modified termination
     c_2 = 0.1
     for i in range(5):
-        result = _minimize_newtoncg(
-            fun=separated_loss,
-            x0=x_0,
-            args=(),
-            jac=separated_grad,
-            hess=separated_hessian,
-            hessp=None,
-            # callback=callback,
-            xtol=0., #1e-5,
-            eps=_epsilon, # unused
-            maxiter=10000,
-            # max_cg_iters=100,
-            disp=99,
-            return_all=False,
-            c1=1e-4, c2=c_2)
+        if not SCIPY_EXPERIMENTS:
+            result = _minimize_newtoncg(
+                fun=separated_loss,
+                x0=x_0,
+                args=(),
+                jac=separated_grad,
+                hess=separated_hessian,
+                hessp=None,
+                # callback=callback,
+                xtol=0., #1e-5,
+                eps=_epsilon, # unused
+                maxiter=10000,
+                # max_cg_iters=100,
+                disp=99,
+                return_all=False,
+                c1=1e-4, c2=c_2)
+            print(result)
+        else:
+            # NO, trust region (at least scipy implementations) don't work as
+            # well, I tried a bunch, only realistic competitor is l-bfgs
+            #global HESSIAN_COUNTER
+            HESSIAN_COUNTER['ba'] = 0
+            result = sp.optimize.minimize(
+                fun=separated_loss,
+                x0=x_0,
+                args=(),
+                jac=separated_grad,
+                # hess=dense_hessian, # for dogleg
+                #callback=lambda x: print(separated_loss(x)),
+                callback=callback_plot,
+                hessp=hessp, # for trust-ncg
+                options = {
+                'initial_trust_radius': .01,
+                'max_trust_radius': .1,
+                'eta': 0., 'gtol': 1e-16,
+                'ftol': 0.,
+                'gtol': 0.,
+                'maxfun': 1e10,
+                'maxiter': 1e10,
+                'maxls': 100,
+                'maxcor': 20,
+                },
+                # method='dogleg',
+                # method='trust-ncg',
+                method='l-bfgs-b',
+                )
+            print("NUMBER HESSIAN MATMULS", HESSIAN_COUNTER['ba'])
+            print(result)
+        # plt.plot(all_losses)
+        # plt.yscale('log')
+        # plt.show()
         # breakpoint()
         print(f'NEWTON-CG took {time.time() - start:.3f} seconds')
         loss_after_cg = separated_loss(result["x"])
         print(f'LOSS {loss_after_cg:.2e}')
-        if loss_after_cg < 1e-20:
+        if loss_after_cg < 1e-25:
             break
         x_0 = result['x']
         # breakpoint()