From ada8ee8be27c68c828478ccdfcd108ddd45eb995 Mon Sep 17 00:00:00 2001 From: Mathieu Blondel Date: Thu, 3 Dec 2015 23:26:07 +0900 Subject: [PATCH 1/5] Add Frank-Wolfe implementation of l1 constrained regression. --- lightning/impl/fw.py | 114 ++++++++++++++++++++++++++++++++++++++++ lightning/regression.py | 1 + 2 files changed, 115 insertions(+) create mode 100644 lightning/impl/fw.py diff --git a/lightning/impl/fw.py b/lightning/impl/fw.py new file mode 100644 index 00000000..801f455e --- /dev/null +++ b/lightning/impl/fw.py @@ -0,0 +1,114 @@ +import numpy as np +import scipy.sparse as sp + +from sklearn.base import BaseEstimator, RegressorMixin +from sklearn.utils.extmath import safe_sparse_dot + + +def _frank_wolfe(w_init, X, y, beta, max_iter=50, tol=1e-3, max_nz=None, + verbose=0): + """ + Solve + + 0.5 * ||np.dot(X, w) - y||^2 s.t. ||w||_1 <= beta + + by the Frank-Wolfe method. + + The method can be seen as a greedy coordinate descent: it adds at most one + non-zero coefficient per iteration. + """ + n_samples, n_features = X.shape + + if sp.issparse(X): + X = X.tocsc() + + w = w_init.copy() + + for it in range(max_iter): + y_pred = safe_sparse_dot(X, w) + resid = beta * y_pred - y + neg_grad = -safe_sparse_dot(X.T, beta * resid) + + atom = np.argmax(np.abs(neg_grad)) + s = np.sign(neg_grad[atom]) + + error = np.dot(resid, resid) + dgap = s * neg_grad[atom] - np.dot(w, neg_grad) + + if it == 0: + error_init = error + dgap_init = dgap + + if verbose: + print "iter", it + 1 + print "duality gap", dgap / dgap_init + print "error reduction", error / error_init + print "l1 norm", beta * np.sum(np.abs(w)) + print "n_nz", np.sum(w != 0) + print + + # Find optimal step size by exact line search. + Xs = s * X[:, atom] + if sp.issparse(Xs): + Xs_sq = np.dot(Xs.data, Xs.data) + else: + Xs_sq = np.dot(Xs, Xs) + y_pred_sq = np.dot(y_pred, y_pred) + b = (Xs - y_pred) + gamma = np.dot(resid, y_pred) - safe_sparse_dot(resid, Xs) + gamma /= beta * (Xs_sq - 2 * safe_sparse_dot(Xs.T, y_pred) + y_pred_sq) + gamma = max(0, min(1, gamma)) + + # Update parameters. + w *= (1 - gamma) + w[atom] += gamma * s + + # Stop if maximum number of non-zero coefficients is reached. + if max_nz is not None and np.sum(w != 0) == max_nz: + break + + # Stop if desired duality gap tolerance is reached. + if dgap / dgap_init <= tol: + if verbose: + print "Converged" + break + + w *= beta + return w + + +class FWRegressor(BaseEstimator, RegressorMixin): + + def __init__(self, beta=1.0, max_iter=50, tol=1e-3, max_nz=None, verbose=0): + self.beta = beta + self.max_iter = max_iter + self.tol = tol + self.max_nz = max_nz + self.verbose = verbose + + def fit(self, X, y): + n_features = X.shape[1] + coef = np.zeros(n_features) + self.coef_ = _frank_wolfe(coef, X, y, beta=self.beta, + max_iter=self.max_iter, tol=self.tol, + max_nz=self.max_nz, verbose=self.verbose) + return self + + def predict(self, X): + return safe_sparse_dot(X, self.coef_) + + +if __name__ == '__main__': + from sklearn.datasets import load_diabetes + from sklearn.preprocessing import StandardScaler + + diabetes = load_diabetes() + X, y = diabetes.data, diabetes.target + X = StandardScaler().fit_transform(X) + #X = sp.csr_matrix(X) + + reg = FWRegressor(beta=100, max_iter=1000, tol=1e-2, verbose=1) + reg.fit(X, y) + y_pred = reg.predict(X) + error = np.mean((y - y_pred) ** 2) + print error diff --git a/lightning/regression.py b/lightning/regression.py index 0fdf735d..4fa66f11 100644 --- a/lightning/regression.py +++ b/lightning/regression.py @@ -2,6 +2,7 @@ from .impl.dual_cd import LinearSVR from .impl.primal_cd import CDRegressor from .impl.fista import FistaRegressor +from .impl.fw import FWRegressor from .impl.sag import SAGRegressor from .impl.sag import SAGARegressor from .impl.sdca import SDCARegressor From 7fe802d104f859d5105ab557c18ed67da2fcf5c9 Mon Sep 17 00:00:00 2001 From: Mathieu Blondel Date: Thu, 3 Dec 2015 23:26:37 +0900 Subject: [PATCH 2/5] Add example. --- examples/plot_l1_reg.py | 59 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 examples/plot_l1_reg.py diff --git a/examples/plot_l1_reg.py b/examples/plot_l1_reg.py new file mode 100644 index 00000000..3ab5f95b --- /dev/null +++ b/examples/plot_l1_reg.py @@ -0,0 +1,59 @@ + +""" +======================================== +L1 regression: regularization paths +======================================== + +Shows that the regularization paths obtained by coordinate descent (penalized) +and Frank-Wolfe (constrained) are equivalent. +""" +print __doc__ +import numpy as np +import matplotlib.pyplot as plt + +from sklearn.datasets import load_diabetes +from sklearn.model_selection import train_test_split + +from lightning.regression import CDRegressor +from lightning.regression import FWRegressor + +diabetes = load_diabetes() +X, y = diabetes.data, diabetes.target + +X_tr, X_te, y_tr, y_te = train_test_split(X, y, train_size=0.75, random_state=0) + +plt.figure() + +betas = np.logspace(-2, 5, 50) +alphas = np.logspace(-4, 4, 50) + +fw_n_nz = [] +fw_error = [] +cd_n_nz = [] +cd_error = [] + +for beta in betas: + reg = FWRegressor(beta=beta, max_iter=1000, tol=1e-3, verbose=0) + reg.fit(X_tr, y_tr) + y_pred = reg.predict(X_te) + fw_n_nz.append(np.sum(reg.coef_ != 0)) + fw_error.append(np.sqrt(np.mean((y_te - y_pred) ** 2))) + +for alpha in alphas: + reg = CDRegressor(alpha=alpha, penalty="l1", max_iter=1000, tol=1e-3, + verbose=0) + reg.fit(X_tr, y_tr) + y_pred = reg.predict(X_te) + cd_n_nz.append(np.sum(reg.coef_ != 0)) + cd_error.append(np.sqrt(np.mean((y_te - y_pred) ** 2))) + +plt.plot(fw_n_nz, fw_error, label="Frank-Wolfe", linewidth=3) +plt.plot(cd_n_nz, cd_error, label="Coordinate Descent", linewidth=3, linestyle="--") + +plt.xlabel("Number of non-zero coefficients") +plt.ylabel("RMSE") +plt.xlim((0, X_tr.shape[1])) +#plt.ylim((160, 170)) +plt.legend() + +plt.show() From 21e07117d3c8fc323d477a3a2c021cd8f34e9651 Mon Sep 17 00:00:00 2001 From: Mathieu Blondel Date: Tue, 18 Oct 2016 21:02:29 +0900 Subject: [PATCH 3/5] Add FISTA to plot. --- examples/plot_l1_reg.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/examples/plot_l1_reg.py b/examples/plot_l1_reg.py index 3ab5f95b..fcbc48c0 100644 --- a/examples/plot_l1_reg.py +++ b/examples/plot_l1_reg.py @@ -16,6 +16,7 @@ from lightning.regression import CDRegressor from lightning.regression import FWRegressor +from lightning.regression import FistaRegressor diabetes = load_diabetes() X, y = diabetes.data, diabetes.target @@ -29,6 +30,8 @@ fw_n_nz = [] fw_error = [] +fista_n_nz = [] +fista_error = [] cd_n_nz = [] cd_error = [] @@ -39,6 +42,13 @@ fw_n_nz.append(np.sum(reg.coef_ != 0)) fw_error.append(np.sqrt(np.mean((y_te - y_pred) ** 2))) + if beta <= 1000: + reg = FistaRegressor(penalty="l1-ball", alpha=beta, max_iter=1000, verbose=0) + reg.fit(X_tr, y_tr) + y_pred = reg.predict(X_te) + fista_n_nz.append(np.sum(reg.coef_ != 0)) + fista_error.append(np.sqrt(np.mean((y_te - y_pred) ** 2))) + for alpha in alphas: reg = CDRegressor(alpha=alpha, penalty="l1", max_iter=1000, tol=1e-3, verbose=0) @@ -47,7 +57,10 @@ cd_n_nz.append(np.sum(reg.coef_ != 0)) cd_error.append(np.sqrt(np.mean((y_te - y_pred) ** 2))) +#fista_error = np.array(fista_error)[np.argsort(fista_n_nz)] + plt.plot(fw_n_nz, fw_error, label="Frank-Wolfe", linewidth=3) +plt.plot(fista_n_nz, fista_error, label="FISTA", linewidth=3, marker="s") plt.plot(cd_n_nz, cd_error, label="Coordinate Descent", linewidth=3, linestyle="--") plt.xlabel("Number of non-zero coefficients") From 55a4b02692b99b7d3a8fd394e20a9f67e0ef73e4 Mon Sep 17 00:00:00 2001 From: Mathieu Blondel Date: Thu, 20 Oct 2016 23:50:57 +0900 Subject: [PATCH 4/5] Add simplex option. --- lightning/impl/fw.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/lightning/impl/fw.py b/lightning/impl/fw.py index 801f455e..1dde8f09 100644 --- a/lightning/impl/fw.py +++ b/lightning/impl/fw.py @@ -6,7 +6,7 @@ def _frank_wolfe(w_init, X, y, beta, max_iter=50, tol=1e-3, max_nz=None, - verbose=0): + simplex=False, verbose=0): """ Solve @@ -29,8 +29,12 @@ def _frank_wolfe(w_init, X, y, beta, max_iter=50, tol=1e-3, max_nz=None, resid = beta * y_pred - y neg_grad = -safe_sparse_dot(X.T, beta * resid) - atom = np.argmax(np.abs(neg_grad)) - s = np.sign(neg_grad[atom]) + if simplex: + atom = np.argmax(neg_grad) + s = 1 + else: + atom = np.argmax(np.abs(neg_grad)) + s = np.sign(neg_grad[atom]) error = np.dot(resid, resid) dgap = s * neg_grad[atom] - np.dot(w, neg_grad) @@ -79,11 +83,13 @@ def _frank_wolfe(w_init, X, y, beta, max_iter=50, tol=1e-3, max_nz=None, class FWRegressor(BaseEstimator, RegressorMixin): - def __init__(self, beta=1.0, max_iter=50, tol=1e-3, max_nz=None, verbose=0): + def __init__(self, beta=1.0, max_iter=50, tol=1e-3, max_nz=None, + simplex=False, verbose=0): self.beta = beta self.max_iter = max_iter self.tol = tol self.max_nz = max_nz + self.simplex = simplex self.verbose = verbose def fit(self, X, y): @@ -91,7 +97,8 @@ def fit(self, X, y): coef = np.zeros(n_features) self.coef_ = _frank_wolfe(coef, X, y, beta=self.beta, max_iter=self.max_iter, tol=self.tol, - max_nz=self.max_nz, verbose=self.verbose) + max_nz=self.max_nz, simplex=self.simplex, + verbose=self.verbose) return self def predict(self, X): From 27500f08aff37cb8c4a33a91dbaf9ea9f6719866 Mon Sep 17 00:00:00 2001 From: Mathieu Blondel Date: Wed, 18 Jan 2017 13:10:43 +0900 Subject: [PATCH 5/5] Use abs(x) > 1e-9 instead of x != 0. --- examples/plot_l1_reg.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/examples/plot_l1_reg.py b/examples/plot_l1_reg.py index fcbc48c0..d443bbad 100644 --- a/examples/plot_l1_reg.py +++ b/examples/plot_l1_reg.py @@ -39,28 +39,28 @@ reg = FWRegressor(beta=beta, max_iter=1000, tol=1e-3, verbose=0) reg.fit(X_tr, y_tr) y_pred = reg.predict(X_te) - fw_n_nz.append(np.sum(reg.coef_ != 0)) + fw_n_nz.append(np.sum(np.abs(reg.coef_) > 1e-9)) fw_error.append(np.sqrt(np.mean((y_te - y_pred) ** 2))) - if beta <= 1000: - reg = FistaRegressor(penalty="l1-ball", alpha=beta, max_iter=1000, verbose=0) - reg.fit(X_tr, y_tr) - y_pred = reg.predict(X_te) - fista_n_nz.append(np.sum(reg.coef_ != 0)) - fista_error.append(np.sqrt(np.mean((y_te - y_pred) ** 2))) + reg = FistaRegressor(penalty="l1-ball", alpha=beta, max_iter=1000, verbose=0) + reg.fit(X_tr, y_tr) + y_pred = reg.predict(X_te) + fista_n_nz.append(np.sum(np.abs(reg.coef_) > 1e-9)) + fista_error.append(np.sqrt(np.mean((y_te - y_pred) ** 2))) for alpha in alphas: reg = CDRegressor(alpha=alpha, penalty="l1", max_iter=1000, tol=1e-3, verbose=0) reg.fit(X_tr, y_tr) y_pred = reg.predict(X_te) - cd_n_nz.append(np.sum(reg.coef_ != 0)) + cd_n_nz.append(np.sum(np.abs(reg.coef_) > 1e-9)) cd_error.append(np.sqrt(np.mean((y_te - y_pred) ** 2))) #fista_error = np.array(fista_error)[np.argsort(fista_n_nz)] plt.plot(fw_n_nz, fw_error, label="Frank-Wolfe", linewidth=3) -plt.plot(fista_n_nz, fista_error, label="FISTA", linewidth=3, marker="s") +plt.plot(fista_n_nz, fista_error, label="FISTA", linewidth=3, marker="s", + linestyle="--") plt.plot(cd_n_nz, cd_error, label="Coordinate Descent", linewidth=3, linestyle="--") plt.xlabel("Number of non-zero coefficients")