diff --git a/examples/plot_l1_reg.py b/examples/plot_l1_reg.py new file mode 100644 index 00000000..d443bbad --- /dev/null +++ b/examples/plot_l1_reg.py @@ -0,0 +1,72 @@ + +""" +======================================== +L1 regression: regularization paths +======================================== + +Shows that the regularization paths obtained by coordinate descent (penalized) +and Frank-Wolfe (constrained) are equivalent. +""" +print __doc__ +import numpy as np +import matplotlib.pyplot as plt + +from sklearn.datasets import load_diabetes +from sklearn.model_selection import train_test_split + +from lightning.regression import CDRegressor +from lightning.regression import FWRegressor +from lightning.regression import FistaRegressor + +diabetes = load_diabetes() +X, y = diabetes.data, diabetes.target + +X_tr, X_te, y_tr, y_te = train_test_split(X, y, train_size=0.75, random_state=0) + +plt.figure() + +betas = np.logspace(-2, 5, 50) +alphas = np.logspace(-4, 4, 50) + +fw_n_nz = [] +fw_error = [] +fista_n_nz = [] +fista_error = [] +cd_n_nz = [] +cd_error = [] + +for beta in betas: + reg = FWRegressor(beta=beta, max_iter=1000, tol=1e-3, verbose=0) + reg.fit(X_tr, y_tr) + y_pred = reg.predict(X_te) + fw_n_nz.append(np.sum(np.abs(reg.coef_) > 1e-9)) + fw_error.append(np.sqrt(np.mean((y_te - y_pred) ** 2))) + + reg = FistaRegressor(penalty="l1-ball", alpha=beta, max_iter=1000, verbose=0) + reg.fit(X_tr, y_tr) + y_pred = reg.predict(X_te) + fista_n_nz.append(np.sum(np.abs(reg.coef_) > 1e-9)) + fista_error.append(np.sqrt(np.mean((y_te - y_pred) ** 2))) + +for alpha in alphas: + reg = CDRegressor(alpha=alpha, penalty="l1", max_iter=1000, tol=1e-3, + verbose=0) + reg.fit(X_tr, y_tr) + y_pred = reg.predict(X_te) + cd_n_nz.append(np.sum(np.abs(reg.coef_) > 1e-9)) + cd_error.append(np.sqrt(np.mean((y_te - y_pred) ** 2))) + +#fista_error = np.array(fista_error)[np.argsort(fista_n_nz)] + +plt.plot(fw_n_nz, fw_error, label="Frank-Wolfe", linewidth=3) +plt.plot(fista_n_nz, fista_error, label="FISTA", linewidth=3, marker="s", + linestyle="--") +plt.plot(cd_n_nz, cd_error, label="Coordinate Descent", linewidth=3, linestyle="--") + +plt.xlabel("Number of non-zero coefficients") +plt.ylabel("RMSE") +plt.xlim((0, X_tr.shape[1])) +#plt.ylim((160, 170)) +plt.legend() + +plt.show() diff --git a/lightning/impl/fw.py b/lightning/impl/fw.py new file mode 100644 index 00000000..1dde8f09 --- /dev/null +++ b/lightning/impl/fw.py @@ -0,0 +1,121 @@ +import numpy as np +import scipy.sparse as sp + +from sklearn.base import BaseEstimator, RegressorMixin +from sklearn.utils.extmath import safe_sparse_dot + + +def _frank_wolfe(w_init, X, y, beta, max_iter=50, tol=1e-3, max_nz=None, + simplex=False, verbose=0): + """ + Solve + + 0.5 * ||np.dot(X, w) - y||^2 s.t. ||w||_1 <= beta + + by the Frank-Wolfe method. + + The method can be seen as a greedy coordinate descent: it adds at most one + non-zero coefficient per iteration. + """ + n_samples, n_features = X.shape + + if sp.issparse(X): + X = X.tocsc() + + w = w_init.copy() + + for it in range(max_iter): + y_pred = safe_sparse_dot(X, w) + resid = beta * y_pred - y + neg_grad = -safe_sparse_dot(X.T, beta * resid) + + if simplex: + atom = np.argmax(neg_grad) + s = 1 + else: + atom = np.argmax(np.abs(neg_grad)) + s = np.sign(neg_grad[atom]) + + error = np.dot(resid, resid) + dgap = s * neg_grad[atom] - np.dot(w, neg_grad) + + if it == 0: + error_init = error + dgap_init = dgap + + if verbose: + print "iter", it + 1 + print "duality gap", dgap / dgap_init + print "error reduction", error / error_init + print "l1 norm", beta * np.sum(np.abs(w)) + print "n_nz", np.sum(w != 0) + print + + # Find optimal step size by exact line search. + Xs = s * X[:, atom] + if sp.issparse(Xs): + Xs_sq = np.dot(Xs.data, Xs.data) + else: + Xs_sq = np.dot(Xs, Xs) + y_pred_sq = np.dot(y_pred, y_pred) + b = (Xs - y_pred) + gamma = np.dot(resid, y_pred) - safe_sparse_dot(resid, Xs) + gamma /= beta * (Xs_sq - 2 * safe_sparse_dot(Xs.T, y_pred) + y_pred_sq) + gamma = max(0, min(1, gamma)) + + # Update parameters. + w *= (1 - gamma) + w[atom] += gamma * s + + # Stop if maximum number of non-zero coefficients is reached. + if max_nz is not None and np.sum(w != 0) == max_nz: + break + + # Stop if desired duality gap tolerance is reached. + if dgap / dgap_init <= tol: + if verbose: + print "Converged" + break + + w *= beta + return w + + +class FWRegressor(BaseEstimator, RegressorMixin): + + def __init__(self, beta=1.0, max_iter=50, tol=1e-3, max_nz=None, + simplex=False, verbose=0): + self.beta = beta + self.max_iter = max_iter + self.tol = tol + self.max_nz = max_nz + self.simplex = simplex + self.verbose = verbose + + def fit(self, X, y): + n_features = X.shape[1] + coef = np.zeros(n_features) + self.coef_ = _frank_wolfe(coef, X, y, beta=self.beta, + max_iter=self.max_iter, tol=self.tol, + max_nz=self.max_nz, simplex=self.simplex, + verbose=self.verbose) + return self + + def predict(self, X): + return safe_sparse_dot(X, self.coef_) + + +if __name__ == '__main__': + from sklearn.datasets import load_diabetes + from sklearn.preprocessing import StandardScaler + + diabetes = load_diabetes() + X, y = diabetes.data, diabetes.target + X = StandardScaler().fit_transform(X) + #X = sp.csr_matrix(X) + + reg = FWRegressor(beta=100, max_iter=1000, tol=1e-2, verbose=1) + reg.fit(X, y) + y_pred = reg.predict(X) + error = np.mean((y - y_pred) ** 2) + print error diff --git a/lightning/regression.py b/lightning/regression.py index 0fdf735d..4fa66f11 100644 --- a/lightning/regression.py +++ b/lightning/regression.py @@ -2,6 +2,7 @@ from .impl.dual_cd import LinearSVR from .impl.primal_cd import CDRegressor from .impl.fista import FistaRegressor +from .impl.fw import FWRegressor from .impl.sag import SAGRegressor from .impl.sag import SAGARegressor from .impl.sdca import SDCARegressor