diff --git a/models/LogReg.py b/models/LogReg.py index 316a82f..0075491 100644 --- a/models/LogReg.py +++ b/models/LogReg.py @@ -17,14 +17,14 @@ class LogReg: Add multinomial functions and unit tests. Add functionality for regression(?) Inputs: - params: dictionary of logistic regression input functions. - keys max_iter, tol, and C supported. + kwargs: logistic regression input functions. + keys random_state, max_iter, tol, and C supported. random_state: int/float for reproducible intiailization. ''' # only binary so far def __init__(self, **kwargs): - # supported keys = ['max_iter', 'tol', 'C'] + # supported keys = ['max_iter', 'tol', 'C', 'random_state'] # defaults to a fixed value for reproducibility self.random_state = kwargs.pop('random_state', 0) # parameters for logistic regression model: diff --git a/models/SSML/CoTraining.py b/models/SSML/CoTraining.py new file mode 100644 index 0000000..9fda2db --- /dev/null +++ b/models/SSML/CoTraining.py @@ -0,0 +1,329 @@ +import numpy as np +import matplotlib.pyplot as plt +# For hyperopt (parameter optimization) +from hyperopt import STATUS_OK +# sklearn models +from sklearn import linear_model +# diagnostics +from sklearn.metrics import balanced_accuracy_score +from scripts.utils import run_hyperopt +import joblib + + +class CoTraining: + ''' + Methods for deploying a basic co-training with logistic + regression implementation with hyperparameter optimization. + Data agnostic (i.e. user supplied data inputs). + TODO: Currently only supports binary classification. + - Add multinomial functions and unit tests. + - Add functionality for regression(?) + Inputs: + kwargs: logistic regression input functions. + keys seed, random_state, max_iter, tol, and C supported. + seed/random_state: int/float for reproducible intiailization. + ''' + + # only binary so far + def __init__(self, **kwargs): + # supported keys = ['max_iter', 'tol', 'C', 'random_state', 'seed'] + # defaults to a fixed value for reproducibility + self.random_state = kwargs.pop('random_state', 0) + # set the random seed of training splits for reproducibility + self.seed = kwargs.pop('seed', 0) + np.random.seed(self.seed) + # parameters for cotraining logistic regression models: + # defaults to sklearn.linear_model.LogisticRegression default vals + self.max_iter = kwargs.pop('max_iter', 100) + self.tol = kwargs.pop('tol', 0.0001) + self.C = kwargs.pop('C', 1.0) + self.n_samples = kwargs.pop('n_samples', 1) + self.model1 = linear_model.LogisticRegression( + random_state=self.random_state, + max_iter=self.max_iter, + tol=self.tol, + C=self.C + ) + self.model2 = linear_model.LogisticRegression( + random_state=self.random_state, + max_iter=self.max_iter, + tol=self.tol, + C=self.C + ) + + def training_loop(self, slr1, slr2, L_lr1, L_lr2, + Ly_lr1, Ly_lr2, U_lr, n_samples, + testx=None, testy=None): + ''' + Main training iteration for co-training. + Given two models, labeled training data, and unlabeled training data: + - Train both models using their respective labeled datasets + - Randomly sample n_samples number of unlabeled + instances for model 1 and 2 each. + - Label the sampled unlabeled instances using + model 1 (u1) and model 2 (u2). + - Remove u1 and u2 from the unlabeled dataset and + include in each model's respective labeled dataset + with their associated labels for future training. + Inputs: + slr1: logistic regression co-training model #1 + slr2: logistic regression co-training model #2 + L_lr1: feature training data for co-training model #1 + L_lr2: feature training data for co-training model #2 + Ly_lr1: labels for input data for co-training model #1 + Ly_lr2: labels for input data for co-training model #2 + U_lr: unlabeled feature training data used by both models + n_samples: the number of instances to sample and + predict from Ux at one time + testx: feature vector/matrix used for testing the performance + of each model at every iteration. + testy: label vector used for testing the performance + of each model at every iteration. + ''' + + model1_accs, model2_accs = np.array([]), np.array([]) + # should stay false but if true, + # the same unalbeled instance could be sampled multiple times + rep = False + while U_lr.shape[0] > 1: + slr1.fit(L_lr1, Ly_lr1) + slr2.fit(L_lr2, Ly_lr2) + + # pull u1 + # ensuring there is enough instances to sample for each model + if U_lr.shape[0] < n_samples*2: + n_samples = int(U_lr.shape[0]/2) + uidx1 = np.random.choice(range(U_lr.shape[0]), + n_samples, + replace=rep) + u1 = U_lr[uidx1].copy() + # remove instances that will be labeled + U_lr = np.delete(U_lr, uidx1, axis=0) + + # pull u2 + uidx2 = np.random.choice(range(U_lr.shape[0]), + n_samples, + replace=rep) + u2 = U_lr[uidx2].copy() + # remove instances that will be labeled + U_lr = np.delete(U_lr, uidx2, axis=0) + + # predict unlabeled samples + u1y = slr1.predict(u1) + u2y = slr2.predict(u2) + + if testx is not None and testy is not None: + # test and save model(s) accuracy over all training iterations + model1_accs = np.append(model1_accs, + balanced_accuracy_score(testy, + slr1.predict( + testx))) + model2_accs = np.append(model2_accs, + balanced_accuracy_score(testy, + slr2.predict( + testx))) + + # add predictions to cotrained model(s) labeled samples + L_lr1 = np.append(L_lr1, u2, axis=0) + L_lr2 = np.append(L_lr2, u1, axis=0) + Ly_lr1 = np.append(Ly_lr1, u2y, axis=0) + Ly_lr2 = np.append(Ly_lr2, u1y, axis=0) + + return slr1, slr2, model1_accs, model2_accs + + def fresh_start(self, params, data_dict): + ''' + Required method for hyperopt optimization. + Trains and tests a fresh co-training model + with given input parameters. + This method does not overwrite self.model (self.optimize() does). + Inputs: + params: dictionary of logistic regression input functions. + keys n_samples, max_iter, tol, and C supported. + data_dict: compact data representation with the four requisite + data structures used for training and testing a model. + keys trainx, trainy, testx, testy, and Ux required. + NOTE: Uy is not needed since labels for unlabeled data + instances is not used. + ''' + + # unpack data + trainx = data_dict['trainx'] + trainy = data_dict['trainy'] + testx = data_dict['testx'] + testy = data_dict['testy'] + # unlabeled co-training data + Ux = data_dict['Ux'] + + clf = CoTraining(**params, random_state=self.random_state) + # training and testing + model1_accs, model2_accs = clf.train(trainx, trainy, Ux, testx, testy) + # uses balanced_accuracy accounts for class imbalanced data + pred1, acc, pred2, model1_acc, model2_acc = clf.predict(testx, testy) + + return {'loss': 1-acc, + 'status': STATUS_OK, + 'model': clf.model1, + 'model2': clf.model2, + 'model1_acc_history': model1_accs, + 'model2_acc_history': model2_accs, + 'params': params, + 'accuracy': acc} + + def optimize(self, space, data_dict, max_evals=50, verbose=True): + ''' + Wrapper method for using hyperopt (see utils.run_hyperopt + for more details). After hyperparameter optimization, results + are stored, the best model -overwrites- self.model, and the + best params -overwrite- self.params. + Inputs: + space: a hyperopt compliant dictionary with defined optimization + spaces. For example: + # quniform returns float, some parameters require int; + # use this to force int + space = {'max_iter' : scope.int(hp.quniform('max_iter', + 10, + 10000, + 10)), + 'tol' : hp.loguniform('tol', 1e-5, 1e-3), + 'C' : hp.uniform('C', 1.0, 1000.0), + 'n_samples' : scope.int(hp.quniform('n_samples', + 1, + 20, + 1)) + } + See hyperopt docs for more information. + data_dict: compact data representation with the five requisite + data structures used for training and testing an SSML model. + keys trainx, trainy, testx, testy, and Ux required. + NOTE: Uy is not needed since labels for unlabeled data + instances is not used. + max_evals: the number of epochs for hyperparameter optimization. + Each iteration is one set of hyperparameters trained + and tested on a fresh model. Convergence for simpler + models like logistic regression typically happens well + before 50 epochs, but can increase as more complex models, + more hyperparameters, and a larger hyperparameter space is tested. + verbose: boolean. If true, print results of hyperopt. + If false, print only the progress bar for optimization. + ''' + + best, worst = run_hyperopt(space=space, + model=self.fresh_start, + data_dict=data_dict, + max_evals=max_evals, + verbose=verbose) + + # save the results of hyperparameter optimization + self.best = best + self.model = best['model'] + self.params = best['params'] + self.worst = worst + + def train(self, trainx, trainy, Ux, + testx=None, testy=None): + ''' + Wrapper method for a basic co-training with logistic regression + implementation training method. + Inputs: + trainx: nxm feature vector/matrix for training model. + trainy: nxk class label vector/matrix for training model. + Ux: feature vector/matrix like labeled trainx but unlabeled data. + testx: feature vector/matrix used for testing the performance + of each model at every iteration. + testy: label vector used for testing the performance + of each model at every iteration. + ''' + + # avoid overwriting when deleting in co-training loop + U_lr = Ux.copy() + + # TODO: allow a user to specify uneven splits between the two models + split_frac = 0.5 + # labeled training data + idx = np.random.choice(range(trainy.shape[0]), + size=int(split_frac * trainy.shape[0]), + replace=False) + + # avoid overwriting when deleting in co-training loop + L_lr1 = trainx[idx].copy() + L_lr2 = trainx[~idx].copy() + Ly_lr1 = trainy[idx].copy() + Ly_lr2 = trainy[~idx].copy() + + self.model1, self.model2, model1_accs, model2_accs = \ + self.training_loop( + self.model1, self.model2, + L_lr1, L_lr2, + Ly_lr1, Ly_lr2, + U_lr, self.n_samples, + testx, testy, + ) + + # optional returns if a user is interested in training diagnostics + return model1_accs, model2_accs + + def predict(self, testx, testy=None): + ''' + Wrapper method for sklearn's Label Propagation predict method. + Inputs: + testx: nxm feature vector/matrix for testing model. + testy: nxk class label vector/matrix for training model. + optional: if included, the predicted classes -and- + the resulting classification accuracy will be returned. + ''' + + pred1 = self.model1.predict(testx) + pred2 = self.model2.predict(testx) + + acc = None + if testy is not None: + # balanced_accuracy accounts for class imbalanced data + # could alternatively use pure accuracy + # for a more traditional hyperopt + model1_acc = balanced_accuracy_score(testy, pred1) + model2_acc = balanced_accuracy_score(testy, pred2) + # select best accuracy for hyperparameter optimization + acc = max(model1_acc, model2_acc) + + return pred1, acc, pred2, model1_acc, model2_acc + + def plot_cotraining(self, model1_accs=None, model2_accs=None, + filename='lr-cotraining-learningcurves.png'): + ''' + Plots the training error curves for two co-training models. + NOTE: The user must provide the curves to plot, but each curve is + saved by the class under self.best and self.worst models. + Inputs: + filename: name to store picture under. + Must end in .png (or will be added if missing). + model1_accs: the accuracy scores over training epochs for model 1 + model2_accs: the accuracy scores over training epochs for model 2 + ''' + + fig, ax = plt.subplots(figsize=(10, 8), dpi=300) + ax.plot(np.arange(len(model1_accs)), model1_accs, + color='tab:blue', label='Model 1') + ax.plot(np.arange(len(model2_accs)), model2_accs, + color='tab:orange', label='Model 2') + ax.legend() + ax.set_xlabel('Co-Training Iteration') + ax.set_ylabel('Test Accuracy') + ax.grid() + + if filename[-4:] != '.png': + filename += '.png' + fig.savefig(filename) + + def save(self, filename): + ''' + Save class instance to file using joblib. + Inputs: + filename: string filename to save object to file under. + The file must be saved with extension .joblib. + Added to filename if not included as input. + ''' + + if filename[-7:] != '.joblib': + filename += '.joblib' + joblib.dump(self, filename) diff --git a/models/SSML/__init__.py b/models/SSML/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_models.py b/tests/test_models.py index 5b66f65..75b6702 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -14,6 +14,7 @@ import scripts.utils as utils # models from models.LogReg import LogReg +from models.SSML.CoTraining import CoTraining # testing write import joblib import os @@ -65,16 +66,6 @@ def test_cross_validation(): # therefore its accuracy should be less than all other folds assert (accs[-1] < accs[:-1]).all() - # test cross validation for supervised data and StratifiedKFold with LogReg - # params = {'max_iter': 2022, 'tol': 0.5, 'C': 5.0} - # model = LogReg(params=params) - # max_acc_model = utils.cross_validation(model=model, - # X=X, - # y=y, - # params=params, - # stratified=True) - # assert max_acc_model['accuracy'] >= 0.5 - # test cross validation for SSML with LabelProp # params = {'gamma': 10, 'n_neighbors': 15, 'max_iter': 2022, 'tol': 0.5} # model = LabelProp(params=params) @@ -104,9 +95,10 @@ def test_pca(): utils.plot_pca(pcs, y_train, np.full_like(Uy, -1), filename, 2) os.remove(filename+'.png') - # filename = 'test_multiD_pca' - # utils.multiD_pca(X_train, y_train, Ux, np.full_like(Uy, -1), filename, n=5) - # os.remove(filename+'.png') + filename = 'test_multiD_pca' + pcs = utils.pca(X_train, Ux, 5) + utils.plot_pca(pcs, y_train, np.full_like(Uy, -1), filename, 5) + os.remove(filename+'.png') # normalization normalizer = StandardScaler() @@ -129,14 +121,16 @@ def test_pca(): def test_LogReg(): # test saving model input parameters - params = {'max_iter': 2022, 'tol': 0.5, 'C': 5.0} + params = {'max_iter': 2022, 'tol': 0.5, 'C': 5.0, 'random_state': 0} model = LogReg(max_iter=params['max_iter'], tol=params['tol'], - C=params['C']) + C=params['C'], + random_state=params['random_state']) assert model.model.max_iter == params['max_iter'] assert model.model.tol == params['tol'] assert model.model.C == params['C'] + assert model.random_state == params['random_state'] X_train, X_test, y_train, y_test = train_test_split(pytest.spectra, pytest.labels, @@ -190,3 +184,94 @@ def test_LogReg(): assert model_file.best['params'] == model.best['params'] os.remove(filename+ext) + + +def test_CoTraining(): + # test saving model input parameters + params = {'max_iter': 2022, 'tol': 0.5, 'C': 5.0, + 'random_state': 0, 'seed': 1} + model = CoTraining(max_iter=params['max_iter'], + tol=params['tol'], + C=params['C'], + random_state=params['random_state'], + seed=params['seed']) + + assert model.model1.max_iter == params['max_iter'] + assert model.model1.tol == params['tol'] + assert model.model1.C == params['C'] + + assert model.model2.max_iter == params['max_iter'] + assert model.model2.tol == params['tol'] + assert model.model2.C == params['C'] + + assert model.random_state == params['random_state'] + assert model.seed == params['seed'] + + X, Ux, y, Uy = train_test_split(pytest.spectra, + pytest.labels, + test_size=0.5, + random_state=0) + X_train, X_test, y_train, y_test = train_test_split(X, + y, + test_size=0.2, + random_state=0) + + # normalization + normalizer = StandardScaler() + normalizer.fit(X_train) + + X_train = normalizer.transform(X_train) + X_test = normalizer.transform(X_test) + Ux = normalizer.transform(Ux) + + # default behavior + model = CoTraining(params=None, random_state=0) + model.train(X_train, y_train, Ux) + + # testing train and predict methods + pred, acc, *_ = model.predict(X_test, y_test) + + # since the test data used here is synthetic/toy data (i.e. uninteresting), + # the trained model should be at least better than a 50-50 guess + # if it was worse, something would be wrong with the ML class + assert acc > 0.5 + + # testing hyperopt optimize methods + space = {'max_iter': scope.int(hp.quniform('max_iter', + 10, + 10000, + 10)), + 'tol': hp.loguniform('tol', 1e-5, 1e-3), + 'C': hp.uniform('C', 1.0, 1000.0), + 'n_samples': scope.int(hp.quniform('n_samples', + 1, + 20, + 1)), + 'seed': 0 + } + data_dict = {'trainx': X_train, + 'testx': X_test, + 'trainy': y_train, + 'testy': y_test, + 'Ux': Ux + } + model.optimize(space, data_dict, max_evals=2, verbose=True) + + assert model.best['accuracy'] >= model.worst['accuracy'] + assert model.best['status'] == 'ok' + + # testing model plotting method + filename = 'test_plot' + model.plot_cotraining(model1_accs=model.best['model1_acc_history'], + model2_accs=model.best['model2_acc_history'], + filename=filename) + os.remove(filename+'.png') + + # testing model write to file method + filename = 'test_LogReg' + ext = '.joblib' + model.save(filename) + model_file = joblib.load(filename+ext) + assert model_file.best['params'] == model.best['params'] + + os.remove(filename+ext)