From 5d87f3f6ae89cb401464c8f44628eb5bc58f9302 Mon Sep 17 00:00:00 2001 From: Jordan Stomps Date: Mon, 31 Oct 2022 14:23:39 -0400 Subject: [PATCH 1/5] adding CoTraining class implementation --- models/SSML/CoTraining.py | 335 ++++++++++++++++++++++++++++++++++++++ models/SSML/__init__.py | 0 2 files changed, 335 insertions(+) create mode 100644 models/SSML/CoTraining.py create mode 100644 models/SSML/__init__.py diff --git a/models/SSML/CoTraining.py b/models/SSML/CoTraining.py new file mode 100644 index 0000000..e6757bd --- /dev/null +++ b/models/SSML/CoTraining.py @@ -0,0 +1,335 @@ +import numpy as np +import matplotlib.pyplot as plt +# For hyperopt (parameter optimization) +from hyperopt import STATUS_OK +# sklearn models +from sklearn import linear_model +# diagnostics +from sklearn.metrics import balanced_accuracy_score +from scripts.utils import run_hyperopt +import joblib + + +class CoTraining: + ''' + Methods for deploying a basic co-training with logistic + regression implementation with hyperparameter optimization. + Data agnostic (i.e. user supplied data inputs). + TODO: Currently only supports binary classification. + Add multinomial functions and unit tests. + Add functionality for regression(?) + Inputs: + params: dictionary of logistic regression input functions. + keys max_iter, tol, and C supported. + random_state: int/float for reproducible intiailization. + ''' + + # only binary so far + def __init__(self, params=None, random_state=0): + # defaults to a fixed value for reproducibility + self.random_state = random_state + # dictionary of parameters for logistic regression model + self.params = params + if self.params is None: + self.model1 = linear_model.LogisticRegression( + random_state=self.random_state) + self.model2 = linear_model.LogisticRegression( + random_state=self.random_state) + # default needed for training + self.params = {'n_samples': 1} + else: + self.model1 = linear_model.LogisticRegression( + random_state=self.random_state, + max_iter=params['max_iter'], + tol=params['tol'], + C=params['C'] + ) + self.model2 = linear_model.LogisticRegression( + random_state=self.random_state, + max_iter=params['max_iter'], + tol=params['tol'], + C=params['C'] + ) + + def training_loop(self, slr1, slr2, L_lr1, L_lr2, + Ly_lr1, Ly_lr2, U_lr, n_samples, + testx=None, testy=None): + ''' + Main training iteration for co-training. + Given two models, labeled training data, and unlabeled training data: + - Train both models using their respective labeled datasets + - Randomly sample n_samples number of unlabeled + instances for model 1 and 2 each. + - Label the sampled unlabeled instances using + model 1 (u1) and model 2 (u2). + - Remove u1 and u2 from the unlabeled dataset and + include in each model's respective labeled dataset + with their associated labels for future training. + Inputs: + slr1: logistic regression co-training model #1 + slr2: logistic regression co-training model #2 + L_lr1: feature training data for co-training model #1 + L_lr2: feature training data for co-training model #2 + Ly_lr1: labels for input data for co-training model #1 + Ly_lr2: labels for input data for co-training model #2 + U_lr: unlabeled feature training data used by both models + n_samples: the number of instances to sample and + predict from Ux at one time + testx: feature vector/matrix used for testing the performance + of each model at every iteration. + testy: label vector used for testing the performance + of each model at every iteration. + ''' + + model1_accs, model2_accs = np.array([]), np.array([]) + # should stay false but if true, + # the same unalbeled instance could be sampled multiple times + rep = False + while U_lr.shape[0] > 1: + slr1.fit(L_lr1, Ly_lr1) + slr2.fit(L_lr2, Ly_lr2) + + # pull u1 + # ensuring there is enough instances to sample for each model + if U_lr.shape[0] < n_samples*2: + n_samples = int(U_lr.shape[0]/2) + uidx1 = np.random.choice(range(U_lr.shape[0]), + n_samples, + replace=rep) + u1 = U_lr[uidx1].copy() + # remove instances that will be labeled + U_lr = np.delete(U_lr, uidx1, axis=0) + + # pull u2 + uidx2 = np.random.choice(range(U_lr.shape[0]), + n_samples, + replace=rep) + u2 = U_lr[uidx2].copy() + # remove instances that will be labeled + U_lr = np.delete(U_lr, uidx2, axis=0) + + # predict unlabeled samples + u1y = slr1.predict(u1) + u2y = slr2.predict(u2) + + if testx is not None and testy is not None: + # test and save model(s) accuracy over all training iterations + model1_accs = np.append(model1_accs, + balanced_accuracy_score(testy, + slr1.predict( + testx))) + model2_accs = np.append(model2_accs, + balanced_accuracy_score(testy, + slr2.predict( + testx))) + + # add predictions to cotrained model(s) labeled samples + L_lr1 = np.append(L_lr1, u2, axis=0) + L_lr2 = np.append(L_lr2, u1, axis=0) + Ly_lr1 = np.append(Ly_lr1, u2y, axis=0) + Ly_lr2 = np.append(Ly_lr2, u1y, axis=0) + + return slr1, slr2, model1_accs, model2_accs + + def fresh_start(self, params, data_dict): + ''' + Required method for hyperopt optimization. + Trains and tests a fresh co-training model + with given input parameters. + This method does not overwrite self.model (self.optimize() does). + Inputs: + params: dictionary of logistic regression input functions. + keys n_samples, max_iter, tol, and C supported. + data_dict: compact data representation with the four requisite + data structures used for training and testing a model. + keys trainx, trainy, testx, testy, and Ux required. + NOTE: Uy is not needed since labels for unlabeled data + instances is not used. + ''' + + # unpack data + trainx = data_dict['trainx'] + trainy = data_dict['trainy'] + testx = data_dict['testx'] + testy = data_dict['testy'] + # unlabeled co-training data + Ux = data_dict['Ux'] + + clf = CoTraining(params=params, random_state=self.random_state) + # training and testing + model1_accs, model2_accs = clf.train(trainx, trainy, Ux, testx, testy) + # uses balanced_accuracy accounts for class imbalanced data + pred1, acc, pred2, model1_acc, model2_acc = clf.predict(testx, testy) + + return {'loss': 1-acc, + 'status': STATUS_OK, + 'model': clf.model1, + 'model2': clf.model2, + 'model1_acc_history': model1_accs, + 'model2_acc_history': model2_accs, + 'params': params, + 'accuracy': acc} + + def optimize(self, space, data_dict, max_evals=50, verbose=True): + ''' + Wrapper method for using hyperopt (see utils.run_hyperopt + for more details). After hyperparameter optimization, results + are stored, the best model -overwrites- self.model, and the + best params -overwrite- self.params. + Inputs: + space: a hyperopt compliant dictionary with defined optimization + spaces. For example: + # quniform returns float, some parameters require int; + # use this to force int + space = {'max_iter' : scope.int(hp.quniform('max_iter', + 10, + 10000, + 10)), + 'tol' : hp.loguniform('tol', 1e-5, 1e-3), + 'C' : hp.uniform('C', 1.0, 1000.0), + 'n_samples' : scope.int(hp.quniform('n_samples', + 1, + 20, + 1)) + } + See hyperopt docs for more information. + data_dict: compact data representation with the five requisite + data structures used for training and testing an SSML model. + keys trainx, trainy, testx, testy, and Ux required. + NOTE: Uy is not needed since labels for unlabeled data + instances is not used. + max_evals: the number of epochs for hyperparameter optimization. + Each iteration is one set of hyperparameters trained + and tested on a fresh model. Convergence for simpler + models like logistic regression typically happens well + before 50 epochs, but can increase as more complex models, + more hyperparameters, and a larger hyperparameter space is tested. + verbose: boolean. If true, print results of hyperopt. + If false, print only the progress bar for optimization. + ''' + + best, worst = run_hyperopt(space=space, + model=self.fresh_start, + data_dict=data_dict, + max_evals=max_evals, + verbose=verbose) + + # save the results of hyperparameter optimization + self.best = best + self.model = best['model'] + self.params = best['params'] + self.worst = worst + + def train(self, trainx, trainy, Ux, + testx=None, testy=None): + ''' + Wrapper method for a basic co-training with logistic regression + implementation training method. + Inputs: + trainx: nxm feature vector/matrix for training model. + trainy: nxk class label vector/matrix for training model. + Ux: feature vector/matrix like labeled trainx but unlabeled data. + testx: feature vector/matrix used for testing the performance + of each model at every iteration. + testy: label vector used for testing the performance + of each model at every iteration. + ''' + + # avoid overwriting when deleting in co-training loop + U_lr = Ux.copy() + + # set the random seed of training splits for reproducibility + # This can be ignored by excluding params['seed'] + # in the hyperopt space dictionary + if 'seed' in self.params.keys(): + np.random.seed(self.params['seed']) + + # TODO: allow a user to specify uneven splits between the two models + split_frac = 0.5 + # labeled training data + idx = np.random.choice(range(trainy.shape[0]), + size=int(split_frac * trainy.shape[0]), + replace=False) + + # avoid overwriting when deleting in co-training loop + L_lr1 = trainx[idx].copy() + L_lr2 = trainx[~idx].copy() + Ly_lr1 = trainy[idx].copy() + Ly_lr2 = trainy[~idx].copy() + + self.model1, self.model2, model1_accs, model2_accs = \ + self.training_loop( + self.model1, self.model2, + L_lr1, L_lr2, + Ly_lr1, Ly_lr2, + U_lr, self.params['n_samples'], + testx, testy, + ) + + # optional returns if a user is interested in training diagnostics + return model1_accs, model2_accs + + def predict(self, testx, testy=None): + ''' + Wrapper method for sklearn's Label Propagation predict method. + Inputs: + testx: nxm feature vector/matrix for testing model. + testy: nxk class label vector/matrix for training model. + optional: if included, the predicted classes -and- + the resulting classification accuracy will be returned. + ''' + + pred1 = self.model1.predict(testx) + pred2 = self.model2.predict(testx) + + acc = None + if testy is not None: + # balanced_accuracy accounts for class imbalanced data + # could alternatively use pure accuracy + # for a more traditional hyperopt + model1_acc = balanced_accuracy_score(testy, pred1) + model2_acc = balanced_accuracy_score(testy, pred2) + # select best accuracy for hyperparameter optimization + acc = max(model1_acc, model2_acc) + + return pred1, acc, pred2, model1_acc, model2_acc + + def plot_cotraining(self, model1_accs=None, model2_accs=None, + filename='lr-cotraining-learningcurves.png'): + ''' + Plots the training error curves for two co-training models. + NOTE: The user must provide the curves to plot, but each curve is + saved by the class under self.best and self.worst models. + Inputs: + filename: name to store picture under. + Must end in .png (or will be added if missing). + model1_accs: the accuracy scores over training epochs for model 1 + model2_accs: the accuracy scores over training epochs for model 2 + ''' + + fig, ax = plt.subplots(figsize=(10, 8), dpi=300) + ax.plot(np.arange(len(model1_accs)), model1_accs, + color='tab:blue', label='Model 1') + ax.plot(np.arange(len(model2_accs)), model2_accs, + color='tab:orange', label='Model 2') + ax.legend() + ax.set_xlabel('Co-Training Iteration') + ax.set_ylabel('Test Accuracy') + ax.grid() + + if filename[-4:] != '.png': + filename += '.png' + fig.savefig(filename) + + def save(self, filename): + ''' + Save class instance to file using joblib. + Inputs: + filename: string filename to save object to file under. + The file must be saved with extension .joblib. + Added to filename if not included as input. + ''' + + if filename[-7:] != '.joblib': + filename += '.joblib' + joblib.dump(self, filename) diff --git a/models/SSML/__init__.py b/models/SSML/__init__.py new file mode 100644 index 0000000..e69de29 From 1a85591e318431a89f7134295f43a1c2f1c37cfd Mon Sep 17 00:00:00 2001 From: Jordan Stomps Date: Mon, 31 Oct 2022 14:46:13 -0400 Subject: [PATCH 2/5] adding pytest for cotraining --- tests/test_models.py | 83 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) diff --git a/tests/test_models.py b/tests/test_models.py index 5b66f65..3a28206 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -14,6 +14,8 @@ import scripts.utils as utils # models from models.LogReg import LogReg +# models +from models.SSML.CoTraining import CoTraining # testing write import joblib import os @@ -190,3 +192,84 @@ def test_LogReg(): assert model_file.best['params'] == model.best['params'] os.remove(filename+ext) + + +def test_CoTraining(): + # test saving model input parameters + params = {'max_iter': 2022, 'tol': 0.5, 'C': 5.0} + model = CoTraining(params=params) + + assert model.model1.max_iter == params['max_iter'] + assert model.model1.tol == params['tol'] + assert model.model1.C == params['C'] + + assert model.model2.max_iter == params['max_iter'] + assert model.model2.tol == params['tol'] + assert model.model2.C == params['C'] + + X, Ux, y, Uy = train_test_split(spectra, + labels, + test_size=0.5, + random_state=0) + X_train, X_test, y_train, y_test = train_test_split(X, + y, + test_size=0.2, + random_state=0) + + # normalization + normalizer = StandardScaler() + normalizer.fit(X_train) + + X_train = normalizer.transform(X_train) + X_test = normalizer.transform(X_test) + Ux = normalizer.transform(Ux) + + # default behavior + model = CoTraining(params=None, random_state=0) + model.train(X_train, y_train, Ux) + + # testing train and predict methods + pred, acc, *_ = model.predict(X_test, y_test) + + assert acc > 0.7 + np.testing.assert_equal(pred, y_test) + + # testing hyperopt optimize methods + space = {'max_iter': scope.int(hp.quniform('max_iter', + 10, + 10000, + 10)), + 'tol': hp.loguniform('tol', 1e-5, 1e-3), + 'C': hp.uniform('C', 1.0, 1000.0), + 'n_samples': scope.int(hp.quniform('n_samples', + 1, + 20, + 1)), + 'seed': 0 + } + data_dict = {'trainx': X_train, + 'testx': X_test, + 'trainy': y_train, + 'testy': y_test, + 'Ux': Ux + } + model.optimize(space, data_dict, max_evals=2, verbose=True) + + assert model.best['accuracy'] >= model.worst['accuracy'] + assert model.best['status'] == 'ok' + + # testing model plotting method + filename = 'test_plot' + model.plot_cotraining(model1_accs=model.best['model1_acc_history'], + model2_accs=model.best['model2_acc_history'], + filename=filename) + os.remove(filename+'.png') + + # testing model write to file method + filename = 'test_LogReg' + ext = '.joblib' + model.save(filename) + model_file = joblib.load(filename+ext) + assert model_file.best['params'] == model.best['params'] + + os.remove(filename+ext) From ec47a631ad10dc9c00f4d625aa6389a4451184a7 Mon Sep 17 00:00:00 2001 From: Jordan Stomps Date: Mon, 16 Jan 2023 14:06:11 -0500 Subject: [PATCH 3/5] changes in light of PR #41 comments --- models/LogReg.py | 6 ++-- models/SSML/CoTraining.py | 59 ++++++++++++++++++--------------------- tests/test_models.py | 31 +++++++++----------- 3 files changed, 43 insertions(+), 53 deletions(-) diff --git a/models/LogReg.py b/models/LogReg.py index 316a82f..0075491 100644 --- a/models/LogReg.py +++ b/models/LogReg.py @@ -17,14 +17,14 @@ class LogReg: Add multinomial functions and unit tests. Add functionality for regression(?) Inputs: - params: dictionary of logistic regression input functions. - keys max_iter, tol, and C supported. + kwargs: logistic regression input functions. + keys random_state, max_iter, tol, and C supported. random_state: int/float for reproducible intiailization. ''' # only binary so far def __init__(self, **kwargs): - # supported keys = ['max_iter', 'tol', 'C'] + # supported keys = ['max_iter', 'tol', 'C', 'random_state'] # defaults to a fixed value for reproducibility self.random_state = kwargs.pop('random_state', 0) # parameters for logistic regression model: diff --git a/models/SSML/CoTraining.py b/models/SSML/CoTraining.py index e6757bd..16eac7d 100644 --- a/models/SSML/CoTraining.py +++ b/models/SSML/CoTraining.py @@ -19,37 +19,35 @@ class CoTraining: Add multinomial functions and unit tests. Add functionality for regression(?) Inputs: - params: dictionary of logistic regression input functions. - keys max_iter, tol, and C supported. + kwargs: logistic regression input functions. + keys random_state, max_iter, tol, and C supported. random_state: int/float for reproducible intiailization. ''' # only binary so far - def __init__(self, params=None, random_state=0): + def __init__(self, **kwargs): + # supported keys = ['max_iter', 'tol', 'C', 'random_state'] # defaults to a fixed value for reproducibility - self.random_state = random_state - # dictionary of parameters for logistic regression model - self.params = params - if self.params is None: - self.model1 = linear_model.LogisticRegression( - random_state=self.random_state) - self.model2 = linear_model.LogisticRegression( - random_state=self.random_state) - # default needed for training - self.params = {'n_samples': 1} - else: - self.model1 = linear_model.LogisticRegression( - random_state=self.random_state, - max_iter=params['max_iter'], - tol=params['tol'], - C=params['C'] - ) - self.model2 = linear_model.LogisticRegression( - random_state=self.random_state, - max_iter=params['max_iter'], - tol=params['tol'], - C=params['C'] - ) + self.random_state = kwargs.pop('random_state', 0) + self.seed = kwargs.pop('seed', 0) + # parameters for cotraining logistic regression models: + # defaults to sklearn.linear_model.LogisticRegression default vals + self.max_iter = kwargs.pop('max_iter', 100) + self.tol = kwargs.pop('tol', 0.0001) + self.C = kwargs.pop('C', 1.0) + self.n_samples = kwargs.pop('n_samples', 1) + self.model1 = linear_model.LogisticRegression( + random_state=self.random_state, + max_iter=self.max_iter, + tol=self.tol, + C=self.C + ) + self.model2 = linear_model.LogisticRegression( + random_state=self.random_state, + max_iter=self.max_iter, + tol=self.tol, + C=self.C + ) def training_loop(self, slr1, slr2, L_lr1, L_lr2, Ly_lr1, Ly_lr2, U_lr, n_samples, @@ -155,7 +153,7 @@ def fresh_start(self, params, data_dict): # unlabeled co-training data Ux = data_dict['Ux'] - clf = CoTraining(params=params, random_state=self.random_state) + clf = CoTraining(**params, random_state=self.random_state) # training and testing model1_accs, model2_accs = clf.train(trainx, trainy, Ux, testx, testy) # uses balanced_accuracy accounts for class imbalanced data @@ -239,10 +237,7 @@ def train(self, trainx, trainy, Ux, U_lr = Ux.copy() # set the random seed of training splits for reproducibility - # This can be ignored by excluding params['seed'] - # in the hyperopt space dictionary - if 'seed' in self.params.keys(): - np.random.seed(self.params['seed']) + np.random.seed(self.seed) # TODO: allow a user to specify uneven splits between the two models split_frac = 0.5 @@ -262,7 +257,7 @@ def train(self, trainx, trainy, Ux, self.model1, self.model2, L_lr1, L_lr2, Ly_lr1, Ly_lr2, - U_lr, self.params['n_samples'], + U_lr, self.n_samples, testx, testy, ) diff --git a/tests/test_models.py b/tests/test_models.py index 3a28206..b7bb087 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -67,16 +67,6 @@ def test_cross_validation(): # therefore its accuracy should be less than all other folds assert (accs[-1] < accs[:-1]).all() - # test cross validation for supervised data and StratifiedKFold with LogReg - # params = {'max_iter': 2022, 'tol': 0.5, 'C': 5.0} - # model = LogReg(params=params) - # max_acc_model = utils.cross_validation(model=model, - # X=X, - # y=y, - # params=params, - # stratified=True) - # assert max_acc_model['accuracy'] >= 0.5 - # test cross validation for SSML with LabelProp # params = {'gamma': 10, 'n_neighbors': 15, 'max_iter': 2022, 'tol': 0.5} # model = LabelProp(params=params) @@ -106,9 +96,10 @@ def test_pca(): utils.plot_pca(pcs, y_train, np.full_like(Uy, -1), filename, 2) os.remove(filename+'.png') - # filename = 'test_multiD_pca' - # utils.multiD_pca(X_train, y_train, Ux, np.full_like(Uy, -1), filename, n=5) - # os.remove(filename+'.png') + filename = 'test_multiD_pca' + pcs = utils.pca(X_train, Ux, 5) + utils.plot_pca(pcs, y_train, np.full_like(Uy, -1), filename, 5) + os.remove(filename+'.png') # normalization normalizer = StandardScaler() @@ -197,7 +188,9 @@ def test_LogReg(): def test_CoTraining(): # test saving model input parameters params = {'max_iter': 2022, 'tol': 0.5, 'C': 5.0} - model = CoTraining(params=params) + model = CoTraining(max_iter=params['max_iter'], + tol=params['tol'], + C=params['C']) assert model.model1.max_iter == params['max_iter'] assert model.model1.tol == params['tol'] @@ -207,8 +200,8 @@ def test_CoTraining(): assert model.model2.tol == params['tol'] assert model.model2.C == params['C'] - X, Ux, y, Uy = train_test_split(spectra, - labels, + X, Ux, y, Uy = train_test_split(pytest.spectra, + pytest.labels, test_size=0.5, random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, @@ -231,8 +224,10 @@ def test_CoTraining(): # testing train and predict methods pred, acc, *_ = model.predict(X_test, y_test) - assert acc > 0.7 - np.testing.assert_equal(pred, y_test) + # since the test data used here is synthetic/toy data (i.e. uninteresting), + # the trained model should be at least better than a 50-50 guess + # if it was worse, something would be wrong with the ML class + assert acc > 0.5 # testing hyperopt optimize methods space = {'max_iter': scope.int(hp.quniform('max_iter', From 95fc695a89ae209139787e1dfa7cdd7e9de426a6 Mon Sep 17 00:00:00 2001 From: Jordan Stomps Date: Wed, 18 Jan 2023 09:33:35 -0500 Subject: [PATCH 4/5] adjusting numpy.random.seed usage in cotraining --- models/SSML/CoTraining.py | 15 +++++++-------- tests/test_models.py | 16 ++++++++++++---- 2 files changed, 19 insertions(+), 12 deletions(-) diff --git a/models/SSML/CoTraining.py b/models/SSML/CoTraining.py index 16eac7d..9fda2db 100644 --- a/models/SSML/CoTraining.py +++ b/models/SSML/CoTraining.py @@ -16,20 +16,22 @@ class CoTraining: regression implementation with hyperparameter optimization. Data agnostic (i.e. user supplied data inputs). TODO: Currently only supports binary classification. - Add multinomial functions and unit tests. - Add functionality for regression(?) + - Add multinomial functions and unit tests. + - Add functionality for regression(?) Inputs: kwargs: logistic regression input functions. - keys random_state, max_iter, tol, and C supported. - random_state: int/float for reproducible intiailization. + keys seed, random_state, max_iter, tol, and C supported. + seed/random_state: int/float for reproducible intiailization. ''' # only binary so far def __init__(self, **kwargs): - # supported keys = ['max_iter', 'tol', 'C', 'random_state'] + # supported keys = ['max_iter', 'tol', 'C', 'random_state', 'seed'] # defaults to a fixed value for reproducibility self.random_state = kwargs.pop('random_state', 0) + # set the random seed of training splits for reproducibility self.seed = kwargs.pop('seed', 0) + np.random.seed(self.seed) # parameters for cotraining logistic regression models: # defaults to sklearn.linear_model.LogisticRegression default vals self.max_iter = kwargs.pop('max_iter', 100) @@ -236,9 +238,6 @@ def train(self, trainx, trainy, Ux, # avoid overwriting when deleting in co-training loop U_lr = Ux.copy() - # set the random seed of training splits for reproducibility - np.random.seed(self.seed) - # TODO: allow a user to specify uneven splits between the two models split_frac = 0.5 # labeled training data diff --git a/tests/test_models.py b/tests/test_models.py index b7bb087..334fc19 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -122,14 +122,16 @@ def test_pca(): def test_LogReg(): # test saving model input parameters - params = {'max_iter': 2022, 'tol': 0.5, 'C': 5.0} + params = {'max_iter': 2022, 'tol': 0.5, 'C': 5.0, 'random_state': 0} model = LogReg(max_iter=params['max_iter'], tol=params['tol'], - C=params['C']) + C=params['C'], + random_state=params['random_state']) assert model.model.max_iter == params['max_iter'] assert model.model.tol == params['tol'] assert model.model.C == params['C'] + assert model.random_state == params['random_state'] X_train, X_test, y_train, y_test = train_test_split(pytest.spectra, pytest.labels, @@ -187,10 +189,13 @@ def test_LogReg(): def test_CoTraining(): # test saving model input parameters - params = {'max_iter': 2022, 'tol': 0.5, 'C': 5.0} + params = {'max_iter': 2022, 'tol': 0.5, 'C': 5.0, + 'random_state': 0, 'seed': 1} model = CoTraining(max_iter=params['max_iter'], tol=params['tol'], - C=params['C']) + C=params['C'], + random_state=params['random_state'], + seed=params['seed']) assert model.model1.max_iter == params['max_iter'] assert model.model1.tol == params['tol'] @@ -200,6 +205,9 @@ def test_CoTraining(): assert model.model2.tol == params['tol'] assert model.model2.C == params['C'] + assert model.random_state == params['random_state'] + assert model.seed == params['seed'] + X, Ux, y, Uy = train_test_split(pytest.spectra, pytest.labels, test_size=0.5, From e1fc82820ef58a4a46d55b4a06e0ab55a3aacb5e Mon Sep 17 00:00:00 2001 From: Jordan Stomps Date: Thu, 21 Dec 2023 11:09:33 -0500 Subject: [PATCH 5/5] remove comment --- tests/test_models.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_models.py b/tests/test_models.py index 334fc19..75b6702 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -14,7 +14,6 @@ import scripts.utils as utils # models from models.LogReg import LogReg -# models from models.SSML.CoTraining import CoTraining # testing write import joblib