diff --git a/config/example_params.json b/config/example_params.json index 4bea76bc..47fce1d4 100644 --- a/config/example_params.json +++ b/config/example_params.json @@ -13,6 +13,7 @@ "flaml-individual-descriptors", "autogluon-manifolds", "kerastuner-reference-embedding", + "kerastuner-eosce-embedding", "molmap" ] } \ No newline at end of file diff --git a/install_linux.sh b/install_linux.sh index 5f083277..abfd1fce 100644 --- a/install_linux.sh +++ b/install_linux.sh @@ -31,7 +31,7 @@ python3 -m pip install autogluon.tabular[all]==0.5.2 python3 -m pip install "xgboost==1.3.3" python3 -m pip install "SQLAlchemy<1.4.0" -# install zairachem +# install extra dependencies python3 -m pip install git+https://github.com/chembl/FPSim2.git@0.3.0 python3 -m pip install -q -U keras-tuner==1.1.3 @@ -39,6 +39,9 @@ python3 -m pip install -q -U keras-tuner==1.1.3 python3 -m pip install git+https://github.com/ersilia-os/ersilia.git ersilia --help +# install ersilia compound embedding +python3 -m pip install git+https://github.com/ersilia-os/compound-embedding-lite.git + # install isaura python3 -m pip install git+https://github.com/ersilia-os/isaura.git@ce293244ad0bdd6d7d4f796d2a84b17208a87b56 @@ -51,5 +54,11 @@ python3 -m pip install git+https://github.com/ersilia-os/lazy-qsar.git # install melloddy-tuner python3 -m pip install git+https://github.com/melloddy/MELLODDY-TUNER.git@2.1.3 +# install tabpfn +python3 -m pip install tabpfn==0.1.8 + +# install imblearn +python3 -m pip install imbalanced-learn==0.10.1 + # install zairachem python3 -m pip install -e . diff --git a/zairachem/automl/binarytabpfn.py b/zairachem/automl/binarytabpfn.py new file mode 100644 index 00000000..aad81454 --- /dev/null +++ b/zairachem/automl/binarytabpfn.py @@ -0,0 +1,142 @@ +import numpy as np +from lol import LOL +import random +import collections +from tabpfn import TabPFNClassifier +from imblearn.combine import SMOTETomek +from imblearn.over_sampling import KMeansSMOTE +from imblearn.under_sampling import EditedNearestNeighbours +import joblib + + +class TabPFNBinaryClassifier(object): + def __init__(self, device="cpu", N_ensemble_configurations=4): + self.device = device + self.N_ensemble_configurations = N_ensemble_configurations + self.max_samples = 1000 + + def _get_balanced_datasets(self, X, y): + try: + smp = SMOTETomek(sampling_strategy="auto") + X_0, y_0 = smp.fit_resample(X, y) + except: + X_0, y_0 = X, y + try: + smp = KMeansSMOTE(sampling_strategy="auto") + X_1, y_1 = smp.fit_resample(X, y) + except: + X_1, y_1 = X, y + try: + smp = EditedNearestNeighbours(sampling_strategy="auto") + X_2, y_2 = smp.fit_resample(X, y) + except: + X_2, y_2 = X, y + results = [(X_0, y_0), (X_1, y_1), (X_2, y_2)] + return results + + def _cap_samples(self, X, y): + if X.shape[0] <= self.max_samples: + return [(X, y)] + idxs = [i for i in range(X.shape[0])] + R = [] + for _ in range(3): + smp_idxs = random.sample(idxs, self.max_samples) + X_, y_ = X[smp_idxs], y[smp_idxs] + if np.sum(y_) == 0: + continue + R += [(X_, y_)] + return R + + def _get_ensemble(self, X, y): + R = [] + for X_0, y_0 in self._get_balanced_datasets(X, y): + for X_1, y_1 in self._cap_samples(X_0, y_0): + R += [(X_1, y_1)] + return R + + def fit(self, X, y): + self.reducer = LOL(n_components=100) + self.reducer.fit(X, y) + X = self.reducer.transform(X) + self.ensemble = self._get_ensemble(X, y) + + def predict_proba(self, X): + model = TabPFNClassifier( + device=self.device, N_ensemble_configurations=self.N_ensemble_configurations + ) + X = self.reducer.transform(X) + R = [] + for X_tr, y_tr in self.ensemble: + # print(X_tr.shape, np.sum(y_tr)) + model.fit(X_tr, y_tr) + R += [model.predict_proba(X)[:, 1]] + model.remove_models_from_memory() + R = np.array(R).T + y_h1 = np.mean(R, axis=1) + y_h0 = 1 - y_h1 + y_h = np.array([y_h0, y_h1]).T + return y_h + + def save(self, file_name): + data = { + "device": self.device, + "N_ensemble_configurations": self.N_ensemble_configurations, + "reducer": self.reducer, + "ensemble": self.ensemble, + } + joblib.dump(data, file_name) + + def load(self, file_name): + data = joblib.load(file_name) + model = TabPFNBinaryClassifier( + device=data["device"], + N_ensemble_configurations=data["N_ensemble_configurations"], + ) + model.ensemble = data["ensemble"] + model.reducer = data["reducer"] + return TabPFNClassifierArtifact(model, 0.5) + + +class Binarizer(object): + def __init__(self, threshold): + self.threshold = threshold + + def binarize(self, y_hat): + y_bin = [] + for y in y_hat: + if y > self.threshold: + y_bin += [1] + else: + y_bin += [0] + return np.array(y_bin, dtype=np.uint8) + + +class TabPFNClassifierArtifact(object): + def __init__(self, model, threshold): + self.model = model + self.threshold = threshold + if threshold is not None: + self.binarizer = Binarizer(self.threshold) + else: + self.binarizer = None + + def predict_proba(self, X): + return self.model.predict_proba(X)[:, 1] + + def predict(self, X): + if self.binarizer is not None: + y_hat = self.predict_proba(X) + y_bin = self.binarizer.binarize(y_hat) + else: + y_bin = self.model.predict(X) + return y_bin + + def run(self, X, y=None): + results = collections.OrderedDict() + results["main"] = { + "idxs": None, + "y": y, + "y_hat": self.predict_proba(X), + "b_hat": self.predict(X), + } + return results diff --git a/zairachem/descriptors/describe.py b/zairachem/descriptors/describe.py index 2fff69f5..c2075463 100644 --- a/zairachem/descriptors/describe.py +++ b/zairachem/descriptors/describe.py @@ -3,6 +3,7 @@ from .raw import RawDescriptors from .treated import TreatedDescriptors from .reference import ReferenceDescriptors, SimpleDescriptors +from .eosce import EosceDescriptors from .manifolds import Manifolds from .. import ZairaBase @@ -47,6 +48,12 @@ def _reference_descriptors(self): ReferenceDescriptors().run() step.update() + def _eosce_descriptors(self): + step = PipelineStep("eosce_descriptors", self.output_dir) + if not step.is_done(): + EosceDescriptors().run() + step.update() + def _manifolds(self): step = PipelineStep("manifolds", self.output_dir) if not step.is_done(): @@ -58,5 +65,6 @@ def run(self): self._raw_descriptions() self._treated_descriptions() self._reference_descriptors() + self._eosce_descriptors() self._manifolds() self.update_elapsed_time() diff --git a/zairachem/descriptors/eosce.py b/zairachem/descriptors/eosce.py new file mode 100644 index 00000000..21194f37 --- /dev/null +++ b/zairachem/descriptors/eosce.py @@ -0,0 +1,63 @@ +import os +import pandas as pd +import h5py + +from eosce.models import ErsiliaCompoundEmbeddings +from ..utils.matrices import Hdf5 +from .. import ZairaBase + +from ..setup import SMILES_COLUMN +from ..vars import DATA_SUBFOLDER, DATA_FILENAME, DESCRIPTORS_SUBFOLDER + +EOSCE_FILE_NAME = "eosce.h5" + + +class EosceEmbedder(ZairaBase): + def __init__(self): + ZairaBase.__init__(self) + self.model = ErsiliaCompoundEmbeddings() + + def calculate(self, smiles_list, output_h5): + X = self.model.transform(smiles_list) + if output_h5 is None: + return X + keys = ["key-{0}".format(i) for i in range(len(smiles_list))] + features = ["feat-{0}".format(i) for i in range(X.shape[1])] + inputs = smiles_list + with h5py.File(output_h5, "w") as f: + f.create_dataset("Keys", data=keys) + f.create_dataset("Features", data=features) + f.create_dataset("Inputs", data=inputs) + f.create_dataset("Values", data=X) + + +class EosceLoader(ZairaBase): + def __init__(self): + ZairaBase.__init__(self) + self.path = self.get_output_dir() + + def open(self, eos_id): + path = os.path.join(self.path, DESCRIPTORS_SUBFOLDER, eos_id, EOSCE_FILE_NAME) + return Hdf5(path) + + +class EosceDescriptors(ZairaBase): + def __init__(self): + ZairaBase.__init__(self) + self.path = self.get_output_dir() + self.input_csv = os.path.join(self.path, DATA_SUBFOLDER, DATA_FILENAME) + self.smiles_list = self._get_smiles_list() + + def _get_smiles_list(self): + df = pd.read_csv(self.input_csv) + return list(df[SMILES_COLUMN]) + + def output_h5_filename(self): + path = os.path.join(self.path, DESCRIPTORS_SUBFOLDER) + os.makedirs(path, exist_ok=True) + return os.path.join(path, EOSCE_FILE_NAME) + + def run(self): + output_h5 = self.output_h5_filename() + ref = EosceEmbedder() + ref.calculate(self.smiles_list, output_h5) diff --git a/zairachem/estimators/from_ersilia_embedding/__init__.py b/zairachem/estimators/from_ersilia_embedding/__init__.py new file mode 100644 index 00000000..1d7b9c64 --- /dev/null +++ b/zairachem/estimators/from_ersilia_embedding/__init__.py @@ -0,0 +1 @@ +ESTIMATORS_FAMILY_SUBFOLDER = "ersilia_embedding" diff --git a/zairachem/estimators/from_ersilia_embedding/estimate.py b/zairachem/estimators/from_ersilia_embedding/estimate.py new file mode 100644 index 00000000..be6c099e --- /dev/null +++ b/zairachem/estimators/from_ersilia_embedding/estimate.py @@ -0,0 +1,183 @@ +import os +import numpy as np +import pandas as pd +import h5py + +from ... import ZairaBase +from ..base import BaseEstimator, BaseOutcomeAssembler +from ...automl.kerastuner import KerasTunerEstimator +from ...vars import ( + DATA_FILENAME, + DATA_SUBFOLDER, + DESCRIPTORS_SUBFOLDER, + ESTIMATORS_SUBFOLDER, +) +from . import ESTIMATORS_FAMILY_SUBFOLDER +from .. import RESULTS_MAPPED_FILENAME, RESULTS_UNMAPPED_FILENAME + + +class XGetter(ZairaBase): + def __init__(self, path): + ZairaBase.__init__(self) + self.path = path + self.X = [] + self.columns = [] + + def _get_eosce_descriptor(self): + with h5py.File( + os.path.join(self.path, DESCRIPTORS_SUBFOLDER, "eosce.h5"), "r" + ) as f: + X_ = f["Values"][:] + self.X += [X_] + self.columns += [ + "feat-{0}".format(x.decode("utf-8")) for x in f["Features"][:] + ] + + def get(self): + self._get_eosce_descriptor() + X = np.hstack(self.X) + df = pd.DataFrame(X, columns=self.columns) + df.to_csv( + os.path.join( + self.path, + ESTIMATORS_SUBFOLDER, + ESTIMATORS_FAMILY_SUBFOLDER, + DATA_FILENAME, + ), + index=False, + ) + return df + + +class Fitter(BaseEstimator): + def __init__(self, path): + BaseEstimator.__init__(self, path=path) + self.trained_path = os.path.join( + self.get_output_dir(), ESTIMATORS_SUBFOLDER, ESTIMATORS_FAMILY_SUBFOLDER + ) + self.x_getter = XGetter + + def _get_X(self): + df = self.x_getter(path=self.path).get() + return df + + def _get_y(self, task): + df = pd.read_csv(os.path.join(self.path, DATA_SUBFOLDER, DATA_FILENAME)) + return np.array(df[task]) + + def _get_Y(self): + Y = [] + columns = [] + for t in self._get_reg_tasks(): + y = self._get_y(t) + Y += [y] + columns += [t] + for t in self._get_clf_tasks(): + y = self._get_y(t) + Y += [y] + columns += [t] + Y = np.array(Y).T + df = pd.DataFrame(Y, columns=columns) + return df + + def run(self, time_budget_sec=None): + self.reset_time() + if time_budget_sec is None: + time_budget_sec = self._estimate_time_budget() + else: + time_budget_sec = time_budget_sec + train_idxs = self.get_train_indices(path=self.path) + df_X = self._get_X() + df_Y = self._get_Y() + df = pd.concat([df_X, df_Y], axis=1) + labels = list(df_Y.columns) + self.logger.debug("Starting KerasTuner estimation") + estimator = KerasTunerEstimator(save_path=self.trained_path) + self.logger.debug("Fitting") + estimator.fit(data=df.iloc[train_idxs, :], labels=labels) + estimator.save() + estimator = estimator.load() + results = estimator.run(df) + self.update_elapsed_time() + return results + + +class Predictor(BaseEstimator): + def __init__(self, path): + BaseEstimator.__init__(self, path=path) + self.trained_path = os.path.join( + self.get_trained_dir(), ESTIMATORS_SUBFOLDER, ESTIMATORS_FAMILY_SUBFOLDER + ) + self.x_getter = XGetter + + def run(self): + self.reset_time() + df = self.x_getter(path=self.path).get() + model = KerasTunerEstimator(save_path=self.trained_path).load() + results = model.run(df) + self.update_elapsed_time() + return results + + +class Assembler(BaseOutcomeAssembler): + def __init__(self, path=None): + BaseOutcomeAssembler.__init__(self, path=path) + + def run(self, df): + df_c = self._get_compounds() + df_y = df + df = pd.concat([df_c, df_y], axis=1) + df.to_csv( + os.path.join( + self.path, + ESTIMATORS_SUBFOLDER, + ESTIMATORS_FAMILY_SUBFOLDER, + RESULTS_UNMAPPED_FILENAME, + ), + index=False, + ) + mappings = self._get_mappings() + df = self._remap(df, mappings) + df.to_csv( + os.path.join( + self.path, + ESTIMATORS_SUBFOLDER, + ESTIMATORS_FAMILY_SUBFOLDER, + RESULTS_MAPPED_FILENAME, + ), + index=False, + ) + + +class Estimator(ZairaBase): + def __init__(self, path=None): + ZairaBase.__init__(self) + if path is None: + self.path = self.get_output_dir() + else: + self.path = path + path_ = os.path.join( + self.path, ESTIMATORS_SUBFOLDER, ESTIMATORS_FAMILY_SUBFOLDER + ) + if not os.path.exists(path_): + os.makedirs(path_, exist_ok=True) + if not self.is_predict(): + self.logger.debug("Starting kerastuner fitter") + self.estimator = Fitter(path=self.path) + else: + self.logger.debug("Starting kerastuner predictor") + self.estimator = Predictor(path=self.path) + self.assembler = Assembler(path=self.path) + + def run(self, time_budget_sec=None): + if time_budget_sec is not None: + self.time_budget_sec = int(time_budget_sec) + else: + self.time_budget_sec = None + if not self.is_predict(): + self.logger.debug("Mode: fit") + results = self.estimator.run() + else: + self.logger.debug("Mode: predict") + results = self.estimator.run() + self.assembler.run(results) diff --git a/zairachem/estimators/from_ersilia_embedding/pipe.py b/zairachem/estimators/from_ersilia_embedding/pipe.py new file mode 100644 index 00000000..89adfda8 --- /dev/null +++ b/zairachem/estimators/from_ersilia_embedding/pipe.py @@ -0,0 +1,9 @@ +from .estimate import Estimator + + +class EosceEmbeddingPipeline(object): + def __init__(self, path): + self.e = Estimator(path=path) + + def run(self, time_budget_sec=None): + self.e.run(time_budget_sec=time_budget_sec) diff --git a/zairachem/estimators/from_individual_full_descriptors/assemble.py b/zairachem/estimators/from_individual_full_descriptors/assemble.py index 5b2b7cbc..ba4acda1 100644 --- a/zairachem/estimators/from_individual_full_descriptors/assemble.py +++ b/zairachem/estimators/from_individual_full_descriptors/assemble.py @@ -80,7 +80,7 @@ def _get_model_ids(self): os.path.join(path_trained, DESCRIPTORS_SUBFOLDER, "done_eos.json"), "r" ) as f: model_ids = list(json.load(f)) - model_ids_successfull = [] + model_ids_successful = [] for model_id in model_ids: if os.path.isfile( os.path.join( @@ -91,8 +91,8 @@ def _get_model_ids(self): "y_hat.joblib", ) ): - model_ids_successfull += [model_id] - return model_ids_successfull + model_ids_successful += [model_id] + return model_ids_successful def run(self): model_ids = self._get_model_ids() diff --git a/zairachem/estimators/from_individual_full_descriptors/estimate.py b/zairachem/estimators/from_individual_full_descriptors/estimate.py index a047383e..e96906a6 100644 --- a/zairachem/estimators/from_individual_full_descriptors/estimate.py +++ b/zairachem/estimators/from_individual_full_descriptors/estimate.py @@ -244,13 +244,13 @@ def _get_model_ids(self): os.path.join(path_trained, DESCRIPTORS_SUBFOLDER, "done_eos.json"), "r" ) as f: model_ids = list(json.load(f)) - model_ids_successfull = [] + model_ids_successful = [] for model_id in model_ids: if os.path.isfile( os.path.join(path, DESCRIPTORS_SUBFOLDER, model_id, "treated.h5") ): - model_ids_successfull += [model_id] - return model_ids_successfull + model_ids_successful += [model_id] + return model_ids_successful def run(self, time_budget_sec=None): model_ids = self._get_model_ids() diff --git a/zairachem/estimators/from_individual_full_descriptors/performance.py b/zairachem/estimators/from_individual_full_descriptors/performance.py index 4bcd91b5..d067456f 100644 --- a/zairachem/estimators/from_individual_full_descriptors/performance.py +++ b/zairachem/estimators/from_individual_full_descriptors/performance.py @@ -212,7 +212,7 @@ def _get_model_ids(self): os.path.join(path_trained, DESCRIPTORS_SUBFOLDER, "done_eos.json"), "r" ) as f: model_ids = list(json.load(f)) - model_ids_successfull = [] + model_ids_successful = [] for model_id in model_ids: if os.path.isfile( os.path.join( @@ -223,8 +223,8 @@ def _get_model_ids(self): "y_hat.joblib", ) ): - model_ids_successfull += [model_id] - return model_ids_successfull + model_ids_successful += [model_id] + return model_ids_successful def run(self): model_ids = self._get_model_ids() diff --git a/zairachem/estimators/from_individual_full_descriptors_tabpfn/__init__.py b/zairachem/estimators/from_individual_full_descriptors_tabpfn/__init__.py new file mode 100644 index 00000000..856581cd --- /dev/null +++ b/zairachem/estimators/from_individual_full_descriptors_tabpfn/__init__.py @@ -0,0 +1 @@ +ESTIMATORS_FAMILY_SUBFOLDER = "individual_full_descriptors_tabpfn" diff --git a/zairachem/estimators/from_individual_full_descriptors_tabpfn/assemble.py b/zairachem/estimators/from_individual_full_descriptors_tabpfn/assemble.py new file mode 100644 index 00000000..ba4acda1 --- /dev/null +++ b/zairachem/estimators/from_individual_full_descriptors_tabpfn/assemble.py @@ -0,0 +1,101 @@ +import pandas as pd +import json +import os +import joblib +import collections + +from . import ESTIMATORS_FAMILY_SUBFOLDER +from ... import ZairaBase +from ...vars import DESCRIPTORS_SUBFOLDER, ESTIMATORS_SUBFOLDER +from .. import Y_HAT_FILE, RESULTS_UNMAPPED_FILENAME, RESULTS_MAPPED_FILENAME +from ..base import BaseOutcomeAssembler + + +class IndividualOutcomeAssembler(BaseOutcomeAssembler): + def __init__(self, path=None, model_id=None): + BaseOutcomeAssembler.__init__(self, path=path) + self.model_id = model_id + + def _get_y_hat(self): + results = joblib.load( + os.path.join( + self.path, + ESTIMATORS_SUBFOLDER, + ESTIMATORS_FAMILY_SUBFOLDER, + self.model_id, + Y_HAT_FILE, + ) + ) + data = collections.OrderedDict() + for c, r in results.items(): + r = r["main"] + data[c] = r["y_hat"] + if "b_hat" in r: + data[c + "_bin"] = r["b_hat"] + return pd.DataFrame(data) + + def run(self): + df_c = self._get_compounds() + df_y = self._get_y_hat() + df = pd.concat([df_c, df_y], axis=1) + df.to_csv( + os.path.join( + self.path, + ESTIMATORS_SUBFOLDER, + ESTIMATORS_FAMILY_SUBFOLDER, + self.model_id, + RESULTS_UNMAPPED_FILENAME, + ), + index=False, + ) + mappings = self._get_mappings() + df = self._remap(df, mappings) + df.to_csv( + os.path.join( + self.path, + ESTIMATORS_SUBFOLDER, + ESTIMATORS_FAMILY_SUBFOLDER, + self.model_id, + RESULTS_MAPPED_FILENAME, + ), + index=False, + ) + + +class OutcomeAssembler(ZairaBase): + def __init__(self, path=None): + ZairaBase.__init__(self) + self.path = path + + def _get_model_ids(self): + if self.path is None: + path = self.get_output_dir() + else: + path = self.path + if self.is_predict(): + path_trained = self.get_trained_dir() + else: + path_trained = path + with open( + os.path.join(path_trained, DESCRIPTORS_SUBFOLDER, "done_eos.json"), "r" + ) as f: + model_ids = list(json.load(f)) + model_ids_successful = [] + for model_id in model_ids: + if os.path.isfile( + os.path.join( + path, + ESTIMATORS_SUBFOLDER, + ESTIMATORS_FAMILY_SUBFOLDER, + model_id, + "y_hat.joblib", + ) + ): + model_ids_successful += [model_id] + return model_ids_successful + + def run(self): + model_ids = self._get_model_ids() + for model_id in model_ids: + o = IndividualOutcomeAssembler(path=self.path, model_id=model_id) + o.run() diff --git a/zairachem/estimators/from_individual_full_descriptors_tabpfn/estimate.py b/zairachem/estimators/from_individual_full_descriptors_tabpfn/estimate.py new file mode 100644 index 00000000..6f9c9f4f --- /dev/null +++ b/zairachem/estimators/from_individual_full_descriptors_tabpfn/estimate.py @@ -0,0 +1,185 @@ +import os +import json +import h5py +import pandas as pd +import numpy as np +import collections +import joblib + +from ...descriptors.treated import TREATED_FILE_NAME + +from ... import ZairaBase +from ...automl.binarytabpfn import TabPFNBinaryClassifier + +from ...vars import ( + DESCRIPTORS_SUBFOLDER, + DATA_SUBFOLDER, + DATA_FILENAME, + ESTIMATORS_SUBFOLDER, +) +from ..base import BaseEstimator + +from .. import Y_HAT_FILE +from . import ESTIMATORS_FAMILY_SUBFOLDER + + +class BaseEstimatorIndividual(BaseEstimator): + def __init__(self, path, model_id): + BaseEstimator.__init__(self, path=path) + path_ = os.path.join( + self.path, ESTIMATORS_SUBFOLDER, ESTIMATORS_FAMILY_SUBFOLDER, model_id + ) + if not os.path.exists(path_): + os.makedirs(path_) + self.model_id = model_id + + def _get_X(self): + f = os.path.join( + self.path, DESCRIPTORS_SUBFOLDER, self.model_id, TREATED_FILE_NAME + ) + with h5py.File(f, "r") as f: + X = f["Values"][:] + return X + + +class Fitter(BaseEstimatorIndividual): + def __init__(self, path, model_id): + BaseEstimatorIndividual.__init__(self, path=path, model_id=model_id) + self.trained_path = os.path.join( + self.get_output_dir(), ESTIMATORS_SUBFOLDER, ESTIMATORS_FAMILY_SUBFOLDER + ) + + def _get_flds(self): + # for now only auxiliary folds are used + col = [f for f in self.schema["folds"] if "_aux" in f][0] + df = pd.read_csv(os.path.join(self.path, DATA_SUBFOLDER, DATA_FILENAME)) + return np.array(df[col]) + + def _get_y(self, task): + # for now iterate task by task + df = pd.read_csv(os.path.join(self.path, DATA_SUBFOLDER, DATA_FILENAME)) + return np.array(df[task]) + + def run(self, time_budget_sec=60): + self.reset_time() + if time_budget_sec is None: + time_budget_sec = self._estimate_time_budget() + else: + time_budget_sec = time_budget_sec + tasks = collections.OrderedDict() + X = self._get_X() + train_idxs = self.get_train_indices(path=self.path) + valid_idxs = self.get_validation_indices(path=self.path) + for t in self._get_clf_tasks(): + y = self._get_y(t) + model = TabPFNBinaryClassifier() + model.fit(X[train_idxs], y[train_idxs]) + file_name = os.path.join(self.trained_path, self.model_id, t + ".joblib") + model.save(file_name) + model = model.load(file_name) + tasks[t] = model.run(X, y) + _valid_task = model.run(X[valid_idxs], y[valid_idxs]) + tasks[t]["valid"] = _valid_task["main"] + self.update_elapsed_time() + return tasks + + +class Predictor(BaseEstimatorIndividual): + def __init__(self, path, model_id): + BaseEstimatorIndividual.__init__(self, path=path, model_id=model_id) + self.trained_path = os.path.join( + self.get_trained_dir(), ESTIMATORS_SUBFOLDER, ESTIMATORS_FAMILY_SUBFOLDER + ) + + def _get_y(self, task): + # for now iterate task by task + df = pd.read_csv(os.path.join(self.path, DATA_SUBFOLDER, DATA_FILENAME)) + columns = set(df.columns) + if task in columns: + return np.array(df[task]) + else: + return None + + def run(self): + self.reset_time() + tasks = collections.OrderedDict() + X = self._get_X() + for t in self._get_clf_tasks(): + y = self._get_y(t) + model = TabPFNBinaryClassifier() + file_name = os.path.join(self.trained_path, self.model_id, t + ".joblib") + model = model.load(file_name) + tasks[t] = model.run(X, y) + self.update_elapsed_time() + return tasks + + +class IndividualEstimator(ZairaBase): + def __init__(self, path=None, model_id=None): + ZairaBase.__init__(self) + self.model_id = model_id + if path is None: + self.path = self.get_output_dir() + else: + self.path = path + if not self.is_predict(): + self.estimator = Fitter(path=self.path, model_id=self.model_id) + else: + self.estimator = Predictor(path=self.path, model_id=self.model_id) + + def run(self, time_budget_sec=None): + if time_budget_sec is not None: + self.time_budget_sec = int(time_budget_sec) + else: + self.time_budget_sec = None + if not self.is_predict(): + results = self.estimator.run(time_budget_sec=self.time_budget_sec) + else: + results = self.estimator.run() + joblib.dump( + results, + os.path.join( + self.path, + ESTIMATORS_SUBFOLDER, + ESTIMATORS_FAMILY_SUBFOLDER, + self.model_id, + Y_HAT_FILE, + ), + ) + + +class Estimator(ZairaBase): + def __init__(self, path=None): + ZairaBase.__init__(self) + self.path = path + + def _get_model_ids(self): + if self.path is None: + path = self.get_output_dir() + else: + path = self.path + if self.is_predict(): + path_trained = self.get_trained_dir() + else: + path_trained = path + with open( + os.path.join(path_trained, DESCRIPTORS_SUBFOLDER, "done_eos.json"), "r" + ) as f: + model_ids = list(json.load(f)) + model_ids_successful = [] + for model_id in model_ids: + if os.path.isfile( + os.path.join(path, DESCRIPTORS_SUBFOLDER, model_id, "treated.h5") + ): + model_ids_successful += [model_id] + return model_ids_successful + + def run(self, time_budget_sec=None): + model_ids = self._get_model_ids() + if time_budget_sec is not None: + tbs = max(int(time_budget_sec / len(model_ids)), 1) + else: + tbs = None + for model_id in model_ids: + estimator = IndividualEstimator(path=self.path, model_id=model_id) + estimator.run(time_budget_sec=tbs) diff --git a/zairachem/estimators/from_individual_full_descriptors_tabpfn/performance.py b/zairachem/estimators/from_individual_full_descriptors_tabpfn/performance.py new file mode 100644 index 00000000..d067456f --- /dev/null +++ b/zairachem/estimators/from_individual_full_descriptors_tabpfn/performance.py @@ -0,0 +1,233 @@ +import os +import json +import numpy as np +import pandas as pd +import joblib +import collections + +from sklearn import metrics + +from .. import Y_HAT_FILE +from ... import ZairaBase + +from . import ESTIMATORS_FAMILY_SUBFOLDER +from ...vars import ( + DATA_SUBFOLDER, + DESCRIPTORS_SUBFOLDER, + ESTIMATORS_SUBFOLDER, + DATA_FILENAME, +) + +from .. import CLF_REPORT_FILENAME, REG_REPORT_FILENAME + + +class BasePerformance(ZairaBase): + def __init__(self, path=None, model_id=None): + ZairaBase.__init__(self) + if path is None: + self.path = self.get_output_dir() + else: + self.path = path + self.model_id = model_id + + def _get_y_hat_dict(self): + return joblib.load( + os.path.join( + self.path, + ESTIMATORS_SUBFOLDER, + ESTIMATORS_FAMILY_SUBFOLDER, + self.model_id, + Y_HAT_FILE, + ) + ) + + +class ClassificationPerformance(BasePerformance): + def __init__(self, path, model_id): + BasePerformance.__init__(self, path=path, model_id=model_id) + self.results = self._get_y_hat_dict() + self._prefix = self._get_prefix() + self.results = self.results[self._prefix] + + def _get_prefix(self): + for c in list(self.results.keys()): + if "clf_" in c: + return c + + def _try_metric(self, fun, t, p): + try: + return float(fun(t, p)) + except: + return None + + def _calculate(self, key): + r = self.results[key] + y_true = np.array(r["y"]) + y_pred = np.array(r["y_hat"]) + b_pred = np.array(r["b_hat"]) + try: + confu = metrics.confusion_matrix(y_true, b_pred, labels=[0, 1]) + except: + confu = np.array([[-1, -1], [-1, -1]]) + report = { + "roc_auc_score": self._try_metric(metrics.roc_auc_score, y_true, y_pred), + "precision_score": self._try_metric( + metrics.precision_score, y_true, b_pred + ), + "recall_score": self._try_metric(metrics.recall_score, y_true, b_pred), + "tp": int(confu[1, 1]), + "tn": int(confu[0, 0]), + "fp": int(confu[0, 1]), + "fn": int(confu[1, 0]), + "y_true": [int(y) for y in y_true], + "y_pred": [float(y) for y in y_pred], + "b_pred": [int(y) for y in b_pred], + } + return report + + def calculate(self): + report = collections.OrderedDict() + for k in self.results.keys(): + report[k] = self._calculate(k) + return report + + +class RegressionPerformance(BasePerformance): + def __init__(self, path, model_id): + BasePerformance.__init__(self, path=path, model_id=model_id) + self.results = self._get_y_hat_dict() + self._prefix = self._get_prefix() + self.results = self.results[self._prefix] + + def _get_prefix(self): + for c in list(self.results.keys()): + if "reg_" in c: + return c + + def _calculate(self, key): + r = self.results[key] + y_true = np.array(r["y"]) + y_pred = np.array(r["y_hat"]) + report = { + "r2_score": float(metrics.r2_score(y_true, y_pred)), + "mean_absolute_error": float(metrics.mean_absolute_error(y_true, y_pred)), + "mean_squared_error": float(metrics.mean_squared_error(y_true, y_pred)), + "y_true": [float(y) for y in y_true], + "y_pred": [float(y) for y in y_pred], + } + return report + + def calculate(self): + report = collections.OrderedDict() + for k in self.results.keys(): + report[k] = self._calculate(k) + return report + + +class IndividualPerformanceReporter(ZairaBase): + def __init__(self, path=None, model_id=None): + ZairaBase.__init__(self) + if path is None: + self.path = self.get_output_dir() + else: + self.path = path + self.has_tasks = self._has_tasks() + if self._has_clf_tasks(): + self.clf = ClassificationPerformance(path=path, model_id=model_id) + else: + self.clf = None + if self._has_reg_tasks(): + self.reg = RegressionPerformance(path=path, model_id=model_id) + else: + self.reg = None + self.model_id = model_id + + def _has_tasks(self): + df = pd.read_csv(os.path.join(self.path, DATA_SUBFOLDER, DATA_FILENAME)) + for c in list(df.columns): + if "clf_" in c or "reg_" in c: + return True + return False + + def _has_reg_tasks(self): + df = pd.read_csv(os.path.join(self.path, DATA_SUBFOLDER, DATA_FILENAME)) + for c in list(df.columns): + if "reg_" in c and "_skip" not in c and "_aux" not in c: + return True + return False + + def _has_clf_tasks(self): + df = pd.read_csv(os.path.join(self.path, DATA_SUBFOLDER, DATA_FILENAME)) + for c in list(df.columns): + if "clf_" in c and "_skip" not in c and "_aux" not in c: + return True + return False + + def run(self): + if not self.has_tasks: + return + if self.clf is not None: + clf_rep = self.clf.calculate() + with open( + os.path.join( + self.path, + ESTIMATORS_SUBFOLDER, + ESTIMATORS_FAMILY_SUBFOLDER, + self.model_id, + CLF_REPORT_FILENAME, + ), + "w", + ) as f: + json.dump(clf_rep, f, indent=4) + if self.reg is not None: + reg_rep = self.reg.calculate() + with open( + os.path.join( + self.path, + ESTIMATORS_SUBFOLDER, + ESTIMATORS_FAMILY_SUBFOLDER, + self.model_id, + REG_REPORT_FILENAME, + ), + "w", + ) as f: + json.dump(reg_rep, f, indent=4) + + +class PerformanceReporter(ZairaBase): + def __init__(self, path=None): + ZairaBase.__init__(self) + self.path = path + + def _get_model_ids(self): + if self.path is None: + path = self.get_output_dir() + else: + path = self.path + if self.is_predict(): + path_trained = self.get_trained_dir() + else: + path_trained = path + with open( + os.path.join(path_trained, DESCRIPTORS_SUBFOLDER, "done_eos.json"), "r" + ) as f: + model_ids = list(json.load(f)) + model_ids_successful = [] + for model_id in model_ids: + if os.path.isfile( + os.path.join( + path, + ESTIMATORS_SUBFOLDER, + ESTIMATORS_FAMILY_SUBFOLDER, + model_id, + "y_hat.joblib", + ) + ): + model_ids_successful += [model_id] + return model_ids_successful + + def run(self): + model_ids = self._get_model_ids() + for model_id in model_ids: + p = IndividualPerformanceReporter(path=self.path, model_id=model_id) + p.run() diff --git a/zairachem/estimators/from_individual_full_descriptors_tabpfn/pipe.py b/zairachem/estimators/from_individual_full_descriptors_tabpfn/pipe.py new file mode 100644 index 00000000..8e397fbf --- /dev/null +++ b/zairachem/estimators/from_individual_full_descriptors_tabpfn/pipe.py @@ -0,0 +1,15 @@ +from .estimate import Estimator +from .assemble import OutcomeAssembler +from .performance import PerformanceReporter + + +class IndividualFullDescriptorTabPFNPipeline(object): + def __init__(self, path): + self.e = Estimator(path=path) + self.a = OutcomeAssembler(path=path) + self.p = PerformanceReporter(path=path) + + def run(self, time_budget_sec=None): + self.e.run(time_budget_sec=time_budget_sec) + self.a.run() + self.p.run() diff --git a/zairachem/estimators/pipe.py b/zairachem/estimators/pipe.py index dddd61f6..fe658a71 100644 --- a/zairachem/estimators/pipe.py +++ b/zairachem/estimators/pipe.py @@ -7,8 +7,12 @@ from .from_classic.pipe import ClassicPipeline from .from_fingerprint.pipe import FingerprintPipeline from .from_individual_full_descriptors.pipe import IndividualFullDescriptorPipeline +from .from_individual_full_descriptors_tabpfn.pipe import ( + IndividualFullDescriptorTabPFNPipeline, +) from .from_manifolds.pipe import ManifoldPipeline from .from_reference_embedding.pipe import ReferenceEmbeddingPipeline +from .from_ersilia_embedding.pipe import EosceEmbeddingPipeline from .from_molmap.pipe import MolMapPipeline from .evaluate import SimpleEvaluator @@ -81,6 +85,19 @@ def _individual_estimator_pipeline(self, time_budget_sec): p.run(time_budget_sec=time_budget_sec) step.update() + def _individual_estimator_tabpfn_pipeline(self, time_budget_sec): + if self.is_lazy(): + self.logger.info("Lazy mode skips individual descriptors with tabpfn") + return + if "tabpfn-individual-descriptors" not in self._estimators_to_use: + return + step = PipelineStep("individual_estimator_pipeline_tabpfn", self.output_dir) + if not step.is_done(): + self.logger.debug("Running individual estimator pipeline") + p = IndividualFullDescriptorTabPFNPipeline(path=self.path) + p.run(time_budget_sec=time_budget_sec) + step.update() + def _manifolds_pipeline(self, time_budget_sec): if self.is_lazy(): self.logger.info("Lazy mode skips manifolds") @@ -107,6 +124,16 @@ def _reference_pipeline(self, time_budget_sec): p.run(time_budget_sec=time_budget_sec) step.update() + def _eosce_pipeline(self, time_budget_sec): + if "kerastuner-eosce-embedding" not in self._estimators_to_use: + return + step = PipelineStep("eosce_pipeline", self.output_dir) + if not step.is_done(): + self.logger.debug("Ersilia compound embedding pipeline") + p = EosceEmbeddingPipeline(path=self.path) + p.run(time_budget_sec=time_budget_sec) + step.update() + def _molmap_pipeline(self, time_budget_sec): if self.is_lazy(): self.logger.info("Lazy mode skips molmap") @@ -133,8 +160,10 @@ def _simple_evaluation(self): def run(self, time_budget_sec=None): self._classic_estimator_pipeline(time_budget_sec) self._fingerprint_estimator_pipeline(time_budget_sec) + self._individual_estimator_tabpfn_pipeline(time_budget_sec) self._individual_estimator_pipeline(time_budget_sec) self._manifolds_pipeline(time_budget_sec) self._reference_pipeline(time_budget_sec) + self._eosce_pipeline(time_budget_sec) self._molmap_pipeline(time_budget_sec) self._simple_evaluation() diff --git a/zairachem/pool/bagger.py b/zairachem/pool/bagger.py index e1a3e7c9..535c33d0 100644 --- a/zairachem/pool/bagger.py +++ b/zairachem/pool/bagger.py @@ -5,9 +5,10 @@ import joblib import h5py import collections +from scipy.special import expit from sklearn.linear_model import LogisticRegressionCV, LinearRegression from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor -from sklearn.preprocessing import RobustScaler +from sklearn.preprocessing import RobustScaler, PowerTransformer from sklearn.metrics import roc_curve, auc, r2_score from .. import ZairaBase @@ -155,8 +156,8 @@ def save(self, filename): class PoolClassifier(object): - def __init__(self, path, mode="scaling"): - assert mode in ["scaling", "median", "model"] + def __init__(self, path, mode="weighting"): + assert mode in ["weighting", "median", "model"] self.path = path if not os.path.exists(self.path): os.makedirs(self.path, exist_ok=True) @@ -168,7 +169,8 @@ def _get_model_filename(self, n): def _fit_just_median(self, df_X, df_y): return np.median(np.array(df_X), axis=1) - def _fit_scaling(self, df_X, df_y): + def _fit_weighting(self, df_X, df_y): + y = np.array(df_y).ravel() cols = list(df_X.columns) X = np.array(df_X) p25 = np.percentile(X.ravel(), 25) @@ -177,10 +179,13 @@ def _fit_scaling(self, df_X, df_y): scale = (p25, p50, p75) for c in cols: X = np.array(df_X[c]).reshape(-1, 1) - mdl = RobustScaler() - mdl.fit(X) + mdl0 = PowerTransformer() + mdl0.fit(X) + X = mdl0.transform(X) + mdl1 = LogisticRegressionCV() + mdl1.fit(X, y) filename = self._get_model_filename(c) - joblib.dump(mdl, filename) + joblib.dump((mdl0, mdl1), filename) filename = self._get_model_filename("overall") joblib.dump(scale, filename) filename = self._get_model_filename("weighting") @@ -188,7 +193,7 @@ def _fit_scaling(self, df_X, df_y): ws.distance_to_leads() ws.importance() ws.save(filename) - return self._predict_scaling(df_X) + return self._predict_weighting(df_X) def _fit_model(self, df_X, df_y): y = np.array(df_y).ravel() @@ -204,22 +209,19 @@ def _fit_model(self, df_X, df_y): def _predict_just_median(self, df_X): return np.median(np.array(df_X), axis=1) - def _predict_scaling(self, df_X): + def _predict_weighting(self, df_X): cols = list(df_X.columns) Y_hat = [] for c in cols: filename = self._get_model_filename(c) if os.path.exists(filename): - mdl = joblib.load(filename) + mdl0, mdl1 = joblib.load(filename) X = np.array(df_X[c]).reshape(-1, 1) - y_hat = mdl.transform(X).ravel() + X = mdl0.transform(X) + y_hat = mdl1.predict_proba(X)[:,1] Y_hat += [y_hat] Y_hat = np.array(Y_hat).T filename = self._get_model_filename("overall") - scale = joblib.load(filename) - iqr = scale[-1] - scale[0] - med = scale[1] - Y_hat = Y_hat * iqr + med filename = self._get_model_filename("weighting") weights = joblib.load(filename) wvals = weights["weights"] @@ -230,7 +232,7 @@ def _predict_scaling(self, df_X): w = w + 1 y_hats += [np.average(Y_hat, axis=1, weights=w)] y_hats += [np.mean(Y_hat, axis=1)] - y_hats = np.clip(np.array(y_hats), 0, 1) + y_hats = np.array(y_hats) y_hat = np.mean(y_hats, axis=0) return y_hat @@ -248,16 +250,16 @@ def _predict_model(self, df_X): return np.median(Y_hat, axis=1) def fit(self, df_X, df_y): - if self.mode == "scaling": - return self._fit_scaling(df_X, df_y) + if self.mode == "weighting": + return self._fit_weighting(df_X, df_y) if self.mode == "median": return self._fit_just_median(df_X, df_y) if self.mdoe == "model": return self._fit_model(df_X, df_y) def predict(self, df_X): - if self.mode == "scaling": - return self._predict_scaling(df_X) + if self.mode == "weighting": + return self._predict_weighting(df_X) if self.mode == "median": return self._predict_just_median(df_X) if self.mode == "model": diff --git a/zairachem/vars.py b/zairachem/vars.py index b1039c66..fc229d10 100644 --- a/zairachem/vars.py +++ b/zairachem/vars.py @@ -41,18 +41,22 @@ # Ersilia Model Hub ERSILIA_HUB_DEFAULT_MODELS = [ - "morgan-counts", + # "morgan-counts", "cc-signaturizer", + "molfeat-chemgpt", + "rdkit-fingerprint", "grover-embedding", "mordred", -] # molbert was removed +] DEFAULT_ESTIMATORS = [ "baseline-classic", "baseline-fingerprint", "flaml-individual-descriptors", + "tabpfn-individual-descriptors", "autogluon-manifolds", "kerastuner-reference-embedding", + "kerastuner-eosce-embedding", "molmap", ]