diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml index fcbaeea27f..83153e606d 100644 --- a/.github/workflows/testing.yml +++ b/.github/workflows/testing.yml @@ -217,6 +217,8 @@ jobs: - docs/tutorials/models/slr.rst - docs/tutorials/sources/complex.rst - docs/tutorials/sources/file.rst + - docs/tutorials/tuner/parameter_grid.rst + - docs/tutorials/tuner/bayes_opt_gp.rst steps: - uses: actions/checkout@v2 diff --git a/dffml/__init__.py b/dffml/__init__.py index f035051aa4..755f9f0124 100644 --- a/dffml/__init__.py +++ b/dffml/__init__.py @@ -57,6 +57,7 @@ class DuplicateName(Exception): "train": "high_level.ml", "predict": "high_level.ml", "score": "high_level.ml", + "tune": "high_level.ml", "load": "high_level.source", "save": "high_level.source", "run": "high_level.dataflow", diff --git a/dffml/cli/cli.py b/dffml/cli/cli.py index b7dbd21fe6..8ce00e5ecc 100644 --- a/dffml/cli/cli.py +++ b/dffml/cli/cli.py @@ -39,7 +39,7 @@ from .dataflow import Dataflow from .config import Config -from .ml import Train, Accuracy, Predict +from .ml import Train, Accuracy, Predict, Tune from .list import List version = VERSION @@ -366,6 +366,7 @@ class CLI(CMD): train = Train accuracy = Accuracy predict = Predict + tune = Tune service = services() dataflow = Dataflow config = Config diff --git a/dffml/cli/ml.py b/dffml/cli/ml.py index 7876ee2de9..72788b5783 100644 --- a/dffml/cli/ml.py +++ b/dffml/cli/ml.py @@ -1,9 +1,10 @@ import inspect from ..model.model import Model +from ..tuner.tuner import Tuner from ..source.source import Sources, SubsetSources from ..util.cli.cmd import CMD, CMDOutputOverride -from ..high_level.ml import train, predict, score +from ..high_level.ml import train, predict, score, tune from ..util.config.fields import FIELD_SOURCES from ..util.cli.cmds import ( SourcesCMD, @@ -15,6 +16,7 @@ ) from ..base import config, field from ..accuracy import AccuracyScorer + from ..feature import Features @@ -118,3 +120,57 @@ class Predict(CMD): record = PredictRecord _all = PredictAll + + +@config +class TuneCMDConfig: + model: Model = field("Model used for ML", required=True) + tuner: Tuner = field("Tuner to optimize hyperparameters", required=True) + scorer: AccuracyScorer = field( + "Method to use to score accuracy", required=True + ) + features: Features = field("Predict Feature(s)", default=Features()) + sources: Sources = FIELD_SOURCES + + +class Tune(MLCMD): + """Optimize hyperparameters of model with given sources""" + + CONFIG = TuneCMDConfig + + async def run(self): + # Instantiate the accuracy scorer class if for some reason it is a class + # at this point rather than an instance. + if inspect.isclass(self.scorer): + self.scorer = self.scorer.withconfig(self.extra_config) + if inspect.isclass(self.tuner): + self.tuner = self.tuner.withconfig(self.extra_config) + + train_source = test_source = None + + # Check for tags to determine train/test sets + for source in self.sources: + + if hasattr(source, "tag") and source.tag == "train": + train_source = source + if hasattr(source, "tag") and source.tag == "test": + test_source = source + + if not train_source or not test_source: + # If tags not found, default to positional + if len(self.sources) >= 2: + train_source = self.sources[0] + test_source = self.sources[1] + elif not train_source: + raise NotImplementedError("Train set not found.") + else: + raise NotImplementedError("Test set not found.") + + return await tune( + self.model, + self.tuner, + self.scorer, + self.features, + [train_source], + [test_source], + ) diff --git a/dffml/high_level/ml.py b/dffml/high_level/ml.py index ffa110341b..9317f1a4e6 100644 --- a/dffml/high_level/ml.py +++ b/dffml/high_level/ml.py @@ -1,12 +1,14 @@ import contextlib from typing import Union, Dict, Any, List + from ..record import Record from ..source.source import BaseSource from ..feature import Feature, Features from ..model import Model, ModelContext -from ..util.internal import records_to_sources, list_records_to_dict +from ..util.internal import records_to_sources, list_records_to_dict, records_to_dict_check from ..accuracy.accuracy import AccuracyScorer, AccuracyContext +from ..tuner import Tuner, TunerContext async def train(model, *args: Union[BaseSource, Record, Dict[str, Any], List]): @@ -293,3 +295,133 @@ async def predict( ) if update: await sctx.update(record) + +async def tune( + model, + tuner: Union[Tuner, TunerContext], + accuracy_scorer: Union[AccuracyScorer, AccuracyContext], + features: Union[Feature, Features], + train_ds: Union[BaseSource, Record, Dict[str, Any], List], + valid_ds: Union[BaseSource, Record, Dict[str, Any], List], +) -> float: + + """ + Tune the hyperparameters of a model with a given tuner. + + + Parameters + ---------- + model : Model + Machine Learning model to use. See :doc:`/plugins/dffml_model` for + models options. + tuner: Tuner + Hyperparameter tuning method to use. See :doc:`/plugins/dffml_tuner` for + tuner options. + train_ds : list + Input data for training. Could be a ``dict``, :py:class:`Record`, + filename, one of the data :doc:`/plugins/dffml_source`, or a filename + with the extension being one of the data sources. + valid_ds : list + Validation data for testing. Could be a ``dict``, :py:class:`Record`, + filename, one of the data :doc:`/plugins/dffml_source`, or a filename + with the extension being one of the data sources. + + + Returns + ------- + float + A decimal value representing the result of the accuracy scorer on the given + test set. For instance, ClassificationAccuracy represents the percentage of correct + classifications made by the model. + + Examples + -------- + + >>> import asyncio + >>> from dffml import * + >>> + >>> model = SLRModel( + ... features=Features( + ... Feature("Years", int, 1), + ... ), + ... predict=Feature("Salary", int, 1), + ... location="tempdir", + ... ) + >>> + >>> async def main(): + ... score = await tune( + ... model, + ... ParameterGrid(objective="min"), + ... MeanSquaredErrorAccuracy(), + ... Features( + ... Feature("Years", float, 1), + ... ), + ... [ + ... {"Years": 0, "Salary": 10}, + ... {"Years": 1, "Salary": 20}, + ... {"Years": 2, "Salary": 30}, + ... {"Years": 3, "Salary": 40} + ... ], + ... [ + ... {"Years": 6, "Salary": 70}, + ... {"Years": 7, "Salary": 80} + ... ] + ... + ... ) + ... print(f"Tuner score: {score}") + ... + >>> asyncio.run(main()) + Tuner score: 0.0 + """ + + if not isinstance(features, (Feature, Features)): + raise TypeError( + f"features was {type(features)}: {features!r}. Should have been Feature or Features" + ) + if isinstance(features, Feature): + features = Features(features) + if hasattr(model.config, "predict"): + if isinstance(model.config.predict, Features): + predict_feature = [ + feature.name for feature in model.config.predict + ] + else: + predict_feature = [model.config.predict.name] + + train_ds = records_to_dict_check(train_ds, model, predict_feature) + valid_ds = records_to_dict_check(valid_ds, model, predict_feature) + + async with contextlib.AsyncExitStack() as astack: + # Open sources + train = await astack.enter_async_context(records_to_sources(*train_ds)) + test = await astack.enter_async_context(records_to_sources(*valid_ds)) + # Allow for keep models open + if isinstance(model, Model): + model = await astack.enter_async_context(model) + mctx = await astack.enter_async_context(model()) + elif isinstance(model, ModelContext): + mctx = model + + # Allow for scorers to be kept open + if isinstance(accuracy_scorer, AccuracyScorer): + accuracy_scorer = await astack.enter_async_context(accuracy_scorer) + actx = await astack.enter_async_context(accuracy_scorer()) + elif isinstance(accuracy_scorer, AccuracyContext): + actx = accuracy_scorer + else: + # TODO Replace this with static type checking and maybe dynamic + # through something like pydantic. See issue #36 + raise TypeError(f"{accuracy_scorer} is not an AccuracyScorer") + + if isinstance(tuner, Tuner): + tuner = await astack.enter_async_context(tuner) + tctx = await astack.enter_async_context(tuner()) + elif isinstance(tuner, TunerContext): + tctx = tuner + else: + raise TypeError(f"{tuner} is not an Tuner") + + return float( + await tctx.optimize(mctx, features, actx, train, test) + ) + diff --git a/dffml/model/automl.py b/dffml/model/automl.py new file mode 100644 index 0000000000..0612b2a086 --- /dev/null +++ b/dffml/model/automl.py @@ -0,0 +1,179 @@ +import pathlib +import os +import shutil +import json +import tempfile +import contextlib +import pkg_resources +import numpy as np + +from typing import AsyncIterator, Tuple, Any, Type, List +from ..high_level.ml import tune +from ..base import config, field +from ..util.entrypoint import entrypoint +from .model import ModelNotTrained, ModelContext, SimpleModel, Model +from ..feature.feature import Feature, Features +from ..source.source import Sources, SourcesContext +from ..record import Record +from ..model.model import Model +from ..tuner.tuner import Tuner +from ..accuracy import AccuracyScorer + + +@config +class AutoMLModelConfig: + predict: Feature = field("Label or the value to be predicted") + features: Features = field("Features to train on.") + location: pathlib.Path = field("Location where state should be saved") + tuner: Tuner = field("Tuner to optimize hyperparameters with.") + scorer: AccuracyScorer = field("Scorer to evaluate and select best model.") + models: List[str] = field("List of models to tune and compare against", default_factory= lambda:list()) + objective: str = field( + "How to optimize the given scorer. Values are min/max", default="max" + ) + parameters: dict = field("Hyperparameter configuration of different models to optimize", default_factory= lambda:dict()) + use_default: bool = field("Whether or not to utilize DFFML's default hyperparameter settings for tuning", default=False) + split_data: bool = field("Whether or not to split data when performing tuning. Assumes dataset is in a tabular format.", default=False) + split_ratio: float = field("The percentage of records in the train set, if splitting is performed.", default=0.8) + + +@entrypoint("automl") +class AutoMLModel(SimpleModel): + r""" + AutoML model for automatic training and tuning based on target datasets and given + models and tuner. + + """ + # The configuration class needs to be set as the CONFIG property + + CONFIG: Type[AutoMLModelConfig] = AutoMLModelConfig + + def __init__(self, config) -> None: + super().__init__(config) + self.saved = None + self.forbidden = ["automl", "autosklearn"] + + async def __aenter__(self): + + dest = pathlib.Path(self.parent.config.location) + best_path = dest / "best_model" + # Check if model has been trained, and if so, get the type of the model + best_model = None + if dest.exists() and best_path.exists() and len(os.listdir(best_path)): + best_model = os.listdir(best_path)[0] + + # We want to allow users to not need to deal with individual model configuration. + # So we accept a list of strings and initialize our models based on that. + self.model_classes = {} + for ep in pkg_resources.iter_entry_points(group='dffml.model'): + if ep.name in self.parent.config.models or ep.name == best_model: + self.model_classes.update({ep.name: ep.load()}) + + # loading a trained model for prediction + if best_model: + model = self.model_classes[best_model]( + location = best_path / best_model, + features = self.parent.config.features, + predict = self.parent.config.predict + ) + async with contextlib.AsyncExitStack() as astack: + if isinstance(model, Model): + model = await astack.enter_async_context(model) + mctx = await astack.enter_async_context(model()) + elif isinstance(model, ModelContext): + mctx = model + self.saved = mctx + self.is_trained = True + return self + + async def __aexit__(self, exc_type, exc_value, traceback): + await super().__aexit__(exc_type, exc_value, traceback) + + async def train(self, sources: Sources) -> None: + + + tuner = self.parent.config.tuner + scorer = self.parent.config.scorer + features = self.parent.config.features + location = self.parent.config.location + + tuner.config.objective = self.parent.config.objective + + if self.parent.config.use_default: + pth = pathlib.Path(__file__).parents[1] / "util" / "autodefault.json" + with open(str(pth), "r") as tar: + self.parent.config.parameters = json.load(tar) + + if self.parent.config.objective == "min": + highest_acc = float("inf") + elif self.parent.config.objective == "max": + highest_acc = -float("inf") + else: + raise NotImplementedError('Objective must be either "min" or "max".') + + dest = pathlib.Path(location) + # We clear the destination directory first, to avoid conflicts. + if dest.exists() and dest.is_dir(): + shutil.rmtree(dest) + + train_source = test_source = None + + if self.parent.config.split_data: + data = [] + async for record in sources.with_features( + self.features + [self.parent.config.predict.name] + ): + data.append(record) + train_len = int(self.parent.config.split_ratio * len(data)) + train_source = data[:train_len] + test_source = data[train_len:] + + + else: + train_source = test_source = sources + + best_path = best_name = "" + + for model_name in self.parent.config.models: + if model_name in self.forbidden: + print(f"{model_name} is a forbidden model. Skipping...") + continue + model_dir = dest / model_name + + model = self.model_classes[model_name]( + location = model_dir, + features = features, + predict = self.parent.config.predict + ) + if model_name in self.parent.config.parameters: + tuner.config.parameters = self.parent.config.parameters[model_name] + else: + tuner.config.parameters = {} + + value = await tune(model, tuner, scorer, self.parent.config.predict, train_source, test_source) + + if self.parent.config.objective == "min" and value < highest_acc: + best_path = model_dir + best_name = model_name + highest_acc = value + elif self.parent.config.objective == "max" and value > highest_acc: + best_path = model_dir + best_name = model_name + highest_acc = value + + best_model_dir = dest / "best_model" / best_name + shutil.copytree(best_path, best_model_dir) + + + + + async def predict( + self, sources: SourcesContext + ) -> AsyncIterator[Tuple[Record, Any, float]]: + if not self.is_trained: + raise ModelNotTrained( + "Train the model first before getting predictions" + ) + # Use the child model API to make predictions + async for record in self.saved.predict(sources): + yield record diff --git a/dffml/noasync.py b/dffml/noasync.py index 41d9201138..a7416bad21 100644 --- a/dffml/noasync.py +++ b/dffml/noasync.py @@ -6,6 +6,7 @@ train as high_level_train, score as high_level_score, predict as high_level_predict, + tune as high_level_tune, ) @@ -24,6 +25,21 @@ def train(*args, **kwargs): ) ) +def tune(*args, **kwargs): + return asyncio.run(high_level_tune(*args, **kwargs)) + + +tune.__doc__ = ( + high_level_tune.__doc__.replace("await ", "") + .replace("async ", "") + .replace("asyncio.run(main())", "main()") + .replace(" >>> import asyncio\n", "") + .replace( + " >>> from dffml import *\n", + " >>> from dffml import *\n >>> from dffml.noasync import tune\n", + ) +) + def score(*args, **kwargs): return asyncio.run(high_level_score(*args, **kwargs)) diff --git a/dffml/plugins.py b/dffml/plugins.py index 8e4f7e2ec2..f5bb056ca0 100644 --- a/dffml/plugins.py +++ b/dffml/plugins.py @@ -51,6 +51,7 @@ def inpath(binary): ("operations", "nlp"), ("service", "http"), ("source", "mysql"), + ("tuner", "bayes_opt_gp"), ] diff --git a/dffml/skel/config/README.rst b/dffml/skel/config/README.rst deleted file mode 120000 index f6eeba643b..0000000000 --- a/dffml/skel/config/README.rst +++ /dev/null @@ -1 +0,0 @@ -../common/README.rst \ No newline at end of file diff --git a/dffml/skel/config/README.rst b/dffml/skel/config/README.rst new file mode 100644 index 0000000000..f6eeba643b --- /dev/null +++ b/dffml/skel/config/README.rst @@ -0,0 +1 @@ +../common/README.rst \ No newline at end of file diff --git a/dffml/skel/model/README.rst b/dffml/skel/model/README.rst deleted file mode 120000 index f6eeba643b..0000000000 --- a/dffml/skel/model/README.rst +++ /dev/null @@ -1 +0,0 @@ -../common/README.rst \ No newline at end of file diff --git a/dffml/skel/model/README.rst b/dffml/skel/model/README.rst new file mode 100644 index 0000000000..f6eeba643b --- /dev/null +++ b/dffml/skel/model/README.rst @@ -0,0 +1 @@ +../common/README.rst \ No newline at end of file diff --git a/dffml/skel/operations/README.rst b/dffml/skel/operations/README.rst deleted file mode 120000 index f6eeba643b..0000000000 --- a/dffml/skel/operations/README.rst +++ /dev/null @@ -1 +0,0 @@ -../common/README.rst \ No newline at end of file diff --git a/dffml/skel/operations/README.rst b/dffml/skel/operations/README.rst new file mode 100644 index 0000000000..f6eeba643b --- /dev/null +++ b/dffml/skel/operations/README.rst @@ -0,0 +1 @@ +../common/README.rst \ No newline at end of file diff --git a/dffml/skel/service/README.rst b/dffml/skel/service/README.rst deleted file mode 120000 index f6eeba643b..0000000000 --- a/dffml/skel/service/README.rst +++ /dev/null @@ -1 +0,0 @@ -../common/README.rst \ No newline at end of file diff --git a/dffml/skel/service/README.rst b/dffml/skel/service/README.rst new file mode 100644 index 0000000000..f6eeba643b --- /dev/null +++ b/dffml/skel/service/README.rst @@ -0,0 +1 @@ +../common/README.rst \ No newline at end of file diff --git a/dffml/skel/source/README.rst b/dffml/skel/source/README.rst deleted file mode 120000 index f6eeba643b..0000000000 --- a/dffml/skel/source/README.rst +++ /dev/null @@ -1 +0,0 @@ -../common/README.rst \ No newline at end of file diff --git a/dffml/skel/source/README.rst b/dffml/skel/source/README.rst new file mode 100644 index 0000000000..f6eeba643b --- /dev/null +++ b/dffml/skel/source/README.rst @@ -0,0 +1 @@ +../common/README.rst \ No newline at end of file diff --git a/dffml/tuner/__init__.py b/dffml/tuner/__init__.py index 072f34db2e..2ca452c2ef 100644 --- a/dffml/tuner/__init__.py +++ b/dffml/tuner/__init__.py @@ -8,4 +8,3 @@ TunerContext, Tuner, ) -from .parameter_grid import ParameterGrid diff --git a/dffml/tuner/parameter_grid.py b/dffml/tuner/parameter_grid.py index d6a8ead5f6..6bf1352b83 100644 --- a/dffml/tuner/parameter_grid.py +++ b/dffml/tuner/parameter_grid.py @@ -17,7 +17,8 @@ @config class ParameterGridConfig: - parameters: dict = field("Parameters to be optimized") + parameters: dict = field("Parameters to be optimized", default_factory= lambda:dict()) + objective: str = field("How to optimize for the scorer", default="max") class ParameterGridContext(TunerContext): @@ -38,6 +39,8 @@ async def optimize( Uses a grid of hyperparameters in the form of a dictionary present in config, Trains each permutation of the grid of parameters and compares accuracy. Sets model to the best parameters and returns highest accuracy. + If no hyperparameters are provided, the model is simply trained using + default parameters. Parameters ---------- @@ -53,39 +56,61 @@ async def optimize( train_data: SourcesContext The train_data to train models on with the hyperparameters provided. - sources : SourcesContext + test_data : SourcesContext The test_data to score against and optimize hyperparameters. Returns ------- float - The highest score value + The best score value """ - highest_acc = -1 + # Score should be optimized based on objective + if self.parent.config.objective == "min": + highest_acc = float("inf") + elif self.parent.config.objective == "max": + highest_acc = -1 + else: + raise NotImplementedError('Objective must be either "min" or "max".') + best_config = dict() logging.info( f"Optimizing model with parameter grid: {self.parent.config.parameters}" ) + names = list(self.parent.config.parameters.keys()) logging.info(names) - with model.config.no_enforce_immutable(): + + with model.parent.config.no_enforce_immutable(): for combination in itertools.product( *list(self.parent.config.parameters.values()) ): logging.info(combination) + for i in range(len(combination)): param = names[i] - setattr(model.config, names[i], combination[i]) - await train(model, *train_data) - acc = await score(model, accuracy_scorer, feature, *test_data) + setattr(model.parent.config, names[i], combination[i]) + + await train(model.parent, *train_data) + + acc = await score( + model.parent, accuracy_scorer, feature, *test_data + ) + logging.info(f"Accuracy of the tuned model: {acc}") - if acc > highest_acc: - highest_acc = acc - for param in names: - best_config[param] = getattr(model.config, param) + if self.parent.config.objective == "min": + if acc < highest_acc: + highest_acc = acc + elif self.parent.config.objective == "max": + if acc > highest_acc: + highest_acc = acc + for param in names: + best_config[param] = getattr( + model.parent.config, param + ) for param in names: - setattr(model.config, param, best_config[param]) - await train(model, *train_data) + setattr(model.parent.config, param, best_config[param]) + await train(model.parent, *train_data) + highest_acc = await score(model.parent, accuracy_scorer, feature, *test_data) logging.info(f"\nOptimal Hyper-parameters: {best_config}") logging.info(f"Accuracy of Optimized model: {highest_acc}") return highest_acc diff --git a/dffml/tuner/random_search.py b/dffml/tuner/random_search.py new file mode 100644 index 0000000000..ca4ccef46c --- /dev/null +++ b/dffml/tuner/random_search.py @@ -0,0 +1,127 @@ +from typing import Union, Dict, Any +import itertools +import logging +import random + +from ..base import ( + config, + field, +) +from ..high_level.ml import train, score +from .tuner import Tuner, TunerContext +from ..util.entrypoint import entrypoint +from ..source.source import BaseSource, Record +from ..accuracy.accuracy import AccuracyScorer, AccuracyContext +from ..model import ModelContext +from ..feature.feature import Feature + + +@config +class RandomSearchConfig: + parameters: dict = field("Parameters to be optimized") + objective: str = field( + "How to optimize the given scorer. Values are min/max", default="max" + ) + trials: int = field("Number of random configurations to try.", default=20) + + +class RandomSearchContext(TunerContext): + """ + Parameter Grid Tuner + """ + + async def optimize( + self, + model: ModelContext, + feature: Feature, + accuracy_scorer: Union[AccuracyScorer, AccuracyContext], + train_data: Union[BaseSource, Record, Dict[str, Any]], + test_data: Union[BaseSource, Record, Dict[str, Any]], + ): + """ + Method to optimize hyperparameters by parameter grid. + Uses a grid of hyperparameters in the form of a dictionary present in config, + Trains each permutation of the grid of parameters and compares accuracy. + Sets model to the best parameters and returns highest accuracy. + + Parameters + ---------- + model : ModelContext + The Model which needs to be used. + + feature : Feature + The Target feature in the data. + + accuracy_scorer: AccuracyContext + The accuracy scorer that needs to be used. + + train_data: SourcesContext + The train_data to train models on with the hyperparameters provided. + + test_data : SourcesContext + The test_data to score against and optimize hyperparameters. + + Returns + ------- + float + The highest score value + """ + + if self.parent.config.objective == "min": + highest_acc = float("inf") + elif self.parent.config.objective == "max": + highest_acc = -1 + else: + raise NotImplementedError('Objective must be either "min" or "max".') + + best_config = dict() + logging.info( + f"Optimizing model with random search: {self.parent.config.parameters}" + ) + + names = list(self.parent.config.parameters.keys()) + logging.info(names) + + with model.parent.config.no_enforce_immutable(): + for _ in range(self.parent.config.trials): + combination = [] + for pvs in self.parent.config.parameters.values(): + combination.append(random.choice(pvs)) + logging.info(combination) + + for i in range(len(combination)): + param = names[i] + setattr(model.parent.config, names[i], combination[i]) + await train(model.parent, *train_data) + acc = await score( + model.parent, accuracy_scorer, feature, *test_data + ) + + logging.info(f"Accuracy of the tuned model: {acc}") + if self.parent.config.objective == "min": + if acc < highest_acc: + highest_acc = acc + for param in names: + best_config[param] = getattr( + model.parent.config, param + ) + elif self.parent.config.objective == "max": + if acc > highest_acc: + highest_acc = acc + for param in names: + best_config[param] = getattr( + model.parent.config, param + ) + for param in names: + setattr(model.parent.config, param, best_config[param]) + await train(model.parent, *train_data) + logging.info(f"\nOptimal Hyper-parameters: {best_config}") + logging.info(f"Accuracy of Optimized model: {highest_acc}") + return highest_acc + + +@entrypoint("random_search") +class RandomSearch(Tuner): + + CONFIG = RandomSearchConfig + CONTEXT = RandomSearchContext diff --git a/dffml/util/autodefault.json b/dffml/util/autodefault.json new file mode 100644 index 0000000000..b4c1cfe816 --- /dev/null +++ b/dffml/util/autodefault.json @@ -0,0 +1,137 @@ +{ + + "xgbclassifier": {"learning_rate": [0.01, 0.05, 0.1], + "n_estimators": [20, 100, 200], + "max_depth": [3,5,8]}, + "scikitsvc": {"gamma": [0.001, 0.1], "C": [1, 10]}, + "daal4py":{}, + "pytorch":{}, + "automl":{}, + "slr":{}, + "anomalydetection":{}, + "scratchlgrsag":{}, + "xgbregressor": {"learning_rate": [0.01, 0.05, 0.1], + "n_estimators": [20, 100, 200], + "max_depth": [3,5,8]}, + "vwmodel":{}, + "scikitac":{}, + "scikitadaboost":{ + "base_estimator": ["DecisionTreeClassifier", "LogisticRegressor", "SVC"], + "n_estimators": [10, 50, 100, 500, 1000, 5000], + "learning_rate": [0.1, 0.5, 0.9, 1.5], + "tree_depth": [3,5,7,9] + + }, + "scikitap":{}, + "scikitard":{ + "alpha_1": [1e-5, 1e-6, 1e-7], + "alpha_2": [1e-5, 1e-6, 1e-7] + }, + "scikitbgc":{}, + "scikitbirch":{"threshold":[0.3,0.5,0.7]}, + "scikitbnb":{}, + "scikitbyr":{}, + "scikitdtc":{ + "criterion": ["gini", "entropy"], + "max_depth": [3,5,7,9] + }, + "scikitdtr":{ + "criterion": ["gini", "entropy"], + "max_depth": [3,5,7,9] + }, + "scikiteln":{ + "max_iter": [1, 5, 10], + "alpha": [0.0001, 0.001, 0.01, 0.1, 1, 10, 100] + + }, + "scikitetc":{ + "n_estimators": [10,50,100], + "criterion": ["mse", "mae"], + "max_depth": [2,8,16,32,50], + "min_sample_split": [2,4,6], + "min_sample_leaf": [1,2] + }, + "scikitgbc":{ + "learning_rates": [1, 0.5, 0.25, 0.1, 0.05, 0.01], + "n_estimators": [1, 2, 4, 8, 16, 32, 64, 100, 200] + }, + "scikitgnb":{"var_smoothing":[1e-7,1e-8,1e-9]}, + "scikitgpc":{}, + "scikitgpr":{}, + "scikitkmeans":{ + "n_clusters":[5,10,15], + "tol":[1e-3, 1e-4, 1e-5] + }, + "scikitknn":{"n_neighbours": [2, 4, 8, 16, 32, 64, 128]}, + "scikitlars":{}, + "scikitlas":{}, + "scikitlda":{}, + "scikitlor":{}, + "scikitlr":{}, + "scikitmbkmeans":{}, + "scikitmlp":{ + "hidden_layer_sizes": [[50,50,50], [50,100,50], [100]], + "activation": ["tanh", "relu"], + "solver": ["sgd", "adam"], + "alpha": [0.0001, 0.05], + "learning_rate": ["constant","adaptive"] + + }, + "scikitmnb":{ + "alpha":[0,0.5,1.0] + }, + "scikitms":{}, + "scikitomp":{ + "n_nonzero_coefs":[0.1,0.2,0.3] + }, + "scikitoptics":{"min_samples":[5,10,15]}, + "scikitqda":{"reg_param": [0.1, 0.2, 0.3, 0.4, 0.5]}, + "scikitrfc":{ + "max_depth": [10, 20, 30, 40, 50, 60, 70, 80, 90, 100], + "max_features": ["auto", "sqrt"], + "min_samples_leaf": [1, 2, 4], + "min_samples_split": [2, 5, 10], + "n_estimators": [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000] + }, + "scikitridge":{ + "alpha_init":[1, 1.1, 1.2, 1.3, 1.4, 1.5, 1.9], + "lambda_init": [1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-9] + }, + "scikitrsc":{}, + "scikitsc":{}, + "alexnet":{"optimizer":["Adam", "AdamW"]}, + "densenet121":{"optimizer":["Adam", "AdamW"]}, + "densenet161":{"optimizer":["Adam", "AdamW"]}, + "densenet169":{"optimizer":["Adam", "AdamW"]}, + "densenet201":{"optimizer":["Adam", "AdamW"]}, + "googlenet":{"optimizer":["Adam", "AdamW"]}, + "inception_v3":{"optimizer":["Adam", "AdamW"]}, + "mnasnet0_5":{"optimizer":["Adam", "AdamW"]}, + "mnasnet1_0":{"optimizer":["Adam", "AdamW"]}, + "mobilenet_v2":{"optimizer":["Adam", "AdamW"]}, + "pytorchnet":{"optimizer":["Adam", "AdamW"]}, + "resnet101":{"optimizer":["Adam", "AdamW"]}, + "resnet152":{"optimizer":["Adam", "AdamW"]}, + "resnet18":{"optimizer":["Adam", "AdamW"]}, + "resnet34":{"optimizer":["Adam", "AdamW"]}, + "resnet50":{"optimizer":["Adam", "AdamW"]}, + "resnext101_32x8d":{"optimizer":["Adam", "AdamW"]}, + "resnext50_32x4d":{"optimizer":["Adam", "AdamW"]}, + "shufflenet_v2_x0_5":{"optimizer":["Adam", "AdamW"]}, + "shufflenet_v2_x1_0":{"optimizer":["Adam", "AdamW"]}, + "vgg11":{"optimizer":["Adam", "AdamW"]}, + "vgg11_bn":{"optimizer":["Adam", "AdamW"]}, + "vgg13":{"optimizer":["Adam", "AdamW"]}, + "vgg13_bn":{"optimizer":["Adam", "AdamW"]}, + "vgg16":{"optimizer":["Adam", "AdamW"]}, + "vgg16_bn":{"optimizer":["Adam", "AdamW"]}, + "vgg19":{"optimizer":["Adam", "AdamW"]}, + "vgg19_bn":{"optimizer":["Adam", "AdamW"]}, + "wide_resnet101_2":{"optimizer":["Adam", "AdamW"]}, + "wide_resnet50_2":{"optimizer":["Adam", "AdamW"]}, + "daal4pylr":{}, + "spacyner":{}, + "tfdnnc":{}, + "tfdnnr":{}, + "text_classifier":{} +} \ No newline at end of file diff --git a/dffml/util/internal.py b/dffml/util/internal.py index fcb4dd5255..e26a8698ab 100644 --- a/dffml/util/internal.py +++ b/dffml/util/internal.py @@ -72,3 +72,15 @@ def list_records_to_dict(features, *args, model=None): args[i] = dict(zip(features, args[i])) return args raise CannotConvertToRecord("Model does not exist!") + +def records_to_dict_check(ds, model, predict_feature): + if hasattr(model.config, "features") and any( + isinstance(td, list) for td in ds + ): + return list_records_to_dict( + [feature.name for feature in model.config.features] + + predict_feature, + *ds, + model=model, + ) + return ds diff --git a/docs/tutorials/models/automl.rst b/docs/tutorials/models/automl.rst new file mode 100644 index 0000000000..daebce9845 --- /dev/null +++ b/docs/tutorials/models/automl.rst @@ -0,0 +1,142 @@ +Using DFFML's AutoML model +============================== + +Automated Machine Learning, abbreiviated as AutoML, is a process that automates away the time-consuming and tedious +aspects of ML, by encapsulating common ML models and techniques within a single API. It allows users to approach ML +from a high-level persepctive, abstracting away the minutae of statistical modelling, democratizing ML for +both data scientists and citizenry alike. On the other hand, AutoML also provides users with a degree of flexibility +in the form of being able to select their preferred models and tuners, which maximizes the likelihood of discovering an +effective model within the search space. In this tutorial, we will see how DFFML's AutoML model can be utilized to yield +the most out of a dataset. + +AutoML is extremely simple to use. Simply provide your dataset, a list of models to iterate over, and a hyperparmater +tuning technique to optimize your models with. The AutoML model will iterate over all the models provided, saving the +model with the best results in the user-specified directory. + + +.. code-block:: console + :test: + from sklearn.datasets import load_iris + from sklearn.model_selection import train_test_split + + + from dffml.accuracy import ClassificationAccuracy + from dffml.tuner.parameter_grid import ParameterGrid + from dffml.model.automl import AutoMLModel + + iris = load_iris() + y = iris["target"] + X = iris["data"] + trainX, testX, trainy, testy = train_test_split( + X, y, test_size=0.1, random_state=123 + ) + scorer = ClassificationAccuracy() + + # Configure the model + model = AutoMLModel( + predict="target", + features=["data"], + location="tempDir", + tuner = ParameterGrid(), + scorer = scorer, + models = ["xgbclassifier", "scikitsvc"], + objective="max", + parameters = { + "xgbclassifier": { + "learning_rate": [0.01, 0.05, 0.1], + "n_estimators": [20, 100, 200], + "max_depth": [3,5,8] + }, + "scikitsvc": { + "gamma": [0.001, 0.1], + "C": [1, 10] + } + } + ) + + + # Train the model. Note this is different from most other + # dffml models, where you have to provide both a train/test set. + train(model, [ + [{"data": x, "target": y} for x, y in zip(trainX, trainy)], + [{"data": x, "target": y} for x, y in zip(testX, testy)] + ]) + + # Assess accuracy + + print( + "Test accuracy:", + score( + model, + scorer, + Feature("target", float, 1), + *[{"data": x, "target": y} for x, y in zip(testX, testy)], + ), + ) + + print( + "Training accuracy:", + score( + model, + scorer, + Feature("target", float, 1), + *[{"data": x, "target": y} for x, y in zip(trainX, trainy)], + ), + ) + + +Command Line Usage +------------------ + +First, we download the Iris dataset to the desired folder. + +.. code-block:: console + $ wget http://download.tensorflow.org/data/iris_training.csv + $ wget http://download.tensorflow.org/data/iris_test.csv + $ sed -i 's/.*setosa,versicolor,virginica/SepalLength,SepalWidth,PetalLength,PetalWidth,classification/g' iris_training.csv iris_test.csv + +We create a JSON file with the hyperparameter search space: + +parameters.json +.. code-block:: console + { + "xgbclassifier": {"learning_rate": [0.01, 0.05, 0.1], + "n_estimators": [20, 100, 200], + "max_depth": [3,5,8]}, + "scikitsvc": {"gamma": [0.001, 0.1], "C": [1, 10]} + } + +Now, train the model: + +.. code-block:: console + $ dffml train \ + -model automl \ + -model-features \ + SepalLength:float:1 \ + SepalWidth:float:1 \ + PetalLength:float:1 \ + -model-predict classification \ + -model-location tempDir \ + -model-tuner parameter_grid \ + -model-scorer clf \ + -model-models xgbclassifier scikitsvc \ + -model-parameters @parameters.json \ + -model-objective max \ + -sources train=csv \ + -source-train-filename iris_training.csv + + +Make predictions with the model: + dffml predict all \ + -model automl \ + -model-features \ + SepalLength:float:1 \ + SepalWidth:float:1 \ + PetalLength:float:1 \ + -model-predict classification \ + -model-location tempDir \ + -model-tuner parameter_grid \ + -model-scorer clf \ + -model-objective max \ + -sources test=csv \ + -source-test-filename iris_test.csv diff --git a/docs/tutorials/tuners/bayes_opt_gp.rst b/docs/tutorials/tuners/bayes_opt_gp.rst new file mode 100644 index 0000000000..7ed9d94825 --- /dev/null +++ b/docs/tutorials/tuners/bayes_opt_gp.rst @@ -0,0 +1,167 @@ +Tuning a DFFML model with Bayesian Optimization +=============== + +For an introduction to hyperparameter tuning with the DFFML API, view the :ref:`parameter_grid` tutorial. + +For this tutorial, we'll be performing hyperparameter tuning using a BayesOptGP tuner, which is somewhat different +from the typical grid search/random search variants. As per normal, we will be using XGBClassifier as our model to +tune. + +Unlike grid search/random search, bayesian optimization is an intelligent hyperparameter selection process, +where the hyperparameters selected in the next iteration are dependent on the results of the previous iteration. +In the current iteration, the bayesian optimization process updates a surrogate model (which is a probability +distribution of scores | hypeparameters), selects a set of hyperparameters to maximize expected improvement of the +score based on this surrogate model, and repeats the process all over again. This allows one to efficiently search +the hyperparameter space, which is especially apt when the model to be tuned is expensive to evaluate. (For instance, +medium/large neural networks) + +The BayesOptGP tuner uses the BayesianOptimization library, which utilizes gaussian processes as the surrogate model, +hence the name of our tuner. + + +First, download the xgboost plugin for the DFFML library, which can be done via pip: + +.. code-block:: console + :test: + $ pip install -U dffml-model-xgboost + +We can utilize DFFML's tune method either via the Python API. In the following code, we demonstrate its usage in a Python +file: + +.. code-block:: console + :test: + :filepath: bayes_opt_gp_xgboost.py + from sklearn.datasets import load_iris + from sklearn.model_selection import train_test_split + + from dffml import Feature, Features + from dffml.noasync import tune + from dffml.accuracy import ClassificationAccuracy + from dffml_tuner_bayes_opt_gp.bayes_opt_gp import BayesOptGP + from dffml_model_xgboost.xgbclassifier import ( + XGBClassifierModel, + XGBClassifierModelConfig, + ) + + iris = load_iris() + y = iris["target"] + X = iris["data"] + trainX, testX, trainy, testy = train_test_split( + X, y, test_size=0.1, random_state=123 + ) + + # Configure the model + model = XGBClassifierModel( + XGBClassifierModelConfig( + features=Features(Feature("data", float,)), + predict=Feature("target", float, 1), + location="model", + max_depth=3, + learning_rate=0.01, + n_estimators=200, + reg_lambda=1, + reg_alpha=0, + gamma=0, + colsample_bytree=0, + subsample=1, + ) + ) + + # Configure the tuner search space in a dictionary + # All combinations will be tried, even if the parameter's + # value has been set in the model. + tuner = BayesOptGP( + parameters = { + "learning_rate": [0.01, 0.1], + "n_estimators": [20, 200], + "max_depth": [3,8] + + }, + objective = "max", + + ) + + # Tune function saves the best model and returns its score + print("Tuning accuracy:", + tune( + model, + tuner, + scorer, + Feature("target", float, 1), + [{"data": x, "target": y} for x, y in zip(trainX, trainy)], + [{"data": x, "target": y} for x, y in zip(testX, testy)], + + ) + ) + + +Note that because of its different nature, our BayesOptGP tuner only accepts a specific structure for its hyperparameter search +space configuration. For each hyperparameter, we accept two values representing the minimum and maximum bounds of that +hypeparameter which the tuner searches over. Also, Bayesian optimization only works on numerical hyperparameters ( +technically it should only work on floats, but we made some modfiications so it works on discrete values). This is because +the selection of the next set of hypeparameters derives from a closed-fm integral which exepcts a continuous search space. + +Examples of non-legitimate hyperparameter configurations: + +.. code-block:: console + { + "learning_rate": [0.01, 0.1, 0.2], // too many values + "n_estimators": [20, 200], + "max_depth": [3] // too few values + + } + + +.. code-block:: console + { + "learning_rate": [0.01, 0.1], + "sampling_method": ["uniform", "gradient_based"], //no strings + "validate_parameters": [True, False] //no booleans + + } + +Command Line Usage +------------------ + +First, we download the Iris dataset to the desired folder. + +.. code-block:: console + :test: + $ wget http://download.tensorflow.org/data/iris_training.csv + $ wget http://download.tensorflow.org/data/iris_test.csv + $ sed -i 's/.*setosa,versicolor,virginica/SepalLength,SepalWidth,PetalLength,PetalWidth,classification/g' iris_training.csv iris_test.csv + +We create a JSON file with the hyperparameter search space: + +parameters.json +.. code-block:: console + :test: + :filepath: parameters.json + { + "learning_rate": [0.01, 0.1], + "n_estimators": [20, 200], + "max_depth": [3,8] + } + +In the same folder, we perform the CLI tune command. + +.. code-block:: console + :test: + $ dffml tune \ + -model xgbclassifier \ + -model-features \ + SepalLength:float:1 \ + SepalWidth:float:1 \ + PetalLength:float:1 \ + -model-predict classification \ + -model-location tempDir \ + -tuner bayes_opt_gp \ + -tuner-parameters @parameters.json \ + -tuner-objective max \ + -scorer clf \ + -sources train=csv test=csv \ + -source-train-filename iris_training.csv \ + -source-test-filename iris_test.csv \ + -source-train-tag train \ + -source-test-tag test \ + -features classification:int:1 \ No newline at end of file diff --git a/docs/tutorials/tuners/parameter_grid.rst b/docs/tutorials/tuners/parameter_grid.rst new file mode 100644 index 0000000000..2b37a8daff --- /dev/null +++ b/docs/tutorials/tuners/parameter_grid.rst @@ -0,0 +1,162 @@ +Tuning a DFFML model with ParameterGrid +=============== + +For this tutorial, we'll be performing hyperparameter tuning on a DFFML model using DFFML's integrated "tune" method. +We will be using the XGBClassifier model and ParameterGrid tuner for this example, but note that these are +interchangeale for any DFFML Model and Tuner respectively. + +As we know, a machine learning model yields accurate predictions to unseen data by fitting itself to the +training dataset. However, different initial configurations to certain model parameters will affect the performance +of the trained model. For instance, a neural network that is allowed to train for several epochs on a dataset +typically outperforms another that has only trained a single epoch. We call these parameters to be modified in +pre-training "hyperparameters", and it is normally the job of the ML engineer to try many different hyperparameter +configuratons to find the best-performing model. + +This process can be automated using a hyperparameter tuning method, which tries a series of configurations on your +behalf, and includes random search, grid search, bayesian optimization and more. Here, we will be using +ParameterGrid, otherwise known as grid search, where the tuner tries all possible combinations of hyperparameters +provided by the user, a selects the best model based on a given metric. We will be tuning for the XGBClassifier +model based on a dictionary of values provied in a JSON file, and return the one with the highest accuracy on a +holdout validation set. + +First, download the xgboost plugin for the DFFML library, which can be done via pip: + +.. code-block:: console + :test: + $ pip install -U dffml-model-xgboost + +We can utilize DFFML's tune method either via the Python API. In the following code, we demonstrate its usage in a Python +file: + +.. code-block:: console + :test: + from sklearn.datasets import load_iris + from sklearn.model_selection import train_test_split + + from dffml import Feature, Features + from dffml.noasync import tune + from dffml.accuracy import ClassificationAccuracy + from dffml.tuner.parameter_grid import ParameterGrid + from dffml_model_xgboost.xgbclassifier import ( + XGBClassifierModel, + XGBClassifierModelConfig, + ) + + iris = load_iris() + y = iris["target"] + X = iris["data"] + trainX, testX, trainy, testy = train_test_split( + X, y, test_size=0.1, random_state=123 + ) + + # Configure the model + model = XGBClassifierModel( + XGBClassifierModelConfig( + features=Features(Feature("data", float,)), + predict=Feature("target", float, 1), + location="model", + max_depth=3, + learning_rate=0.01, + n_estimators=200, + reg_lambda=1, + reg_alpha=0, + gamma=0, + colsample_bytree=0, + subsample=1, + ) + ) + + # Configure the tuner search space in a dictionary + # All combinations will be tried, even if the parameter's + # value has been set in the model. + tuner = ParameterGrid( + parameters = { + "learning_rate": [0.01, 0.05, 0.1], + "n_estimators": [20, 100, 200], + "max_depth": [3,5,8] + + }, + objective = "max" + ) + + # Tune function saves the best model and returns its score + print("Tuning accuracy:", + tune( + model, + tuner, + scorer, + Feature("target", float, 1), + [{"data": x, "target": y} for x, y in zip(trainX, trainy)], + [{"data": x, "target": y} for x, y in zip(testX, testy)], + + ) + ) + +The tune function takes in 6 arguments: + + model : Model + Machine Learning model to use. See :doc:`/plugins/dffml_model` for + models options. + + tuner: Tuner + Hyperparameter tuning method to use. See :doc:`/plugins/dffml_tuner` for + tuner options. + + scorer: Scorer + Method to evaluate the performance of the model, inheriting from AccuracyScorer + class. + + predict_feature: Union[Features, Feature] + A feature indicating the feature you wish to predict. + + train_ds : list + Input data for training. Could be a ``dict``, :py:class:`Record`, + filename, one of the data :doc:`/plugins/dffml_source`, or a filename + with the extension being one of the data sources. + + valid_ds : list + Validation data for testing. Could be a ``dict``, :py:class:`Record`, + filename, one of the data :doc:`/plugins/dffml_source`, or a filename + with the extension being one of the data sources. + +Command Line Usage +------------------ + +First, we download the Iris dataset to the desired folder. + +.. code-block:: console + $ wget http://download.tensorflow.org/data/iris_training.csv + $ wget http://download.tensorflow.org/data/iris_test.csv + $ sed -i 's/.*setosa,versicolor,virginica/SepalLength,SepalWidth,PetalLength,PetalWidth,classification/g' iris_training.csv iris_test.csv + +We create a JSON file with the hyperparameter search space: + +parameters.json +.. code-block:: console + { + "learning_rate": [0.01, 0.05, 0.1], + "n_estimators": [20, 100, 200], + "max_depth": [3,5,8] + } + +In the same folder, we perform the CLI tune command. + +.. code-block:: console + $ dffml tune \ + -model xgbclassifier \ + -model-features \ + SepalLength:float:1 \ + SepalWidth:float:1 \ + PetalLength:float:1 \ + -model-predict classification \ + -model-location tempDir \ + -tuner parameter_grid \ + -tuner-parameters @parameters.json \ + -tuner-objective max \ + -scorer clf \ + -sources train=csv test=csv \ + -source-train-filename iris_training.csv \ + -source-test-filename iris_test.csv \ + -source-train-tag train \ + -source-test-tag test \ + -features classification:int:1 \ No newline at end of file diff --git a/docs/tutorials/tuners/random_search.rst b/docs/tutorials/tuners/random_search.rst new file mode 100644 index 0000000000..8a88562d7c --- /dev/null +++ b/docs/tutorials/tuners/random_search.rst @@ -0,0 +1,167 @@ +Tuning a DFFML model with Random Search +=============== + +For this tutorial, we'll be performing hyperparameter tuning on a DFFML model using DFFML's integrated "tune" method. +We will be using the XGBClassifier model and RandomSearch tuner for this example, but note that these are +interchangeale for any DFFML Model and Tuner respectively. + +As we know, a machine learning model yields accurate predictions to unseen data by fitting itself to the +training dataset. However, different initial configurations to certain model parameters will affect the performance +of the trained model. For instance, a neural network that is allowed to train for several epochs on a dataset +typically outperforms another that has only trained a single epoch. We call these parameters to be modified in +pre-training "hyperparameters", and it is normally the job of the ML engineer to try many different hyperparameter +configuratons to find the best-performing model. + +This process can be automated using a hyperparameter tuning method, which tries a series of configurations on your +behalf, and includes random search, grid search, bayesian optimization and more. Here, we will be using +RandomSearch, where the tuner tries a random combination of hyperparameters provided by the user for a fixed number of +iterations, and selects the best model based on a given metric. We will be tuning for the XGBClassifier +model based on a dictionary of values provied in a JSON file, and returns the one with the highest accuracy on a +holdout validation set. + +First, download the xgboost plugin for the DFFML library, which can be done via pip: + +.. code-block:: console + :test: + $ pip install -U dffml-model-xgboost + +We can utilize DFFML's tune method either via the Python API. In the following code, we demonstrate its usage in a Python +file: + +.. code-block:: console + :test: + from sklearn.datasets import load_iris + from sklearn.model_selection import train_test_split + + from dffml import Feature, Features + from dffml.noasync import tune + from dffml.accuracy import ClassificationAccuracy + from dffml.tuner.random_search import RandomSearch + from dffml_model_xgboost.xgbclassifier import ( + XGBClassifierModel, + XGBClassifierModelConfig, + ) + + iris = load_iris() + y = iris["target"] + X = iris["data"] + trainX, testX, trainy, testy = train_test_split( + X, y, test_size=0.1, random_state=123 + ) + + # Configure the model + model = XGBClassifierModel( + XGBClassifierModelConfig( + features=Features(Feature("data", float,)), + predict=Feature("target", float, 1), + location="model", + max_depth=3, + learning_rate=0.01, + n_estimators=200, + reg_lambda=1, + reg_alpha=0, + gamma=0, + colsample_bytree=0, + subsample=1, + ) + ) + + # Configure the tuner search space in a dictionary + # All combinations will be tried, even if the parameter's + # value has been set in the model. + tuner = RandomSearch( + parameters = { + "learning_rate": [0.01, 0.05, 0.1], + "n_estimators": [20, 100, 200], + "max_depth": [3,5,8] + + }, + objective = "max", + trials=15 + ) + + # Tune function saves the best model and returns its score + print("Tuning accuracy:", + tune( + model, + tuner, + scorer, + Feature("target", float, 1), + [{"data": x, "target": y} for x, y in zip(trainX, trainy)], + [{"data": x, "target": y} for x, y in zip(testX, testy)], + + ) + ) + +The tune function takes in 6 arguments: + + model : Model + Machine Learning model to use. See :doc:`/plugins/dffml_model` for + models options. + + tuner: Tuner + Hyperparameter tuning method to use. See :doc:`/plugins/dffml_tuner` for + tuner options. + + scorer: Scorer + Method to evaluate the performance of the model, inheriting from AccuracyScorer + class. + + predict_feature: Union[Features, Feature] + A feature indicating the feature you wish to predict. + + train_ds : list + Input data for training. Could be a ``dict``, :py:class:`Record`, + filename, one of the data :doc:`/plugins/dffml_source`, or a filename + with the extension being one of the data sources. + + valid_ds : list + Validation data for testing. Could be a ``dict``, :py:class:`Record`, + filename, one of the data :doc:`/plugins/dffml_source`, or a filename + with the extension being one of the data sources. + +Command Line Usage +------------------ + +First, we download the Iris dataset to the desired folder. + +.. code-block:: console + :test: + $ wget http://download.tensorflow.org/data/iris_training.csv + $ wget http://download.tensorflow.org/data/iris_test.csv + $ sed -i 's/.*setosa,versicolor,virginica/SepalLength,SepalWidth,PetalLength,PetalWidth,classification/g' iris_training.csv iris_test.csv + +We create a JSON file with the hyperparameter search space: + +parameters.json +.. code-block:: console + :test: + :filepath: parameters.json + { + "learning_rate": [0.01, 0.05, 0.1], + "n_estimators": [20, 100, 200], + "max_depth": [3,5,8] + } + +In the same folder, we perform the CLI tune command. + +.. code-block:: console + :test: + $ dffml tune \ + -model xgbclassifier \ + -model-features \ + SepalLength:float:1 \ + SepalWidth:float:1 \ + PetalLength:float:1 \ + -model-predict classification \ + -model-location tempDir \ + -tuner random_search \ + -tuner-parameters @parameters.json \ + -tuner-objective max \ + -scorer clf \ + -sources train=csv test=csv \ + -source-train-filename iris_training.csv \ + -source-test-filename iris_test.csv \ + -source-train-tag train \ + -source-test-tag test \ + -features classification:int:1 \ No newline at end of file diff --git a/examples/rockpaperscissors/parameters.json b/examples/rockpaperscissors/parameters.json new file mode 100644 index 0000000000..a60e592493 --- /dev/null +++ b/examples/rockpaperscissors/parameters.json @@ -0,0 +1 @@ +{"epochs":[2,3,4]} \ No newline at end of file diff --git a/examples/rockpaperscissors/tune.sh b/examples/rockpaperscissors/tune.sh new file mode 100644 index 0000000000..e4613b7980 --- /dev/null +++ b/examples/rockpaperscissors/tune.sh @@ -0,0 +1,27 @@ +dffml tune \ + -model pytorchnet \ + -model-features image:int:$((300*300*3)) \ + -model-clstype str \ + -model-classifications rock paper scissors \ + -model-predict label:int:1 \ + -model-network @model.yaml \ + -model-location rps_model \ + -model-loss crossentropyloss \ + -model-optimizer Adam \ + -model-validation_split 0.2 \ + -model-epochs 10 \ + -model-batch_size 32 \ + -model-imageSize 150 \ + -model-enableGPU \ + -model-patience 2 \ + -scorer pytorchscore \ + -tuner parameter_grid \ + -tuner-parameters @parameters.json \ + -log debug \ + -sources train=dir test=dir \ + -source-train-foldername rps/rps \ + -source-train-feature image \ + -source-train-labels rock paper scissors \ + -source-test-foldername rps-test-set/rps-test-set \ + -source-test-feature image \ + -source-test-labels rock paper scissors \ \ No newline at end of file diff --git a/model/pytorch/examples/resnet18/parameters.json b/model/pytorch/examples/resnet18/parameters.json new file mode 100644 index 0000000000..1dd80ef7a6 --- /dev/null +++ b/model/pytorch/examples/resnet18/parameters.json @@ -0,0 +1 @@ +{"epochs":[1,2]} \ No newline at end of file diff --git a/model/pytorch/examples/resnet18/tune.sh b/model/pytorch/examples/resnet18/tune.sh new file mode 100644 index 0000000000..a53fec0a4f --- /dev/null +++ b/model/pytorch/examples/resnet18/tune.sh @@ -0,0 +1,24 @@ +dffml tune \ + -model resnet18 \ + -model-add_layers \ + -model-layers @layers.yaml \ + -model-clstype str \ + -model-classifications ants bees \ + -model-location resnet18_model \ + -model-imageSize 224 \ + -model-epochs 5 \ + -model-batch_size 32 \ + -model-enableGPU \ + -model-features image:int:$((500*500)) \ + -model-predict label:str:1 \ + -sources train=dir test=dir \ + -source-train-foldername hymenoptera_data/train \ + -source-train-feature image \ + -source-train-labels ants bees \ + -source-test-foldername hymenoptera_data/val \ + -source-test-feature image \ + -source-test-labels ants bees \ + -scorer pytorchscore \ + -tuner parameter_grid \ + -tuner-parameters @parameters.json \ + -log critical \ No newline at end of file diff --git a/model/pytorch/tests/test_pytorchnet.py b/model/pytorch/tests/test_pytorchnet.py index bb4d82cb3d..4a9cdd6a98 100644 --- a/model/pytorch/tests/test_pytorchnet.py +++ b/model/pytorch/tests/test_pytorchnet.py @@ -1,3 +1,4 @@ +from dffml.tuner.parameter_grid import ParameterGrid import torch.nn as nn import os import shutil @@ -7,7 +8,8 @@ from dffml.cli.cli import CLI from dffml.util.net import cached_download_unpack_archive from dffml.util.asynctestcase import AsyncTestCase -from dffml.high_level.ml import train, score, predict +from dffml.high_level.ml import train, score, predict, tune +from dffml.tuner.parameter_grid import ParameterGrid from dffml import Features, Feature, DirectorySource from dffml_model_pytorch import PyTorchNeuralNetwork from dffml_model_pytorch.utils import CrossEntropyLossFunction @@ -103,6 +105,7 @@ def setUpClass(cls): ) ) cls.scorer = PytorchAccuracy() + cls.tuner = ParameterGrid(parameters={"epochs":[3,5]}, objective="max") @classmethod def tearDownClass(cls): @@ -149,13 +152,33 @@ async def test_02_predict(self): self.assertIn(results["value"], self.model.config.classifications) self.assertTrue(results["confidence"]) + async def test_03_tune(self): + acc = await tune( + self.model, + self.tuner, + self.scorer, + Feature("label", str, 1), + [DirectorySource( + foldername=str(self.traindir) + "/rps", + feature="image", + labels=["rock", "paper", "scissors"], + )], + [DirectorySource( + foldername=str(self.testdir) + "/rps-test-set", + feature="image", + labels=["rock", "paper", "scissors"], + )], + ) + self.assertGreater(acc, 0.0) + async def test_shell(self): def clean_args(fd, directory): cmnd = " ".join(fd.readlines()).split("\\\n") cmnd = " ".join(cmnd).split() for idx, word in enumerate(cmnd): cmnd[idx] = word.strip() - cmnd[cmnd.index("-source-foldername") + 1] = directory + if "-source-foldername" in cmnd: + cmnd[cmnd.index("-source-foldername") + 1] = directory if "-model-epochs" in cmnd: cmnd[cmnd.index("-model-epochs") + 1] = "1" return cmnd @@ -163,6 +186,9 @@ def clean_args(fd, directory): shutil.copy( sh_filepath("model.yaml"), os.path.join(os.getcwd(), "model.yaml"), ) + shutil.copy( + sh_filepath("parameters.json"), os.path.join(os.getcwd(), "parameters.json"), + ) with open(sh_filepath("train.sh"), "r") as f: train_command = clean_args(f, str(self.traindir) + "/rps") @@ -178,6 +204,12 @@ def clean_args(fd, directory): predict_command = clean_args(f, str(self.predictdir)) results = await CLI.cli(*predict_command[1:-1]) + with open(sh_filepath("tune.sh"), "r") as f: + tc = clean_args(f, str(self.predictdir)) + tc[tc.index("-source-train-foldername") + 1] = str(self.traindir) + "/rps" + tc[tc.index("-source-test-foldername") + 1] = str(self.testdir ) + "/rps-test-set" + acc = await CLI.cli(*tc[1:-1]) + self.assertTrue(isinstance(results, list)) self.assertTrue(results) results = results[0] @@ -187,3 +219,4 @@ def clean_args(fd, directory): self.assertIn("confidence", results) self.assertIn(isinstance(results["value"], str), [True]) self.assertTrue(results["confidence"]) + self.assertTrue(acc>=0.0) diff --git a/model/pytorch/tests/test_resnet18.py b/model/pytorch/tests/test_resnet18.py index fd270592a4..79c92eacd6 100644 --- a/model/pytorch/tests/test_resnet18.py +++ b/model/pytorch/tests/test_resnet18.py @@ -47,6 +47,11 @@ def clean_args(fd, directory): sh_filepath("resnet18", "layers.yaml"), os.path.join(os.getcwd(), "layers.yaml"), ) + shutil.copy( + sh_filepath("resnet18", "parameters.json"), + os.path.join(os.getcwd(), "parameters.json"), + ) + with open(sh_filepath("resnet18", "train.sh"), "r") as f: train_command = clean_args(f, str(tempdir)) @@ -60,6 +65,10 @@ def clean_args(fd, directory): predict_command = clean_args(f, str(tempdir)) results = await CLI.cli(*predict_command[1:-1]) + with open(sh_filepath("resnet18", "tune.sh"), "r") as f: + tune_command = clean_args(f, str(tempdir)) + acc = await CLI.cli(*tune_command[1:]) + self.assertTrue(isinstance(results, list)) self.assertTrue(results) results = results[0] @@ -69,3 +78,4 @@ def clean_args(fd, directory): self.assertIn("confidence", results) self.assertIn(isinstance(results["value"], str), [True]) self.assertTrue(results["confidence"]) + self.assertTrue(acc>=0.0) diff --git a/model/spacy/examples/ner/parameters.json b/model/spacy/examples/ner/parameters.json new file mode 100644 index 0000000000..21d5c6841b --- /dev/null +++ b/model/spacy/examples/ner/parameters.json @@ -0,0 +1 @@ +{"n_iter":[3,5]} \ No newline at end of file diff --git a/model/spacy/examples/ner/tune.sh b/model/spacy/examples/ner/tune.sh new file mode 100644 index 0000000000..e69b483c5f --- /dev/null +++ b/model/spacy/examples/ner/tune.sh @@ -0,0 +1,15 @@ +dffml tune \ + -model spacyner \ + -model-model_name en_core_web_sm \ + -model-location temp.zip \ + -model-n_iter 5 \ + -scorer sner \ + -tuner parameter_grid \ + -tuner-parameters @parameters.json \ + -features tag:str:1 \ + -sources train=op test=op \ + -source-train-opimp model.spacy.dffml_model_spacy.ner.utils:parser \ + -source-args train.json False \ + -source-test-opimp model.spacy.dffml_model_spacy.ner.utils:parser \ + -source-args test.json True \ + -log debug diff --git a/model/spacy/tests/test_ner_integration.py b/model/spacy/tests/test_ner_integration.py index 201e55f4d8..fcf3b16ea9 100644 --- a/model/spacy/tests/test_ner_integration.py +++ b/model/spacy/tests/test_ner_integration.py @@ -9,6 +9,7 @@ import tempfile import contextlib import subprocess +import shutil from dffml.cli.cli import CLI from dffml.util.os import chdir @@ -159,6 +160,43 @@ async def test_run(self): "-model-n_iter", "5", ) + + param_path = os.path.join(os.path.dirname(__file__), "../examples/ner/parameters.json") + + # Tune model + await CLI.cli( + "tune", + "-model", + "spacyner", + "-model-model_name", + "en_core_web_sm", + "-model-location", + directory, + "-scorer", + "sner", + "-tuner", + "parameter_grid", + "-tuner-parameters", + "@" + str(param_path), + "-features", + "Tag:str:1", + "-sources", + "train=op", + "test=op", + "-source-train-opimp", + "model.spacy.dffml_model_spacy.ner.utils:parser", + "-source-train-args", + train_data_filename, + "False", + "-source-test-opimp", + "model.spacy.dffml_model_spacy.ner.utils:parser", + "-source-test-args", + test_data_filename, + "True", + "-log", + "debug", + ) + self.assertTrue(isinstance(results, list)) self.assertTrue(results) results = results[0].export() @@ -208,6 +246,10 @@ def clean_args(fd, directory): return cmnd with directory_with_csv_files() as tempdir: + shutil.copy( + os.path.join(os.path.dirname(__file__), "../examples/ner/parameters.json"), + os.path.join(tempdir, "parameters.json"), + ) with open( os.path.join( os.path.dirname(os.path.dirname(__file__)), @@ -232,6 +274,19 @@ def clean_args(fd, directory): accuracy_cmnd = clean_args(f, tempdir) await CLI.cli(*accuracy_cmnd[1:]) + with open( + os.path.join( + os.path.dirname(os.path.dirname(__file__)), + "examples", + "ner", + "tune.sh", + ), + "r", + ) as f: + accuracy_cmnd = clean_args(f, tempdir) + await CLI.cli(*accuracy_cmnd[1:]) + + with open( os.path.join( os.path.dirname(os.path.dirname(__file__)), diff --git a/model/spacy/tests/test_ner_model.py b/model/spacy/tests/test_ner_model.py index c5065b43f6..b858091700 100644 --- a/model/spacy/tests/test_ner_model.py +++ b/model/spacy/tests/test_ner_model.py @@ -5,7 +5,8 @@ from dffml.record import Record from dffml.source.source import Sources from dffml.feature.feature import Feature -from dffml import train, score, predict, run_consoletest +from dffml import train, score, predict, tune, run_consoletest +from dffml.tuner.parameter_grid import ParameterGrid from dffml.util.asynctestcase import AsyncTestCase from dffml.source.memory import MemorySource, MemorySourceConfig from dffml_model_spacy.accuracy import SpacyNerAccuracy @@ -56,6 +57,7 @@ def setUpClass(cls): ) ) cls.scorer = SpacyNerAccuracy() + cls.tuner = ParameterGrid(parameters={"n_iter":[5,10]}, objective="max") @classmethod def tearDownClass(cls): @@ -81,6 +83,12 @@ async def test_02_predict(self): self.assertIn( predictions[0][2]["Tag"]["value"][0][1], ["ORG", "PERSON", "LOC"] ) + + async def test_03_tune(self): + res = await tune( + self.model, self.tuner, self.scorer, Feature("Tag", str, 1), [self.train_sources], [self.test_sources] + ) + self.assertGreaterEqual(res, 0) async def test_docstring(self): await run_consoletest(SpacyNERModel) diff --git a/model/tensorflow/examples/parameters.json b/model/tensorflow/examples/parameters.json new file mode 100644 index 0000000000..f9cf0426be --- /dev/null +++ b/model/tensorflow/examples/parameters.json @@ -0,0 +1 @@ +{"epochs":[10,15]} \ No newline at end of file diff --git a/model/tensorflow/tests/test_dnnc.py b/model/tensorflow/tests/test_dnnc.py index 9178dd2ff0..85e11825a6 100644 --- a/model/tensorflow/tests/test_dnnc.py +++ b/model/tensorflow/tests/test_dnnc.py @@ -2,7 +2,7 @@ import pathlib import tempfile -from dffml import train, predict, score +from dffml import train, predict, score, tune from dffml.record import Record from dffml.source.source import Sources from dffml.source.memory import MemorySource, MemorySourceConfig @@ -10,6 +10,7 @@ from dffml.util.cli.arg import parse_unknown from dffml.util.asynctestcase import AsyncTestCase from dffml.accuracy import ClassificationAccuracy +from dffml.tuner.parameter_grid import ParameterGrid from dffml_model_tensorflow.dnnc import ( DNNClassifierModel, @@ -84,6 +85,7 @@ async def test_config(self): async def test_model(self): scorer = ClassificationAccuracy() + tuner = ParameterGrid(parameters={"epochs":[20,30]}, objective="max") for i in range(0, 7): await train(self.model, self.sources) res = await score( @@ -98,7 +100,11 @@ async def test_model(self): location=self.model_dir.name ) continue + res_tune = await tune( + self.model, tuner, scorer, Feature("string", str, 1), [self.sources], [self.sources] + ) self.assertGreater(res, 0.9) + self.assertGreater(res_tune, 0.9) a = Record("a", data={"features": {self.feature.name: 1}}) target_name = self.model.config.predict.name res = [ diff --git a/model/tensorflow/tests/test_dnnr.py b/model/tensorflow/tests/test_dnnr.py index 145337b74e..3074b0ae35 100644 --- a/model/tensorflow/tests/test_dnnr.py +++ b/model/tensorflow/tests/test_dnnr.py @@ -4,14 +4,16 @@ import numpy as np -from dffml import train, score, predict +from dffml import train, score, predict, tune from dffml.record import Record from dffml.source.source import Sources from dffml.accuracy import MeanSquaredErrorAccuracy from dffml.source.memory import MemorySource, MemorySourceConfig +from dffml.tuner.parameter_grid import ParameterGrid from dffml.util.cli.arg import parse_unknown from dffml.util.asynctestcase import AsyncTestCase from dffml.feature import Feature, Features +from dffml.tuner.parameter_grid import ParameterGrid from dffml_model_tensorflow.dnnr import ( DNNRegressionModel, @@ -98,6 +100,7 @@ async def test_model(self): }, ) target_name = self.model.config.predict.name + tuner = ParameterGrid(parameters={"epochs":[10,15]}, objective="min") scorer = MeanSquaredErrorAccuracy() for i in range(0, 7): await train(self.model, self.sources) @@ -113,7 +116,11 @@ async def test_model(self): location=pathlib.Path(self.model_dir.name) ) continue + res_tune = await tune( + self.model, tuner, scorer, Feature("TARGET", float, 1), [self.sources], [self.sources] + ) self.assertGreater(res, 0.0) + self.assertGreater(res_tune, 0.0) res = [ record async for record in predict(self.model, a, keep_record=True) diff --git a/model/tensorflow/tests/test_tf_integration.py b/model/tensorflow/tests/test_tf_integration.py index 9a39650e4c..1a435b2bd3 100644 --- a/model/tensorflow/tests/test_tf_integration.py +++ b/model/tensorflow/tests/test_tf_integration.py @@ -2,6 +2,7 @@ This file contains integration tests. We use the CLI to exercise functionality of various DFFML classes and constructs. """ +import os import csv import pathlib @@ -190,6 +191,33 @@ async def test_run(self): "-source-filename", data_filename, ) + param_path = os.path.join(os.path.dirname(__file__), "../examples/parameters.json") + # Tune model + await CLI.cli( + "accuracy", + "-model", + "tfdnnr", + *features, + "-model-predict", + "true_target:float:1", + "-model-location", + model_dir, + "-features", + "true_target:float:1", + "-scorer", + "mse", + "-tuner", + "parameter_grid" + "-tuner-parameters", + "@" + str(param_path), + "-sources", + "train=csv", + "test=csv", + "-source-train-filename", + data_filename, + "-source-test-filename", + data_filename, + ) self.assertTrue(isinstance(results, list)) self.assertTrue(results) results = results[0].export() diff --git a/model/tensorflow_hub/examples/tfhub_text_classifier/parameters.json b/model/tensorflow_hub/examples/tfhub_text_classifier/parameters.json new file mode 100644 index 0000000000..f9cf0426be --- /dev/null +++ b/model/tensorflow_hub/examples/tfhub_text_classifier/parameters.json @@ -0,0 +1 @@ +{"epochs":[10,15]} \ No newline at end of file diff --git a/model/tensorflow_hub/tests/test_model.py b/model/tensorflow_hub/tests/test_model.py index b7ffca4e83..8c072535b9 100644 --- a/model/tensorflow_hub/tests/test_model.py +++ b/model/tensorflow_hub/tests/test_model.py @@ -2,7 +2,8 @@ import tempfile from dffml.record import Record -from dffml.high_level.ml import score +from dffml.high_level.ml import score, tune +from dffml.tuner.parameter_grid import ParameterGrid from dffml.source.source import Sources from dffml.util.asynctestcase import AsyncTestCase from dffml.feature import Features, Feature @@ -47,6 +48,7 @@ def setUpClass(cls): ) ) cls.scorer = TextClassifierAccuracy() + cls.tuner = ParameterGrid(parameters={"epochs":[10,15]}, objective="max") @classmethod def tearDownClass(cls): @@ -63,6 +65,7 @@ async def test_01_accuracy(self): ) self.assertGreater(res, 0) + async def test_02_predict(self): async with self.sources as sources, self.model as model: target_name = model.config.predict.name @@ -71,6 +74,12 @@ async def test_02_predict(self): prediction = record.prediction(target_name).value self.assertIn(prediction, ["0", "1"]) + async def test_03_tune(self): + res = await tune( + self.model, self.tuner, self.scorer, Feature("X", int, 1), [self.sources], [self.sources] + ) + self.assertGreater(res, 0) + # Randomly generate sample data POSITIVE_WORDS = ["fun", "great", "cool", "awesome", "rad"] diff --git a/model/tensorflow_hub/tests/test_tfhub_integration.py b/model/tensorflow_hub/tests/test_tfhub_integration.py index 535e10aeed..9cc88e3203 100644 --- a/model/tensorflow_hub/tests/test_tfhub_integration.py +++ b/model/tensorflow_hub/tests/test_tfhub_integration.py @@ -2,6 +2,7 @@ This file contains integration tests. We use the CLI to exercise functionality of various DFFML classes and constructs. """ +import os import csv import json import random @@ -117,6 +118,40 @@ async def test_run(self): "-source-filename", data_filename, ) + param_path = os.path.join(os.path.dirname(__file__), "../examples/tfhub_text_classifier/parameters.json") + # Tune model + await CLI.cli( + "tune", + "-model", + "text_classifier", + *features, + "-model-predict", + "sentiment:int:1", + "-model-location", + model_dir, + "-model-classifications", + "0", + "1", + "-model-clstype", + "int", + "-features", + "sentiment:int:1", + "-sources", + "train=csv", + "test=csv", + "-source-train-filename", + data_filename, + "-source-test-filename", + data_filename, + "-scorer", + "textclf", + "-tuner", + "parameter_grid", + "-tuner-parameters", + "@" + str(param_path) + + ) + self.assertTrue(isinstance(results, list)) self.assertTrue(results) results = results[0].export() diff --git a/model/vowpalWabbit/tests/test_vw.py b/model/vowpalWabbit/tests/test_vw.py index 3d9167d7d5..83af3f5999 100644 --- a/model/vowpalWabbit/tests/test_vw.py +++ b/model/vowpalWabbit/tests/test_vw.py @@ -4,13 +4,14 @@ from sklearn.datasets import make_friedman1 from dffml.record import Record -from dffml.high_level.ml import score +from dffml.high_level.ml import score, tune from dffml.source.source import Sources from dffml.source.memory import MemorySource, MemorySourceConfig from dffml.feature import Feature, Features from dffml.util.asynctestcase import AsyncTestCase from dffml.accuracy import MeanSquaredErrorAccuracy from dffml_model_vowpalWabbit.vw_base import VWModel, VWConfig +from dffml.tuner.parameter_grid import ParameterGrid class TestVWModel(AsyncTestCase): @@ -73,6 +74,7 @@ def setUpClass(cls): ) ) cls.scorer = MeanSquaredErrorAccuracy() + cls.tuner = ParameterGrid(parameters={}, objective="min") @classmethod def tearDownClass(cls): @@ -96,6 +98,12 @@ async def test_02_predict(self): async for record in mctx.predict(sctx): prediction = record.prediction(target).value self.assertTrue(isinstance(prediction, float)) + + async def test_03_tune(self): + res = await tune( + self.model, self.tuner, self.scorer, Feature("X", float, 1), [self.sources], [self.sources] + ) + self.assertTrue(isinstance(res, float)) DATA_LEN = 500 diff --git a/model/vowpalWabbit/tests/test_vw_integration.py b/model/vowpalWabbit/tests/test_vw_integration.py index b8e1874d36..1b2a70db98 100644 --- a/model/vowpalWabbit/tests/test_vw_integration.py +++ b/model/vowpalWabbit/tests/test_vw_integration.py @@ -99,6 +99,37 @@ async def test_run(self): "-source-filename", data_filename, ) + + # Tune model + await CLI.cli( + "tune", + "-model", + "vwmodel", + *features, + "-model-predict", + "true_class:int:1", + "-model-vwcmd", + "binary", + "True", + "-model-use_binary_label", + "-model-location", + model_dir, + "-scorer", + "mse", + "-features", + "true_class:int:1", + "-sources", + "train=csv", + "test=csv", + "-source-train-filename", + data_filename, + "-source-test-filename", + data_filename, + "-tuner", + "parameter_grid", + "-tuner-objective", + "min" + ) self.assertTrue(isinstance(results, list)) self.assertTrue(results) results = results[0].export() diff --git a/model/xgboost/examples/diabetesregression.py b/model/xgboost/examples/diabetesregression.py index 56b5b0e90b..0beba6f12a 100644 --- a/model/xgboost/examples/diabetesregression.py +++ b/model/xgboost/examples/diabetesregression.py @@ -2,11 +2,12 @@ from sklearn.model_selection import train_test_split from dffml import Feature, Features -from dffml.noasync import train, score +from dffml.noasync import train, score, tune from dffml_model_xgboost.xgbregressor import ( XGBRegressorModel, XGBRegressorModelConfig, ) +from dffml.tuner.parameter_grid import ParameterGrid from dffml.accuracy import MeanSquaredErrorAccuracy @@ -34,6 +35,16 @@ ) ) +tuner = ParameterGrid( + parameters = { + "learning_rate": [0.01, 0.05, 0.1], + "n_estimators": [20, 100, 200], + "max_depth": [3,5,8] + + }, + objective = "min" +) + # Train the model train(model, *[{"data": x, "target": y} for x, y in zip(trainX, trainy)]) @@ -58,3 +69,16 @@ *[{"data": x, "target": y} for x, y in zip(trainX, trainy)], ), ) + +print("Tuning accuracy:", + tune( + model, + tuner, + scorer, + Feature("target", float, 1), + [{"data": x, "target": y} for x, y in zip(trainX, trainy)], + [{"data": x, "target": y} for x, y in zip(testX, testy)], + + ) +) + diff --git a/model/xgboost/examples/iris_classification.py b/model/xgboost/examples/iris_classification.py index 5043e1dd97..bb0725f1dc 100644 --- a/model/xgboost/examples/iris_classification.py +++ b/model/xgboost/examples/iris_classification.py @@ -2,8 +2,9 @@ from sklearn.model_selection import train_test_split from dffml import Feature, Features -from dffml.noasync import train, score +from dffml.noasync import train, score, tune from dffml.accuracy import ClassificationAccuracy +from dffml.tuner.parameter_grid import ParameterGrid from dffml_model_xgboost.xgbclassifier import ( XGBClassifierModel, XGBClassifierModelConfig, @@ -33,6 +34,16 @@ ) ) +tuner = ParameterGrid( + parameters = { + "learning_rate": [0.01, 0.05, 0.1], + "n_estimators": [20, 100, 200], + "max_depth": [3,5,8] + + }, + objective = "max" +) + # Train the model train(model, *[{"data": x, "target": y} for x, y in zip(trainX, trainy)]) @@ -56,3 +67,14 @@ *[{"data": x, "target": y} for x, y in zip(trainX, trainy)], ), ) +print("Tuning accuracy:", + tune( + model, + tuner, + scorer, + Feature("target", float, 1), + [{"data": x, "target": y} for x, y in zip(trainX, trainy)], + [{"data": x, "target": y} for x, y in zip(testX, testy)], + + ) +) diff --git a/model/xgboost/tests/test_classifier_model.py b/model/xgboost/tests/test_classifier_model.py index 722c5aef70..67c5367ed4 100644 --- a/model/xgboost/tests/test_classifier_model.py +++ b/model/xgboost/tests/test_classifier_model.py @@ -9,7 +9,8 @@ from dffml.record import Record from dffml.source.source import Sources -from dffml import train, score, predict, run_consoletest +from dffml import train, score, predict, tune, run_consoletest +from dffml.tuner.parameter_grid import ParameterGrid from dffml.util.asynctestcase import AsyncTestCase from dffml.feature.feature import Feature, Features from dffml.source.memory import MemorySource, MemorySourceConfig @@ -61,6 +62,16 @@ def setUpClass(cls): MemorySource(MemorySourceConfig(records=cls.records[1800:])) ) cls.scorer = ClassificationAccuracy() + cls.tuner = ParameterGrid( + parameters = { + "learning_rate": [0.01, 0.05, 0.1], + "n_estimators": [20, 100, 200], + "max_depth": [3,5,8] + + }, + objective = "max" + ) + @classmethod def tearDownClass(cls): @@ -145,6 +156,18 @@ async def test_05_example(self): "iris_classification.py", ) subprocess.check_call([sys.executable, filepath]) + + async def test_06_tune(self): + # Integration with tuning method + acc = await tune( + self.model, + self.tuner, + self.scorer, + Features(Feature("Target", int, 1)), + self.trainingsource, + self.testsource + ) + self.assertTrue(0.8 <= acc) class TestXGBClassifierDocstring(AsyncTestCase): diff --git a/model/xgboost/tests/test_regressor_model.py b/model/xgboost/tests/test_regressor_model.py index 6cd4920a75..2cbba26420 100644 --- a/model/xgboost/tests/test_regressor_model.py +++ b/model/xgboost/tests/test_regressor_model.py @@ -6,8 +6,9 @@ from dffml.record import Record from dffml.source.source import Sources -from dffml import train, score, predict, run_consoletest +from dffml import train, score, predict, tune, run_consoletest from dffml.util.asynctestcase import AsyncTestCase +from dffml.tuner.parameter_grid import ParameterGrid from dffml.feature.feature import Feature, Features from dffml.accuracy import MeanSquaredErrorAccuracy from dffml.source.memory import MemorySource, MemorySourceConfig @@ -57,6 +58,16 @@ def setUpClass(cls): cls.testsource = Sources( MemorySource(MemorySourceConfig(records=cls.records[1800:])) ) + cls.scorer = MeanSquaredErrorAccuracy() + cls.tuner = ParameterGrid( + parameters = { + "learning_rate": [0.01, 0.05, 0.1], + "n_estimators": [20, 100, 200], + "max_depth": [3,5,8] + + }, + objective = "min" + ) @classmethod def tearDownClass(cls): @@ -92,6 +103,18 @@ async def test_02_predict(self): # Sometimes causes an issue when only one data point anomalously has high error self.assertLess(error, acceptable) + async def test_03_tune(self): + # Integration with tuning method + acc = await tune( + self.model, + self.tuner, + self.scorer, + Features(Feature("Target", int, 1)), + self.trainingsource, + self.testsource + ) + self.assertTrue(acc <= 10) + class TestXGBClassifierDocstring(AsyncTestCase): async def test_docstring(self): diff --git a/setup.py b/setup.py index c4a9003008..fb02cb0dce 100644 --- a/setup.py +++ b/setup.py @@ -161,7 +161,7 @@ class InstallException(Exception): # Databases "dffml.db": ["sqlite = dffml.db.sqlite:SqliteDatabase"], # Models - "dffml.model": ["slr = dffml.model.slr:SLRModel"], + "dffml.model": ["slr = dffml.model.slr:SLRModel", "automl = dffml.model.automl:AutoMLModel"], # Secrets "dffml.secret": ["ini = dffml.secret.ini:INISecret"], # Accuracy @@ -172,6 +172,7 @@ class InstallException(Exception): # Tuner "dffml.tuner": [ "parameter_grid = dffml.tuner.parameter_grid:ParameterGrid", + "random_search = dffml.tuner.random_search:RandomSearch", ], }, ) diff --git a/tests/model/test_automl.py b/tests/model/test_automl.py new file mode 100644 index 0000000000..3299940b52 --- /dev/null +++ b/tests/model/test_automl.py @@ -0,0 +1,126 @@ +import os +import random +import tempfile +import contextlib +import subprocess +import shutil + + +import numpy as np + +from dffml.record import Record +from dffml.source.source import Sources +from dffml import train, score, chdir +from dffml.util.asynctestcase import AsyncTestCase +from dffml.feature.feature import Feature, Features +from dffml.source.memory import MemorySource, MemorySourceConfig +from dffml.accuracy import ClassificationAccuracy +from dffml.tuner.parameter_grid import ParameterGrid +from dffml.model.automl import AutoMLModel + +def sh_filepath(filename): + return os.path.join(os.path.dirname(__file__), filename) + +@contextlib.contextmanager +def directory_with_csv_files(): + with tempfile.TemporaryDirectory() as tempdir: + with chdir(tempdir): + subprocess.check_output(["bash", sh_filepath("../dataset_cls.sh")]) + shutil.copy( + sh_filepath("xgbtest.json"), os.path.join(tempdir, "xgbtest.json"), + ) + yield tempdir + +class TestAutoMLModel(AsyncTestCase): + @classmethod + def setUpClass(cls): + # Create a temporary directory to store the trained model + cls.model_dir = tempfile.TemporaryDirectory() + # Create an instance of the model + + # Generating data f(x1,x2) = (2*x1 + 3*x2)//2 + _n_data = 2000 + _temp_data = np.random.rand(2, _n_data) + cls.records = [ + Record( + "x" + str(random.random()), + data={ + "features": { + "Feature1": float(_temp_data[0][i]), + "Feature2": float(_temp_data[1][i]), + "Target": (2 * _temp_data[0][i] + 3 * _temp_data[1][i]) + // 2, + } + }, + ) + for i in range(0, _n_data) + ] + + cls.trainingsource = Sources( + MemorySource(MemorySourceConfig(records=cls.records[:1800])) + ) + cls.testsource = Sources( + MemorySource(MemorySourceConfig(records=cls.records[1800:])) + ) + + cls.scorer = ClassificationAccuracy() + cls.tuner = ParameterGrid() + cls.model = AutoMLModel( + predict="Target", + features=["Feature1", "Feature2"], + location=cls.model_dir.name, + tuner = cls.tuner, + scorer = cls.scorer, + models = ["xgbclassifier", "scikitsvc"], + objective="max", + parameters = { + "xgbclassifier": { + "learning_rate": [0.01, 0.05, 0.1], + "n_estimators": [20, 100, 200], + "max_depth": [3,5,8] + }, + "scikitsvc": { + "gamma": [0.001, 0.1], + "C": [1, 10] + } + }, + split_data = True + + ) + + @classmethod + def tearDownClass(cls): + # Remove the temporary directory where the model was stored to cleanup + cls.model_dir.cleanup() + + + async def test_00_train(self): + await train(self.model, self.trainingsource) + + + async def test_01_score(self): + # Use the test data to assess the model's accuracy + res = await score( + self.model, self.scorer, Feature("Target", float, 1), self.testsource + ) + + self.assertTrue(res > 0.8) + + async def test_02_predict(self): + + res_train = await score( + self.model, + self.scorer, + Feature("Target", float, 1), + self.trainingsource, + ) + + res_test = await score( + self.model, + self.scorer, + Feature("Target", float, 1), + self.testsource, + ) + + self.assertLess(res_train - res_test, 0.05) + diff --git a/tests/tuner/dataset_cls.sh b/tests/tuner/dataset_cls.sh new file mode 100644 index 0000000000..bf506c4e11 --- /dev/null +++ b/tests/tuner/dataset_cls.sh @@ -0,0 +1,3 @@ +wget http://download.tensorflow.org/data/iris_training.csv +wget http://download.tensorflow.org/data/iris_test.csv +sed -i 's/.*setosa,versicolor,virginica/SepalLength,SepalWidth,PetalLength,PetalWidth,classification/g' iris_training.csv iris_test.csv \ No newline at end of file diff --git a/tests/tuner/dataset_reg.sh b/tests/tuner/dataset_reg.sh new file mode 100644 index 0000000000..457a6eac14 --- /dev/null +++ b/tests/tuner/dataset_reg.sh @@ -0,0 +1,17 @@ +cat > dataset.csv << EOF +f1,ans +0.1,0 +0.7,1 +0.6,1 +0.2,0 +0.8,1 +EOF + +cat > dataset2.csv << EOF +f1,ans +0.1,0 +0.7,1 +0.6,1 +0.2,0 +0.8,1 +EOF \ No newline at end of file diff --git a/tests/tuner/xgbclassifier/test_classifier.py b/tests/tuner/xgbclassifier/test_classifier.py new file mode 100644 index 0000000000..b92bc4e7e9 --- /dev/null +++ b/tests/tuner/xgbclassifier/test_classifier.py @@ -0,0 +1,120 @@ + +import os +import random +import tempfile +import contextlib +import subprocess +import shutil + + +import numpy as np + +from dffml.record import Record +from dffml.source.source import Sources +from dffml import tune, score, chdir +from dffml.util.asynctestcase import AsyncTestCase +from dffml.feature.feature import Feature, Features +from dffml.source.memory import MemorySource, MemorySourceConfig +from dffml.accuracy import ClassificationAccuracy +from dffml.tuner.parameter_grid import ParameterGrid + +from dffml_model_xgboost.xgbclassifier import ( + XGBClassifierModel, + XGBClassifierModelConfig, +) + + +def sh_filepath(filename): + return os.path.join(os.path.dirname(__file__), filename) + + +@contextlib.contextmanager +def directory_with_csv_files(): + with tempfile.TemporaryDirectory() as tempdir: + with chdir(tempdir): + subprocess.check_output(["bash", sh_filepath("../dataset_cls.sh")]) + shutil.copy( + sh_filepath("xgbtest.json"), os.path.join(tempdir, "xgbtest.json"), + ) + yield tempdir + +class TestParameterGrid(AsyncTestCase): + @classmethod + def setUpClass(cls): + # Create a temporary directory to store the trained model + cls.model_dir = tempfile.TemporaryDirectory() + # Create an instance of the model + cls.model = XGBClassifierModel( + XGBClassifierModelConfig( + features=Features( + Feature("Feature1", float, 1), Feature("Feature2") + ), + predict=Feature("Target", float, 1), + location=cls.model_dir.name, + ) + ) + + # Generating data f(x1,x2) = (2*x1 + 3*x2)//2 + _n_data = 2000 + _temp_data = np.random.rand(2, _n_data) + cls.records = [ + Record( + "x" + str(random.random()), + data={ + "features": { + "Feature1": float(_temp_data[0][i]), + "Feature2": float(_temp_data[1][i]), + "Target": (2 * _temp_data[0][i] + 3 * _temp_data[1][i]) + // 2, + } + }, + ) + for i in range(0, _n_data) + ] + + cls.trainSource = Sources( + MemorySource(MemorySourceConfig(records=cls.records[:1800])) + ) + cls.testSource = Sources( + MemorySource(MemorySourceConfig(records=cls.records[1800:])) + ) + + cls.scorer = ClassificationAccuracy() + cls.tuner = ParameterGrid( + parameters = { + "learning_rate": [0.01, 0.05, 0.1], + "n_estimators": [20, 100, 200], + "max_depth": [3,5,8] + + }, + objective = "max" + ) + + @classmethod + def tearDownClass(cls): + # Remove the temporary directory where the model was stored to cleanup + cls.model_dir.cleanup() + + + async def test_00_tune(self): + # Typical tuning using Python code + acc = await tune(self.model, self.tuner, self.scorer, Features(Feature("Target", int, 1)), self.trainSource, self.testSource) + self.assertTrue(0.8 <= acc) + # Test the best model has been correctly saved + acc = await score(self.model, self.scorer, Features(Feature("Target", int, 1)), self.testSource) + self.assertTrue(0.8 <= acc) + + async def test_01_tune(self): + # Tuning using CLI + with directory_with_csv_files() as tempdir: + stdout = subprocess.check_output( + ["bash", sh_filepath("tune.sh")] + ) + self.assertEqual(round(float(stdout.decode().strip())), 0) + + + + + + + diff --git a/tests/tuner/xgbclassifier/tune.sh b/tests/tuner/xgbclassifier/tune.sh new file mode 100644 index 0000000000..d9aec45950 --- /dev/null +++ b/tests/tuner/xgbclassifier/tune.sh @@ -0,0 +1,18 @@ +dffml tune \ +-model xgbclassifier \ +-model-features \ +SepalLength:float:1 \ + SepalWidth:float:1 \ + PetalLength:float:1 \ + -model-predict classification \ +-model-location tempDir \ +-tuner parameter_grid \ +-tuner-parameters @xgbtest.json \ +-tuner-objective max \ + -scorer clf \ +-sources train=csv test=csv \ + -source-train-filename iris_training.csv \ + -source-test-filename iris_test.csv \ + -source-train-tag train \ +-source-test-tag test \ +-features classification:int:1 \ No newline at end of file diff --git a/tests/tuner/xgbclassifier/xgbtest.json b/tests/tuner/xgbclassifier/xgbtest.json new file mode 100644 index 0000000000..e1e18981d4 --- /dev/null +++ b/tests/tuner/xgbclassifier/xgbtest.json @@ -0,0 +1,6 @@ +{ + "learning_rate": [0.01, 0.05, 0.1], + "n_estimators": [20, 100, 200], + "max_depth": [3,5,8] + +} \ No newline at end of file diff --git a/tests/tuner/xgbregressor/test_regressor.py b/tests/tuner/xgbregressor/test_regressor.py new file mode 100644 index 0000000000..51893e6a0f --- /dev/null +++ b/tests/tuner/xgbregressor/test_regressor.py @@ -0,0 +1,113 @@ +import os +import random +import tempfile +import contextlib +import subprocess +import shutil + +import numpy as np + +from dffml.record import Record +from dffml.source.source import Sources +from dffml import tune, score, chdir +from dffml.util.asynctestcase import AsyncTestCase +from dffml.feature.feature import Feature, Features +from dffml.source.memory import MemorySource, MemorySourceConfig +from dffml.accuracy import ClassificationAccuracy +from dffml.tuner.parameter_grid import ParameterGrid + +from dffml_model_xgboost.xgbregressor import ( + XGBRegressorModel, + XGBRegressorModelConfig, +) + +from sklearn.datasets import load_iris +from sklearn.model_selection import train_test_split + +def sh_filepath(filename): + return os.path.join(os.path.dirname(__file__), filename) + + +@contextlib.contextmanager +def directory_with_csv_files(): + with tempfile.TemporaryDirectory() as tempdir: + with chdir(tempdir): + subprocess.check_output(["bash", sh_filepath("../dataset_reg.sh")]) + shutil.copy( + sh_filepath("xgbtest.json"), os.path.join(tempdir, "xgbtest.json"), + ) + yield tempdir + + +class TestParameterGrid(AsyncTestCase): + @classmethod + def setUpClass(cls): + # Create a temporary directory to store the trained model + cls.model_dir = tempfile.TemporaryDirectory() + # Create an instance of the model + cls.model = XGBRegressorModel( + XGBRegressorModelConfig( + features=Features( + Feature("Feature1", float, 1), Feature("Feature2") + ), + predict=Feature("Target", float, 1), + location=cls.model_dir.name, + ) + ) + + + # Generating data f(x1,x2) = (2*x1 + 3*x2)//2 + _n_data = 2000 + _temp_data = np.random.rand(2, _n_data) + cls.records = [ + Record( + "x" + str(random.random()), + data={ + "features": { + "Feature1": float(_temp_data[0][i]), + "Feature2": float(_temp_data[1][i]), + "Target": (2 * _temp_data[0][i] + 3 * _temp_data[1][i]), + } + }, + ) + for i in range(0, _n_data) + ] + + cls.trainSource = Sources( + MemorySource(MemorySourceConfig(records=cls.records[:1800])) + ) + cls.testSource = Sources( + MemorySource(MemorySourceConfig(records=cls.records[1800:])) + ) + + cls.scorer = ClassificationAccuracy() + cls.tuner = ParameterGrid( + parameters = { + "learning_rate": [0.01, 0.05, 0.1], + "n_estimators": [20, 100, 200], + "max_depth": [3,5,8] + + }, + objective = "min" + ) + + @classmethod + def tearDownClass(cls): + # Remove the temporary directory where the model was stored to cleanup + cls.model_dir.cleanup() + + async def test_00_tune(self): + # Train the model on the training data + acc = await tune(self.model, self.tuner, self.scorer, Features(Feature("Target", int, 1)), self.trainSource, self.testSource) + self.assertTrue(acc <= 10) + acc = await score(self.model, self.scorer, Features(Feature("Target", int, 1)), self.testSource) + self.assertTrue(acc <= 10) + + async def test_01_tune(self): + # Tuning using CLI + with directory_with_csv_files() as tempdir: + stdout = subprocess.check_output( + ["bash", sh_filepath("tune.sh")] + ) + self.assertEqual(round(float(stdout.decode().strip())), 0) + diff --git a/tests/tuner/xgbregressor/tune.sh b/tests/tuner/xgbregressor/tune.sh new file mode 100644 index 0000000000..18842cc166 --- /dev/null +++ b/tests/tuner/xgbregressor/tune.sh @@ -0,0 +1,16 @@ +dffml tune \ +-model xgbregressor \ +-model-features f1:float:1 \ + -model-predict ans:int:1 \ +-model-location tempDir \ +-tuner parameter_grid \ +-tuner-parameters @xgbtest.json \ +-tuner-objective min \ + -scorer mse \ + -features ans:int:1 \ +-sources train=csv test=csv \ +-source-train-tag train \ +-source-test-tag test \ + -source-train-filename dataset.csv \ + -source-test-filename dataset2.csv \ + diff --git a/tests/tuner/xgbregressor/xgbtest.json b/tests/tuner/xgbregressor/xgbtest.json new file mode 100644 index 0000000000..e1e18981d4 --- /dev/null +++ b/tests/tuner/xgbregressor/xgbtest.json @@ -0,0 +1,6 @@ +{ + "learning_rate": [0.01, 0.05, 0.1], + "n_estimators": [20, 100, 200], + "max_depth": [3,5,8] + +} \ No newline at end of file diff --git a/tuner/bayes_opt_gp/.coveragerc b/tuner/bayes_opt_gp/.coveragerc new file mode 100644 index 0000000000..4cf9aab94b --- /dev/null +++ b/tuner/bayes_opt_gp/.coveragerc @@ -0,0 +1,13 @@ +[run] +source = + dffml_tuner_bayes_opt_gp + tests +branch = True + +[report] +exclude_lines = + no cov + no qa + noqa + pragma: no cover + if __name__ == .__main__.: diff --git a/tuner/bayes_opt_gp/.gitignore b/tuner/bayes_opt_gp/.gitignore new file mode 100644 index 0000000000..070ee81c83 --- /dev/null +++ b/tuner/bayes_opt_gp/.gitignore @@ -0,0 +1,20 @@ +*.log +*.pyc +.cache/ +.coverage +.idea/ +.vscode/ +*.egg-info/ +build/ +dist/ +docs/build/ +venv/ +wheelhouse/ +*.egss +.mypy_cache/ +*.swp +.venv/ +.eggs/ +*.modeldir +*.db +htmlcov/ diff --git a/tuner/bayes_opt_gp/LICENSE b/tuner/bayes_opt_gp/LICENSE new file mode 100644 index 0000000000..456e449824 --- /dev/null +++ b/tuner/bayes_opt_gp/LICENSE @@ -0,0 +1,21 @@ +Copyright (c) 2020 Intel, Oliver O'Brien + +MIT License + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/tuner/bayes_opt_gp/MANIFEST.in b/tuner/bayes_opt_gp/MANIFEST.in new file mode 100644 index 0000000000..19f3196490 --- /dev/null +++ b/tuner/bayes_opt_gp/MANIFEST.in @@ -0,0 +1,3 @@ +include README.md +include LICENSE +include setup_common.py diff --git a/tuner/bayes_opt_gp/README.md b/tuner/bayes_opt_gp/README.md new file mode 100644 index 0000000000..fbb5511412 --- /dev/null +++ b/tuner/bayes_opt_gp/README.md @@ -0,0 +1,15 @@ +# DFFML XGBoost Models + +## About + +dffml_tuner_bayes_opt_gp is a Bayesian Optimization tuner. +![Bayesian Optimization](https://github.com/fmfn/BayesianOptimization) + +## Documentation + +Documentation is hosted at https://intel.github.io/dffml/plugins/dffml_model.html#dffml-tuner-bayes-opt-gp + +## License + +dffml_tuner_bayes_opt_gp Tuners are distributed under the terms of the +[MIT License](LICENSE). \ No newline at end of file diff --git a/tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/__init__.py b/tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/bayes_opt_gp.py b/tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/bayes_opt_gp.py new file mode 100644 index 0000000000..d906574e69 --- /dev/null +++ b/tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/bayes_opt_gp.py @@ -0,0 +1,169 @@ +from typing import Union, Dict, Any, List +import itertools +import logging +import functools + +from dffml.base import ( + config, + field, +) +from dffml.noasync import train, score +from dffml.high_level.ml import train as async_train +from dffml.tuner import Tuner, TunerContext +from dffml.util.entrypoint import entrypoint +from dffml.record import Record +from dffml.source.source import BaseSource +from dffml.accuracy import AccuracyScorer, AccuracyContext +from dffml.model import ModelContext +from dffml.feature import Feature +import nest_asyncio +from bayes_opt import BayesianOptimization + + +class InvalidParametersException(Exception): + pass + + +@config +class BayesOptGPConfig: + parameters: dict = field( + "Parameters to be optimized", default_factory=lambda: dict() + ) + objective: str = field( + "How to optimize the given scorer. Values are min/max", default="max" + ) + init_points: int = field( + "How many steps of random exploration you want to perform.", default=5 + ) + n_iter: int = field( + "How many steps of bayesian optimization you want to perform.", + default=10, + ) + + +class BayesOptGPContext(TunerContext): + """ + Bayesian Optimization GP Tuner + """ + + def check_parameters(self, pars): + for (pax, vals) in pars.items(): + if len(vals) != 2: + raise InvalidParametersException( + f"2 values are not provided for parameter {pax}" + ) + for val in vals: + if not type(val) is float and not type(val) is int: + raise InvalidParametersException( + f"Parameter {pax} is not of type int or float." + ) + return True + + def obj_func(self, model, train_data, accuracy_scorer, feature, test_data, **vals): + + with model.parent.config.no_enforce_immutable(): + for param in vals.keys(): + + if ( + hasattr(model.parent.config, param) + and model.parent.config.__annotations__[param].__name__ + == "int" + ): + setattr(model.parent.config, param, int(vals[param])) + else: + setattr(model.parent.config, param, vals[param]) + + train(model.parent, *train_data) + acc = score(model.parent, accuracy_scorer, feature, *test_data) + + if self.parent.config.objective == "min": + return -acc + elif self.parent.config.objective == "max": + return acc + + async def optimize( + self, + model: ModelContext, + feature: Feature, + accuracy_scorer: Union[AccuracyScorer, AccuracyContext], + train_data: Union[BaseSource, Record, Dict[str, Any]], + test_data: Union[BaseSource, Record, Dict[str, Any]], + ): + """ + Method to optimize hyperparameters by Bayesian optimization using Gaussian Processes + as the surrogate model. + Uses a grid of hyperparameters in the form of a dictionary present in config, + Trains each permutation of the grid of parameters and compares accuracy. + Sets model to the best parameters and returns highest accuracy. + + Note that for this tuner, each hyperparameter field to be tuned must have exactly 2 values + specified, representing the minimum and maximum values in the search space for that + hyperparameter. Additionally, they must be either float/integer values. Otherwise, + an error is raised. + + Parameters + ---------- + model : ModelContext + The Model which needs to be used. + + feature : Feature + The Target feature in the data. + + accuracy_scorer: AccuracyContext + The accuracy scorer that needs to be used. + + train_data: SourcesContext + The train_data to train models on with the hyperparameters provided. + + test_data : SourcesContext + The test_data to score against and optimize hyperparameters. + + Returns + ------- + float + The highest score value + """ + + nest_asyncio.apply() + + self.check_parameters(self.parent.config.parameters) + + logging.info( + f"Optimizing model with Bayesian optimization with gaussian processes: {self.parent.config.parameters}" + ) + + optimizer = BayesianOptimization( + f=functools.partial(self.obj_func, model, train_data, accuracy_scorer, feature, test_data), + pbounds=self.parent.config.parameters, + random_state=1, + ) + + optimizer.maximize( + init_points=self.parent.config.init_points, + n_iter=self.parent.config.n_iter, + ) + with model.parent.config.no_enforce_immutable(): + for (param, val) in optimizer.max["params"].items(): + + if ( + hasattr(model.parent.config, param) + and model.parent.config.__annotations__[param].__name__ + == "int" + ): + setattr(model.parent.config, param, int(val)) + else: + setattr(model.parent.config, param, val) + + await async_train(model.parent, *train_data) + + if self.parent.config.objective == "min": + return -optimizer.max["target"] + elif self.parent.config.objective == "max": + return optimizer.max["target"] + + +@entrypoint("bayes_opt_gp") +class BayesOptGP(Tuner): + + CONFIG = BayesOptGPConfig + CONTEXT = BayesOptGPContext diff --git a/tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/tests/__init__.py b/tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/tests/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/tests/test_classifier_model.py b/tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/tests/test_classifier_model.py new file mode 100644 index 0000000000..8c1177b5a4 --- /dev/null +++ b/tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/tests/test_classifier_model.py @@ -0,0 +1,105 @@ +from doctest import testsource +import os +from pyexpat import features +import sys +import random +import tempfile +import subprocess + +import numpy as np +from sklearn.metrics import f1_score + +from dffml.record import Record +from dffml.source.source import Sources +from dffml import train, score, predict, tune, run_consoletest +from dffml.util.asynctestcase import AsyncTestCase +from dffml.feature.feature import Feature, Features +from dffml.source.memory import MemorySource, MemorySourceConfig +from dffml.accuracy import ClassificationAccuracy + +from dffml_model_xgboost.xgbclassifier import ( + XGBClassifierModel, + XGBClassifierModelConfig, +) + +from dffml_tuner_bayes_opt_gp.bayes_opt_gp import BayesOptGP + + + +class TestXGBClassifier(AsyncTestCase): + @classmethod + def setUpClass(cls): + # Create a temporary directory to store the trained model + cls.model_dir = tempfile.TemporaryDirectory() + # Create an instance of the model + cls.features = Features( + Feature("Feature1", float, 1), Feature("Feature2") + ) + cls.model = XGBClassifierModel( + XGBClassifierModelConfig( + features=Features( + Feature("Feature1", float, 1), Feature("Feature2") + ), + predict=Feature("Target", float, 1), + location=cls.model_dir.name, + ) + ) + cls.tuner = BayesOptGP( + parameters= + { + "learning_rate": [0.01, 0.05, 0.1], + "n_estimators": [20, 100, 200], + "max_depth": [3,5,8] + }, + objective="max", + init_points=5, + n_iter=10 + ) + # Generating data f(x1,x2) = (2*x1 + 3*x2)//2 + _n_data = 2000 + _temp_data = np.random.rand(2, _n_data) + cls.records = [ + Record( + "x" + str(random.random()), + data={ + "features": { + "Feature1": float(_temp_data[0][i]), + "Feature2": float(_temp_data[1][i]), + "Target": (2 * _temp_data[0][i] + 3 * _temp_data[1][i]) + // 2, + } + }, + ) + for i in range(0, _n_data) + ] + + cls.trainingsource = Sources( + MemorySource(MemorySourceConfig(records=cls.records[:1800])) + ) + cls.testsource = Sources( + MemorySource(MemorySourceConfig(records=cls.records[1800:])) + ) + cls.scorer = ClassificationAccuracy() + + @classmethod + def tearDownClass(cls): + # Remove the temporary directory where the model was stored to cleanup + cls.model_dir.cleanup() + + async def test_00_train(self): + # Train the model on the training data + await tune( + self.model, + self.tuner, + self.scorer, + self.features, + [self.trainingsource], + [self.testsource], + ) + + + + +class TestXGBClassifierDocstring(AsyncTestCase): + async def test_docstring(self): + await run_consoletest(XGBClassifierModel) diff --git a/tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/version.py b/tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/version.py new file mode 100644 index 0000000000..1cf6267ae5 --- /dev/null +++ b/tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/version.py @@ -0,0 +1 @@ +VERSION = "0.1.0" diff --git a/tuner/bayes_opt_gp/pyproject.toml b/tuner/bayes_opt_gp/pyproject.toml new file mode 100644 index 0000000000..8b9d32fa10 --- /dev/null +++ b/tuner/bayes_opt_gp/pyproject.toml @@ -0,0 +1,20 @@ +[tool.black] +line-length = 79 +target-version = ['py37'] + +exclude = ''' +( + /( + \.eggs # exclude a few common directories in the + | \.git # root of the project + | \.hg + | \.mypy_cache + | \.tox + | \.venv + | _build + | buck-out + | build + | dist + ) +) +''' diff --git a/tuner/bayes_opt_gp/setup.cfg b/tuner/bayes_opt_gp/setup.cfg new file mode 100644 index 0000000000..00a065a39a --- /dev/null +++ b/tuner/bayes_opt_gp/setup.cfg @@ -0,0 +1,10 @@ +[options] +zip_safe = False +include_package_data = True +packages = find: +install_requires = + dffml>=0.4.0 + bayesian-optimization>=1.2.0 + pandas>=0.25.0 + scikit-learn>=0.22.0 + joblib>=0.16.0 \ No newline at end of file diff --git a/tuner/bayes_opt_gp/setup.py b/tuner/bayes_opt_gp/setup.py new file mode 100644 index 0000000000..d38d37ea92 --- /dev/null +++ b/tuner/bayes_opt_gp/setup.py @@ -0,0 +1,19 @@ +import os +import sys +import site +import importlib.util +from setuptools import setup + +# See https://github.com/pypa/pip/issues/7953 +site.ENABLE_USER_SITE = "--user" in sys.argv[1:] + +# Boilerplate to load commonalities +spec = importlib.util.spec_from_file_location( + "setup_common", os.path.join(os.path.dirname(__file__), "setup_common.py") +) +common = importlib.util.module_from_spec(spec) +spec.loader.exec_module(common) + +common.KWARGS["entry_points"] = {"dffml.tuner": [f"bayes_opt_gp = {common.IMPORT_NAME}.bayes_opt_gp:BayesOptGP"]} + +setup(**common.KWARGS) diff --git a/tuner/bayes_opt_gp/setup_common.py b/tuner/bayes_opt_gp/setup_common.py new file mode 100644 index 0000000000..7dfb09b35c --- /dev/null +++ b/tuner/bayes_opt_gp/setup_common.py @@ -0,0 +1,55 @@ +import os +import sys +import ast +from pathlib import Path + +ORG = "dffml" +NAME = "dffml-tuner-bayes-opt-gp" +DESCRIPTION = "DFFML model dffml-tuner-bayes-opt-gp" +AUTHOR_NAME = "Edison Siow" +AUTHOR_EMAIL = "edisonsiowxiong@gmail.com" + +IMPORT_NAME = ( + NAME + if "replace_package_name".upper() != NAME + else "replace_import_package_name".upper() +).replace("-", "_") + +SELF_PATH = Path(sys.argv[0]).parent.resolve() +if not (SELF_PATH / Path(IMPORT_NAME, "version.py")).is_file(): + SELF_PATH = os.path.dirname(os.path.realpath(__file__)) + +VERSION = ast.literal_eval( + Path(SELF_PATH, IMPORT_NAME, "version.py") + .read_text() + .split("=")[-1] + .strip() +) + +README = Path(SELF_PATH, "README.md").read_text() + +KWARGS = dict( + name=NAME, + version=VERSION, + description=DESCRIPTION, + long_description=README, + long_description_content_type="text/markdown", + author=AUTHOR_NAME, + author_email=AUTHOR_EMAIL, + maintainer=AUTHOR_NAME, + maintainer_email=AUTHOR_EMAIL, + url=f"https://github.com/{ORG}/{NAME}", + license="MIT", + keywords=["dffml"], + classifiers=[ + "Development Status :: 3 - Alpha", + "Intended Audience :: Developers", + "License :: OSI Approved :: MIT License", + "Natural Language :: English", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3 :: Only", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy", + ], +)