diff --git a/CHANGES.rst b/CHANGES.rst index 20a98c85..92588f7a 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,6 +1,18 @@ Changelog ========= +0.10.1 (2019-08-29) +------------------- + +* Don't include typing dependency on Python 3.5+ + to fix installation on Python 3.7 + +0.10.0 (2019-08-21) +------------------- + +* Keras image classifiers: explaining predictions with Grad-CAM + (GSoC-2019 project by @teabolt). + 0.9.0 (2019-07-05) ------------------ diff --git a/eli5/__init__.py b/eli5/__init__.py index 416681eb..1b125976 100644 --- a/eli5/__init__.py +++ b/eli5/__init__.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- from __future__ import absolute_import -__version__ = '0.9.0' +__version__ = '0.10.1' from .formatters import ( format_as_html, @@ -96,6 +96,7 @@ except ImportError: # keras is not available pass +<<<<<<< HEAD try: @@ -104,4 +105,6 @@ ) except ImportError: # pytorch is not available - pass \ No newline at end of file + pass +======= +>>>>>>> ec0f51c60aaf360327ca18e3e0cdae2222cec6bf diff --git a/eli5/_feature_names.py b/eli5/_feature_names.py index fecb820f..ff1fd80c 100644 --- a/eli5/_feature_names.py +++ b/eli5/_feature_names.py @@ -5,8 +5,8 @@ Union, Callable, Pattern ) -import numpy as np # type: ignore -import scipy.sparse as sp # type: ignore +import numpy as np +import scipy.sparse as sp class FeatureNames(Sized, Iterable): diff --git a/eli5/_feature_weights.py b/eli5/_feature_weights.py index 0b737795..1c096e3a 100644 --- a/eli5/_feature_weights.py +++ b/eli5/_feature_weights.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- from __future__ import absolute_import -import numpy as np # type: ignore +import numpy as np from eli5.base import FeatureWeights, FeatureWeight from .utils import argsort_k_largest_positive, argsort_k_smallest, mask diff --git a/eli5/_graphviz.py b/eli5/_graphviz.py index 4632a925..36b26a17 100644 --- a/eli5/_graphviz.py +++ b/eli5/_graphviz.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -import graphviz # type: ignore +import graphviz def is_supported(): diff --git a/eli5/base.py b/eli5/base.py index 1f120fef..ef50fb0a 100644 --- a/eli5/base.py +++ b/eli5/base.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- from typing import Any, List, Tuple, Union, Optional -import numpy as np # type: ignore +import numpy as np from .base_utils import attrs from .formatters.features import FormattedFeatureName diff --git a/eli5/base_utils.py b/eli5/base_utils.py index 779c6d64..f1081c3b 100644 --- a/eli5/base_utils.py +++ b/eli5/base_utils.py @@ -1,9 +1,9 @@ import inspect -import attr # type: ignore +import attr try: - from functools import singledispatch # type: ignore + from functools import singledispatch except ImportError: from singledispatch import singledispatch # type: ignore diff --git a/eli5/catboost.py b/eli5/catboost.py index 2e495cb6..56abfa9e 100644 --- a/eli5/catboost.py +++ b/eli5/catboost.py @@ -1,7 +1,7 @@ from __future__ import absolute_import, division -import numpy as np # type: ignore -import catboost # type: ignore +import numpy as np +import catboost from eli5.explain import explain_weights from eli5._feature_importances import get_feature_importance_explanation diff --git a/eli5/formatters/as_dataframe.py b/eli5/formatters/as_dataframe.py index 6f2fc302..5b801e75 100644 --- a/eli5/formatters/as_dataframe.py +++ b/eli5/formatters/as_dataframe.py @@ -2,7 +2,7 @@ from typing import Any, Dict, List, Optional import warnings -import pandas as pd # type: ignore +import pandas as pd import eli5 from eli5.base import ( diff --git a/eli5/formatters/as_dict.py b/eli5/formatters/as_dict.py index 1878d8b0..fbad5ee5 100644 --- a/eli5/formatters/as_dict.py +++ b/eli5/formatters/as_dict.py @@ -1,7 +1,7 @@ import six -import attr # type: ignore -import numpy as np # type: ignore +import attr +import numpy as np from .features import FormattedFeatureName diff --git a/eli5/formatters/html.py b/eli5/formatters/html.py index 54cf2e6c..0167b33e 100644 --- a/eli5/formatters/html.py +++ b/eli5/formatters/html.py @@ -3,8 +3,8 @@ from itertools import groupby from typing import List, Optional, Tuple -import numpy as np # type: ignore -from jinja2 import Environment, PackageLoader # type: ignore +import numpy as np +from jinja2 import Environment, PackageLoader from eli5 import _graphviz from eli5.base import (Explanation, TargetExplanation, FeatureWeights, diff --git a/eli5/formatters/image.py b/eli5/formatters/image.py index e2f15684..5175961f 100644 --- a/eli5/formatters/image.py +++ b/eli5/formatters/image.py @@ -2,11 +2,14 @@ from __future__ import absolute_import from typing import Union, Optional, Callable -import numpy as np # type: ignore -from PIL import Image # type: ignore -import matplotlib.cm # type: ignore +import numpy as np +from PIL import Image +import matplotlib.cm from eli5.base import Explanation +from eli5.nn.gradcam import ( + _validate_heatmap, +) def format_as_image(expl, # type: Explanation @@ -287,14 +290,6 @@ def _validate_image(image): 'Got: {}'.format(image)) -def _validate_heatmap(heatmap): - # type: (np.ndarray) -> None - """Check that ``heatmap`` has the right type.""" - if not isinstance(heatmap, np.ndarray): - raise TypeError('heatmap must be a numpy.ndarray instance. ' - 'Got: {}'.format(heatmap)) - - def _needs_normalization(heatmap): # type: (np.ndarray) -> bool """Return whether ``heatmap`` values are in the interval [0, 1].""" @@ -311,4 +306,4 @@ def _normalize_heatmap(h, epsilon=1e-07): # https://datascience.stackexchange.com/questions/5885/how-to-scale-an-array-of-signed-integers-to-range-from-0-to-1 # add eps to avoid division by zero in case heatmap is all 0's # this also means that lmap max will be slightly less than the 'true' max - return (h - h.min()) / (h.max() - h.min() + epsilon) + return (h - h.min()) / (h.max() - h.min() + epsilon) \ No newline at end of file diff --git a/eli5/formatters/text.py b/eli5/formatters/text.py index a6269ed3..e6abb286 100644 --- a/eli5/formatters/text.py +++ b/eli5/formatters/text.py @@ -2,6 +2,7 @@ from __future__ import absolute_import from itertools import chain import six +from tabulate import tabulate from typing import List, Optional, Iterator from eli5.base import Explanation, FeatureImportances @@ -9,7 +10,8 @@ from .features import FormattedFeatureName from .utils import ( format_signed, format_value, format_weight, has_any_values_for_weights, - replace_spaces, should_highlight_spaces, tabulate) + replace_spaces, should_highlight_spaces) +from .utils import tabulate as eli5_tabulate from .trees import tree2text @@ -153,7 +155,6 @@ def _decision_tree_lines(explanation): def _transition_features_lines(explanation): # type: (Explanation) -> List[str] - from tabulate import tabulate # type: ignore tf = explanation.transition_features assert tf is not None return [ @@ -203,7 +204,7 @@ def _targets_lines(explanation, # type: Explanation w = target.feature_weights assert w is not None - table = tabulate( + table = eli5_tabulate( [table_line(fw) for fw in chain(w.pos, reversed(w.neg))], header=table_header, col_align=col_align, diff --git a/eli5/formatters/text_helpers.py b/eli5/formatters/text_helpers.py index dc5ff28a..c63c66b2 100644 --- a/eli5/formatters/text_helpers.py +++ b/eli5/formatters/text_helpers.py @@ -1,7 +1,7 @@ from collections import Counter from typing import List, Optional -import numpy as np # type: ignore +import numpy as np from eli5.base import TargetExplanation, WeightedSpans, DocWeightedSpans from eli5.base_utils import attrs diff --git a/eli5/formatters/trees.py b/eli5/formatters/trees.py index 7be1e9c3..5cfbee10 100644 --- a/eli5/formatters/trees.py +++ b/eli5/formatters/trees.py @@ -24,6 +24,8 @@ def p(*args): else: assert node.left is not None assert node.right is not None + assert node.threshold is not None + feat_name = node.feature_name if depth > 0: diff --git a/eli5/formatters/utils.py b/eli5/formatters/utils.py index 542402d1..2e6d2d39 100644 --- a/eli5/formatters/utils.py +++ b/eli5/formatters/utils.py @@ -2,10 +2,9 @@ from itertools import chain import re import six -from numbers import Real from typing import Any, Union, List, Dict, Callable, Match, Optional -import numpy as np # type: ignore +import numpy as np from eli5.base import Explanation from .features import FormattedFeatureName @@ -143,12 +142,12 @@ def tabulate(data, # type: List[List[Any]] def format_weight(value): - # type: (Real) -> str + # type: (float) -> str return '{:+.3f}'.format(value) def format_value(value): - # type: (Optional[Real]) -> str + # type: (Optional[float]) -> str if value is None: return '' elif np.isnan(value): diff --git a/eli5/ipython.py b/eli5/ipython.py index 0039c39e..033a840c 100644 --- a/eli5/ipython.py +++ b/eli5/ipython.py @@ -3,7 +3,7 @@ from typing import Any, Dict, Tuple import warnings -from IPython.display import HTML, Image # type: ignore +from IPython.display import HTML, Image from .explain import explain_weights, explain_prediction from .formatters import format_as_html, fields @@ -11,7 +11,7 @@ from .formatters.image import format_as_image except ImportError as e: # missing dependencies - format_as_image = e # type: ignore + format_as_image = e # type: ignore FORMAT_KWARGS = {'include_styles', 'force_weights', diff --git a/eli5/keras/explain_prediction.py b/eli5/keras/explain_prediction.py index 6d465f34..1613af72 100644 --- a/eli5/keras/explain_prediction.py +++ b/eli5/keras/explain_prediction.py @@ -2,14 +2,14 @@ from __future__ import absolute_import from typing import Union, Optional, Callable, List, Tuple, Generator, TYPE_CHECKING if TYPE_CHECKING: - import PIL # type: ignore - -import numpy as np # type: ignore -import keras # type: ignore -import keras.backend as K # type: ignore -from keras.models import Model # type: ignore -from keras.layers import Layer # type: ignore -from keras.layers import ( # type: ignore + import PIL + +import numpy as np +import keras +import keras.backend as K +from keras.models import Model +from keras.layers import Layer +from keras.layers import ( Conv2D, MaxPooling2D, AveragePooling2D, @@ -26,7 +26,7 @@ GRU, Bidirectional, ) -from keras.preprocessing.image import array_to_img # type: ignore +from keras.preprocessing.image import array_to_img from eli5.base import ( Explanation, @@ -36,12 +36,9 @@ from eli5.nn.gradcam import ( gradcam_heatmap, DESCRIPTION_GRADCAM, - _validate_targets, - _validate_classification_target, ) from eli5.nn.text import ( - gradcam_text_spans, - _is_character_tokenization, + gradcam_spans, ) from .gradcam import ( gradcam_backend_keras, @@ -248,7 +245,7 @@ def explain_prediction_keras_image(model, * ``target`` ID of target class. * ``score`` value for predicted class. """ - _validate_params(model, doc, targets=targets) + _validate_params(model, doc) if image is None: image = _extract_image(doc) @@ -354,7 +351,7 @@ def explain_prediction_keras_text(model, """ assert tokens is not None - _validate_params(model, doc, targets=targets, tokens=tokens) + _validate_params(model, doc) tokens = _unbatch_tokens(tokens) if layer is not None: @@ -373,13 +370,13 @@ def explain_prediction_keras_text(model, predicted_idx, = predicted_idx predicted_val, = predicted_val heatmap, = heatmap - text_vals = gradcam_text_spans(heatmap, - tokens, - doc, - pad_value=pad_value, - pad_token=pad_token, - interpolation_kind=interpolation_kind, - ) + text_vals = gradcam_spans(heatmap, + tokens, + doc, + pad_value=pad_value, + pad_token=pad_token, + interpolation_kind=interpolation_kind, + ) # TODO: padding could be relevant for images too? tokens, heatmap, weighted_spans = text_vals return Explanation( @@ -553,18 +550,11 @@ def _backward_layers(model): def _validate_params(model, # type: Model doc, # type: np.ndarray - targets=None, # type: Optional[list] - tokens=None, # type: Optional[Union[np.ndarray, list]] ): # type: (...) -> None - """Helper for validating all explanation function parameters.""" + """Helper for validating explanation function parameters.""" _validate_model(model) _validate_doc(doc) - if targets is not None: - _validate_targets(targets) - _validate_classification_target(targets[0], model.output_shape) - if tokens is not None: - _validate_tokens(doc, tokens) def _validate_model(model): @@ -589,59 +579,4 @@ def _validate_doc(doc): raise ValueError('"doc" batch size must be 1. ' 'Got doc with batch size: %d' % batch_size) - # Note that validation of the input shape, etc is done by Keras - - -# FIXME: break this function up -def _validate_tokens(doc, tokens): - # type: (np.ndarray, Union[np.ndarray, list]) -> None - """Check that ``tokens`` contains correct items and matches ``doc``.""" - batch_size, doc_len = doc.shape - if not isinstance(tokens, (list, np.ndarray)): - # wrong type - raise TypeError('"tokens" must be list or numpy.ndarray. ' - 'Got "{}".'.format(tokens)) - - if len(tokens) == 0: - # empty list - raise ValueError('"tokens" is empty: {}'.format(tokens)) - - an_entry = tokens[0] - if isinstance(an_entry, str): - # no batch - if batch_size != 1: - # doc is batched but tokens is not - raise ValueError('If passing "tokens" without batch dimension, ' - '"doc" must have batch size = 1.' - 'Got "doc" with batch size = %d.' % batch_size) - tokens_len = len(tokens) - elif isinstance(an_entry, (list, np.ndarray)): - # batched - tokens_batch_size = len(tokens) - if tokens_batch_size != batch_size: - # batch lengths do not match - raise ValueError('"tokens" must have same number of samples ' - 'as in doc batch. Got: "tokens" samples: %d, ' - 'doc samples: %d' % (tokens_batch_size, batch_size)) - - a_token = an_entry[0] - if not isinstance(a_token, str): - # actual contents are not strings - raise TypeError('Second axis in "tokens" must contain strings. ' - 'Found "{}" (type "{}")'.format(a_token, type(a_token))) - - # https://stackoverflow.com/a/35791116/11555448 - it = iter(tokens) - the_len = len(next(it)) - if not all(len(l) == the_len for l in it): - raise ValueError('"tokens" samples do not have the same length.') - tokens_len = the_len - else: - raise TypeError('"tokens" must be an array of strings, ' - 'or an array of string arrays. ' - 'Got "{}".'.format(tokens)) - - if tokens_len != doc_len: - raise ValueError('"tokens" and "doc" lengths must match. ' - '"tokens" length: "%d". "doc" length: "%d"' - % (tokens_len, doc_len)) + # Note that validation of the input shape, etc is done by Keras \ No newline at end of file diff --git a/eli5/keras/gradcam.py b/eli5/keras/gradcam.py index c5f06fd2..5fd09526 100644 --- a/eli5/keras/gradcam.py +++ b/eli5/keras/gradcam.py @@ -7,24 +7,29 @@ """ from __future__ import absolute_import -from typing import Optional, Tuple, List +from typing import Any, Optional, Tuple, List -import numpy as np # type: ignore -import keras # type: ignore -import keras.backend as K # type: ignore -from keras.models import Model # type: ignore -from keras.layers import Layer # type: ignore +import numpy as np +import keras +import keras.backend as K +from keras.models import Model +from keras.layers import Layer + +from eli5.nn.gradcam import ( + _validate_targets, + _validate_classification_target, +) def gradcam_backend_keras(model, # type: Model - doc, # type: np.ndarray - targets, # type: Optional[List[int]] - activation_layer, # type: Layer - ): + doc, # type: np.ndarray + targets, # type: Optional[List[int]] + activation_layer, # type: Layer + ): # type: (...) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray] """ Compute the terms and by-products required by the Grad-CAM formula. - + Parameters ---------- model : keras.models.Model @@ -51,24 +56,13 @@ def gradcam_backend_keras(model, # type: Model (numpy.ndarray, np.ndarray, np.ndarray, np.ndarray) Values of variables. """ - # score for class in targets - # TODO: maybe do the sum / loss calculation in this function and pass it to gradcam. - # This would be consistent with what is done in - # https://github.com/ramprs/grad-cam/blob/master/misc/utils.lua - # and https://github.com/ramprs/grad-cam/blob/master/classification.lua - # TODO: as in pytorch PR, separate out classification tensor code - if targets is not None: - target, = targets - predicted_idx = K.constant([target], dtype='int64') - else: - predicted_idx = _autoget_target_prediction(model) - # access value by index - predicted_val = K.gather(model.output[0, :], predicted_idx) + # class score + predicted_idx, predicted_val = _classification_target(model, targets) # output of target activation layer, i.e. activation maps of a conv layer activation_output = activation_layer.output - # score for class w.r.p.t. activation layer + # score w.r.p.t. activation layer grads = _calc_gradient(predicted_val, [activation_output]) # TODO: gradcam on input layer @@ -85,12 +79,12 @@ def gradcam_backend_keras(model, # type: Model def _calc_gradient(ys, xs): - # (K.variable, list) -> K.variable - # FIXME: K.variable is not the right type to use? + # (Any, list) -> Any + # TODO: In the future we can replace the annotation Any with a tensor type in Keras backend """ Return the gradient of scalar ``ys`` with respect to each of list ``xs``, (must be singleton) - and apply grad normalization. + and apply gradient normalization. """ # differentiate ys (scalar) with respect to each variable in xs # K.gradients tends to produce bigger values than tf.gradients @@ -116,9 +110,43 @@ def _calc_gradient(ys, xs): return grads -def _autoget_target_prediction(model): - # type: (Model) -> K.variable - """Automatically get the index with - the highest predicted output from ``model``""" - output = model.output - return K.argmax(output, axis=-1) \ No newline at end of file +def _classification_target(model, targets): + # type: (Model, Optional[List[int]]) -> Tuple[Any, Any] + """Get a predicted index and its value from a classification based model.""" + # TODO: maybe pass the loss/score to the gradcam function. + # This would be consistent with what is done in + # https://github.com/ramprs/grad-cam/blob/master/misc/utils.lua + # and https://github.com/ramprs/grad-cam/blob/master/classification.lua + if targets is not None: + _validate_targets(targets) + target, = targets + _validate_classification_target(target, model.output_shape) + # make a dummy index + predicted_idx = K.constant([target], dtype='int64') + else: + # take the index with the highest value + # from the array of predictions + predicted_idx = K.argmax(model.output, axis=-1) + +<<<<<<< HEAD + # access value by index + predicted_val = K.gather(model.output[0, :], predicted_idx) + return predicted_idx, predicted_val +======= +def _validate_target(target, output_shape): + # type: (int, tuple) -> None + """ + Check whether ``target``, + an integer index into the model's output + is valid for the given ``output_shape``. + """ + if isinstance(target, int): + output_nodes = output_shape[1:][0] + if not (0 <= target < output_nodes): + raise ValueError('Prediction target index is ' + 'outside the required range [0, {}). ' + 'Got {}'.format(output_nodes, target)) + else: + raise TypeError('Prediction target must be int. ' + 'Got: {}'.format(target)) +>>>>>>> 017c738f8dcf3e31346de49a390835ffafad3f1b diff --git a/eli5/lightgbm.py b/eli5/lightgbm.py index b4510912..c54236b7 100644 --- a/eli5/lightgbm.py +++ b/eli5/lightgbm.py @@ -3,8 +3,8 @@ from collections import defaultdict from typing import DefaultDict, Optional -import numpy as np # type: ignore -import lightgbm # type: ignore +import numpy as np +import lightgbm from eli5.explain import explain_weights, explain_prediction from eli5._feature_importances import get_feature_importance_explanation diff --git a/eli5/lightning.py b/eli5/lightning.py index 417d4f15..2f648064 100644 --- a/eli5/lightning.py +++ b/eli5/lightning.py @@ -1,9 +1,9 @@ # -*- coding: utf-8 -*- from __future__ import absolute_import -from lightning.impl.base import BaseEstimator # type: ignore -from lightning import classification, regression # type: ignore -from sklearn.multiclass import OneVsRestClassifier # type: ignore +from lightning.impl.base import BaseEstimator +from lightning import classification, regression +from sklearn.multiclass import OneVsRestClassifier from eli5.base import Explanation from eli5.base_utils import singledispatch diff --git a/eli5/lime/_vectorizer.py b/eli5/lime/_vectorizer.py index d5168454..5356d6cd 100644 --- a/eli5/lime/_vectorizer.py +++ b/eli5/lime/_vectorizer.py @@ -2,8 +2,8 @@ from __future__ import absolute_import from typing import Tuple, Callable, Dict, Optional, List -import numpy as np # type: ignore -from sklearn.base import BaseEstimator, TransformerMixin # type: ignore +import numpy as np +from sklearn.base import BaseEstimator, TransformerMixin from eli5.base import DocWeightedSpans, FeatureWeights from eli5.sklearn.text import _get_feature_weights_dict @@ -13,8 +13,8 @@ class SingleDocumentVectorizer(BaseEstimator, TransformerMixin): """ Fake vectorizer which converts document just to a vector of ones """ - def __init__(self, token_pattern=None): - # type: (Optional[str]) -> None + def __init__(self, token_pattern): + # type: (str) -> None self.token_pattern = token_pattern def fit(self, X, y=None): diff --git a/eli5/lime/lime.py b/eli5/lime/lime.py index 924675bb..2968da04 100644 --- a/eli5/lime/lime.py +++ b/eli5/lime/lime.py @@ -6,12 +6,12 @@ from __future__ import absolute_import from typing import Any, Callable, Dict, Optional -import numpy as np # type: ignore -from sklearn.feature_extraction.text import CountVectorizer # type: ignore -from sklearn.linear_model import SGDClassifier # type: ignore -from sklearn.model_selection import train_test_split # type: ignore -from sklearn.utils import check_random_state # type: ignore -from sklearn.base import clone, BaseEstimator # type: ignore +import numpy as np +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.linear_model import SGDClassifier +from sklearn.model_selection import train_test_split +from sklearn.utils import check_random_state +from sklearn.base import clone, BaseEstimator import eli5 from eli5.sklearn.utils import sklearn_version @@ -148,7 +148,7 @@ def __init__(self, rbf_sigma=None, # type: float random_state=None, expand_factor=10, # type: Optional[int] - token_pattern=None, # type: str + token_pattern=None, # type: Optional[str] ): # type: (...) -> None self.n_samples = n_samples @@ -162,7 +162,7 @@ def __init__(self, if char_based is None: if token_pattern is None: self.char_based = False # type: Optional[bool] - self.token_pattern = DEFAULT_TOKEN_PATTERN + self.token_pattern = DEFAULT_TOKEN_PATTERN # type: str else: self.char_based = None self.token_pattern = token_pattern diff --git a/eli5/lime/samplers.py b/eli5/lime/samplers.py index ff72f568..d079b471 100644 --- a/eli5/lime/samplers.py +++ b/eli5/lime/samplers.py @@ -5,13 +5,13 @@ from typing import List, Tuple, Any, Union, Dict, Optional import six -import numpy as np # type: ignore -from scipy.stats import itemfreq # type: ignore -from sklearn.base import BaseEstimator, clone # type: ignore -from sklearn.neighbors import KernelDensity # type: ignore -from sklearn.metrics import pairwise_distances # type: ignore -from sklearn.model_selection import GridSearchCV, KFold # type: ignore -from sklearn.utils import check_random_state # type: ignore +import numpy as np +from scipy.stats import itemfreq +from sklearn.base import BaseEstimator, clone +from sklearn.neighbors import KernelDensity +from sklearn.metrics import pairwise_distances +from sklearn.model_selection import GridSearchCV, KFold +from sklearn.utils import check_random_state from eli5.utils import vstack from eli5.lime.utils import rbf diff --git a/eli5/lime/textutils.py b/eli5/lime/textutils.py index e896f347..98da0428 100644 --- a/eli5/lime/textutils.py +++ b/eli5/lime/textutils.py @@ -7,8 +7,8 @@ import math from typing import List, Tuple, Union, Optional -import numpy as np # type: ignore -from sklearn.utils import check_random_state # type: ignore +import numpy as np +from sklearn.utils import check_random_state from eli5.utils import indices_to_bool_mask, vstack diff --git a/eli5/lime/utils.py b/eli5/lime/utils.py index ee271a65..120dbfbd 100644 --- a/eli5/lime/utils.py +++ b/eli5/lime/utils.py @@ -2,12 +2,12 @@ from __future__ import absolute_import from typing import List, Any -import numpy as np # type: ignore -from scipy.stats import entropy # type: ignore -from sklearn.pipeline import Pipeline # type: ignore -from sklearn.utils import check_random_state, issparse # type: ignore -from sklearn.utils.metaestimators import if_delegate_has_method # type: ignore -from sklearn.utils import shuffle as _shuffle # type: ignore +import numpy as np +from scipy.stats import entropy +from sklearn.pipeline import Pipeline +from sklearn.utils import check_random_state, issparse +from sklearn.utils.metaestimators import if_delegate_has_method +from sklearn.utils import shuffle as _shuffle from eli5.utils import vstack from eli5.sklearn.utils import sklearn_version diff --git a/eli5/nn/__init__.py b/eli5/nn/__init__.py index 7bfe97d5..7422dd60 100644 --- a/eli5/nn/__init__.py +++ b/eli5/nn/__init__.py @@ -6,6 +6,6 @@ compute_weights, ) from .text import ( - gradcam_text_spans, + gradcam_spans, resize_1d, ) \ No newline at end of file diff --git a/eli5/nn/gradcam.py b/eli5/nn/gradcam.py index 34af00b1..ec6af934 100644 --- a/eli5/nn/gradcam.py +++ b/eli5/nn/gradcam.py @@ -9,7 +9,6 @@ """ -# FIXME: rename functions def gradcam_heatmap(activations, grads, relu=True, counterfactual=False): # type: (np.ndarray, np.ndarray, bool, bool) -> np.ndarray """ @@ -196,4 +195,13 @@ def _validate_classification_target(target, output_shape): if not (0 <= target < output_nodes): raise ValueError('Prediction target index is ' 'outside the required range [0, {}). ', - 'Got {}'.format(output_nodes, target)) \ No newline at end of file + 'Got {}'.format(output_nodes, target)) + + +def _validate_heatmap(heatmap): + # type: (np.ndarray) -> None + """Utility function to check that the ``heatmap`` + argument has the right type.""" + if not isinstance(heatmap, np.ndarray): + raise TypeError('heatmap must be a numpy.ndarray instance. ' + 'Got: "{}" (type "{}").'.format(heatmap, type(heatmap))) \ No newline at end of file diff --git a/eli5/nn/text.py b/eli5/nn/text.py index 4612a61f..81f69155 100644 --- a/eli5/nn/text.py +++ b/eli5/nn/text.py @@ -8,15 +8,18 @@ WeightedSpans, DocWeightedSpans, ) +from eli5.nn.gradcam import ( + _validate_heatmap, +) -def gradcam_text_spans(heatmap, # type: np.ndarray - tokens, # type: Union[np.ndarray, list] - doc, # type: np.ndarray - pad_value=None, # type: Optional[Union[int, float]] - pad_token=None, # type: Optional[str] - interpolation_kind='linear' # type: Union[str, int] - ): +def gradcam_spans(heatmap, # type: np.ndarray + tokens, # type: Union[np.ndarray, list] + doc, # type: np.ndarray + pad_value=None, # type: Optional[Union[int, float]] + pad_token=None, # type: Optional[str] + interpolation_kind='linear' # type: Union[str, int] + ): # type: (...) -> Tuple[Union[np.ndarray, list], np.ndarray, WeightedSpans] """ Create text spans from a Grad-CAM ``heatmap`` imposed over ``tokens``. @@ -30,9 +33,16 @@ def gradcam_text_spans(heatmap, # type: np.ndarray **Should be rank 1 (no batch dimension).** + + :raises TypeError: if ``heatmap`` is wrong type. + tokens : numpy.ndarray or list Tokens that will be highlighted using weights from ``heatmap``. + + :raises TypeError: if ``tokens`` is wrong type. + :raises ValueError: if ``tokens`` contents are unexpected. + doc: numpy.ndarray Original input to the network, from which ``heatmap`` was created. @@ -53,8 +63,15 @@ def gradcam_text_spans(heatmap, # type: np.ndarray ``tokens`` and ``heatmap`` optionally cut from padding. A :class:`eli5.base.WeightedSpans` object with a weight for each token. """ - # FIXME: might want to do this when formatting the explanation? + # We call this before returning the explanation, NOT when formatting the explanation + # Because WeightedSpans, etc are attributes of a returned explanation # TODO: might want to add validation for heatmap and other arguments? + _validate_tokens(tokens) + _validate_tokens_value(tokens, doc) + if isinstance(tokens, list): + # convert to a common data type + tokens = np.array(tokens) + length = len(tokens) heatmap = resize_1d(heatmap, length, interpolation_kind=interpolation_kind) @@ -62,14 +79,16 @@ def gradcam_text_spans(heatmap, # type: np.ndarray if pad_value is not None or pad_token is not None: # remove padding pad_indices = _find_padding(pad_value=pad_value, pad_token=pad_token, doc=doc, tokens=tokens) - # If pad_value is not the actual padding value, behaviour is unknown + # If passed padding argument is not the actual padding token/value, behaviour is unknown tokens, heatmap = _trim_padding(pad_indices, tokens, heatmap) + document = _construct_document(tokens) spans = _build_spans(tokens, heatmap, document) weighted_spans = WeightedSpans([ DocWeightedSpans(document, spans=spans) - ]) # why list? - for each vectorized - don't need multiple vectorizers? - # multiple highlights? - could do positive and negative expl? + ]) + # why do we have a list of WeightedSpans? One for each vectorizer? + # But we do not use multiple vectorizers? return tokens, heatmap, weighted_spans @@ -87,6 +106,9 @@ def resize_1d(heatmap, length, interpolation_kind='linear'): heatmap : numpy.ndarray Heatmap to be resized. + + :raises TypeError: if ``heatmap`` is wrong type. + length : int Required width. @@ -102,6 +124,8 @@ def resize_1d(heatmap, length, interpolation_kind='linear'): heatmap : numpy.ndarray The heatmap resized. """ + _validate_heatmap(heatmap) + _validate_length(length) if len(heatmap.shape) == 1 and heatmap.shape[0] == 1: # single weight, no batch heatmap = heatmap.repeat(length) @@ -144,7 +168,7 @@ def _build_spans(tokens, # type: Union[np.ndarray, list] def _construct_document(tokens): # type: (Union[list, np.ndarray]) -> str - """Create a document string by joining ``tokens``.""" + """Create a document string by joining ``tokens`` sequence.""" if _is_character_tokenization(tokens): sep = '' else: @@ -154,10 +178,7 @@ def _construct_document(tokens): def _is_character_tokenization(tokens): # type: (Union[list, np.ndarray]) -> bool - """ - Check whether tokenization is character-level - (returns True) or word-level (returns False). - """ + """Check whether tokenization is character-level (True) or word-level (False).""" return any(' ' in t for t in tokens) @@ -178,27 +199,27 @@ def _find_padding(pad_value=None, # type: Union[int, float] else: raise TypeError('Pass "doc" and "pad_value", ' 'or "tokens" and "pad_token".') - # TODO: warn if indices is empty - passed wrong padding char/value? def _find_padding_values(pad_value, doc): # type: (Union[int, float], np.ndarray) -> np.ndarray if not isinstance(pad_value, (int, float)): raise TypeError('"pad_value" must be int or float. Got "{}"'.format(type(pad_value))) + _validate_doc(doc) values, indices = np.where(doc == pad_value) return indices def _find_padding_tokens(pad_token, tokens): - # type: (str, Union[list, np.ndarray]) -> np.ndarray + # type: (str, np.ndarray) -> np.ndarray if not isinstance(pad_token, str): raise TypeError('"pad_token" must be str. Got "{}"'.format(type(pad_token))) - indices = [idx for idx, token in enumerate(tokens) if token == pad_token] - return np.array(indices) + indices = np.where(tokens == pad_token) + return indices def _trim_padding(pad_indices, # type: np.ndarray - tokens, # type: Union[list, np.ndarray] + tokens, # type: np.ndarray heatmap, # type: np.ndarray ): # type: (...) -> Tuple[Union[list, np.ndarray], np.ndarray] @@ -212,4 +233,82 @@ def _trim_padding(pad_indices, # type: np.ndarray # and we can not detect and raise an error if there is padding in the middle of the text tokens = np.delete(tokens, pad_indices) heatmap = np.delete(heatmap, pad_indices) - return tokens, heatmap \ No newline at end of file + return tokens, heatmap + + +def _validate_doc(doc): + if not isinstance(doc, np.ndarray): + raise TypeError('"doc" must be an instance of numpy.ndarray. ' + 'Got "{}" (type "{}")'.format(doc, type(doc))) + + +def _validate_length(length): + if not isinstance(length, int): + raise TypeError('"length" must be an integer. Got "{}" ' + '(type "{}")'.format(length, type(length))) + if length < 0: + raise ValueError('"length" must be a non-negative integer. ' + 'Got "{}"'.format(length)) + + +# TODO: +# docs for raises in here +# coverage tests for new validation + + +# FIXME: break this function up +def _validate_tokens(tokens): + # type: (Union[np.ndarray, list]) -> None + """Check that ``tokens`` contains correct items and matches ``doc``.""" + if not isinstance(tokens, (list, np.ndarray)): + # wrong type + raise TypeError('"tokens" must be list or numpy.ndarray. ' + 'Got "{}".'.format(tokens)) + if len(tokens) == 0: + # empty list + raise ValueError('"tokens" is empty: {}'.format(tokens)) + + +def _validate_tokens_value(tokens, doc): + # type: (Union[np.ndarray, list], np.ndarray) -> None + doc_batch, doc_len = doc.shape[0], doc.shape[1] + an_entry = tokens[0] + if isinstance(an_entry, str): + # no batch + if doc_batch != 1: + # doc is batched but tokens is not + raise ValueError('If passing "tokens" without batch dimension, ' + '"doc" must have batch size = 1.' + 'Got "doc" with batch size = %d.' % doc_batch) + tokens_len = len(tokens) + elif isinstance(an_entry, (list, np.ndarray)): + # batched + tokens_batch = len(tokens) + if tokens_batch != doc_batch: + # batch lengths do not match + raise ValueError('"tokens" must have same number of samples ' + 'as in doc batch. Got: "tokens" samples: %d, ' + 'doc samples: %d' % (tokens_batch, doc_batch)) + + a_token = an_entry[0] + if not isinstance(a_token, str): + # actual contents are not strings + raise TypeError('Second axis in "tokens" must contain strings. ' + 'Found "{}" (type "{}")'.format(a_token, type(a_token))) + + # a way to check that all elements match some condition + # https://stackoverflow.com/a/35791116/11555448 + it = iter(tokens) + the_len = len(next(it)) + if not all(len(l) == the_len for l in it): + raise ValueError('"tokens" samples do not all have the same length.') + tokens_len = the_len + else: + raise TypeError('"tokens" must be an array of strings, ' + 'or an array of string arrays. ' + 'Got "{}".'.format(tokens)) + + if tokens_len != doc_len: + raise ValueError('"tokens" and "doc" lengths must match. ' + '"tokens" length: "%d". "doc" length: "%d"' + % (tokens_len, doc_len)) \ No newline at end of file diff --git a/eli5/permutation_importance.py b/eli5/permutation_importance.py index b5c4a3f0..8ff5d40d 100644 --- a/eli5/permutation_importance.py +++ b/eli5/permutation_importance.py @@ -13,8 +13,8 @@ from __future__ import absolute_import from typing import Tuple, List, Callable, Any -import numpy as np # type: ignore -from sklearn.utils import check_random_state # type: ignore +import numpy as np +from sklearn.utils import check_random_state def iter_shuffled(X, columns_to_shuffle=None, pre_shuffle=False, diff --git a/eli5/sklearn/explain_prediction.py b/eli5/sklearn/explain_prediction.py index 88586f05..18dcc36f 100644 --- a/eli5/sklearn/explain_prediction.py +++ b/eli5/sklearn/explain_prediction.py @@ -1,10 +1,10 @@ # -*- coding: utf-8 -*- from functools import partial -import numpy as np # type: ignore -import scipy.sparse as sp # type: ignore -from sklearn.base import BaseEstimator # type: ignore -from sklearn.ensemble import ( # type: ignore +import numpy as np +import scipy.sparse as sp +from sklearn.base import BaseEstimator +from sklearn.ensemble import ( ExtraTreesClassifier, ExtraTreesRegressor, GradientBoostingClassifier, @@ -12,7 +12,7 @@ RandomForestClassifier, RandomForestRegressor, ) -from sklearn.linear_model import ( # type: ignore +from sklearn.linear_model import ( ElasticNet, # includes Lasso, MultiTaskElasticNet, etc. ElasticNetCV, HuberRegressor, @@ -34,7 +34,7 @@ SGDRegressor, TheilSenRegressor, ) -from sklearn.svm import ( # type: ignore +from sklearn.svm import ( LinearSVC, LinearSVR, SVC, @@ -43,8 +43,8 @@ NuSVR, OneClassSVM, ) -from sklearn.multiclass import OneVsRestClassifier # type: ignore -from sklearn.tree import ( # type: ignore +from sklearn.multiclass import OneVsRestClassifier +from sklearn.tree import ( DecisionTreeClassifier, DecisionTreeRegressor ) diff --git a/eli5/sklearn/explain_weights.py b/eli5/sklearn/explain_weights.py index 019140a6..49010fb4 100644 --- a/eli5/sklearn/explain_weights.py +++ b/eli5/sklearn/explain_weights.py @@ -1,11 +1,11 @@ # -*- coding: utf-8 -*- from __future__ import absolute_import -import numpy as np # type: ignore +import numpy as np -from sklearn.base import BaseEstimator, RegressorMixin # type: ignore -from sklearn.pipeline import Pipeline # type: ignore -from sklearn.linear_model import ( # type: ignore +from sklearn.base import BaseEstimator, RegressorMixin +from sklearn.pipeline import Pipeline +from sklearn.linear_model import ( ElasticNet, # includes Lasso, MultiTaskElasticNet, etc. ElasticNetCV, HuberRegressor, @@ -27,8 +27,8 @@ SGDRegressor, TheilSenRegressor, ) -from sklearn.multiclass import OneVsRestClassifier # type: ignore -from sklearn.svm import ( # type: ignore +from sklearn.multiclass import OneVsRestClassifier +from sklearn.svm import ( LinearSVC, LinearSVR, SVC, @@ -38,8 +38,8 @@ OneClassSVM, ) # TODO: see https://github.com/scikit-learn/scikit-learn/pull/2250 -from sklearn.naive_bayes import BernoulliNB, MultinomialNB # type: ignore -from sklearn.ensemble import ( # type: ignore +from sklearn.naive_bayes import BernoulliNB, MultinomialNB +from sklearn.ensemble import ( GradientBoostingClassifier, GradientBoostingRegressor, AdaBoostClassifier, @@ -49,7 +49,7 @@ ExtraTreesClassifier, ExtraTreesRegressor, ) -from sklearn.tree import ( # type: ignore +from sklearn.tree import ( DecisionTreeClassifier, DecisionTreeRegressor, ) diff --git a/eli5/sklearn/permutation_importance.py b/eli5/sklearn/permutation_importance.py index 30ab3cad..370be8be 100644 --- a/eli5/sklearn/permutation_importance.py +++ b/eli5/sklearn/permutation_importance.py @@ -2,23 +2,23 @@ from functools import partial from typing import List -import numpy as np # type: ignore -from sklearn.model_selection import check_cv # type: ignore -from sklearn.utils.metaestimators import if_delegate_has_method # type: ignore -from sklearn.utils import check_array, check_random_state # type: ignore -from sklearn.base import ( # type: ignore +import numpy as np +from sklearn.model_selection import check_cv +from sklearn.utils.metaestimators import if_delegate_has_method +from sklearn.utils import check_array, check_random_state +from sklearn.base import ( BaseEstimator, MetaEstimatorMixin, clone, is_classifier ) -from sklearn.metrics.scorer import check_scoring # type: ignore +from sklearn.metrics.scorer import check_scoring from eli5.permutation_importance import get_score_importances from eli5.sklearn.utils import pandas_available if pandas_available: - import pandas as pd # type: ignore + import pandas as pd CAVEATS_CV_NONE = """ Feature importances are computed on the same data as used for training, @@ -87,11 +87,13 @@ class PermutationImportance(BaseEstimator, MetaEstimatorMixin): scoring : string, callable or None, default=None Scoring function to use for computing feature importances. - A string with scoring name (see scikit-learn docs) or + A string with scoring name (see scikit-learn `docs`_) or a scorer callable object / function with signature ``scorer(estimator, X, y)``. If ``None``, the ``score`` method of the estimator is used. + .. _docs: https://scikit-learn.org/stable/modules/model_evaluation.html#common-cases-predefined-values + n_iter : int, default 5 Number of random shuffle iterations. Decrease to improve speed, increase to get more precise estimates. @@ -212,8 +214,12 @@ def _cv_scores_importances(self, X, y, groups=None, **fit_params): cv = check_cv(self.cv, y, is_classifier(self.estimator)) feature_importances = [] # type: List base_scores = [] # type: List[float] + weights = fit_params.pop('sample_weight', None) + fold_fit_params = fit_params.copy() for train, test in cv.split(X, y, groups): - est = clone(self.estimator).fit(X[train], y[train], **fit_params) + if weights is not None: + fold_fit_params['sample_weight'] = weights[train] + est = clone(self.estimator).fit(X[train], y[train], **fold_fit_params) score_func = partial(self.scorer_, est) _base_score, _importances = self._get_score_importances( score_func, X[test], y[test]) diff --git a/eli5/sklearn/text.py b/eli5/sklearn/text.py index 57296a6f..fb2748bf 100644 --- a/eli5/sklearn/text.py +++ b/eli5/sklearn/text.py @@ -1,8 +1,11 @@ from __future__ import absolute_import from typing import Any, Union, Callable, Dict, List, Optional, Set, Tuple -from sklearn.feature_extraction.text import VectorizerMixin # type: ignore -from sklearn.pipeline import FeatureUnion # type: ignore +from sklearn.pipeline import FeatureUnion +try: + from sklearn.feature_extraction.text import _VectorizerMixin as VectorizerMixin +except ImportError: # Changed in scikit-learn 0.22 + from sklearn.feature_extraction.text import VectorizerMixin from eli5.base import ( DocWeightedSpans, WeightedSpans, FeatureWeights, FeatureWeight, diff --git a/eli5/sklearn/transform.py b/eli5/sklearn/transform.py index 2d431f8c..8d79f555 100644 --- a/eli5/sklearn/transform.py +++ b/eli5/sklearn/transform.py @@ -1,11 +1,11 @@ """transform_feature_names implementations for scikit-learn transformers """ -import numpy as np # type: ignore -from sklearn.pipeline import Pipeline, FeatureUnion # type: ignore -from sklearn.feature_selection.base import SelectorMixin # type: ignore +import numpy as np +from sklearn.pipeline import Pipeline, FeatureUnion +from sklearn.feature_selection.base import SelectorMixin -from sklearn.preprocessing import ( # type: ignore +from sklearn.preprocessing import ( MinMaxScaler, StandardScaler, MaxAbsScaler, @@ -26,7 +26,7 @@ def _select_names(est, in_names=None): return [in_names[i] for i in np.flatnonzero(mask)] try: - from sklearn.linear_model import ( # type: ignore + from sklearn.linear_model import ( RandomizedLogisticRegression, RandomizedLasso, ) diff --git a/eli5/sklearn/treeinspect.py b/eli5/sklearn/treeinspect.py index 373397bd..d0e9f76a 100644 --- a/eli5/sklearn/treeinspect.py +++ b/eli5/sklearn/treeinspect.py @@ -7,8 +7,8 @@ """ from __future__ import absolute_import, division -from sklearn.base import ClassifierMixin # type: ignore -from sklearn.tree import _tree, export_graphviz # type: ignore +from sklearn.base import ClassifierMixin +from sklearn.tree import _tree, export_graphviz from eli5.base import TreeInfo, NodeInfo diff --git a/eli5/sklearn/unhashing.py b/eli5/sklearn/unhashing.py index 64b44d86..f4f79b8d 100644 --- a/eli5/sklearn/unhashing.py +++ b/eli5/sklearn/unhashing.py @@ -7,14 +7,14 @@ from itertools import chain from typing import List, Iterable, Any, Dict, Tuple, Union -import numpy as np # type: ignore +import numpy as np import six -from sklearn.base import BaseEstimator, TransformerMixin # type: ignore -from sklearn.feature_extraction.text import ( # type: ignore +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.feature_extraction.text import ( HashingVectorizer, FeatureHasher, ) -from sklearn.pipeline import FeatureUnion # type: ignore +from sklearn.pipeline import FeatureUnion from eli5._feature_names import FeatureNames diff --git a/eli5/sklearn/utils.py b/eli5/sklearn/utils.py index ba3680aa..286d078a 100644 --- a/eli5/sklearn/utils.py +++ b/eli5/sklearn/utils.py @@ -3,9 +3,9 @@ from distutils.version import LooseVersion from typing import Any, Optional, List, Tuple -import numpy as np # type: ignore -import scipy.sparse as sp # type: ignore -from sklearn.multiclass import OneVsRestClassifier # type: ignore +import numpy as np +import scipy.sparse as sp +from sklearn.multiclass import OneVsRestClassifier from eli5.sklearn.unhashing import invert_hashing_and_fit, handle_hashing_vec from eli5._feature_names import FeatureNames @@ -214,7 +214,7 @@ def get_num_features(estimator): try: - import pandas as pd # type: ignore + import pandas as pd pandas_available = True except ImportError: pandas_available = False @@ -277,5 +277,5 @@ def sklearn_version(): >>> sklearn_version() > '0.17' True """ - from sklearn import __version__ # type: ignore + from sklearn import __version__ return LooseVersion(__version__) diff --git a/eli5/sklearn_crfsuite/explain_weights.py b/eli5/sklearn_crfsuite/explain_weights.py index e2a9435d..6007efd7 100644 --- a/eli5/sklearn_crfsuite/explain_weights.py +++ b/eli5/sklearn_crfsuite/explain_weights.py @@ -1,9 +1,9 @@ # -*- coding: utf-8 -*- from __future__ import absolute_import -import numpy as np # type: ignore -from scipy import sparse as sp # type: ignore -from sklearn_crfsuite import CRF # type: ignore +import numpy as np +from scipy import sparse as sp +from sklearn_crfsuite import CRF from eli5.base import Explanation, TargetExplanation, TransitionFeatureWeights from eli5.explain import explain_weights diff --git a/eli5/utils.py b/eli5/utils.py index f1ea32c4..e5f669f8 100644 --- a/eli5/utils.py +++ b/eli5/utils.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -import numpy as np # type: ignore -from scipy import sparse as sp # type: ignore +import numpy as np +from scipy import sparse as sp def argsort_k_largest(x, k): diff --git a/eli5/xgboost.py b/eli5/xgboost.py index 86d3ad58..ce6a079a 100644 --- a/eli5/xgboost.py +++ b/eli5/xgboost.py @@ -4,9 +4,9 @@ import re from typing import Any, Dict, List, Tuple, Optional, Pattern -import numpy as np # type: ignore -import scipy.sparse as sp # type: ignore -from xgboost import ( # type: ignore +import numpy as np +import scipy.sparse as sp +from xgboost import ( XGBClassifier, XGBRegressor, Booster, diff --git a/setup.py b/setup.py index 011fdd28..ec561515 100755 --- a/setup.py +++ b/setup.py @@ -37,7 +37,6 @@ def get_long_description(): 'scipy', 'six', 'scikit-learn >= 0.18', - 'typing', 'graphviz', 'tabulate>=0.7.7', ], @@ -45,6 +44,7 @@ def get_long_description(): ":python_version<'3.5.6'": [ 'singledispatch >= 3.4.0.3', ], + ":python_version<'3.5'": ['typing'], }, classifiers=[ 'Development Status :: 4 - Beta', diff --git a/tests/test_formatters_image.py b/tests/test_formatters_image.py index bc395afd..c287062f 100644 --- a/tests/test_formatters_image.py +++ b/tests/test_formatters_image.py @@ -15,7 +15,6 @@ _cap_alpha, _overlay_heatmap, _validate_image, - _validate_heatmap, ) from .utils_image import assert_pixel_by_pixel_equal import eli5 @@ -153,12 +152,6 @@ def test_validate_image(): _validate_image(np.zeros((2, 2, 4,))) -def test_validate_heatmap(): - with pytest.raises(TypeError): - # heatmap must be a numpy array, not a Pillow image - _validate_heatmap(PIL.Image.new('L', (2, 2,))) - - def test_format_as_image_notransparency(catdog_rgba): # heatmap with full transparency expl = Explanation('mock', diff --git a/tests/test_keras.py b/tests/test_keras.py index 9ea1c50b..b765abc1 100644 --- a/tests/test_keras.py +++ b/tests/test_keras.py @@ -33,13 +33,11 @@ from eli5.keras.explain_prediction import ( _validate_model, _validate_doc, - _validate_tokens, _get_layer, _autoget_layer_image, _autoget_layer_text, ) from eli5.keras.gradcam import ( - _autoget_target_prediction, _calc_gradient, ) @@ -147,42 +145,6 @@ def test_validate_doc(): _validate_doc(np.zeros((3, 2, 2, 1))) -def test_validate_tokens(): - _validate_tokens(np.zeros((1, 3)), ['a', 'b', 'c']) - _validate_tokens(np.zeros((2, 2)), [['a', 'b'], ['c', 'd']]) - - -def test_validate_tokens_invalid(): - with pytest.raises(TypeError): - # should be in a list - _validate_tokens(np.zeros((1, 1)), 'a') - with pytest.raises(ValueError): - # empty list - _validate_tokens(np.zeros((1, 1)), []) - with pytest.raises(ValueError): - # single list but multiple samples in batch - _validate_tokens(np.zeros((3, 2)), ['a', 'b']) - - # list doesn't contain strings - with pytest.raises(TypeError): - _validate_tokens(np.zeros((1, 1)), [0]) - with pytest.raises(TypeError): - _validate_tokens(np.zeros((1, 1)), [[0]]) - - with pytest.raises(ValueError): - # not enough samples in batched list - _validate_tokens(np.zeros((3, 1)), np.array([['a'], ['b']])) - with pytest.raises(ValueError): - # tokens lengths vary - _validate_tokens(np.zeros((2, 2)), [['a', 'b'], ['c']]) - with pytest.raises(ValueError): - # tokens sample lengths do not match - _validate_tokens(np.zeros((1, 1)), ['a', 'b']) - with pytest.raises(TypeError): - # too many axes - _validate_tokens(np.zeros((1, 1,)), [[['a']]]) - - def test_explain_prediction_attributes(simple_seq_image, dummy_image): expl = eli5.explain_prediction(simple_seq_image, np.zeros((1, 32, 32, 1))) assert expl.layer is not None @@ -225,9 +187,14 @@ def test_calc_gradient(differentiable_model): def test_calc_gradient_nondifferentiable(nondifferentiable_model): with pytest.raises(ValueError): - grads = _calc_gradient(nondifferentiable_model.output, - [nondifferentiable_model.input]) + _calc_gradient(nondifferentiable_model.output, + [nondifferentiable_model.input]) + + +# TODO: test chossing multiple target from multiple maximum values, etc -# TODO: test_autoget_target_prediction with multiple maximum values, etc \ No newline at end of file +def test_import(): + # test that package imports without errors + import eli5.keras \ No newline at end of file diff --git a/tests/test_lightning.py b/tests/test_lightning.py index 6519d9d5..cebbb7a8 100644 --- a/tests/test_lightning.py +++ b/tests/test_lightning.py @@ -73,6 +73,7 @@ def test_explain_weights_regressors(boston_train, reg): has_bias=False) +@pytest.mark.xfail(reason='lightning does not work with pandas dataframes any more') @pytest.mark.parametrize(['reg'], _instances(_REGRESSORS)[:2]) def test_explain_prediction_pandas(reg, boston_train): _check_explain_prediction_pandas(reg, boston_train) diff --git a/tests/test_nn.py b/tests/test_nn.py new file mode 100644 index 00000000..7dce3b8d --- /dev/null +++ b/tests/test_nn.py @@ -0,0 +1,7 @@ +# -*- coding: utf-8 -*- +"""Test eli5.nn package""" + + +def test_import(): + # check that package imports without errors + import eli5.nn \ No newline at end of file diff --git a/tests/test_nn_gradcam.py b/tests/test_nn_gradcam.py index a82e3d5b..1516a61c 100644 --- a/tests/test_nn_gradcam.py +++ b/tests/test_nn_gradcam.py @@ -3,11 +3,13 @@ import pytest import numpy as np +PIL = pytest.importorskip('PIL') from eli5.nn.gradcam import ( gradcam_heatmap, _validate_targets, _validate_classification_target, + _validate_heatmap, ) @@ -64,10 +66,19 @@ def test_validate_targets(): # target index must correctly reference one of the nodes in the final layer -def _validate_classification_target(): +def test_validate_classification_target(): with pytest.raises(ValueError): # one over _validate_classification_target(2, (1, 2,)) with pytest.raises(ValueError): # one less - _validate_classification_target(-1, (1, 1,)) \ No newline at end of file + _validate_classification_target(-1, (1, 1,)) + + +def test_validate_heatmap(): + with pytest.raises(TypeError): + # heatmap must be a numpy array, not a Pillow image + _validate_heatmap(PIL.Image.new('L', (2, 2,))) + with pytest.raises(TypeError): + # heatmap must not be a Python list + _validate_heatmap([2, 3]) \ No newline at end of file diff --git a/tests/test_nn_text.py b/tests/test_nn_text.py index 2a9b2110..1beac020 100644 --- a/tests/test_nn_text.py +++ b/tests/test_nn_text.py @@ -5,7 +5,7 @@ import numpy as np from eli5.nn.text import ( - gradcam_text_spans, + gradcam_spans, resize_1d, _build_spans, _construct_document, @@ -13,6 +13,8 @@ _find_padding_values, _find_padding_tokens, _trim_padding, + _validate_tokens, + _validate_tokens_value, ) from eli5.base import ( WeightedSpans, @@ -66,11 +68,11 @@ def test_find_padding_values(): def test_find_padding_tokens(): - indices = _find_padding_tokens('', ['the', 'test', '', '']) + indices = _find_padding_tokens('', np.array(['the', 'test', '', ''])) np.array_equal(indices, np.array([2, 3])) with pytest.raises(TypeError): - _find_padding_tokens(0, ['']) + _find_padding_tokens(0, np.array([''])) @pytest.mark.parametrize('pad_indices, tokens, heatmap, expected_tokens, expected_heatmap', [ @@ -93,16 +95,48 @@ def test_trim_padding_invalid(): assert np.array_equal(tokens, tokens_trimmed) assert np.array_equal(heatmap, heatmap_trimmed) - # with pytest.raises(ValueError): - # _trim_padding([1], ['a', 'PAD', 'b'], np.array([1, 0, 2])) - -def test_gradcam_text_spans(): - heatmap, tokens, doc = np.array([2.0]), ['a'], [2] - res_tokens, res_heatmap, res_weighted_spans = gradcam_text_spans(heatmap, tokens, doc) +def test_gradcam_spans(): + heatmap, tokens, doc = np.array([2.0]), ['a'], np.array([[2]]) + res_tokens, res_heatmap, res_weighted_spans = gradcam_spans(heatmap, tokens, doc) assert np.array_equal(heatmap, res_heatmap) assert np.array_equal(tokens, res_tokens) assert res_weighted_spans == WeightedSpans([DocWeightedSpans( 'a', spans=[('a', [(0, 1)], 2.0)] - )]) \ No newline at end of file + )]) + + +def test_validate_tokens_invalid(): + # should be in a list or numpy array + with pytest.raises(TypeError): + _validate_tokens('a') + + # empty list + with pytest.raises(ValueError): + _validate_tokens([]) + + +def test_validate_tokens_value_invalid(): + with pytest.raises(ValueError): + # single tokens list but multiple samples in input doc + _validate_tokens_value(['a', 'b'], np.zeros((3, 2))) + + # list doesn't contain strings + with pytest.raises(TypeError): + _validate_tokens_value([0], np.zeros((1, 1))) + with pytest.raises(TypeError): + _validate_tokens_value([[0]], np.zeros((1, 1))) + + with pytest.raises(ValueError): + # not enough samples in batched list + _validate_tokens_value(np.array([['a'], ['b']]), np.zeros((3, 1))) + with pytest.raises(ValueError): + # tokens lengths vary + _validate_tokens_value([['a', 'b'], ['c']], np.zeros((2, 2))) + with pytest.raises(ValueError): + # tokens sample lengths do not match + _validate_tokens_value(['a', 'b'], np.zeros((1, 1))) + with pytest.raises(TypeError): + # too many axes + _validate_tokens_value([[['a']]], np.zeros((1, 1,))) \ No newline at end of file diff --git a/tests/test_permutation_importance.py b/tests/test_permutation_importance.py index effb4ff9..f4b95233 100644 --- a/tests/test_permutation_importance.py +++ b/tests/test_permutation_importance.py @@ -41,7 +41,7 @@ def is_shuffled(X, X_sh, col): def test_get_feature_importances(boston_train): X, y, feat_names = boston_train - svr = SVR(C=20).fit(X, y) + svr = SVR(C=20, gamma='auto').fit(X, y) score, importances = get_score_importances(svr.score, X, y) assert score > 0.7 importances = dict(zip(feat_names, np.mean(importances, axis=0))) diff --git a/tests/test_sklearn_explain_prediction.py b/tests/test_sklearn_explain_prediction.py index 646c8c7d..277fee46 100644 --- a/tests/test_sklearn_explain_prediction.py +++ b/tests/test_sklearn_explain_prediction.py @@ -379,7 +379,7 @@ def test_explain_linear_binary(newsgroups_train_binary, clf): def test_explain_one_class_svm(): X = np.array([[0, 0], [0, 1], [5, 3], [93, 94], [90, 91]]) - clf = OneClassSVM(kernel='linear', random_state=42).fit(X) + clf = OneClassSVM(kernel='linear').fit(X) res = explain_prediction(clf, X[0]) assert res.targets[0].score < 0 for expl in format_as_all(res, clf): diff --git a/tests/test_sklearn_explain_weights.py b/tests/test_sklearn_explain_weights.py index 6ca5f519..7f5469c2 100644 --- a/tests/test_sklearn_explain_weights.py +++ b/tests/test_sklearn_explain_weights.py @@ -210,7 +210,7 @@ def test_explain_linear_unsupported_multiclass(clf, newsgroups_train): def test_explain_one_class_svm(): X = np.array([[0,0], [0, 1], [5, 3], [93, 94], [90, 91]]) - clf = OneClassSVM(kernel='linear', random_state=42).fit(X) + clf = OneClassSVM(kernel='linear', gamma='auto').fit(X) res = explain_weights(clf) assert len(res.targets) == 1 target = res.targets[0] @@ -451,7 +451,7 @@ def test_explain_random_forest_and_tree_feature_filter(newsgroups_train, clf): def test_explain_empty(newsgroups_train): - clf = LogisticRegression(C=0.01, penalty='l1', random_state=42) + clf = LogisticRegression(C=0.01, penalty='l1', solver='liblinear', random_state=42) docs, y, target_names = newsgroups_train vec = TfidfVectorizer() diff --git a/tests/test_sklearn_permutation_importance.py b/tests/test_sklearn_permutation_importance.py index 19e54e2d..4ffec3ba 100644 --- a/tests/test_sklearn_permutation_importance.py +++ b/tests/test_sklearn_permutation_importance.py @@ -3,7 +3,7 @@ import numpy as np from sklearn.base import is_classifier, is_regressor from sklearn.svm import SVR, SVC -from sklearn.ensemble import RandomForestRegressor +from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier from sklearn.model_selection import train_test_split, cross_val_score from sklearn.pipeline import make_pipeline from sklearn.feature_selection import SelectFromModel @@ -73,7 +73,7 @@ def test_cv(boston_train): *boston_train, noise_ratio=0.99) reg = PermutationImportance( - SVR(C=100), + SVR(C=100, gamma='auto'), random_state=42, cv=None, n_iter=50, # use the same number of experiments as with cv=10 @@ -86,7 +86,7 @@ def test_cv(boston_train): # CV feature importances reg = PermutationImportance( - SVR(C=100), + SVR(C=100, gamma='auto'), random_state=42, cv=10, ).fit(X_test, y_test) @@ -132,9 +132,9 @@ def test_feature_selection(boston_train): ), threshold=0.1, ) - pipe = make_pipeline(sel, SVR(C=10)) + pipe = make_pipeline(sel, SVR(C=10, gamma='auto')) score1 = cross_val_score(pipe, X, y).mean() - score2 = cross_val_score(SVR(C=10), X, y).mean() + score2 = cross_val_score(SVR(C=10, gamma='auto'), X, y).mean() print(score1, score2) assert score1 > score2 @@ -165,6 +165,7 @@ def test_explain_weights(iris_train): for _expl in res: assert "petal width (cm)" in _expl + def test_pandas_xgboost_support(iris_train): xgboost = pytest.importorskip('xgboost') pd = pytest.importorskip('pandas') @@ -175,3 +176,17 @@ def test_pandas_xgboost_support(iris_train): est.fit(X, y) # we expect no exception to be raised here when using xgboost with pd.DataFrame perm = PermutationImportance(est).fit(X, y) + + +def test_cv_sample_weight(iris_train): + X, y, feature_names, target_names = iris_train + weights_ones = np.ones(len(y)) + model = RandomForestClassifier(random_state=42) + + # we expect no exception to be raised when passing weights with a CV + perm_weights = PermutationImportance(model, cv=5, random_state=42).\ + fit(X, y, sample_weight=weights_ones) + perm = PermutationImportance(model, cv=5, random_state=42).fit(X, y) + + # passing a vector of weights filled with one should be the same as passing no weights + assert (perm.feature_importances_ == perm_weights.feature_importances_).all() \ No newline at end of file diff --git a/tests/test_sklearn_transform.py b/tests/test_sklearn_transform.py index aa200937..0c7123f5 100644 --- a/tests/test_sklearn_transform.py +++ b/tests/test_sklearn_transform.py @@ -81,19 +81,19 @@ def selection_score_func(X, y): (VarianceThreshold(1.0), ['']), (GenericUnivariateSelect(), ['']), (GenericUnivariateSelect(mode='k_best', param=2), ['', '']), - (SelectFromModel(LogisticRegression('l1', C=0.01, random_state=42)), + (SelectFromModel(LogisticRegression('l1', C=0.01, solver='liblinear', random_state=42, multi_class='ovr')), ['', '']), (SelectFromModel( PermutationImportance( - LogisticRegression(random_state=42), + LogisticRegression(solver='liblinear', random_state=42), cv=5, random_state=42, refit=False, ), threshold=0.1, ), ['', '']), - (RFE(LogisticRegression(random_state=42), 2), + (RFE(LogisticRegression(solver='liblinear', random_state=42, multi_class='ovr'), 2), ['', '']), - (RFECV(LogisticRegression(random_state=42)), + (RFECV(LogisticRegression(solver='liblinear', random_state=42, multi_class='ovr'), cv=3), ['', '', '', '']), ] + _additional_test_cases) def test_transform_feature_names_iris(transformer, expected, iris_train): diff --git a/tox.ini b/tox.ini index 81465eba..919c0eb2 100644 --- a/tox.ini +++ b/tox.ini @@ -87,10 +87,10 @@ commands={[testenv:py35-extra]commands} basepython=python3.6 deps= {[testenv]deps} - mypy == 0.641 + mypy == 0.750 lxml commands= - mypy --html-report ./mypy-cov --check-untyped-defs eli5 + mypy --html-report ./mypy-cov --check-untyped-defs --ignore-missing-imports eli5 [testenv:docs]