From f7c34469e28f1217491153b92fdc6b103b206685 Mon Sep 17 00:00:00 2001 From: Konstantin Lopuhin Date: Wed, 21 Aug 2019 09:29:27 +0200 Subject: [PATCH 01/23] Prepare 0.10.0 release --- CHANGES.rst | 6 ++++++ eli5/__init__.py | 4 ++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 20a98c85..f4cb72e5 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,6 +1,12 @@ Changelog ========= +0.10.0 (2019-08-21) +------------------- + +* Keras image classifiers: explaining predictions with Grad-CAM + (GSoC-2019 project by @teabolt). + 0.9.0 (2019-07-05) ------------------ diff --git a/eli5/__init__.py b/eli5/__init__.py index 610a84f5..c12cce0c 100644 --- a/eli5/__init__.py +++ b/eli5/__init__.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- from __future__ import absolute_import -__version__ = '0.9.0' +__version__ = '0.10.0' from .formatters import ( format_as_html, @@ -95,4 +95,4 @@ ) except ImportError: # keras is not available - pass \ No newline at end of file + pass From 24f8c9ac168eeec807248c892d7ed406c04c852b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillem=20Garc=C3=ADa=20Subies?= <37592763+GuillemGSubies@users.noreply.github.com> Date: Wed, 28 Aug 2019 11:41:48 +0200 Subject: [PATCH 02/23] Install typing only for old python --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 011fdd28..ec561515 100755 --- a/setup.py +++ b/setup.py @@ -37,7 +37,6 @@ def get_long_description(): 'scipy', 'six', 'scikit-learn >= 0.18', - 'typing', 'graphviz', 'tabulate>=0.7.7', ], @@ -45,6 +44,7 @@ def get_long_description(): ":python_version<'3.5.6'": [ 'singledispatch >= 3.4.0.3', ], + ":python_version<'3.5'": ['typing'], }, classifiers=[ 'Development Status :: 4 - Beta', From 4cd66ac5e0ffde0c002d8f46dc64d7091f49445f Mon Sep 17 00:00:00 2001 From: Konstantin Lopuhin Date: Thu, 29 Aug 2019 09:18:08 +0300 Subject: [PATCH 03/23] 0.10.1 release --- CHANGES.rst | 6 ++++++ eli5/__init__.py | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/CHANGES.rst b/CHANGES.rst index f4cb72e5..92588f7a 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,6 +1,12 @@ Changelog ========= +0.10.1 (2019-08-29) +------------------- + +* Don't include typing dependency on Python 3.5+ + to fix installation on Python 3.7 + 0.10.0 (2019-08-21) ------------------- diff --git a/eli5/__init__.py b/eli5/__init__.py index c12cce0c..ffb406cd 100644 --- a/eli5/__init__.py +++ b/eli5/__init__.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- from __future__ import absolute_import -__version__ = '0.10.0' +__version__ = '0.10.1' from .formatters import ( format_as_html, From 69fe7516aeff33b7e6cf22a874546fb7369606b6 Mon Sep 17 00:00:00 2001 From: teabolt Date: Sat, 7 Sep 2019 20:29:39 +0200 Subject: [PATCH 04/23] Rename function. Add tests for package imports --- eli5/keras/explain_prediction.py | 17 ++++++++--------- eli5/nn/__init__.py | 2 +- eli5/nn/gradcam.py | 1 - eli5/nn/text.py | 14 +++++++------- tests/test_keras.py | 7 ++++++- tests/test_nn.py | 7 +++++++ tests/test_nn_gradcam.py | 2 +- 7 files changed, 30 insertions(+), 20 deletions(-) create mode 100644 tests/test_nn.py diff --git a/eli5/keras/explain_prediction.py b/eli5/keras/explain_prediction.py index 6d465f34..5dc6845a 100644 --- a/eli5/keras/explain_prediction.py +++ b/eli5/keras/explain_prediction.py @@ -40,8 +40,7 @@ _validate_classification_target, ) from eli5.nn.text import ( - gradcam_text_spans, - _is_character_tokenization, + gradcam_spans, ) from .gradcam import ( gradcam_backend_keras, @@ -373,13 +372,13 @@ def explain_prediction_keras_text(model, predicted_idx, = predicted_idx predicted_val, = predicted_val heatmap, = heatmap - text_vals = gradcam_text_spans(heatmap, - tokens, - doc, - pad_value=pad_value, - pad_token=pad_token, - interpolation_kind=interpolation_kind, - ) + text_vals = gradcam_spans(heatmap, + tokens, + doc, + pad_value=pad_value, + pad_token=pad_token, + interpolation_kind=interpolation_kind, + ) # TODO: padding could be relevant for images too? tokens, heatmap, weighted_spans = text_vals return Explanation( diff --git a/eli5/nn/__init__.py b/eli5/nn/__init__.py index 7bfe97d5..7422dd60 100644 --- a/eli5/nn/__init__.py +++ b/eli5/nn/__init__.py @@ -6,6 +6,6 @@ compute_weights, ) from .text import ( - gradcam_text_spans, + gradcam_spans, resize_1d, ) \ No newline at end of file diff --git a/eli5/nn/gradcam.py b/eli5/nn/gradcam.py index 34af00b1..c92630eb 100644 --- a/eli5/nn/gradcam.py +++ b/eli5/nn/gradcam.py @@ -9,7 +9,6 @@ """ -# FIXME: rename functions def gradcam_heatmap(activations, grads, relu=True, counterfactual=False): # type: (np.ndarray, np.ndarray, bool, bool) -> np.ndarray """ diff --git a/eli5/nn/text.py b/eli5/nn/text.py index 4612a61f..ff382dd0 100644 --- a/eli5/nn/text.py +++ b/eli5/nn/text.py @@ -10,13 +10,13 @@ ) -def gradcam_text_spans(heatmap, # type: np.ndarray - tokens, # type: Union[np.ndarray, list] - doc, # type: np.ndarray - pad_value=None, # type: Optional[Union[int, float]] - pad_token=None, # type: Optional[str] - interpolation_kind='linear' # type: Union[str, int] - ): +def gradcam_spans(heatmap, # type: np.ndarray + tokens, # type: Union[np.ndarray, list] + doc, # type: np.ndarray + pad_value=None, # type: Optional[Union[int, float]] + pad_token=None, # type: Optional[str] + interpolation_kind='linear' # type: Union[str, int] + ): # type: (...) -> Tuple[Union[np.ndarray, list], np.ndarray, WeightedSpans] """ Create text spans from a Grad-CAM ``heatmap`` imposed over ``tokens``. diff --git a/tests/test_keras.py b/tests/test_keras.py index 9ea1c50b..de68489c 100644 --- a/tests/test_keras.py +++ b/tests/test_keras.py @@ -230,4 +230,9 @@ def test_calc_gradient_nondifferentiable(nondifferentiable_model): -# TODO: test_autoget_target_prediction with multiple maximum values, etc \ No newline at end of file +# TODO: test_autoget_target_prediction with multiple maximum values, etc + + +def test_import(): + # test that package imports without errors + import eli5.keras \ No newline at end of file diff --git a/tests/test_nn.py b/tests/test_nn.py new file mode 100644 index 00000000..7dce3b8d --- /dev/null +++ b/tests/test_nn.py @@ -0,0 +1,7 @@ +# -*- coding: utf-8 -*- +"""Test eli5.nn package""" + + +def test_import(): + # check that package imports without errors + import eli5.nn \ No newline at end of file diff --git a/tests/test_nn_gradcam.py b/tests/test_nn_gradcam.py index a82e3d5b..fb8c81e5 100644 --- a/tests/test_nn_gradcam.py +++ b/tests/test_nn_gradcam.py @@ -64,7 +64,7 @@ def test_validate_targets(): # target index must correctly reference one of the nodes in the final layer -def _validate_classification_target(): +def test_validate_classification_target(): with pytest.raises(ValueError): # one over _validate_classification_target(2, (1, 2,)) From 9788c22b1f6fc8d75fd06e5ce7305f66c70002b4 Mon Sep 17 00:00:00 2001 From: teabolt Date: Sat, 7 Sep 2019 22:10:37 +0200 Subject: [PATCH 05/23] Move argument validation to relevant modules --- eli5/keras/explain_prediction.py | 70 ++------------------------------ eli5/keras/gradcam.py | 17 +++++--- eli5/nn/text.py | 60 ++++++++++++++++++++++++++- tests/test_keras.py | 37 ----------------- tests/test_nn_text.py | 50 +++++++++++++++++++---- 5 files changed, 116 insertions(+), 118 deletions(-) diff --git a/eli5/keras/explain_prediction.py b/eli5/keras/explain_prediction.py index 5dc6845a..5368e506 100644 --- a/eli5/keras/explain_prediction.py +++ b/eli5/keras/explain_prediction.py @@ -36,8 +36,6 @@ from eli5.nn.gradcam import ( gradcam_heatmap, DESCRIPTION_GRADCAM, - _validate_targets, - _validate_classification_target, ) from eli5.nn.text import ( gradcam_spans, @@ -247,7 +245,7 @@ def explain_prediction_keras_image(model, * ``target`` ID of target class. * ``score`` value for predicted class. """ - _validate_params(model, doc, targets=targets) + _validate_params(model, doc) if image is None: image = _extract_image(doc) @@ -353,7 +351,7 @@ def explain_prediction_keras_text(model, """ assert tokens is not None - _validate_params(model, doc, targets=targets, tokens=tokens) + _validate_params(model, doc) tokens = _unbatch_tokens(tokens) if layer is not None: @@ -552,18 +550,11 @@ def _backward_layers(model): def _validate_params(model, # type: Model doc, # type: np.ndarray - targets=None, # type: Optional[list] - tokens=None, # type: Optional[Union[np.ndarray, list]] ): # type: (...) -> None """Helper for validating all explanation function parameters.""" _validate_model(model) _validate_doc(doc) - if targets is not None: - _validate_targets(targets) - _validate_classification_target(targets[0], model.output_shape) - if tokens is not None: - _validate_tokens(doc, tokens) def _validate_model(model): @@ -588,59 +579,4 @@ def _validate_doc(doc): raise ValueError('"doc" batch size must be 1. ' 'Got doc with batch size: %d' % batch_size) - # Note that validation of the input shape, etc is done by Keras - - -# FIXME: break this function up -def _validate_tokens(doc, tokens): - # type: (np.ndarray, Union[np.ndarray, list]) -> None - """Check that ``tokens`` contains correct items and matches ``doc``.""" - batch_size, doc_len = doc.shape - if not isinstance(tokens, (list, np.ndarray)): - # wrong type - raise TypeError('"tokens" must be list or numpy.ndarray. ' - 'Got "{}".'.format(tokens)) - - if len(tokens) == 0: - # empty list - raise ValueError('"tokens" is empty: {}'.format(tokens)) - - an_entry = tokens[0] - if isinstance(an_entry, str): - # no batch - if batch_size != 1: - # doc is batched but tokens is not - raise ValueError('If passing "tokens" without batch dimension, ' - '"doc" must have batch size = 1.' - 'Got "doc" with batch size = %d.' % batch_size) - tokens_len = len(tokens) - elif isinstance(an_entry, (list, np.ndarray)): - # batched - tokens_batch_size = len(tokens) - if tokens_batch_size != batch_size: - # batch lengths do not match - raise ValueError('"tokens" must have same number of samples ' - 'as in doc batch. Got: "tokens" samples: %d, ' - 'doc samples: %d' % (tokens_batch_size, batch_size)) - - a_token = an_entry[0] - if not isinstance(a_token, str): - # actual contents are not strings - raise TypeError('Second axis in "tokens" must contain strings. ' - 'Found "{}" (type "{}")'.format(a_token, type(a_token))) - - # https://stackoverflow.com/a/35791116/11555448 - it = iter(tokens) - the_len = len(next(it)) - if not all(len(l) == the_len for l in it): - raise ValueError('"tokens" samples do not have the same length.') - tokens_len = the_len - else: - raise TypeError('"tokens" must be an array of strings, ' - 'or an array of string arrays. ' - 'Got "{}".'.format(tokens)) - - if tokens_len != doc_len: - raise ValueError('"tokens" and "doc" lengths must match. ' - '"tokens" length: "%d". "doc" length: "%d"' - % (tokens_len, doc_len)) + # Note that validation of the input shape, etc is done by Keras \ No newline at end of file diff --git a/eli5/keras/gradcam.py b/eli5/keras/gradcam.py index c5f06fd2..f774c61f 100644 --- a/eli5/keras/gradcam.py +++ b/eli5/keras/gradcam.py @@ -15,16 +15,21 @@ from keras.models import Model # type: ignore from keras.layers import Layer # type: ignore +from eli5.nn.gradcam import ( + _validate_targets, + _validate_classification_target, +) + def gradcam_backend_keras(model, # type: Model - doc, # type: np.ndarray - targets, # type: Optional[List[int]] - activation_layer, # type: Layer - ): + doc, # type: np.ndarray + targets, # type: Optional[List[int]] + activation_layer, # type: Layer + ): # type: (...) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray] """ Compute the terms and by-products required by the Grad-CAM formula. - + Parameters ---------- model : keras.models.Model @@ -58,6 +63,8 @@ def gradcam_backend_keras(model, # type: Model # and https://github.com/ramprs/grad-cam/blob/master/classification.lua # TODO: as in pytorch PR, separate out classification tensor code if targets is not None: + _validate_targets(targets) + _validate_classification_target(targets[0], model.output_shape) target, = targets predicted_idx = K.constant([target], dtype='int64') else: diff --git a/eli5/nn/text.py b/eli5/nn/text.py index ff382dd0..7316f4db 100644 --- a/eli5/nn/text.py +++ b/eli5/nn/text.py @@ -55,6 +55,8 @@ def gradcam_spans(heatmap, # type: np.ndarray """ # FIXME: might want to do this when formatting the explanation? # TODO: might want to add validation for heatmap and other arguments? + _validate_tokens(doc, tokens) + length = len(tokens) heatmap = resize_1d(heatmap, length, interpolation_kind=interpolation_kind) @@ -212,4 +214,60 @@ def _trim_padding(pad_indices, # type: np.ndarray # and we can not detect and raise an error if there is padding in the middle of the text tokens = np.delete(tokens, pad_indices) heatmap = np.delete(heatmap, pad_indices) - return tokens, heatmap \ No newline at end of file + return tokens, heatmap + + +# FIXME: break this function up +def _validate_tokens(doc, tokens): + # type: (np.ndarray, Union[np.ndarray, list]) -> None + """Check that ``tokens`` contains correct items and matches ``doc``.""" + if not isinstance(tokens, (list, np.ndarray)): + # wrong type + raise TypeError('"tokens" must be list or numpy.ndarray. ' + 'Got "{}".'.format(tokens)) + + batch_size, doc_len = doc.shape[0], doc.shape[1] + if len(tokens) == 0: + # empty list + raise ValueError('"tokens" is empty: {}'.format(tokens)) + + an_entry = tokens[0] + if isinstance(an_entry, str): + # no batch + if batch_size != 1: + # doc is batched but tokens is not + raise ValueError('If passing "tokens" without batch dimension, ' + '"doc" must have batch size = 1.' + 'Got "doc" with batch size = %d.' % batch_size) + tokens_len = len(tokens) + elif isinstance(an_entry, (list, np.ndarray)): + # batched + tokens_batch_size = len(tokens) + if tokens_batch_size != batch_size: + # batch lengths do not match + raise ValueError('"tokens" must have same number of samples ' + 'as in doc batch. Got: "tokens" samples: %d, ' + 'doc samples: %d' % (tokens_batch_size, batch_size)) + + a_token = an_entry[0] + if not isinstance(a_token, str): + # actual contents are not strings + raise TypeError('Second axis in "tokens" must contain strings. ' + 'Found "{}" (type "{}")'.format(a_token, type(a_token))) + + # a way to check that all elements match some condition + # https://stackoverflow.com/a/35791116/11555448 + it = iter(tokens) + the_len = len(next(it)) + if not all(len(l) == the_len for l in it): + raise ValueError('"tokens" samples do not have the same length.') + tokens_len = the_len + else: + raise TypeError('"tokens" must be an array of strings, ' + 'or an array of string arrays. ' + 'Got "{}".'.format(tokens)) + + if tokens_len != doc_len: + raise ValueError('"tokens" and "doc" lengths must match. ' + '"tokens" length: "%d". "doc" length: "%d"' + % (tokens_len, doc_len)) \ No newline at end of file diff --git a/tests/test_keras.py b/tests/test_keras.py index de68489c..bfdd1f8e 100644 --- a/tests/test_keras.py +++ b/tests/test_keras.py @@ -33,7 +33,6 @@ from eli5.keras.explain_prediction import ( _validate_model, _validate_doc, - _validate_tokens, _get_layer, _autoget_layer_image, _autoget_layer_text, @@ -147,42 +146,6 @@ def test_validate_doc(): _validate_doc(np.zeros((3, 2, 2, 1))) -def test_validate_tokens(): - _validate_tokens(np.zeros((1, 3)), ['a', 'b', 'c']) - _validate_tokens(np.zeros((2, 2)), [['a', 'b'], ['c', 'd']]) - - -def test_validate_tokens_invalid(): - with pytest.raises(TypeError): - # should be in a list - _validate_tokens(np.zeros((1, 1)), 'a') - with pytest.raises(ValueError): - # empty list - _validate_tokens(np.zeros((1, 1)), []) - with pytest.raises(ValueError): - # single list but multiple samples in batch - _validate_tokens(np.zeros((3, 2)), ['a', 'b']) - - # list doesn't contain strings - with pytest.raises(TypeError): - _validate_tokens(np.zeros((1, 1)), [0]) - with pytest.raises(TypeError): - _validate_tokens(np.zeros((1, 1)), [[0]]) - - with pytest.raises(ValueError): - # not enough samples in batched list - _validate_tokens(np.zeros((3, 1)), np.array([['a'], ['b']])) - with pytest.raises(ValueError): - # tokens lengths vary - _validate_tokens(np.zeros((2, 2)), [['a', 'b'], ['c']]) - with pytest.raises(ValueError): - # tokens sample lengths do not match - _validate_tokens(np.zeros((1, 1)), ['a', 'b']) - with pytest.raises(TypeError): - # too many axes - _validate_tokens(np.zeros((1, 1,)), [[['a']]]) - - def test_explain_prediction_attributes(simple_seq_image, dummy_image): expl = eli5.explain_prediction(simple_seq_image, np.zeros((1, 32, 32, 1))) assert expl.layer is not None diff --git a/tests/test_nn_text.py b/tests/test_nn_text.py index 2a9b2110..bbea4540 100644 --- a/tests/test_nn_text.py +++ b/tests/test_nn_text.py @@ -5,7 +5,7 @@ import numpy as np from eli5.nn.text import ( - gradcam_text_spans, + gradcam_spans, resize_1d, _build_spans, _construct_document, @@ -13,6 +13,7 @@ _find_padding_values, _find_padding_tokens, _trim_padding, + _validate_tokens, ) from eli5.base import ( WeightedSpans, @@ -93,16 +94,49 @@ def test_trim_padding_invalid(): assert np.array_equal(tokens, tokens_trimmed) assert np.array_equal(heatmap, heatmap_trimmed) - # with pytest.raises(ValueError): - # _trim_padding([1], ['a', 'PAD', 'b'], np.array([1, 0, 2])) - -def test_gradcam_text_spans(): - heatmap, tokens, doc = np.array([2.0]), ['a'], [2] - res_tokens, res_heatmap, res_weighted_spans = gradcam_text_spans(heatmap, tokens, doc) +def test_gradcam_spans(): + heatmap, tokens, doc = np.array([2.0]), ['a'], np.array([[2]]) + res_tokens, res_heatmap, res_weighted_spans = gradcam_spans(heatmap, tokens, doc) assert np.array_equal(heatmap, res_heatmap) assert np.array_equal(tokens, res_tokens) assert res_weighted_spans == WeightedSpans([DocWeightedSpans( 'a', spans=[('a', [(0, 1)], 2.0)] - )]) \ No newline at end of file + )]) + + +def test_validate_tokens(): + _validate_tokens(np.zeros((1, 3)), ['a', 'b', 'c']) + _validate_tokens(np.zeros((2, 2)), [['a', 'b'], ['c', 'd']]) + + +def test_validate_tokens_invalid(): + with pytest.raises(TypeError): + # should be in a list + _validate_tokens(np.zeros((1, 1)), 'a') + with pytest.raises(ValueError): + # empty list + _validate_tokens(np.zeros((1, 1)), []) + with pytest.raises(ValueError): + # single list but multiple samples in batch + _validate_tokens(np.zeros((3, 2)), ['a', 'b']) + + # list doesn't contain strings + with pytest.raises(TypeError): + _validate_tokens(np.zeros((1, 1)), [0]) + with pytest.raises(TypeError): + _validate_tokens(np.zeros((1, 1)), [[0]]) + + with pytest.raises(ValueError): + # not enough samples in batched list + _validate_tokens(np.zeros((3, 1)), np.array([['a'], ['b']])) + with pytest.raises(ValueError): + # tokens lengths vary + _validate_tokens(np.zeros((2, 2)), [['a', 'b'], ['c']]) + with pytest.raises(ValueError): + # tokens sample lengths do not match + _validate_tokens(np.zeros((1, 1)), ['a', 'b']) + with pytest.raises(TypeError): + # too many axes + _validate_tokens(np.zeros((1, 1,)), [[['a']]]) \ No newline at end of file From 18980b72ee41f773c20069b2b4aa9ccedbfb7d7e Mon Sep 17 00:00:00 2001 From: teabolt Date: Mon, 9 Sep 2019 22:09:57 +0200 Subject: [PATCH 06/23] Separate out classification-specific code into a function --- eli5/keras/gradcam.py | 48 ++++++++++++++++++++++--------------------- 1 file changed, 25 insertions(+), 23 deletions(-) diff --git a/eli5/keras/gradcam.py b/eli5/keras/gradcam.py index f774c61f..fd366456 100644 --- a/eli5/keras/gradcam.py +++ b/eli5/keras/gradcam.py @@ -56,26 +56,13 @@ def gradcam_backend_keras(model, # type: Model (numpy.ndarray, np.ndarray, np.ndarray, np.ndarray) Values of variables. """ - # score for class in targets - # TODO: maybe do the sum / loss calculation in this function and pass it to gradcam. - # This would be consistent with what is done in - # https://github.com/ramprs/grad-cam/blob/master/misc/utils.lua - # and https://github.com/ramprs/grad-cam/blob/master/classification.lua - # TODO: as in pytorch PR, separate out classification tensor code - if targets is not None: - _validate_targets(targets) - _validate_classification_target(targets[0], model.output_shape) - target, = targets - predicted_idx = K.constant([target], dtype='int64') - else: - predicted_idx = _autoget_target_prediction(model) - # access value by index - predicted_val = K.gather(model.output[0, :], predicted_idx) + # class score + predicted_idx, predicted_val = _classification_target(model, targets) # output of target activation layer, i.e. activation maps of a conv layer activation_output = activation_layer.output - # score for class w.r.p.t. activation layer + # score w.r.p.t. activation layer grads = _calc_gradient(predicted_val, [activation_output]) # TODO: gradcam on input layer @@ -97,7 +84,7 @@ def _calc_gradient(ys, xs): """ Return the gradient of scalar ``ys`` with respect to each of list ``xs``, (must be singleton) - and apply grad normalization. + and apply gradient normalization. """ # differentiate ys (scalar) with respect to each variable in xs # K.gradients tends to produce bigger values than tf.gradients @@ -123,9 +110,24 @@ def _calc_gradient(ys, xs): return grads -def _autoget_target_prediction(model): - # type: (Model) -> K.variable - """Automatically get the index with - the highest predicted output from ``model``""" - output = model.output - return K.argmax(output, axis=-1) \ No newline at end of file +def _classification_target(model, targets): + # type: (Model, Optional[List[int]]) -> Tuple[K.variable, K.variable] + """Get a predicted index and its value from a classification based model.""" + # TODO: maybe pass the loss/score to the gradcam function. + # This would be consistent with what is done in + # https://github.com/ramprs/grad-cam/blob/master/misc/utils.lua + # and https://github.com/ramprs/grad-cam/blob/master/classification.lua + if targets is not None: + _validate_targets(targets) + target, = targets + _validate_classification_target(target, model.output_shape) + # make a dummy index + predicted_idx = K.constant([target], dtype='int64') + else: + # take the index with the highest value + # from the array of predictions + predicted_idx = K.argmax(model.output, axis=-1) + + # access value by index + predicted_val = K.gather(model.output[0, :], predicted_idx) + return predicted_idx, predicted_val \ No newline at end of file From 6cfd526970b8f53857fcb111a0a2d4d039f78d06 Mon Sep 17 00:00:00 2001 From: teabolt Date: Mon, 9 Sep 2019 22:24:16 +0200 Subject: [PATCH 07/23] Replace mypy annotation K.variable with Any --- eli5/keras/gradcam.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/eli5/keras/gradcam.py b/eli5/keras/gradcam.py index fd366456..4fd4e8be 100644 --- a/eli5/keras/gradcam.py +++ b/eli5/keras/gradcam.py @@ -7,7 +7,7 @@ """ from __future__ import absolute_import -from typing import Optional, Tuple, List +from typing import Any, Optional, Tuple, List import numpy as np # type: ignore import keras # type: ignore @@ -79,8 +79,8 @@ def gradcam_backend_keras(model, # type: Model def _calc_gradient(ys, xs): - # (K.variable, list) -> K.variable - # FIXME: K.variable is not the right type to use? + # (Any, list) -> Any + # TODO: In the future we can replace the annotation Any with a tensor type in Keras backend """ Return the gradient of scalar ``ys`` with respect to each of list ``xs``, (must be singleton) @@ -111,7 +111,7 @@ def _calc_gradient(ys, xs): def _classification_target(model, targets): - # type: (Model, Optional[List[int]]) -> Tuple[K.variable, K.variable] + # type: (Model, Optional[List[int]]) -> Tuple[Any, Any] """Get a predicted index and its value from a classification based model.""" # TODO: maybe pass the loss/score to the gradcam function. # This would be consistent with what is done in From 250bc0a0cdc0e592183b4bf3a730e8382ceef54e Mon Sep 17 00:00:00 2001 From: Mikhail Korobov Date: Thu, 10 Oct 2019 21:40:43 +0200 Subject: [PATCH 08/23] DOC add a link to sklearn docs for "scoring" argument --- eli5/sklearn/permutation_importance.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/eli5/sklearn/permutation_importance.py b/eli5/sklearn/permutation_importance.py index 30ab3cad..987343c5 100644 --- a/eli5/sklearn/permutation_importance.py +++ b/eli5/sklearn/permutation_importance.py @@ -87,11 +87,13 @@ class PermutationImportance(BaseEstimator, MetaEstimatorMixin): scoring : string, callable or None, default=None Scoring function to use for computing feature importances. - A string with scoring name (see scikit-learn docs) or + A string with scoring name (see scikit-learn `docs`_) or a scorer callable object / function with signature ``scorer(estimator, X, y)``. If ``None``, the ``score`` method of the estimator is used. + .. _docs: https://scikit-learn.org/stable/modules/model_evaluation.html#common-cases-predefined-values + n_iter : int, default 5 Number of random shuffle iterations. Decrease to improve speed, increase to get more precise estimates. From c94eb5207a47598bb37782e20c15872f9baf7a5e Mon Sep 17 00:00:00 2001 From: teabolt Date: Mon, 14 Oct 2019 00:22:54 +0200 Subject: [PATCH 09/23] Add validation for public function arguments --- eli5/formatters/image.py | 13 ++--- eli5/keras/explain_prediction.py | 2 +- eli5/nn/gradcam.py | 11 +++- eli5/nn/text.py | 91 +++++++++++++++++++++++--------- tests/test_formatters_image.py | 7 --- tests/test_keras.py | 7 ++- tests/test_nn_gradcam.py | 13 ++++- tests/test_nn_text.py | 38 ++++++------- 8 files changed, 115 insertions(+), 67 deletions(-) diff --git a/eli5/formatters/image.py b/eli5/formatters/image.py index e2f15684..8ae6c857 100644 --- a/eli5/formatters/image.py +++ b/eli5/formatters/image.py @@ -7,6 +7,9 @@ import matplotlib.cm # type: ignore from eli5.base import Explanation +from eli5.nn.gradcam import ( + _validate_heatmap, +) def format_as_image(expl, # type: Explanation @@ -287,14 +290,6 @@ def _validate_image(image): 'Got: {}'.format(image)) -def _validate_heatmap(heatmap): - # type: (np.ndarray) -> None - """Check that ``heatmap`` has the right type.""" - if not isinstance(heatmap, np.ndarray): - raise TypeError('heatmap must be a numpy.ndarray instance. ' - 'Got: {}'.format(heatmap)) - - def _needs_normalization(heatmap): # type: (np.ndarray) -> bool """Return whether ``heatmap`` values are in the interval [0, 1].""" @@ -311,4 +306,4 @@ def _normalize_heatmap(h, epsilon=1e-07): # https://datascience.stackexchange.com/questions/5885/how-to-scale-an-array-of-signed-integers-to-range-from-0-to-1 # add eps to avoid division by zero in case heatmap is all 0's # this also means that lmap max will be slightly less than the 'true' max - return (h - h.min()) / (h.max() - h.min() + epsilon) + return (h - h.min()) / (h.max() - h.min() + epsilon) \ No newline at end of file diff --git a/eli5/keras/explain_prediction.py b/eli5/keras/explain_prediction.py index 5368e506..724f27f8 100644 --- a/eli5/keras/explain_prediction.py +++ b/eli5/keras/explain_prediction.py @@ -552,7 +552,7 @@ def _validate_params(model, # type: Model doc, # type: np.ndarray ): # type: (...) -> None - """Helper for validating all explanation function parameters.""" + """Helper for validating explanation function parameters.""" _validate_model(model) _validate_doc(doc) diff --git a/eli5/nn/gradcam.py b/eli5/nn/gradcam.py index c92630eb..ec6af934 100644 --- a/eli5/nn/gradcam.py +++ b/eli5/nn/gradcam.py @@ -195,4 +195,13 @@ def _validate_classification_target(target, output_shape): if not (0 <= target < output_nodes): raise ValueError('Prediction target index is ' 'outside the required range [0, {}). ', - 'Got {}'.format(output_nodes, target)) \ No newline at end of file + 'Got {}'.format(output_nodes, target)) + + +def _validate_heatmap(heatmap): + # type: (np.ndarray) -> None + """Utility function to check that the ``heatmap`` + argument has the right type.""" + if not isinstance(heatmap, np.ndarray): + raise TypeError('heatmap must be a numpy.ndarray instance. ' + 'Got: "{}" (type "{}").'.format(heatmap, type(heatmap))) \ No newline at end of file diff --git a/eli5/nn/text.py b/eli5/nn/text.py index 7316f4db..81f69155 100644 --- a/eli5/nn/text.py +++ b/eli5/nn/text.py @@ -8,6 +8,9 @@ WeightedSpans, DocWeightedSpans, ) +from eli5.nn.gradcam import ( + _validate_heatmap, +) def gradcam_spans(heatmap, # type: np.ndarray @@ -30,9 +33,16 @@ def gradcam_spans(heatmap, # type: np.ndarray **Should be rank 1 (no batch dimension).** + + :raises TypeError: if ``heatmap`` is wrong type. + tokens : numpy.ndarray or list Tokens that will be highlighted using weights from ``heatmap``. + + :raises TypeError: if ``tokens`` is wrong type. + :raises ValueError: if ``tokens`` contents are unexpected. + doc: numpy.ndarray Original input to the network, from which ``heatmap`` was created. @@ -53,9 +63,14 @@ def gradcam_spans(heatmap, # type: np.ndarray ``tokens`` and ``heatmap`` optionally cut from padding. A :class:`eli5.base.WeightedSpans` object with a weight for each token. """ - # FIXME: might want to do this when formatting the explanation? + # We call this before returning the explanation, NOT when formatting the explanation + # Because WeightedSpans, etc are attributes of a returned explanation # TODO: might want to add validation for heatmap and other arguments? - _validate_tokens(doc, tokens) + _validate_tokens(tokens) + _validate_tokens_value(tokens, doc) + if isinstance(tokens, list): + # convert to a common data type + tokens = np.array(tokens) length = len(tokens) heatmap = resize_1d(heatmap, length, interpolation_kind=interpolation_kind) @@ -64,14 +79,16 @@ def gradcam_spans(heatmap, # type: np.ndarray if pad_value is not None or pad_token is not None: # remove padding pad_indices = _find_padding(pad_value=pad_value, pad_token=pad_token, doc=doc, tokens=tokens) - # If pad_value is not the actual padding value, behaviour is unknown + # If passed padding argument is not the actual padding token/value, behaviour is unknown tokens, heatmap = _trim_padding(pad_indices, tokens, heatmap) + document = _construct_document(tokens) spans = _build_spans(tokens, heatmap, document) weighted_spans = WeightedSpans([ DocWeightedSpans(document, spans=spans) - ]) # why list? - for each vectorized - don't need multiple vectorizers? - # multiple highlights? - could do positive and negative expl? + ]) + # why do we have a list of WeightedSpans? One for each vectorizer? + # But we do not use multiple vectorizers? return tokens, heatmap, weighted_spans @@ -89,6 +106,9 @@ def resize_1d(heatmap, length, interpolation_kind='linear'): heatmap : numpy.ndarray Heatmap to be resized. + + :raises TypeError: if ``heatmap`` is wrong type. + length : int Required width. @@ -104,6 +124,8 @@ def resize_1d(heatmap, length, interpolation_kind='linear'): heatmap : numpy.ndarray The heatmap resized. """ + _validate_heatmap(heatmap) + _validate_length(length) if len(heatmap.shape) == 1 and heatmap.shape[0] == 1: # single weight, no batch heatmap = heatmap.repeat(length) @@ -146,7 +168,7 @@ def _build_spans(tokens, # type: Union[np.ndarray, list] def _construct_document(tokens): # type: (Union[list, np.ndarray]) -> str - """Create a document string by joining ``tokens``.""" + """Create a document string by joining ``tokens`` sequence.""" if _is_character_tokenization(tokens): sep = '' else: @@ -156,10 +178,7 @@ def _construct_document(tokens): def _is_character_tokenization(tokens): # type: (Union[list, np.ndarray]) -> bool - """ - Check whether tokenization is character-level - (returns True) or word-level (returns False). - """ + """Check whether tokenization is character-level (True) or word-level (False).""" return any(' ' in t for t in tokens) @@ -180,27 +199,27 @@ def _find_padding(pad_value=None, # type: Union[int, float] else: raise TypeError('Pass "doc" and "pad_value", ' 'or "tokens" and "pad_token".') - # TODO: warn if indices is empty - passed wrong padding char/value? def _find_padding_values(pad_value, doc): # type: (Union[int, float], np.ndarray) -> np.ndarray if not isinstance(pad_value, (int, float)): raise TypeError('"pad_value" must be int or float. Got "{}"'.format(type(pad_value))) + _validate_doc(doc) values, indices = np.where(doc == pad_value) return indices def _find_padding_tokens(pad_token, tokens): - # type: (str, Union[list, np.ndarray]) -> np.ndarray + # type: (str, np.ndarray) -> np.ndarray if not isinstance(pad_token, str): raise TypeError('"pad_token" must be str. Got "{}"'.format(type(pad_token))) - indices = [idx for idx, token in enumerate(tokens) if token == pad_token] - return np.array(indices) + indices = np.where(tokens == pad_token) + return indices def _trim_padding(pad_indices, # type: np.ndarray - tokens, # type: Union[list, np.ndarray] + tokens, # type: np.ndarray heatmap, # type: np.ndarray ): # type: (...) -> Tuple[Union[list, np.ndarray], np.ndarray] @@ -217,37 +236,59 @@ def _trim_padding(pad_indices, # type: np.ndarray return tokens, heatmap +def _validate_doc(doc): + if not isinstance(doc, np.ndarray): + raise TypeError('"doc" must be an instance of numpy.ndarray. ' + 'Got "{}" (type "{}")'.format(doc, type(doc))) + + +def _validate_length(length): + if not isinstance(length, int): + raise TypeError('"length" must be an integer. Got "{}" ' + '(type "{}")'.format(length, type(length))) + if length < 0: + raise ValueError('"length" must be a non-negative integer. ' + 'Got "{}"'.format(length)) + + +# TODO: +# docs for raises in here +# coverage tests for new validation + + # FIXME: break this function up -def _validate_tokens(doc, tokens): - # type: (np.ndarray, Union[np.ndarray, list]) -> None +def _validate_tokens(tokens): + # type: (Union[np.ndarray, list]) -> None """Check that ``tokens`` contains correct items and matches ``doc``.""" if not isinstance(tokens, (list, np.ndarray)): # wrong type raise TypeError('"tokens" must be list or numpy.ndarray. ' 'Got "{}".'.format(tokens)) - - batch_size, doc_len = doc.shape[0], doc.shape[1] if len(tokens) == 0: # empty list raise ValueError('"tokens" is empty: {}'.format(tokens)) + +def _validate_tokens_value(tokens, doc): + # type: (Union[np.ndarray, list], np.ndarray) -> None + doc_batch, doc_len = doc.shape[0], doc.shape[1] an_entry = tokens[0] if isinstance(an_entry, str): # no batch - if batch_size != 1: + if doc_batch != 1: # doc is batched but tokens is not raise ValueError('If passing "tokens" without batch dimension, ' '"doc" must have batch size = 1.' - 'Got "doc" with batch size = %d.' % batch_size) + 'Got "doc" with batch size = %d.' % doc_batch) tokens_len = len(tokens) elif isinstance(an_entry, (list, np.ndarray)): # batched - tokens_batch_size = len(tokens) - if tokens_batch_size != batch_size: + tokens_batch = len(tokens) + if tokens_batch != doc_batch: # batch lengths do not match raise ValueError('"tokens" must have same number of samples ' 'as in doc batch. Got: "tokens" samples: %d, ' - 'doc samples: %d' % (tokens_batch_size, batch_size)) + 'doc samples: %d' % (tokens_batch, doc_batch)) a_token = an_entry[0] if not isinstance(a_token, str): @@ -260,7 +301,7 @@ def _validate_tokens(doc, tokens): it = iter(tokens) the_len = len(next(it)) if not all(len(l) == the_len for l in it): - raise ValueError('"tokens" samples do not have the same length.') + raise ValueError('"tokens" samples do not all have the same length.') tokens_len = the_len else: raise TypeError('"tokens" must be an array of strings, ' diff --git a/tests/test_formatters_image.py b/tests/test_formatters_image.py index bc395afd..c287062f 100644 --- a/tests/test_formatters_image.py +++ b/tests/test_formatters_image.py @@ -15,7 +15,6 @@ _cap_alpha, _overlay_heatmap, _validate_image, - _validate_heatmap, ) from .utils_image import assert_pixel_by_pixel_equal import eli5 @@ -153,12 +152,6 @@ def test_validate_image(): _validate_image(np.zeros((2, 2, 4,))) -def test_validate_heatmap(): - with pytest.raises(TypeError): - # heatmap must be a numpy array, not a Pillow image - _validate_heatmap(PIL.Image.new('L', (2, 2,))) - - def test_format_as_image_notransparency(catdog_rgba): # heatmap with full transparency expl = Explanation('mock', diff --git a/tests/test_keras.py b/tests/test_keras.py index bfdd1f8e..b765abc1 100644 --- a/tests/test_keras.py +++ b/tests/test_keras.py @@ -38,7 +38,6 @@ _autoget_layer_text, ) from eli5.keras.gradcam import ( - _autoget_target_prediction, _calc_gradient, ) @@ -188,12 +187,12 @@ def test_calc_gradient(differentiable_model): def test_calc_gradient_nondifferentiable(nondifferentiable_model): with pytest.raises(ValueError): - grads = _calc_gradient(nondifferentiable_model.output, - [nondifferentiable_model.input]) + _calc_gradient(nondifferentiable_model.output, + [nondifferentiable_model.input]) -# TODO: test_autoget_target_prediction with multiple maximum values, etc +# TODO: test chossing multiple target from multiple maximum values, etc def test_import(): diff --git a/tests/test_nn_gradcam.py b/tests/test_nn_gradcam.py index fb8c81e5..1516a61c 100644 --- a/tests/test_nn_gradcam.py +++ b/tests/test_nn_gradcam.py @@ -3,11 +3,13 @@ import pytest import numpy as np +PIL = pytest.importorskip('PIL') from eli5.nn.gradcam import ( gradcam_heatmap, _validate_targets, _validate_classification_target, + _validate_heatmap, ) @@ -70,4 +72,13 @@ def test_validate_classification_target(): _validate_classification_target(2, (1, 2,)) with pytest.raises(ValueError): # one less - _validate_classification_target(-1, (1, 1,)) \ No newline at end of file + _validate_classification_target(-1, (1, 1,)) + + +def test_validate_heatmap(): + with pytest.raises(TypeError): + # heatmap must be a numpy array, not a Pillow image + _validate_heatmap(PIL.Image.new('L', (2, 2,))) + with pytest.raises(TypeError): + # heatmap must not be a Python list + _validate_heatmap([2, 3]) \ No newline at end of file diff --git a/tests/test_nn_text.py b/tests/test_nn_text.py index bbea4540..1beac020 100644 --- a/tests/test_nn_text.py +++ b/tests/test_nn_text.py @@ -14,6 +14,7 @@ _find_padding_tokens, _trim_padding, _validate_tokens, + _validate_tokens_value, ) from eli5.base import ( WeightedSpans, @@ -67,11 +68,11 @@ def test_find_padding_values(): def test_find_padding_tokens(): - indices = _find_padding_tokens('', ['the', 'test', '', '']) + indices = _find_padding_tokens('', np.array(['the', 'test', '', ''])) np.array_equal(indices, np.array([2, 3])) with pytest.raises(TypeError): - _find_padding_tokens(0, ['']) + _find_padding_tokens(0, np.array([''])) @pytest.mark.parametrize('pad_indices, tokens, heatmap, expected_tokens, expected_heatmap', [ @@ -106,37 +107,36 @@ def test_gradcam_spans(): )]) -def test_validate_tokens(): - _validate_tokens(np.zeros((1, 3)), ['a', 'b', 'c']) - _validate_tokens(np.zeros((2, 2)), [['a', 'b'], ['c', 'd']]) - - def test_validate_tokens_invalid(): + # should be in a list or numpy array with pytest.raises(TypeError): - # should be in a list - _validate_tokens(np.zeros((1, 1)), 'a') + _validate_tokens('a') + + # empty list with pytest.raises(ValueError): - # empty list - _validate_tokens(np.zeros((1, 1)), []) + _validate_tokens([]) + + +def test_validate_tokens_value_invalid(): with pytest.raises(ValueError): - # single list but multiple samples in batch - _validate_tokens(np.zeros((3, 2)), ['a', 'b']) + # single tokens list but multiple samples in input doc + _validate_tokens_value(['a', 'b'], np.zeros((3, 2))) # list doesn't contain strings with pytest.raises(TypeError): - _validate_tokens(np.zeros((1, 1)), [0]) + _validate_tokens_value([0], np.zeros((1, 1))) with pytest.raises(TypeError): - _validate_tokens(np.zeros((1, 1)), [[0]]) + _validate_tokens_value([[0]], np.zeros((1, 1))) with pytest.raises(ValueError): # not enough samples in batched list - _validate_tokens(np.zeros((3, 1)), np.array([['a'], ['b']])) + _validate_tokens_value(np.array([['a'], ['b']]), np.zeros((3, 1))) with pytest.raises(ValueError): # tokens lengths vary - _validate_tokens(np.zeros((2, 2)), [['a', 'b'], ['c']]) + _validate_tokens_value([['a', 'b'], ['c']], np.zeros((2, 2))) with pytest.raises(ValueError): # tokens sample lengths do not match - _validate_tokens(np.zeros((1, 1)), ['a', 'b']) + _validate_tokens_value(['a', 'b'], np.zeros((1, 1))) with pytest.raises(TypeError): # too many axes - _validate_tokens(np.zeros((1, 1,)), [[['a']]]) \ No newline at end of file + _validate_tokens_value([[['a']]], np.zeros((1, 1,))) \ No newline at end of file From c0d9d58ad9c539a6ebd9ba5f158ade36eb6d8696 Mon Sep 17 00:00:00 2001 From: Karol Szepietowski Date: Mon, 9 Dec 2019 20:30:35 +0100 Subject: [PATCH 10/23] `random_state` is depracted in `OneClassSVM` --- tests/test_sklearn_explain_prediction.py | 2 +- tests/test_sklearn_explain_weights.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_sklearn_explain_prediction.py b/tests/test_sklearn_explain_prediction.py index 646c8c7d..277fee46 100644 --- a/tests/test_sklearn_explain_prediction.py +++ b/tests/test_sklearn_explain_prediction.py @@ -379,7 +379,7 @@ def test_explain_linear_binary(newsgroups_train_binary, clf): def test_explain_one_class_svm(): X = np.array([[0, 0], [0, 1], [5, 3], [93, 94], [90, 91]]) - clf = OneClassSVM(kernel='linear', random_state=42).fit(X) + clf = OneClassSVM(kernel='linear').fit(X) res = explain_prediction(clf, X[0]) assert res.targets[0].score < 0 for expl in format_as_all(res, clf): diff --git a/tests/test_sklearn_explain_weights.py b/tests/test_sklearn_explain_weights.py index 6ca5f519..93df0881 100644 --- a/tests/test_sklearn_explain_weights.py +++ b/tests/test_sklearn_explain_weights.py @@ -210,7 +210,7 @@ def test_explain_linear_unsupported_multiclass(clf, newsgroups_train): def test_explain_one_class_svm(): X = np.array([[0,0], [0, 1], [5, 3], [93, 94], [90, 91]]) - clf = OneClassSVM(kernel='linear', random_state=42).fit(X) + clf = OneClassSVM(kernel='linear').fit(X) res = explain_weights(clf) assert len(res.targets) == 1 target = res.targets[0] From 320119609c615560fb84491985b9a63dde0e247c Mon Sep 17 00:00:00 2001 From: Karol Szepietowski Date: Mon, 9 Dec 2019 20:41:08 +0100 Subject: [PATCH 11/23] `VectorizerMixin` is no longer base class for `CountVectorizer` --- eli5/sklearn/text.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/eli5/sklearn/text.py b/eli5/sklearn/text.py index 57296a6f..632799f6 100644 --- a/eli5/sklearn/text.py +++ b/eli5/sklearn/text.py @@ -59,6 +59,10 @@ def _get_doc_weighted_spans(doc, if hasattr(vec, 'get_doc_weighted_spans'): return vec.get_doc_weighted_spans(doc, feature_weights, feature_fn) + try: + from sklearn.feature_extraction.text import _VectorizerMixin as VectorizerMixin + except ImportError: # Changed in scikit-learn 0.22 + from sklearn.feature_extraction.text import VectorizerMixin if not isinstance(vec, VectorizerMixin): return None From 7aa3dcb4ce065203f7a2f0e268728585544e8c61 Mon Sep 17 00:00:00 2001 From: Karol Szepietowski Date: Mon, 9 Dec 2019 21:00:01 +0100 Subject: [PATCH 12/23] Default value of `gamma` in `sklearn.svm.*` changed from `auto` to `scale` --- tests/test_permutation_importance.py | 2 +- tests/test_sklearn_explain_weights.py | 2 +- tests/test_sklearn_permutation_importance.py | 8 ++++---- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/test_permutation_importance.py b/tests/test_permutation_importance.py index effb4ff9..f4b95233 100644 --- a/tests/test_permutation_importance.py +++ b/tests/test_permutation_importance.py @@ -41,7 +41,7 @@ def is_shuffled(X, X_sh, col): def test_get_feature_importances(boston_train): X, y, feat_names = boston_train - svr = SVR(C=20).fit(X, y) + svr = SVR(C=20, gamma='auto').fit(X, y) score, importances = get_score_importances(svr.score, X, y) assert score > 0.7 importances = dict(zip(feat_names, np.mean(importances, axis=0))) diff --git a/tests/test_sklearn_explain_weights.py b/tests/test_sklearn_explain_weights.py index 93df0881..21fc3333 100644 --- a/tests/test_sklearn_explain_weights.py +++ b/tests/test_sklearn_explain_weights.py @@ -210,7 +210,7 @@ def test_explain_linear_unsupported_multiclass(clf, newsgroups_train): def test_explain_one_class_svm(): X = np.array([[0,0], [0, 1], [5, 3], [93, 94], [90, 91]]) - clf = OneClassSVM(kernel='linear').fit(X) + clf = OneClassSVM(kernel='linear', gamma='auto').fit(X) res = explain_weights(clf) assert len(res.targets) == 1 target = res.targets[0] diff --git a/tests/test_sklearn_permutation_importance.py b/tests/test_sklearn_permutation_importance.py index 19e54e2d..4fe942fd 100644 --- a/tests/test_sklearn_permutation_importance.py +++ b/tests/test_sklearn_permutation_importance.py @@ -73,7 +73,7 @@ def test_cv(boston_train): *boston_train, noise_ratio=0.99) reg = PermutationImportance( - SVR(C=100), + SVR(C=100, gamma='auto'), random_state=42, cv=None, n_iter=50, # use the same number of experiments as with cv=10 @@ -86,7 +86,7 @@ def test_cv(boston_train): # CV feature importances reg = PermutationImportance( - SVR(C=100), + SVR(C=100, gamma='auto'), random_state=42, cv=10, ).fit(X_test, y_test) @@ -132,9 +132,9 @@ def test_feature_selection(boston_train): ), threshold=0.1, ) - pipe = make_pipeline(sel, SVR(C=10)) + pipe = make_pipeline(sel, SVR(C=10, gamma='auto')) score1 = cross_val_score(pipe, X, y).mean() - score2 = cross_val_score(SVR(C=10), X, y).mean() + score2 = cross_val_score(SVR(C=10, gamma='auto'), X, y).mean() print(score1, score2) assert score1 > score2 From f625fa9d9e2cc09712e4168cad549335056efc6e Mon Sep 17 00:00:00 2001 From: Karol Szepietowski Date: Mon, 9 Dec 2019 21:31:37 +0100 Subject: [PATCH 13/23] New default argumnets for `LogisticRegression`, `LogisticRegressionCV` and `RFECV` --- eli5/sklearn/text.py | 2 +- tests/test_sklearn_explain_weights.py | 2 +- tests/test_sklearn_transform.py | 8 ++++---- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/eli5/sklearn/text.py b/eli5/sklearn/text.py index 632799f6..9f3c2fdb 100644 --- a/eli5/sklearn/text.py +++ b/eli5/sklearn/text.py @@ -62,7 +62,7 @@ def _get_doc_weighted_spans(doc, try: from sklearn.feature_extraction.text import _VectorizerMixin as VectorizerMixin except ImportError: # Changed in scikit-learn 0.22 - from sklearn.feature_extraction.text import VectorizerMixin + from sklearn.feature_extraction.text import VectorizerMixin # type: ignore if not isinstance(vec, VectorizerMixin): return None diff --git a/tests/test_sklearn_explain_weights.py b/tests/test_sklearn_explain_weights.py index 21fc3333..7f5469c2 100644 --- a/tests/test_sklearn_explain_weights.py +++ b/tests/test_sklearn_explain_weights.py @@ -451,7 +451,7 @@ def test_explain_random_forest_and_tree_feature_filter(newsgroups_train, clf): def test_explain_empty(newsgroups_train): - clf = LogisticRegression(C=0.01, penalty='l1', random_state=42) + clf = LogisticRegression(C=0.01, penalty='l1', solver='liblinear', random_state=42) docs, y, target_names = newsgroups_train vec = TfidfVectorizer() diff --git a/tests/test_sklearn_transform.py b/tests/test_sklearn_transform.py index aa200937..0c7123f5 100644 --- a/tests/test_sklearn_transform.py +++ b/tests/test_sklearn_transform.py @@ -81,19 +81,19 @@ def selection_score_func(X, y): (VarianceThreshold(1.0), ['']), (GenericUnivariateSelect(), ['']), (GenericUnivariateSelect(mode='k_best', param=2), ['', '']), - (SelectFromModel(LogisticRegression('l1', C=0.01, random_state=42)), + (SelectFromModel(LogisticRegression('l1', C=0.01, solver='liblinear', random_state=42, multi_class='ovr')), ['', '']), (SelectFromModel( PermutationImportance( - LogisticRegression(random_state=42), + LogisticRegression(solver='liblinear', random_state=42), cv=5, random_state=42, refit=False, ), threshold=0.1, ), ['', '']), - (RFE(LogisticRegression(random_state=42), 2), + (RFE(LogisticRegression(solver='liblinear', random_state=42, multi_class='ovr'), 2), ['', '']), - (RFECV(LogisticRegression(random_state=42)), + (RFECV(LogisticRegression(solver='liblinear', random_state=42, multi_class='ovr'), cv=3), ['', '', '', '']), ] + _additional_test_cases) def test_transform_feature_names_iris(transformer, expected, iris_train): From b150387a4fe3ac4ea0f7a44ea9bfa489d300b12b Mon Sep 17 00:00:00 2001 From: Karol Szepietowski Date: Wed, 11 Dec 2019 17:11:21 +0100 Subject: [PATCH 14/23] Move `VectorizerMixin` import to the top of file. --- eli5/sklearn/text.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/eli5/sklearn/text.py b/eli5/sklearn/text.py index 9f3c2fdb..20da0f48 100644 --- a/eli5/sklearn/text.py +++ b/eli5/sklearn/text.py @@ -1,8 +1,11 @@ from __future__ import absolute_import from typing import Any, Union, Callable, Dict, List, Optional, Set, Tuple -from sklearn.feature_extraction.text import VectorizerMixin # type: ignore from sklearn.pipeline import FeatureUnion # type: ignore +try: + from sklearn.feature_extraction.text import _VectorizerMixin as VectorizerMixin +except ImportError: # Changed in scikit-learn 0.22 + from sklearn.feature_extraction.text import VectorizerMixin # type: ignore from eli5.base import ( DocWeightedSpans, WeightedSpans, FeatureWeights, FeatureWeight, @@ -59,10 +62,6 @@ def _get_doc_weighted_spans(doc, if hasattr(vec, 'get_doc_weighted_spans'): return vec.get_doc_weighted_spans(doc, feature_weights, feature_fn) - try: - from sklearn.feature_extraction.text import _VectorizerMixin as VectorizerMixin - except ImportError: # Changed in scikit-learn 0.22 - from sklearn.feature_extraction.text import VectorizerMixin # type: ignore if not isinstance(vec, VectorizerMixin): return None From fc9c2d737f7316a360c5fe8e2d3e6946009022c0 Mon Sep 17 00:00:00 2001 From: Konstantin Lopuhin Date: Thu, 12 Dec 2019 10:22:48 +0300 Subject: [PATCH 15/23] xfail a test of lightning with pandas dataframes lightning prediction does not work with pandas dataframes any more --- tests/test_lightning.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_lightning.py b/tests/test_lightning.py index 6519d9d5..cebbb7a8 100644 --- a/tests/test_lightning.py +++ b/tests/test_lightning.py @@ -73,6 +73,7 @@ def test_explain_weights_regressors(boston_train, reg): has_bias=False) +@pytest.mark.xfail(reason='lightning does not work with pandas dataframes any more') @pytest.mark.parametrize(['reg'], _instances(_REGRESSORS)[:2]) def test_explain_prediction_pandas(reg, boston_train): _check_explain_prediction_pandas(reg, boston_train) From e6750a51a58830951deef4d6949fab02cdb36b58 Mon Sep 17 00:00:00 2001 From: Karol Szepietowski Date: Sun, 15 Dec 2019 19:40:19 +0100 Subject: [PATCH 16/23] Upgrade version of mypy. Fix typing errors --- eli5/formatters/text.py | 2 +- eli5/formatters/trees.py | 2 ++ eli5/formatters/utils.py | 4 ++-- eli5/keras/gradcam.py | 2 +- eli5/lime/_vectorizer.py | 4 ++-- eli5/lime/lime.py | 4 ++-- tox.ini | 2 +- 7 files changed, 11 insertions(+), 9 deletions(-) diff --git a/eli5/formatters/text.py b/eli5/formatters/text.py index a6269ed3..138dcdc2 100644 --- a/eli5/formatters/text.py +++ b/eli5/formatters/text.py @@ -159,7 +159,7 @@ def _transition_features_lines(explanation): return [ "", "Transition features:", - tabulate(tf.coef, headers=tf.class_names, showindex=tf.class_names, + tabulate(tf.coef, headers=tf.class_names, showindex=tf.class_names, # type: ignore floatfmt="0.3f"), "" ] diff --git a/eli5/formatters/trees.py b/eli5/formatters/trees.py index 7be1e9c3..5cfbee10 100644 --- a/eli5/formatters/trees.py +++ b/eli5/formatters/trees.py @@ -24,6 +24,8 @@ def p(*args): else: assert node.left is not None assert node.right is not None + assert node.threshold is not None + feat_name = node.feature_name if depth > 0: diff --git a/eli5/formatters/utils.py b/eli5/formatters/utils.py index 542402d1..b461f0ee 100644 --- a/eli5/formatters/utils.py +++ b/eli5/formatters/utils.py @@ -144,7 +144,7 @@ def tabulate(data, # type: List[List[Any]] def format_weight(value): # type: (Real) -> str - return '{:+.3f}'.format(value) + return '{:+.3f}'.format(value) # type: ignore def format_value(value): @@ -154,4 +154,4 @@ def format_value(value): elif np.isnan(value): return 'Missing' else: - return '{:.3f}'.format(value) + return '{:.3f}'.format(value) # type: ignore diff --git a/eli5/keras/gradcam.py b/eli5/keras/gradcam.py index 1878d642..31c80f4c 100644 --- a/eli5/keras/gradcam.py +++ b/eli5/keras/gradcam.py @@ -199,7 +199,7 @@ def _validate_target(target, output_shape): output_nodes = output_shape[1:][0] if not (0 <= target < output_nodes): raise ValueError('Prediction target index is ' - 'outside the required range [0, {}). ', + 'outside the required range [0, {}). ' 'Got {}'.format(output_nodes, target)) else: raise TypeError('Prediction target must be int. ' diff --git a/eli5/lime/_vectorizer.py b/eli5/lime/_vectorizer.py index d5168454..4d40ab31 100644 --- a/eli5/lime/_vectorizer.py +++ b/eli5/lime/_vectorizer.py @@ -13,8 +13,8 @@ class SingleDocumentVectorizer(BaseEstimator, TransformerMixin): """ Fake vectorizer which converts document just to a vector of ones """ - def __init__(self, token_pattern=None): - # type: (Optional[str]) -> None + def __init__(self, token_pattern): + # type: (str) -> None self.token_pattern = token_pattern def fit(self, X, y=None): diff --git a/eli5/lime/lime.py b/eli5/lime/lime.py index 924675bb..c9c9aa78 100644 --- a/eli5/lime/lime.py +++ b/eli5/lime/lime.py @@ -148,7 +148,7 @@ def __init__(self, rbf_sigma=None, # type: float random_state=None, expand_factor=10, # type: Optional[int] - token_pattern=None, # type: str + token_pattern=None, # type: Optional[str] ): # type: (...) -> None self.n_samples = n_samples @@ -162,7 +162,7 @@ def __init__(self, if char_based is None: if token_pattern is None: self.char_based = False # type: Optional[bool] - self.token_pattern = DEFAULT_TOKEN_PATTERN + self.token_pattern = DEFAULT_TOKEN_PATTERN # type: str else: self.char_based = None self.token_pattern = token_pattern diff --git a/tox.ini b/tox.ini index 38e65a4c..45c686b4 100644 --- a/tox.ini +++ b/tox.ini @@ -85,7 +85,7 @@ commands={[testenv:py35-extra]commands} basepython=python3.6 deps= {[testenv]deps} - mypy == 0.641 + mypy == 0.750 lxml commands= mypy --html-report ./mypy-cov --check-untyped-defs eli5 From e508937502bfad33c2f438552cfe0e477bf07976 Mon Sep 17 00:00:00 2001 From: Karol Szepietowski Date: Mon, 16 Dec 2019 17:32:34 +0100 Subject: [PATCH 17/23] Move `tabulate` import to the top of file --- eli5/formatters/text.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/eli5/formatters/text.py b/eli5/formatters/text.py index 138dcdc2..fd8938a7 100644 --- a/eli5/formatters/text.py +++ b/eli5/formatters/text.py @@ -2,6 +2,7 @@ from __future__ import absolute_import from itertools import chain import six +from tabulate import tabulate from typing import List, Optional, Iterator from eli5.base import Explanation, FeatureImportances @@ -9,7 +10,8 @@ from .features import FormattedFeatureName from .utils import ( format_signed, format_value, format_weight, has_any_values_for_weights, - replace_spaces, should_highlight_spaces, tabulate) + replace_spaces, should_highlight_spaces) +from .utils import tabulate as eli5_tabulate from .trees import tree2text @@ -153,7 +155,6 @@ def _decision_tree_lines(explanation): def _transition_features_lines(explanation): # type: (Explanation) -> List[str] - from tabulate import tabulate # type: ignore tf = explanation.transition_features assert tf is not None return [ @@ -203,7 +204,7 @@ def _targets_lines(explanation, # type: Explanation w = target.feature_weights assert w is not None - table = tabulate( + table = eli5_tabulate( [table_line(fw) for fw in chain(w.pos, reversed(w.neg))], header=table_header, col_align=col_align, From cc90bafde4f065412a8cbfa892e114b2aec9a59c Mon Sep 17 00:00:00 2001 From: Karol Szepietowski Date: Mon, 16 Dec 2019 17:36:20 +0100 Subject: [PATCH 18/23] Fix typing in formatters.utils --- eli5/formatters/text.py | 2 +- eli5/formatters/utils.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/eli5/formatters/text.py b/eli5/formatters/text.py index fd8938a7..e6abb286 100644 --- a/eli5/formatters/text.py +++ b/eli5/formatters/text.py @@ -160,7 +160,7 @@ def _transition_features_lines(explanation): return [ "", "Transition features:", - tabulate(tf.coef, headers=tf.class_names, showindex=tf.class_names, # type: ignore + tabulate(tf.coef, headers=tf.class_names, showindex=tf.class_names, floatfmt="0.3f"), "" ] diff --git a/eli5/formatters/utils.py b/eli5/formatters/utils.py index b461f0ee..244eb15a 100644 --- a/eli5/formatters/utils.py +++ b/eli5/formatters/utils.py @@ -143,15 +143,15 @@ def tabulate(data, # type: List[List[Any]] def format_weight(value): - # type: (Real) -> str - return '{:+.3f}'.format(value) # type: ignore + # type: (float) -> str + return '{:+.3f}'.format(value) def format_value(value): - # type: (Optional[Real]) -> str + # type: (Optional[float]) -> str if value is None: return '' elif np.isnan(value): return 'Missing' else: - return '{:.3f}'.format(value) # type: ignore + return '{:.3f}'.format(value) From 0ea24fc1ec2a63f3cfda97bc804e7bc4677d3858 Mon Sep 17 00:00:00 2001 From: Karol Szepietowski Date: Mon, 16 Dec 2019 19:01:19 +0100 Subject: [PATCH 19/23] Remove unimportant `# type: ignore` in code --- eli5/_feature_names.py | 4 ++-- eli5/_feature_weights.py | 2 +- eli5/_graphviz.py | 2 +- eli5/base.py | 2 +- eli5/base_utils.py | 4 ++-- eli5/catboost.py | 4 ++-- eli5/formatters/as_dataframe.py | 2 +- eli5/formatters/as_dict.py | 4 ++-- eli5/formatters/html.py | 4 ++-- eli5/formatters/image.py | 6 +++--- eli5/formatters/text_helpers.py | 2 +- eli5/formatters/utils.py | 3 +-- eli5/ipython.py | 4 ++-- eli5/keras/explain_prediction.py | 18 +++++++++--------- eli5/keras/gradcam.py | 10 +++++----- eli5/lightgbm.py | 4 ++-- eli5/lightning.py | 6 +++--- eli5/lime/_vectorizer.py | 4 ++-- eli5/lime/lime.py | 12 ++++++------ eli5/lime/samplers.py | 14 +++++++------- eli5/lime/textutils.py | 4 ++-- eli5/lime/utils.py | 12 ++++++------ eli5/permutation_importance.py | 4 ++-- eli5/sklearn/explain_prediction.py | 16 ++++++++-------- eli5/sklearn/explain_weights.py | 18 +++++++++--------- eli5/sklearn/permutation_importance.py | 14 +++++++------- eli5/sklearn/text.py | 4 ++-- eli5/sklearn/transform.py | 10 +++++----- eli5/sklearn/treeinspect.py | 4 ++-- eli5/sklearn/unhashing.py | 8 ++++---- eli5/sklearn/utils.py | 10 +++++----- eli5/sklearn_crfsuite/explain_weights.py | 6 +++--- eli5/utils.py | 4 ++-- eli5/xgboost.py | 6 +++--- tox.ini | 2 +- 35 files changed, 116 insertions(+), 117 deletions(-) diff --git a/eli5/_feature_names.py b/eli5/_feature_names.py index fecb820f..ff1fd80c 100644 --- a/eli5/_feature_names.py +++ b/eli5/_feature_names.py @@ -5,8 +5,8 @@ Union, Callable, Pattern ) -import numpy as np # type: ignore -import scipy.sparse as sp # type: ignore +import numpy as np +import scipy.sparse as sp class FeatureNames(Sized, Iterable): diff --git a/eli5/_feature_weights.py b/eli5/_feature_weights.py index 0b737795..1c096e3a 100644 --- a/eli5/_feature_weights.py +++ b/eli5/_feature_weights.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- from __future__ import absolute_import -import numpy as np # type: ignore +import numpy as np from eli5.base import FeatureWeights, FeatureWeight from .utils import argsort_k_largest_positive, argsort_k_smallest, mask diff --git a/eli5/_graphviz.py b/eli5/_graphviz.py index 4632a925..36b26a17 100644 --- a/eli5/_graphviz.py +++ b/eli5/_graphviz.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -import graphviz # type: ignore +import graphviz def is_supported(): diff --git a/eli5/base.py b/eli5/base.py index f6e4c66b..3bac3b5b 100644 --- a/eli5/base.py +++ b/eli5/base.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- from typing import Any, List, Tuple, Union, Optional -import numpy as np # type: ignore +import numpy as np from .base_utils import attrs from .formatters.features import FormattedFeatureName diff --git a/eli5/base_utils.py b/eli5/base_utils.py index 779c6d64..f1081c3b 100644 --- a/eli5/base_utils.py +++ b/eli5/base_utils.py @@ -1,9 +1,9 @@ import inspect -import attr # type: ignore +import attr try: - from functools import singledispatch # type: ignore + from functools import singledispatch except ImportError: from singledispatch import singledispatch # type: ignore diff --git a/eli5/catboost.py b/eli5/catboost.py index 2e495cb6..56abfa9e 100644 --- a/eli5/catboost.py +++ b/eli5/catboost.py @@ -1,7 +1,7 @@ from __future__ import absolute_import, division -import numpy as np # type: ignore -import catboost # type: ignore +import numpy as np +import catboost from eli5.explain import explain_weights from eli5._feature_importances import get_feature_importance_explanation diff --git a/eli5/formatters/as_dataframe.py b/eli5/formatters/as_dataframe.py index 6f2fc302..5b801e75 100644 --- a/eli5/formatters/as_dataframe.py +++ b/eli5/formatters/as_dataframe.py @@ -2,7 +2,7 @@ from typing import Any, Dict, List, Optional import warnings -import pandas as pd # type: ignore +import pandas as pd import eli5 from eli5.base import ( diff --git a/eli5/formatters/as_dict.py b/eli5/formatters/as_dict.py index 1878d8b0..fbad5ee5 100644 --- a/eli5/formatters/as_dict.py +++ b/eli5/formatters/as_dict.py @@ -1,7 +1,7 @@ import six -import attr # type: ignore -import numpy as np # type: ignore +import attr +import numpy as np from .features import FormattedFeatureName diff --git a/eli5/formatters/html.py b/eli5/formatters/html.py index 54cf2e6c..0167b33e 100644 --- a/eli5/formatters/html.py +++ b/eli5/formatters/html.py @@ -3,8 +3,8 @@ from itertools import groupby from typing import List, Optional, Tuple -import numpy as np # type: ignore -from jinja2 import Environment, PackageLoader # type: ignore +import numpy as np +from jinja2 import Environment, PackageLoader from eli5 import _graphviz from eli5.base import (Explanation, TargetExplanation, FeatureWeights, diff --git a/eli5/formatters/image.py b/eli5/formatters/image.py index 59a77fa2..f776b2c2 100644 --- a/eli5/formatters/image.py +++ b/eli5/formatters/image.py @@ -2,9 +2,9 @@ from __future__ import absolute_import from typing import Union, Optional, Callable -import numpy as np # type: ignore -from PIL import Image # type: ignore -import matplotlib.cm # type: ignore +import numpy as np +from PIL import Image +import matplotlib.cm from eli5.base import Explanation diff --git a/eli5/formatters/text_helpers.py b/eli5/formatters/text_helpers.py index dc5ff28a..c63c66b2 100644 --- a/eli5/formatters/text_helpers.py +++ b/eli5/formatters/text_helpers.py @@ -1,7 +1,7 @@ from collections import Counter from typing import List, Optional -import numpy as np # type: ignore +import numpy as np from eli5.base import TargetExplanation, WeightedSpans, DocWeightedSpans from eli5.base_utils import attrs diff --git a/eli5/formatters/utils.py b/eli5/formatters/utils.py index 244eb15a..2e6d2d39 100644 --- a/eli5/formatters/utils.py +++ b/eli5/formatters/utils.py @@ -2,10 +2,9 @@ from itertools import chain import re import six -from numbers import Real from typing import Any, Union, List, Dict, Callable, Match, Optional -import numpy as np # type: ignore +import numpy as np from eli5.base import Explanation from .features import FormattedFeatureName diff --git a/eli5/ipython.py b/eli5/ipython.py index 0039c39e..033a840c 100644 --- a/eli5/ipython.py +++ b/eli5/ipython.py @@ -3,7 +3,7 @@ from typing import Any, Dict, Tuple import warnings -from IPython.display import HTML, Image # type: ignore +from IPython.display import HTML, Image from .explain import explain_weights, explain_prediction from .formatters import format_as_html, fields @@ -11,7 +11,7 @@ from .formatters.image import format_as_image except ImportError as e: # missing dependencies - format_as_image = e # type: ignore + format_as_image = e # type: ignore FORMAT_KWARGS = {'include_styles', 'force_weights', diff --git a/eli5/keras/explain_prediction.py b/eli5/keras/explain_prediction.py index d4928276..73deb25b 100644 --- a/eli5/keras/explain_prediction.py +++ b/eli5/keras/explain_prediction.py @@ -2,21 +2,21 @@ from __future__ import absolute_import from typing import Union, Optional, Callable, Tuple, List, TYPE_CHECKING if TYPE_CHECKING: - import PIL # type: ignore - -import numpy as np # type: ignore -import keras # type: ignore -import keras.backend as K # type: ignore -from keras.models import Model # type: ignore -from keras.layers import Layer # type: ignore -from keras.layers import ( # type: ignore + import PIL + +import numpy as np +import keras +import keras.backend as K +from keras.models import Model +from keras.layers import Layer +from keras.layers import ( Conv2D, MaxPooling2D, AveragePooling2D, GlobalMaxPooling2D, GlobalAveragePooling2D, ) -from keras.preprocessing.image import array_to_img # type: ignore +from keras.preprocessing.image import array_to_img from eli5.base import Explanation, TargetExplanation from eli5.explain import explain_prediction diff --git a/eli5/keras/gradcam.py b/eli5/keras/gradcam.py index 31c80f4c..c8bb5922 100644 --- a/eli5/keras/gradcam.py +++ b/eli5/keras/gradcam.py @@ -2,11 +2,11 @@ from __future__ import absolute_import from typing import Union, Optional, Tuple, List -import numpy as np # type: ignore -import keras # type: ignore -import keras.backend as K # type: ignore -from keras.models import Model # type: ignore -from keras.layers import Layer # type: ignore +import numpy as np +import keras +import keras.backend as K +from keras.models import Model +from keras.layers import Layer def gradcam(weights, activations): diff --git a/eli5/lightgbm.py b/eli5/lightgbm.py index b4510912..c54236b7 100644 --- a/eli5/lightgbm.py +++ b/eli5/lightgbm.py @@ -3,8 +3,8 @@ from collections import defaultdict from typing import DefaultDict, Optional -import numpy as np # type: ignore -import lightgbm # type: ignore +import numpy as np +import lightgbm from eli5.explain import explain_weights, explain_prediction from eli5._feature_importances import get_feature_importance_explanation diff --git a/eli5/lightning.py b/eli5/lightning.py index 417d4f15..2f648064 100644 --- a/eli5/lightning.py +++ b/eli5/lightning.py @@ -1,9 +1,9 @@ # -*- coding: utf-8 -*- from __future__ import absolute_import -from lightning.impl.base import BaseEstimator # type: ignore -from lightning import classification, regression # type: ignore -from sklearn.multiclass import OneVsRestClassifier # type: ignore +from lightning.impl.base import BaseEstimator +from lightning import classification, regression +from sklearn.multiclass import OneVsRestClassifier from eli5.base import Explanation from eli5.base_utils import singledispatch diff --git a/eli5/lime/_vectorizer.py b/eli5/lime/_vectorizer.py index 4d40ab31..5356d6cd 100644 --- a/eli5/lime/_vectorizer.py +++ b/eli5/lime/_vectorizer.py @@ -2,8 +2,8 @@ from __future__ import absolute_import from typing import Tuple, Callable, Dict, Optional, List -import numpy as np # type: ignore -from sklearn.base import BaseEstimator, TransformerMixin # type: ignore +import numpy as np +from sklearn.base import BaseEstimator, TransformerMixin from eli5.base import DocWeightedSpans, FeatureWeights from eli5.sklearn.text import _get_feature_weights_dict diff --git a/eli5/lime/lime.py b/eli5/lime/lime.py index c9c9aa78..2968da04 100644 --- a/eli5/lime/lime.py +++ b/eli5/lime/lime.py @@ -6,12 +6,12 @@ from __future__ import absolute_import from typing import Any, Callable, Dict, Optional -import numpy as np # type: ignore -from sklearn.feature_extraction.text import CountVectorizer # type: ignore -from sklearn.linear_model import SGDClassifier # type: ignore -from sklearn.model_selection import train_test_split # type: ignore -from sklearn.utils import check_random_state # type: ignore -from sklearn.base import clone, BaseEstimator # type: ignore +import numpy as np +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.linear_model import SGDClassifier +from sklearn.model_selection import train_test_split +from sklearn.utils import check_random_state +from sklearn.base import clone, BaseEstimator import eli5 from eli5.sklearn.utils import sklearn_version diff --git a/eli5/lime/samplers.py b/eli5/lime/samplers.py index ff72f568..d079b471 100644 --- a/eli5/lime/samplers.py +++ b/eli5/lime/samplers.py @@ -5,13 +5,13 @@ from typing import List, Tuple, Any, Union, Dict, Optional import six -import numpy as np # type: ignore -from scipy.stats import itemfreq # type: ignore -from sklearn.base import BaseEstimator, clone # type: ignore -from sklearn.neighbors import KernelDensity # type: ignore -from sklearn.metrics import pairwise_distances # type: ignore -from sklearn.model_selection import GridSearchCV, KFold # type: ignore -from sklearn.utils import check_random_state # type: ignore +import numpy as np +from scipy.stats import itemfreq +from sklearn.base import BaseEstimator, clone +from sklearn.neighbors import KernelDensity +from sklearn.metrics import pairwise_distances +from sklearn.model_selection import GridSearchCV, KFold +from sklearn.utils import check_random_state from eli5.utils import vstack from eli5.lime.utils import rbf diff --git a/eli5/lime/textutils.py b/eli5/lime/textutils.py index e896f347..98da0428 100644 --- a/eli5/lime/textutils.py +++ b/eli5/lime/textutils.py @@ -7,8 +7,8 @@ import math from typing import List, Tuple, Union, Optional -import numpy as np # type: ignore -from sklearn.utils import check_random_state # type: ignore +import numpy as np +from sklearn.utils import check_random_state from eli5.utils import indices_to_bool_mask, vstack diff --git a/eli5/lime/utils.py b/eli5/lime/utils.py index ee271a65..120dbfbd 100644 --- a/eli5/lime/utils.py +++ b/eli5/lime/utils.py @@ -2,12 +2,12 @@ from __future__ import absolute_import from typing import List, Any -import numpy as np # type: ignore -from scipy.stats import entropy # type: ignore -from sklearn.pipeline import Pipeline # type: ignore -from sklearn.utils import check_random_state, issparse # type: ignore -from sklearn.utils.metaestimators import if_delegate_has_method # type: ignore -from sklearn.utils import shuffle as _shuffle # type: ignore +import numpy as np +from scipy.stats import entropy +from sklearn.pipeline import Pipeline +from sklearn.utils import check_random_state, issparse +from sklearn.utils.metaestimators import if_delegate_has_method +from sklearn.utils import shuffle as _shuffle from eli5.utils import vstack from eli5.sklearn.utils import sklearn_version diff --git a/eli5/permutation_importance.py b/eli5/permutation_importance.py index b5c4a3f0..8ff5d40d 100644 --- a/eli5/permutation_importance.py +++ b/eli5/permutation_importance.py @@ -13,8 +13,8 @@ from __future__ import absolute_import from typing import Tuple, List, Callable, Any -import numpy as np # type: ignore -from sklearn.utils import check_random_state # type: ignore +import numpy as np +from sklearn.utils import check_random_state def iter_shuffled(X, columns_to_shuffle=None, pre_shuffle=False, diff --git a/eli5/sklearn/explain_prediction.py b/eli5/sklearn/explain_prediction.py index 88586f05..18dcc36f 100644 --- a/eli5/sklearn/explain_prediction.py +++ b/eli5/sklearn/explain_prediction.py @@ -1,10 +1,10 @@ # -*- coding: utf-8 -*- from functools import partial -import numpy as np # type: ignore -import scipy.sparse as sp # type: ignore -from sklearn.base import BaseEstimator # type: ignore -from sklearn.ensemble import ( # type: ignore +import numpy as np +import scipy.sparse as sp +from sklearn.base import BaseEstimator +from sklearn.ensemble import ( ExtraTreesClassifier, ExtraTreesRegressor, GradientBoostingClassifier, @@ -12,7 +12,7 @@ RandomForestClassifier, RandomForestRegressor, ) -from sklearn.linear_model import ( # type: ignore +from sklearn.linear_model import ( ElasticNet, # includes Lasso, MultiTaskElasticNet, etc. ElasticNetCV, HuberRegressor, @@ -34,7 +34,7 @@ SGDRegressor, TheilSenRegressor, ) -from sklearn.svm import ( # type: ignore +from sklearn.svm import ( LinearSVC, LinearSVR, SVC, @@ -43,8 +43,8 @@ NuSVR, OneClassSVM, ) -from sklearn.multiclass import OneVsRestClassifier # type: ignore -from sklearn.tree import ( # type: ignore +from sklearn.multiclass import OneVsRestClassifier +from sklearn.tree import ( DecisionTreeClassifier, DecisionTreeRegressor ) diff --git a/eli5/sklearn/explain_weights.py b/eli5/sklearn/explain_weights.py index 019140a6..49010fb4 100644 --- a/eli5/sklearn/explain_weights.py +++ b/eli5/sklearn/explain_weights.py @@ -1,11 +1,11 @@ # -*- coding: utf-8 -*- from __future__ import absolute_import -import numpy as np # type: ignore +import numpy as np -from sklearn.base import BaseEstimator, RegressorMixin # type: ignore -from sklearn.pipeline import Pipeline # type: ignore -from sklearn.linear_model import ( # type: ignore +from sklearn.base import BaseEstimator, RegressorMixin +from sklearn.pipeline import Pipeline +from sklearn.linear_model import ( ElasticNet, # includes Lasso, MultiTaskElasticNet, etc. ElasticNetCV, HuberRegressor, @@ -27,8 +27,8 @@ SGDRegressor, TheilSenRegressor, ) -from sklearn.multiclass import OneVsRestClassifier # type: ignore -from sklearn.svm import ( # type: ignore +from sklearn.multiclass import OneVsRestClassifier +from sklearn.svm import ( LinearSVC, LinearSVR, SVC, @@ -38,8 +38,8 @@ OneClassSVM, ) # TODO: see https://github.com/scikit-learn/scikit-learn/pull/2250 -from sklearn.naive_bayes import BernoulliNB, MultinomialNB # type: ignore -from sklearn.ensemble import ( # type: ignore +from sklearn.naive_bayes import BernoulliNB, MultinomialNB +from sklearn.ensemble import ( GradientBoostingClassifier, GradientBoostingRegressor, AdaBoostClassifier, @@ -49,7 +49,7 @@ ExtraTreesClassifier, ExtraTreesRegressor, ) -from sklearn.tree import ( # type: ignore +from sklearn.tree import ( DecisionTreeClassifier, DecisionTreeRegressor, ) diff --git a/eli5/sklearn/permutation_importance.py b/eli5/sklearn/permutation_importance.py index 987343c5..5a963880 100644 --- a/eli5/sklearn/permutation_importance.py +++ b/eli5/sklearn/permutation_importance.py @@ -2,23 +2,23 @@ from functools import partial from typing import List -import numpy as np # type: ignore -from sklearn.model_selection import check_cv # type: ignore -from sklearn.utils.metaestimators import if_delegate_has_method # type: ignore -from sklearn.utils import check_array, check_random_state # type: ignore -from sklearn.base import ( # type: ignore +import numpy as np +from sklearn.model_selection import check_cv +from sklearn.utils.metaestimators import if_delegate_has_method +from sklearn.utils import check_array, check_random_state +from sklearn.base import ( BaseEstimator, MetaEstimatorMixin, clone, is_classifier ) -from sklearn.metrics.scorer import check_scoring # type: ignore +from sklearn.metrics.scorer import check_scoring from eli5.permutation_importance import get_score_importances from eli5.sklearn.utils import pandas_available if pandas_available: - import pandas as pd # type: ignore + import pandas as pd CAVEATS_CV_NONE = """ Feature importances are computed on the same data as used for training, diff --git a/eli5/sklearn/text.py b/eli5/sklearn/text.py index 20da0f48..fb2748bf 100644 --- a/eli5/sklearn/text.py +++ b/eli5/sklearn/text.py @@ -1,11 +1,11 @@ from __future__ import absolute_import from typing import Any, Union, Callable, Dict, List, Optional, Set, Tuple -from sklearn.pipeline import FeatureUnion # type: ignore +from sklearn.pipeline import FeatureUnion try: from sklearn.feature_extraction.text import _VectorizerMixin as VectorizerMixin except ImportError: # Changed in scikit-learn 0.22 - from sklearn.feature_extraction.text import VectorizerMixin # type: ignore + from sklearn.feature_extraction.text import VectorizerMixin from eli5.base import ( DocWeightedSpans, WeightedSpans, FeatureWeights, FeatureWeight, diff --git a/eli5/sklearn/transform.py b/eli5/sklearn/transform.py index 2d431f8c..8d79f555 100644 --- a/eli5/sklearn/transform.py +++ b/eli5/sklearn/transform.py @@ -1,11 +1,11 @@ """transform_feature_names implementations for scikit-learn transformers """ -import numpy as np # type: ignore -from sklearn.pipeline import Pipeline, FeatureUnion # type: ignore -from sklearn.feature_selection.base import SelectorMixin # type: ignore +import numpy as np +from sklearn.pipeline import Pipeline, FeatureUnion +from sklearn.feature_selection.base import SelectorMixin -from sklearn.preprocessing import ( # type: ignore +from sklearn.preprocessing import ( MinMaxScaler, StandardScaler, MaxAbsScaler, @@ -26,7 +26,7 @@ def _select_names(est, in_names=None): return [in_names[i] for i in np.flatnonzero(mask)] try: - from sklearn.linear_model import ( # type: ignore + from sklearn.linear_model import ( RandomizedLogisticRegression, RandomizedLasso, ) diff --git a/eli5/sklearn/treeinspect.py b/eli5/sklearn/treeinspect.py index 373397bd..d0e9f76a 100644 --- a/eli5/sklearn/treeinspect.py +++ b/eli5/sklearn/treeinspect.py @@ -7,8 +7,8 @@ """ from __future__ import absolute_import, division -from sklearn.base import ClassifierMixin # type: ignore -from sklearn.tree import _tree, export_graphviz # type: ignore +from sklearn.base import ClassifierMixin +from sklearn.tree import _tree, export_graphviz from eli5.base import TreeInfo, NodeInfo diff --git a/eli5/sklearn/unhashing.py b/eli5/sklearn/unhashing.py index 64b44d86..f4f79b8d 100644 --- a/eli5/sklearn/unhashing.py +++ b/eli5/sklearn/unhashing.py @@ -7,14 +7,14 @@ from itertools import chain from typing import List, Iterable, Any, Dict, Tuple, Union -import numpy as np # type: ignore +import numpy as np import six -from sklearn.base import BaseEstimator, TransformerMixin # type: ignore -from sklearn.feature_extraction.text import ( # type: ignore +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.feature_extraction.text import ( HashingVectorizer, FeatureHasher, ) -from sklearn.pipeline import FeatureUnion # type: ignore +from sklearn.pipeline import FeatureUnion from eli5._feature_names import FeatureNames diff --git a/eli5/sklearn/utils.py b/eli5/sklearn/utils.py index ba3680aa..286d078a 100644 --- a/eli5/sklearn/utils.py +++ b/eli5/sklearn/utils.py @@ -3,9 +3,9 @@ from distutils.version import LooseVersion from typing import Any, Optional, List, Tuple -import numpy as np # type: ignore -import scipy.sparse as sp # type: ignore -from sklearn.multiclass import OneVsRestClassifier # type: ignore +import numpy as np +import scipy.sparse as sp +from sklearn.multiclass import OneVsRestClassifier from eli5.sklearn.unhashing import invert_hashing_and_fit, handle_hashing_vec from eli5._feature_names import FeatureNames @@ -214,7 +214,7 @@ def get_num_features(estimator): try: - import pandas as pd # type: ignore + import pandas as pd pandas_available = True except ImportError: pandas_available = False @@ -277,5 +277,5 @@ def sklearn_version(): >>> sklearn_version() > '0.17' True """ - from sklearn import __version__ # type: ignore + from sklearn import __version__ return LooseVersion(__version__) diff --git a/eli5/sklearn_crfsuite/explain_weights.py b/eli5/sklearn_crfsuite/explain_weights.py index e2a9435d..6007efd7 100644 --- a/eli5/sklearn_crfsuite/explain_weights.py +++ b/eli5/sklearn_crfsuite/explain_weights.py @@ -1,9 +1,9 @@ # -*- coding: utf-8 -*- from __future__ import absolute_import -import numpy as np # type: ignore -from scipy import sparse as sp # type: ignore -from sklearn_crfsuite import CRF # type: ignore +import numpy as np +from scipy import sparse as sp +from sklearn_crfsuite import CRF from eli5.base import Explanation, TargetExplanation, TransitionFeatureWeights from eli5.explain import explain_weights diff --git a/eli5/utils.py b/eli5/utils.py index f1ea32c4..e5f669f8 100644 --- a/eli5/utils.py +++ b/eli5/utils.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -import numpy as np # type: ignore -from scipy import sparse as sp # type: ignore +import numpy as np +from scipy import sparse as sp def argsort_k_largest(x, k): diff --git a/eli5/xgboost.py b/eli5/xgboost.py index 86d3ad58..ce6a079a 100644 --- a/eli5/xgboost.py +++ b/eli5/xgboost.py @@ -4,9 +4,9 @@ import re from typing import Any, Dict, List, Tuple, Optional, Pattern -import numpy as np # type: ignore -import scipy.sparse as sp # type: ignore -from xgboost import ( # type: ignore +import numpy as np +import scipy.sparse as sp +from xgboost import ( XGBClassifier, XGBRegressor, Booster, diff --git a/tox.ini b/tox.ini index 45c686b4..b0592901 100644 --- a/tox.ini +++ b/tox.ini @@ -88,7 +88,7 @@ deps= mypy == 0.750 lxml commands= - mypy --html-report ./mypy-cov --check-untyped-defs eli5 + mypy --html-report ./mypy-cov --check-untyped-defs --ignore-missing-imports eli5 [testenv:docs] From 6bc7f8d183cf847363b916404a3c613a7a9c6039 Mon Sep 17 00:00:00 2001 From: Rafael Fernandes Date: Fri, 17 Jan 2020 14:56:40 -0300 Subject: [PATCH 20/23] Slice sample weight to work with cross validation --- eli5/sklearn/permutation_importance.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/eli5/sklearn/permutation_importance.py b/eli5/sklearn/permutation_importance.py index 5a963880..7cb77bff 100644 --- a/eli5/sklearn/permutation_importance.py +++ b/eli5/sklearn/permutation_importance.py @@ -214,8 +214,12 @@ def _cv_scores_importances(self, X, y, groups=None, **fit_params): cv = check_cv(self.cv, y, is_classifier(self.estimator)) feature_importances = [] # type: List base_scores = [] # type: List[float] + weights = fit_params.get('sample_weight', None) + if weights is None: + weights = np.ones(len(y)) + fit_params.pop('sample_weight', None) for train, test in cv.split(X, y, groups): - est = clone(self.estimator).fit(X[train], y[train], **fit_params) + est = clone(self.estimator).fit(X[train], y[train], sample_weight=weights[train], **fit_params) score_func = partial(self.scorer_, est) _base_score, _importances = self._get_score_importances( score_func, X[test], y[test]) From 64bee305c2d309122305044e98fdd3bca767e1be Mon Sep 17 00:00:00 2001 From: Rafael Fernandes Date: Mon, 20 Jan 2020 13:19:50 -0300 Subject: [PATCH 21/23] pass sample weight only when it is present --- eli5/sklearn/permutation_importance.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/eli5/sklearn/permutation_importance.py b/eli5/sklearn/permutation_importance.py index 7cb77bff..ac139801 100644 --- a/eli5/sklearn/permutation_importance.py +++ b/eli5/sklearn/permutation_importance.py @@ -215,11 +215,12 @@ def _cv_scores_importances(self, X, y, groups=None, **fit_params): feature_importances = [] # type: List base_scores = [] # type: List[float] weights = fit_params.get('sample_weight', None) - if weights is None: - weights = np.ones(len(y)) fit_params.pop('sample_weight', None) for train, test in cv.split(X, y, groups): - est = clone(self.estimator).fit(X[train], y[train], sample_weight=weights[train], **fit_params) + if weights is None: + est = clone(self.estimator).fit(X[train], y[train], **fit_params) + else: + est = clone(self.estimator).fit(X[train], y[train], sample_weight=weights[train], **fit_params) score_func = partial(self.scorer_, est) _base_score, _importances = self._get_score_importances( score_func, X[test], y[test]) From f587bfa21f860d7d16e209028886d69e5bc2ee2a Mon Sep 17 00:00:00 2001 From: Rafael Fernandes Date: Mon, 20 Jan 2020 13:21:05 -0300 Subject: [PATCH 22/23] test to sklearn perm imp with cv and sample weight --- tests/test_sklearn_permutation_importance.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/tests/test_sklearn_permutation_importance.py b/tests/test_sklearn_permutation_importance.py index 4fe942fd..4ffec3ba 100644 --- a/tests/test_sklearn_permutation_importance.py +++ b/tests/test_sklearn_permutation_importance.py @@ -3,7 +3,7 @@ import numpy as np from sklearn.base import is_classifier, is_regressor from sklearn.svm import SVR, SVC -from sklearn.ensemble import RandomForestRegressor +from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier from sklearn.model_selection import train_test_split, cross_val_score from sklearn.pipeline import make_pipeline from sklearn.feature_selection import SelectFromModel @@ -165,6 +165,7 @@ def test_explain_weights(iris_train): for _expl in res: assert "petal width (cm)" in _expl + def test_pandas_xgboost_support(iris_train): xgboost = pytest.importorskip('xgboost') pd = pytest.importorskip('pandas') @@ -175,3 +176,17 @@ def test_pandas_xgboost_support(iris_train): est.fit(X, y) # we expect no exception to be raised here when using xgboost with pd.DataFrame perm = PermutationImportance(est).fit(X, y) + + +def test_cv_sample_weight(iris_train): + X, y, feature_names, target_names = iris_train + weights_ones = np.ones(len(y)) + model = RandomForestClassifier(random_state=42) + + # we expect no exception to be raised when passing weights with a CV + perm_weights = PermutationImportance(model, cv=5, random_state=42).\ + fit(X, y, sample_weight=weights_ones) + perm = PermutationImportance(model, cv=5, random_state=42).fit(X, y) + + # passing a vector of weights filled with one should be the same as passing no weights + assert (perm.feature_importances_ == perm_weights.feature_importances_).all() \ No newline at end of file From 729e5579ca770068b313829f70b33ad43e0f17ad Mon Sep 17 00:00:00 2001 From: Rafael Fernandes Date: Tue, 21 Jan 2020 19:01:12 -0300 Subject: [PATCH 23/23] changes as PR suggestion --- eli5/sklearn/permutation_importance.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/eli5/sklearn/permutation_importance.py b/eli5/sklearn/permutation_importance.py index ac139801..370be8be 100644 --- a/eli5/sklearn/permutation_importance.py +++ b/eli5/sklearn/permutation_importance.py @@ -214,13 +214,12 @@ def _cv_scores_importances(self, X, y, groups=None, **fit_params): cv = check_cv(self.cv, y, is_classifier(self.estimator)) feature_importances = [] # type: List base_scores = [] # type: List[float] - weights = fit_params.get('sample_weight', None) - fit_params.pop('sample_weight', None) + weights = fit_params.pop('sample_weight', None) + fold_fit_params = fit_params.copy() for train, test in cv.split(X, y, groups): - if weights is None: - est = clone(self.estimator).fit(X[train], y[train], **fit_params) - else: - est = clone(self.estimator).fit(X[train], y[train], sample_weight=weights[train], **fit_params) + if weights is not None: + fold_fit_params['sample_weight'] = weights[train] + est = clone(self.estimator).fit(X[train], y[train], **fold_fit_params) score_func = partial(self.scorer_, est) _base_score, _importances = self._get_score_importances( score_func, X[test], y[test])