diff --git a/orangecontrib/text/widgets/owstatistics.py b/orangecontrib/text/widgets/owstatistics.py index f465ee3ac..eb4296d8f 100644 --- a/orangecontrib/text/widgets/owstatistics.py +++ b/orangecontrib/text/widgets/owstatistics.py @@ -3,7 +3,7 @@ from copy import copy from itertools import groupby from string import punctuation -from typing import Callable, List, Optional, Tuple +from typing import Callable, List, Optional, Tuple, Union, Iterator, Dict import numpy as np from AnyQt.QtWidgets import QComboBox, QGridLayout, QLabel, QLineEdit, QSizePolicy @@ -14,19 +14,18 @@ from Orange.widgets.utils.widgetpreview import WidgetPreview from Orange.widgets.widget import Input, Output, OWWidget from nltk import tokenize +from orangecanvas.gui.utils import disconnected from orangewidget.widget import Msg from orangecontrib.text import Corpus -# those functions are implemented here since they are used in more statistics -from orangecontrib.text.preprocess import ( - LowercaseTransformer, - RegexpTokenizer, - PreprocessorList -) +class Sources: + DOCUMENTS = "Documents" + TOKENS = "Preprocessed tokens" # tokens or ngrams - depending on statistic -def num_words(document: str, callback: Callable) -> int: + +def num_words(document: Union[str, List], callback: Callable) -> int: """ Return number of words in document-string. Word is every entity divided by space, tab, newline. @@ -35,11 +34,13 @@ def num_words(document: str, callback: Callable) -> int: return len(document.split()) -def char_count(document: str, callback: Callable) -> int: +def char_count(document: Union[str, List], callback: Callable) -> int: """ Count number of alpha-numerical in document/string. """ callback() + if isinstance(document, List): + document = "".join(document) return sum(c.isalnum() for c in document) @@ -52,37 +53,32 @@ def digit_count(document: str, callback: Callable) -> int: def count_appearances( - document: str, characters: List[str], callback: Callable + document: Union[str, List], characters: List[str], callback: Callable ) -> int: """ Count number of appearances of chars from `characters` list. """ callback() # I think it supports the majority of main languages - # Y can be vowel too sometimes - it is not possible to distinguish - return sum(document.lower().count(c) for c in characters) + # Y can be vo wel too sometimes - it is not possible to distinguish + if isinstance(document, str): + return sum(document.lower().count(c) for c in characters) + else: + return sum(d.lower().count(c) for c in characters for d in document) -def preprocess_only_words(corpus: Corpus) -> Corpus: +def get_source(corpus: Corpus, source: str) -> Union[List[str], Iterator[List[str]]]: """ - Apply the preprocessor that splits words, transforms them to lower case - (and removes punctuations). - - Parameters - ---------- - corpus - Corpus on which the preprocessor will be applied. - - Returns - ------- - Preprocessed corpus. Result of pre-processing is saved in tokens/ngrams. + Extract source from corpus according to source variable: + - if source == Sources.DOCUMENTS return documents + - if source == Sources.TOKENS return ngrams """ - p = PreprocessorList( - [LowercaseTransformer(), - # by default regexp keeps only words (no punctuations, no spaces) - RegexpTokenizer()] - ) - return p(corpus) + if source == "Documents": + return corpus.documents + elif source == "Preprocessed tokens": + return corpus.ngrams + else: + raise ValueError(f"Wrong source {source}") # every statistic returns a np.ndarray with statistics @@ -91,38 +87,34 @@ def preprocess_only_words(corpus: Corpus) -> Corpus: def words_count( - corpus: Corpus, _: str, callback: Callable + corpus: Corpus, _: str, source: str, callback: Callable ) -> Tuple[np.ndarray, List[str]]: """ Count number of words in each document. """ - corpus = preprocess_only_words(corpus) + assert source == Sources.DOCUMENTS # np.c_ makes column vector (ndarray) out of the list # [1, 2, 3] -> [[1], [2], [3]] - return ( - np.c_[[num_words(d, callback) for d in corpus.documents]], - ["Word count"], - ) + return np.c_[[num_words(d, callback) for d in corpus.documents]], ["Word count"] def characters_count( - corpus: Corpus, _: str, callback: Callable + corpus: Corpus, _: str, source: str, callback: Callable ) -> Tuple[np.ndarray, List[str]]: """ Count number of characters without spaces, newlines, tabs, ... """ - return ( - np.c_[[char_count(d, callback) for d in corpus.documents]], - ["Character count"], - ) + source = get_source(corpus, source) + return np.c_[[char_count(d, callback) for d in source]], ["Character count"] def n_gram_count( - corpus: Corpus, _: str, callback: Callable + corpus: Corpus, _: str, source: str, callback: Callable ) -> Tuple[np.ndarray, List[str]]: """ Count number of n-grams in every document """ + assert source == Sources.TOKENS def ng_count(n_gram: List[str]): callback() @@ -132,11 +124,12 @@ def ng_count(n_gram: List[str]): def average_word_len( - corpus: Corpus, _: str, callback: Callable + corpus: Corpus, _: str, source: str, callback: Callable ) -> Tuple[np.ndarray, List[str]]: """ Computes word density as: word count / character count + 1 """ + assert source == Sources.DOCUMENTS return ( np.c_[ [ @@ -149,11 +142,12 @@ def average_word_len( def punctuation_count( - corpus: Corpus, _: str, callback: Callable + corpus: Corpus, _: str, source: str, callback: Callable ) -> Tuple[np.ndarray, List[str]]: """ Count number of punctuation signs """ + assert source == Sources.DOCUMENTS def num_punctuation(document: str): callback() @@ -166,11 +160,12 @@ def num_punctuation(document: str): def capital_count( - corpus: Corpus, _: str, callback: Callable + corpus: Corpus, _: str, source: str, callback: Callable ) -> Tuple[np.ndarray, List[str]]: """ Count number of capital letters in documents """ + assert source == Sources.DOCUMENTS def num_capitals(document: str): callback() @@ -183,11 +178,13 @@ def num_capitals(document: str): def vowel_count( - corpus: Corpus, vowels: str, callback: Callable + corpus: Corpus, vowels: str, source: str, callback: Callable ) -> Tuple[np.ndarray, List[str]]: """ Count number of vowels in documents """ + assert source == Sources.DOCUMENTS + # comma separated string of vowels to list vowels = [v.strip() for v in vowels.split(",")] return ( @@ -199,12 +196,14 @@ def vowel_count( def consonant_count( - corpus: Corpus, consonants: str, callback: Callable + corpus: Corpus, consonants: str, source: str, callback: Callable ) -> Tuple[np.ndarray, List[str]]: """ Count number of consonants in documents. Consonants are all alnum characters except vowels and numbers """ + assert source == Sources.DOCUMENTS + # comma separated string of consonants to list consonants = [v.strip() for v in consonants.split(",")] return ( @@ -219,12 +218,12 @@ def consonant_count( def per_cent_unique_words( - corpus: Corpus, _: str, callback: Callable + corpus: Corpus, _: str, source: str, callback: Callable ) -> Tuple[np.ndarray, List[str]]: """ Ratio between unique words count and all words count """ - corpus = preprocess_only_words(corpus) + assert source == Sources.TOKENS def perc_unique(tokens: str): callback() @@ -232,83 +231,84 @@ def perc_unique(tokens: str): return np.nan return len(set(tokens)) / len(tokens) - return np.c_[list(map(perc_unique, corpus.tokens))], ["% unique words"] + return np.c_[list(map(perc_unique, corpus.ngrams))], ["% unique words"] def starts_with( - corpus: Corpus, prefix: str, callback: Callable + corpus: Corpus, prefix: str, source: str, callback: Callable ) -> Tuple[np.ndarray, List[str]]: """ Number of words that starts with the string in `prefix`. """ - corpus = preprocess_only_words(corpus) + assert source == Sources.TOKENS def number_starts_with(tokens: List[str]): callback() return sum(t.startswith(prefix) for t in tokens) return ( - np.c_[list(map(number_starts_with, corpus.tokens))], + np.c_[list(map(number_starts_with, corpus.ngrams))], [f"Starts with {prefix}"], ) def ends_with( - corpus: Corpus, postfix: str, callback: Callable + corpus: Corpus, postfix: str, source: str, callback: Callable ) -> Tuple[np.ndarray, List[str]]: """ Number of words that ends with the string in `postfix`. """ - corpus = preprocess_only_words(corpus) + assert source == Sources.TOKENS def number_ends_with(tokens: List[str]): callback() return sum(t.endswith(postfix) for t in tokens) return ( - np.c_[list(map(number_ends_with, corpus.tokens))], + np.c_[list(map(number_ends_with, corpus.ngrams))], [f"Ends with {postfix}"], ) def contains( - corpus: Corpus, text: str, callback: Callable + corpus: Corpus, text: str, source: str, callback: Callable ) -> Tuple[np.ndarray, List[str]]: """ Number of words that contains string in `text`. """ + source = get_source(corpus, source) return ( - np.c_[ - [count_appearances(d, [text], callback) for d in corpus.documents] - ], + np.c_[[count_appearances(d, [text], callback) for d in source]], [f"Contains {text}"], ) def regex( - corpus: Corpus, expression: str, callback: Callable + corpus: Corpus, expression: str, source: str, callback: Callable ) -> Tuple[np.ndarray, List[str]]: """ Count occurrences of pattern in `expression`. """ pattern = re.compile(expression) - def number_regex(tokens: List[str]): + def regex_matches(text: Union[str, List]): callback() - return sum(bool(pattern.match(t)) for t in tokens) + if isinstance(text, str): + return len(re.findall(pattern, text)) + else: + return sum(len(re.findall(pattern, ngram)) for ngram in text) - return ( - np.c_[list(map(number_regex, corpus.tokens))], - [f"Regex {expression}"], - ) + source = get_source(corpus, source) + return np.c_[list(map(regex_matches, source))], [f"Regex {expression}"] def pos_tags( - corpus: Corpus, pos_tags: str, callback: Callable + corpus: Corpus, pos_tags: str, source: str, callback: Callable ) -> Optional[Tuple[np.ndarray, List[str]]]: """ Count number of specified pos tags in corpus """ + assert source == Sources.TOKENS p_tags = [v.strip().lower() for v in pos_tags.split(",")] def cust_count(tags): @@ -325,7 +325,7 @@ def cust_count(tags): def yule( - corpus: Corpus, _: str, callback: Callable + corpus: Corpus, _: str, source: str, callback: Callable ) -> Optional[Tuple[np.ndarray, List[str]]]: """ Yule's I measure: higher number is higher diversity - richer vocabulary @@ -333,6 +333,7 @@ def yule( Mathematical Proceedings of the Cambridge Philosophical Society, 42(2), B1-B2. doi:10.1017/S0305004100022799 """ + assert source == Sources.TOKENS if corpus.pos_tags is None: return None @@ -354,13 +355,13 @@ def yules_i(tags): def lix( - corpus: Corpus, _: str, callback: Callable + corpus: Corpus, _: str, source: str, callback: Callable ) -> Optional[Tuple[np.ndarray, List[str]]]: """ Readability index LIX https://en.wikipedia.org/wiki/Lix_(readability_test) """ - corpus = preprocess_only_words(corpus) + assert source == Sources.TOKENS tokenizer = tokenize.PunktSentenceTokenizer() def lix_index(document, tokens): @@ -393,18 +394,21 @@ class ComputeValue: pattern Some statistics need additional parameter with the pattern (e.g. starts with), for others it is set to empty string. + source + Part of the corpus used for computation: either tokens/ngrams or whole documents """ - def __init__(self, function: Callable, pattern: str) -> None: + def __init__(self, function: Callable, pattern: str, source: str) -> None: self.function = function self.pattern = pattern + self.source = source def __call__(self, data: Corpus) -> np.ndarray: """ This function compute values on new table. """ # lambda is added as a placeholder for a callback. - return self.function(data, self.pattern, lambda: True)[0] + return self.function(data, self.pattern, self.source, lambda: True)[0] def __eq__(self, other): return self.function == other.function and self.pattern == other.pattern @@ -419,30 +423,32 @@ def __hash__(self): STATISTICS = [ # (name of the statistics, function to compute, default value) # if default value is None - text box is not required - ("Word count", words_count, None), - ("Character count", characters_count, None), - ("N-gram count", n_gram_count, None), - ("Average word length", average_word_len, None), - ("Punctuation count", punctuation_count, None), - ("Capital letter count", capital_count, None), - ("Vowel count", vowel_count, "a,e,i,o,u"), + ("Word count", words_count, None, (Sources.DOCUMENTS,)), + ("Character count", characters_count, None, (Sources.DOCUMENTS, Sources.TOKENS)), + ("N-gram count", n_gram_count, None, (Sources.TOKENS,)), + ("Average word length", average_word_len, None, (Sources.DOCUMENTS,)), + ("Punctuation count", punctuation_count, None, (Sources.DOCUMENTS,)), + ("Capital letter count", capital_count, None, (Sources.DOCUMENTS,)), + ("Vowel count", vowel_count, "a,e,i,o,u", (Sources.DOCUMENTS,)), ( "Consonant count", consonant_count, "b,c,d,f,g,h,j,k,l,m,n,p,q,r,s,t,v,w,x,y,z", + (Sources.DOCUMENTS,), ), - ("Per cent unique words", per_cent_unique_words, None), - ("Starts with", starts_with, ""), - ("Ends with", ends_with, ""), - ("Contains", contains, ""), - ("Regex", regex, ""), - ("POS tag", pos_tags, "NN,VV,JJ"), - ("Yule's I", yule, None), - ("LIX index", lix, None), + ("Per cent unique terms", per_cent_unique_words, None, (Sources.TOKENS,)), + ("Starts with", starts_with, "", (Sources.TOKENS,)), + ("Ends with", ends_with, "", (Sources.TOKENS,)), + ("Contains", contains, "", (Sources.DOCUMENTS, Sources.TOKENS)), + ("Regex", regex, "", (Sources.DOCUMENTS, Sources.TOKENS)), + ("POS tag", pos_tags, "NN,VV,JJ", (Sources.TOKENS,)), + ("Yule's I", yule, None, (Sources.TOKENS,)), + ("LIX index", lix, None, (Sources.TOKENS,)), ] STATISTICS_NAMES = list(list(zip(*STATISTICS))[0]) STATISTICS_FUNCTIONS = list(list(zip(*STATISTICS))[1]) STATISTICS_DEFAULT_VALUE = list(list(zip(*STATISTICS))[2]) +STATISTICS_DEFAULT_SOURCES = list(list(zip(*STATISTICS))[3]) def run(corpus: Corpus, statistics: Tuple[int, str], state: TaskState) -> None: @@ -466,12 +472,12 @@ def run(corpus: Corpus, statistics: Tuple[int, str], state: TaskState) -> None: def advance(): state.set_progress_value(next(tick_values)) - for s, patern in statistics: + for s, patern, source in statistics: fun = STATISTICS_FUNCTIONS[s] - result = fun(corpus, patern, advance) + result = fun(corpus, patern, source, advance) if result is not None: - result = result + (ComputeValue(fun, patern),) - state.set_partial_result((s, patern, result)) + result = result + (ComputeValue(fun, patern, source),) + state.set_partial_result((s, patern, source, result)) class OWStatistics(OWWidget, ConcurrentWidgetMixin): @@ -491,12 +497,14 @@ class Warning(OWWidget.Warning): "{} statistics cannot be computed and is omitted from results." ) + # todo: update settings version and migration want_main_area = False mainArea_width_height_ratio = None - # settings - default_rules = [(0, ""), (1, "")] # rules used to reset the active rules - active_rules: List[Tuple[int, str]] = Setting(default_rules[:]) + settings_version = 2 + # rules used to reset the active rules + default_rules = [(0, "", STATISTICS[0][-1][0]), (1, "", STATISTICS[0][-1][0])] + active_rules: List[Tuple[int, str, str]] = Setting(default_rules[:]) # rules active at time of apply clicked applied_rules: Optional[List[Tuple[int, str]]] = None @@ -507,12 +515,14 @@ def __init__(self) -> None: ConcurrentWidgetMixin.__init__(self) self.corpus = None - # the list with combos from the widget - self.combos = [] + # the list with combos for selecting statistics from the widget + self.statistics_combos = [] # the list with line edits from the widget self.line_edits = [] # the list of buttons in front of controls that removes them self.remove_buttons = [] + # the list with combos for selecting on what statistics computes + self.source_combos = [] self._init_controls() @@ -542,6 +552,7 @@ def _init_statistics_box(self) -> None: grid.setColumnStretch(2, 100) grid.addWidget(QLabel("Feature"), 0, 1) grid.addWidget(QLabel("Pattern"), 0, 2) + grid.addWidget(QLabel("Compute for"), 0, 3) gui.button( box, @@ -562,7 +573,7 @@ def adjust_n_rule_rows(self) -> None: """ def _add_line(): - n_lines = len(self.combos) + 1 + n_lines = len(self.statistics_combos) + 1 # add delete symbol button = gui.button( @@ -577,23 +588,29 @@ def _add_line(): combo.addItems(STATISTICS_NAMES) combo.currentIndexChanged.connect(self._sync_edit_combo) self.rules_grid.addWidget(combo, n_lines, 1) - self.combos.append(combo) + self.statistics_combos.append(combo) - # add line edit for patern + # add line edit for pattern line_edit = QLineEdit() self.rules_grid.addWidget(line_edit, n_lines, 2) line_edit.textChanged.connect(self._sync_edit_line) self.line_edits.append(line_edit) + # add statistics type dropdown + combo = QComboBox() + combo.currentIndexChanged.connect(self._sync_edit_source_combo) + self.rules_grid.addWidget(combo, n_lines, 3) + self.source_combos.append(combo) + def _remove_line(): - self.combos.pop().deleteLater() + self.statistics_combos.pop().deleteLater() self.line_edits.pop().deleteLater() + self.source_combos.pop().deleteLater() self.remove_buttons.pop().deleteLater() def _fix_tab_order(): - # TODO: write it differently - check create class - for i, (r, c, l) in enumerate( - zip(self.active_rules, self.combos, self.line_edits) + for i, (r, c, l, s) in enumerate( + zip(self.active_rules, self.statistics_combos, self.line_edits, self.source_combos) ): c.setCurrentIndex(r[0]) # update combo l.setText(r[1]) # update line edit @@ -601,17 +618,21 @@ def _fix_tab_order(): l.setVisible(True) else: l.setVisible(False) + with disconnected(s.currentIndexChanged, self._sync_edit_source_combo): + s.clear() + s.addItems(STATISTICS_DEFAULT_SOURCES[r[0]]) + s.setCurrentText(r[2]) n = len(self.active_rules) - while n > len(self.combos): + while n > len(self.statistics_combos): _add_line() - while len(self.combos) > n: + while len(self.statistics_combos) > n: _remove_line() _fix_tab_order() def _add_row(self) -> None: """ Add a new row to the statistic box """ - self.active_rules.append((0, "")) + self.active_rules.append((0, "", STATISTICS_DEFAULT_SOURCES[0][0])) self.adjust_n_rule_rows() def _remove_row(self) -> None: @@ -623,20 +644,27 @@ def _remove_row(self) -> None: def _sync_edit_combo(self) -> None: """ Update rules when combo value changed """ combo = self.sender() - edit_index = self.combos.index(combo) + edit_index = self.statistics_combos.index(combo) selected_i = combo.currentIndex() - default_value = STATISTICS_DEFAULT_VALUE[selected_i] - self.active_rules[edit_index] = (selected_i, default_value) + default_value = STATISTICS_DEFAULT_VALUE[selected_i] or "" + default_source = STATISTICS_DEFAULT_SOURCES[selected_i][0] + self.active_rules[edit_index] = (selected_i, default_value, default_source) self.adjust_n_rule_rows() def _sync_edit_line(self) -> None: """ Update rules when line edit value changed """ line_edit = self.sender() edit_index = self.line_edits.index(line_edit) - self.active_rules[edit_index] = ( - self.active_rules[edit_index][0], - line_edit.text(), - ) + arules = self.active_rules[edit_index] + self.active_rules[edit_index] = (arules[0], line_edit.text(), arules[2]) + + def _sync_edit_source_combo(self) -> None: + """ Update rules when source value change """ + combo = self.sender() + edit_index = self.source_combos.index(combo) + value = combo.currentText() + arules = self.active_rules[edit_index] + self.active_rules[edit_index] = (arules[0], arules[1], value) @Inputs.corpus def set_data(self, corpus) -> None: @@ -666,10 +694,10 @@ def on_exception(self, exception: Exception) -> None: raise exception def on_partial_result( - self, result: Tuple[int, str, Tuple[np.ndarray, List[str], Callable]] + self, result: Tuple[int, str, str, Tuple[np.ndarray, List[str], Callable]] ) -> None: - statistic, patern, result = result - self.result_dict[(statistic, patern)] = result + statistic, patern, source, result = result + self.result_dict[(statistic, patern, source)] = result def on_done(self, result: None) -> None: # join results @@ -707,6 +735,21 @@ def output_results(self) -> None: ) self.Outputs.corpus.send(new_corpus) + @classmethod + def migrate_settings(cls, settings: Dict, version: int): + def def_source(idx): + """Return source that behaviour is the most similar to previous version""" + if STATISTICS_NAMES[idx] == "Regex": + # regex was working on tokens in the previous version + return Sources.TOKENS + # others that allow both sources were working on documents + return STATISTICS_DEFAULT_SOURCES[idx][0] + + if version < 2: + if "active_rules" in settings: + new_rules = [(r, v, def_source(r)) for r, v in settings["active_rules"]] + settings["active_rules"] = new_rules + if __name__ == "__main__": WidgetPreview(OWStatistics).run(Corpus.from_file("book-excerpts")) diff --git a/orangecontrib/text/widgets/tests/test_owstatistics.py b/orangecontrib/text/widgets/tests/test_owstatistics.py index ad1820413..e3082e406 100644 --- a/orangecontrib/text/widgets/tests/test_owstatistics.py +++ b/orangecontrib/text/widgets/tests/test_owstatistics.py @@ -5,11 +5,18 @@ from Orange.data import Domain, StringVariable from Orange.widgets.tests.base import WidgetTest +from Orange.widgets.tests.utils import simulate from orangecontrib.text import Corpus +from orangecontrib.text.preprocess import ( + PreprocessorList, + LowercaseTransformer, + RegexpTokenizer, + StopwordsFilter, +) from orangecontrib.text.tag import AveragedPerceptronTagger from orangecontrib.text.widgets.owstatistics import ( STATISTICS_NAMES, - OWStatistics, + OWStatistics, Sources, ) @@ -40,7 +47,9 @@ def _create_simple_data(self) -> None: text_features=[text_var], ) - def _set_feature(self, feature_name: str, value: str = ""): + def _set_feature( + self, feature_name: str, value: str = "", source: str = Sources.DOCUMENTS + ): """ Set statistic which need to be computed by widget. It sets only one statistics. @@ -52,11 +61,15 @@ def _set_feature(self, feature_name: str, value: str = ""): value If statistic need a value (e.g. prefix) it is passed here. """ - feature_index = STATISTICS_NAMES.index(feature_name) - self.widget.active_rules = [(feature_index, value)] - self.widget.adjust_n_rule_rows() - - def _compute_features(self, feature_name: str, value: str = "") -> Corpus: + simulate.combobox_activate_item(self.widget.statistics_combos[0], feature_name) + self.widget.line_edits[0].setText(value) + simulate.combobox_activate_item(self.widget.source_combos[0], source) + for button in self.widget.remove_buttons[1:]: + button.click() + + def _compute_features( + self, feature_name: str, value: str = "", source: str = Sources.DOCUMENTS + ) -> Corpus: """ Send `self.corpus` to widget, set statistic which need bo be computed, run the computation, and return widget output. @@ -74,7 +87,7 @@ def _compute_features(self, feature_name: str, value: str = "") -> Corpus: """ self.send_signal(self.widget.Inputs.corpus, self.corpus) self.wait_until_finished() - self._set_feature(feature_name, value) + self._set_feature(feature_name, value, source) self.widget.apply() self.wait_until_finished() res = self.get_output(self.widget.Outputs.corpus) @@ -101,7 +114,10 @@ def test_words_count(self): def test_characters_count(self): """ Test characters count statistic """ - data = self._compute_features("Character count") + data = self._compute_features("Character count", source=Sources.DOCUMENTS) + np.testing.assert_array_equal(data.X.flatten(), [47, 44, 48, 51]) + + data = self._compute_features("Character count", source=Sources.TOKENS) np.testing.assert_array_equal(data.X.flatten(), [47, 44, 48, 51]) self.send_signal(self.widget.Inputs.corpus, None) @@ -109,7 +125,7 @@ def test_characters_count(self): def test_n_gram_count(self): """ Test n-grams count statistic """ - data = self._compute_features("N-gram count") + data = self._compute_features("N-gram count", source=Sources.TOKENS) np.testing.assert_array_equal(data.X.flatten(), [10, 12, 13, 12]) self.send_signal(self.widget.Inputs.corpus, None) @@ -161,16 +177,16 @@ def test_consonants_count(self): def test_per_cent_unique_words(self): """ Test per-cent unique words statistic """ - data = self._compute_features("Per cent unique words") + data = self._compute_features("Per cent unique terms", source=Sources.TOKENS) np.testing.assert_array_almost_equal( - data.X.flatten(), [1, 1, 0.909091, 1] + data.X.flatten(), [1, 1, 0.84615, 1], decimal=5 ) with self.corpus.unlocked(): - self.corpus[1][-1] = "" - data = self._compute_features("Per cent unique words") + self.corpus[1][-1] = " " + data = self._compute_features("Per cent unique terms", source=Sources.TOKENS) np.testing.assert_array_almost_equal( - data.X.flatten(), [1, np.nan, 0.909091, 1] + data.X.flatten(), [1, np.nan, 0.84615, 1], decimal=5 ) self.send_signal(self.widget.Inputs.corpus, None) @@ -178,10 +194,10 @@ def test_per_cent_unique_words(self): def test_starts_with(self): """ Test starts with count statistic """ - data = self._compute_features("Starts with", "a") + data = self._compute_features("Starts with", "a", Sources.TOKENS) np.testing.assert_array_almost_equal(data.X.flatten(), [2, 0, 2, 2]) - data = self._compute_features("Starts with", "ap") + data = self._compute_features("Starts with", "ap", Sources.TOKENS) np.testing.assert_array_almost_equal(data.X.flatten(), [0, 0, 0, 1]) self.send_signal(self.widget.Inputs.corpus, None) @@ -189,10 +205,10 @@ def test_starts_with(self): def test_ends_with(self): """ Test ends with count statistic """ - data = self._compute_features("Ends with", "t") + data = self._compute_features("Ends with", "t", Sources.TOKENS) np.testing.assert_array_almost_equal(data.X.flatten(), [3, 3, 1, 2]) - data = self._compute_features("Ends with", "et") + data = self._compute_features("Ends with", "et", Sources.TOKENS) np.testing.assert_array_almost_equal(data.X.flatten(), [1, 1, 0, 0]) self.send_signal(self.widget.Inputs.corpus, None) @@ -200,28 +216,50 @@ def test_ends_with(self): def test_contains(self): """ Test contains count statistic """ - data = self._compute_features("Contains", "t") + data = self._compute_features("Contains", "t", Sources.DOCUMENTS) np.testing.assert_array_almost_equal(data.X.flatten(), [5, 4, 4, 9]) - data = self._compute_features("Contains", "et") + data = self._compute_features("Contains", "et", Sources.DOCUMENTS) np.testing.assert_array_almost_equal(data.X.flatten(), [2, 1, 0, 0]) - data = self._compute_features("Contains", "is") + data = self._compute_features("Contains", "is", Sources.DOCUMENTS) np.testing.assert_array_almost_equal(data.X.flatten(), [1, 2, 2, 0]) + data = self._compute_features("Contains", "t", Sources.TOKENS) + np.testing.assert_array_almost_equal(data.X.flatten(), [5, 4, 4, 9]) + + data = self._compute_features("Contains", " ", Sources.TOKENS) + np.testing.assert_array_almost_equal(data.X.flatten(), [0, 0, 0, 0]) + self.send_signal(self.widget.Inputs.corpus, None) self.assertIsNone(self.get_output(self.widget.Outputs.corpus)) def test_regex(self): """ Test regex statistic """ - # words that contains digit - data = self._compute_features("Regex", "\w*\d\w*") + # words that contain digit + data = self._compute_features("Regex", r"\w*\d\w*", Sources.DOCUMENTS) np.testing.assert_array_almost_equal(data.X.flatten(), [0, 0, 0, 1]) - # words that contains digit - data = self._compute_features("Regex", "\w*is\w*") + # words that contain is + data = self._compute_features("Regex", r"\w*is\w*", Sources.DOCUMENTS) np.testing.assert_array_almost_equal(data.X.flatten(), [1, 2, 2, 0]) + # count specific n-gram + data = self._compute_features("Regex", r"ipsum\ dolor", Sources.DOCUMENTS) + np.testing.assert_array_almost_equal(data.X.flatten(), [1, 0, 0, 0]) + + # words that contain digit + data = self._compute_features("Regex", r"\w*\d\w*", Sources.TOKENS) + np.testing.assert_array_almost_equal(data.X.flatten(), [0, 0, 0, 1]) + + # words that contain is + data = self._compute_features("Regex", r"\w*is\w*", Sources.TOKENS) + np.testing.assert_array_almost_equal(data.X.flatten(), [1, 2, 2, 0]) + + # count specific n-gram + data = self._compute_features("Regex", r"ipsum\ dolor", Sources.TOKENS) + np.testing.assert_array_almost_equal(data.X.flatten(), [0, 0, 0, 0]) + self.send_signal(self.widget.Inputs.corpus, None) self.assertIsNone(self.get_output(self.widget.Outputs.corpus)) @@ -232,7 +270,7 @@ def test_pos(self): - test with corpus that has pos tags """ self.send_signal(self.widget.Inputs.corpus, self.corpus) - self._set_feature("POS tag", "NN") + self._set_feature("POS tag", "NN", Sources.TOKENS) self.widget.apply() self.wait_until_finished() res = self.get_output(self.widget.Outputs.corpus) @@ -243,7 +281,7 @@ def test_pos(self): result = tagger(self.corpus) self.send_signal(self.widget.Inputs.corpus, result) - self._set_feature("POS tag", "NN") + self._set_feature("POS tag", "NN", Sources.TOKENS) self.widget.apply() self.wait_until_finished() res = self.get_output(self.widget.Outputs.corpus) @@ -258,7 +296,7 @@ def test_yule(self): - test with corpus that has pos tags """ self.send_signal(self.widget.Inputs.corpus, self.corpus) - self._set_feature("Yule's I") + self._set_feature("Yule's I", source=Sources.TOKENS) self.widget.apply() self.wait_until_finished() res = self.get_output(self.widget.Outputs.corpus) @@ -271,7 +309,7 @@ def test_yule(self): result = tagger(self.corpus) self.send_signal(self.widget.Inputs.corpus, result) - self._set_feature("Yule's I") + self._set_feature("Yule's I", source=Sources.TOKENS) self.widget.apply() self.wait_until_finished() res = self.get_output(self.widget.Outputs.corpus) @@ -287,7 +325,7 @@ def test_lix(self): with self.corpus.unlocked(): self.corpus[1][-1] = "simple. simple." self.send_signal(self.widget.Inputs.corpus, self.corpus) - self._set_feature("LIX index") + self._set_feature("LIX index", source=Sources.TOKENS) self.widget.apply() self.wait_until_finished() res = self.get_output(self.widget.Outputs.corpus) @@ -295,6 +333,40 @@ def test_lix(self): # the second document will have lower complexity than the first one self.assertLess(res[1][0], res[0][0]) + def test_stats_different_preprocessing(self): + pp = [LowercaseTransformer(), RegexpTokenizer(), StopwordsFilter(language="en")] + pp = PreprocessorList(pp) + self.corpus = pp(self.corpus) + + data = self._compute_features("Character count", "", Sources.TOKENS) + np.testing.assert_array_almost_equal(data.X.flatten(), [47, 44, 46, 51]) + + data = self._compute_features("N-gram count", "", Sources.TOKENS) + np.testing.assert_array_almost_equal(data.X.flatten(), [8, 9, 9, 9]) + + data = self._compute_features("Per cent unique terms", "", Sources.TOKENS) + np.testing.assert_array_almost_equal(data.X.flatten(), [1, 1, 1, 1]) + + # none start with the capital because of Lowercase preprocessor + data = self._compute_features("Starts with", "L", Sources.TOKENS) + np.testing.assert_array_almost_equal(data.X.flatten(), [0, 0, 0, 0]) + + data = self._compute_features("Starts with", "a", Sources.TOKENS) + np.testing.assert_array_almost_equal(data.X.flatten(), [2, 0, 0, 2]) + + data = self._compute_features("Ends with", "a", Sources.TOKENS) + np.testing.assert_array_almost_equal(data.X.flatten(), [0, 1, 2, 1]) + + # non contain comma since we use RegexP preprocessor + data = self._compute_features("Contains", ",", Sources.TOKENS) + np.testing.assert_array_almost_equal(data.X.flatten(), [0, 0, 0, 0]) + + data = self._compute_features("Contains", "a", Sources.TOKENS) + np.testing.assert_array_almost_equal(data.X.flatten(), [2, 2, 6, 5]) + + data = self._compute_features("Regex", "{e", Sources.TOKENS) + np.testing.assert_array_almost_equal(data.X.flatten(), [0, 0, 0, 0]) + def test_statistics_combination(self): """ Testing three statistics at same time and see if column concatenated @@ -306,9 +378,9 @@ def test_statistics_combination(self): starts_with_index = STATISTICS_NAMES.index("Starts with") capital_counts_index = STATISTICS_NAMES.index("Capital letter count") self.widget.active_rules = [ - (wc_index, ""), - (starts_with_index, "a"), - (capital_counts_index, ""), + (wc_index, "", Sources.DOCUMENTS), + (starts_with_index, "a", Sources.TOKENS), + (capital_counts_index, "", Sources.DOCUMENTS), ] self.widget.adjust_n_rule_rows() @@ -333,43 +405,44 @@ def test_dictionary_statistics(self): """ self.send_signal(self.widget.Inputs.corpus, self.corpus) - self.widget.active_rules = [ - (1, ""), - ] + self.widget.active_rules = [(1, "", Sources.DOCUMENTS)] self.widget.adjust_n_rule_rows() self.widget.apply() self.wait_until_finished() - self.assertListEqual([(1, None)], list(self.widget.result_dict.keys())) + expected = [(1, "", Sources.DOCUMENTS)] + self.assertListEqual(expected, list(self.widget.result_dict.keys())) - self.widget.active_rules = [(1, ""), (2, "")] + self.widget.active_rules = [(1, "", Sources.DOCUMENTS), (2, "", Sources.TOKENS)] self.widget.adjust_n_rule_rows() self.widget.apply() self.wait_until_finished() - self.assertListEqual( - [(1, ""), (2, None)], list(self.widget.result_dict.keys()) - ) + expected = [(1, "", Sources.DOCUMENTS), (2, "", Sources.TOKENS)] + self.assertListEqual(expected, list(self.widget.result_dict.keys())) - self.widget.active_rules = [(2, "")] + self.widget.active_rules = [(2, "", Sources.TOKENS)] self.widget.adjust_n_rule_rows() self.widget.apply() self.wait_until_finished() - self.assertListEqual([(2, None)], list(self.widget.result_dict.keys())) + expected = [(2, "", Sources.TOKENS)] + self.assertListEqual(expected, list(self.widget.result_dict.keys())) # dict should empty on new data self.send_signal(self.widget.Inputs.corpus, self.corpus) self.assertListEqual([], list(self.widget.result_dict.keys())) def test_settings(self): - """ Test whether context correctly restore rules """ - rules = [(0, ""), (1, ""), (2, None)] + """Test whether context correctly restore rules""" + doc, tk = Sources.DOCUMENTS, Sources.TOKENS + rules = [(0, "", doc), (1, "", doc), (2, "", tk)] self.send_signal(self.widget.Inputs.corpus, self.corpus) self.widget.active_rules = rules[:] self.send_signal(self.widget.Inputs.corpus, self.book_data) - self.assertListEqual([(0, ""), (1, ""), (2, None)], self.widget.active_rules) + expected = [(0, "", doc), (1, "", doc), (2, "", tk)] + self.assertListEqual(expected, self.widget.active_rules) def test_compute_values(self): """ Test compute values on new data """ @@ -401,13 +474,13 @@ def test_add_row(self): if x.text() == "+" ][0] add_button.click() - self.assertListEqual([(0, "")], self.widget.active_rules) + self.assertListEqual([(0, "", Sources.DOCUMENTS)], self.widget.active_rules) def test_remove_row(self): self.send_signal(self.widget.Inputs.corpus, self.corpus) - self.widget.active_rules = [(0, "")] + self.widget.active_rules = [(0, "", Sources.DOCUMENTS)] self.widget.adjust_n_rule_rows() - self.assertListEqual([(0, "")], self.widget.active_rules) + self.assertListEqual([(0, "", Sources.DOCUMENTS)], self.widget.active_rules) remove_button = [ x @@ -417,6 +490,32 @@ def test_remove_row(self): remove_button.click() self.assertListEqual([], self.widget.active_rules) + def test_migrate_settings(self): + vals = [""] * 6 + ["a,e", "b,c", "", "a", "b", "c", r"\w*is", "NN,VV", "", ""] + settings = {"__version__": 1, "active_rules": list(zip(range(17), vals))} + widget = self.create_widget(OWStatistics, stored_settings=settings) + self.send_signal(self.widget.Inputs.corpus, self.corpus, widget=widget) + + expected = [ + (0, "", Sources.DOCUMENTS), + (1, "", Sources.DOCUMENTS), + (2, "", Sources.TOKENS), + (3, "", Sources.DOCUMENTS), + (4, "", Sources.DOCUMENTS), + (5, "", Sources.DOCUMENTS), + (6, "a,e", Sources.DOCUMENTS), + (7, "b,c", Sources.DOCUMENTS), + (8, "", Sources.TOKENS), + (9, "a", Sources.TOKENS), + (10, "b", Sources.TOKENS), + (11, "c", Sources.DOCUMENTS), + (12, r"\w*is", Sources.DOCUMENTS), + (13, "NN,VV", Sources.TOKENS), + (14, "", Sources.TOKENS), + (15, "", Sources.TOKENS), + ] + self.assertListEqual(expected, widget.active_rules) + if __name__ == "__main__": unittest.main()