diff --git a/orangecontrib/text/widgets/owstatistics.py b/orangecontrib/text/widgets/owstatistics.py index fe12ae29b..dbe931dc5 100644 --- a/orangecontrib/text/widgets/owstatistics.py +++ b/orangecontrib/text/widgets/owstatistics.py @@ -3,7 +3,7 @@ from copy import copy from itertools import groupby from string import punctuation -from typing import Callable, List, Optional, Tuple +from typing import Callable, List, Optional, Tuple, Union, Generator, Iterator import numpy as np from AnyQt.QtWidgets import QComboBox, QGridLayout, QLabel, QLineEdit, QSizePolicy @@ -26,7 +26,12 @@ ) -def num_words(document: str, callback: Callable) -> int: +class Sources: + DOCUMENTS = "Documents" + TOKENS = "Preprocessed tokens" # tokens or ngrams - depending on statistic + + +def num_words(document: Union[str, List], callback: Callable) -> int: """ Return number of words in document-string. Word is every entity divided by space, tab, newline. @@ -35,11 +40,13 @@ def num_words(document: str, callback: Callable) -> int: return len(document.split()) -def char_count(document: str, callback: Callable) -> int: +def char_count(document: Union[str, List], callback: Callable) -> int: """ Count number of alpha-numerical in document/string. """ callback() + if isinstance(document, List): + document = "".join(document) return sum(c.isalnum() for c in document) @@ -52,15 +59,18 @@ def digit_count(document: str, callback: Callable) -> int: def count_appearances( - document: str, characters: List[str], callback: Callable + document: Union[str, List], characters: List[str], callback: Callable ) -> int: """ Count number of appearances of chars from `characters` list. """ callback() # I think it supports the majority of main languages - # Y can be vowel too sometimes - it is not possible to distinguish - return sum(document.lower().count(c) for c in characters) + # Y can be vo wel too sometimes - it is not possible to distinguish + if isinstance(document, str): + return sum(document.lower().count(c) for c in characters) + else: + return sum(d.lower().count(c) for c in characters for d in document) def preprocess_only_words(corpus: Corpus) -> Corpus: @@ -85,44 +95,54 @@ def preprocess_only_words(corpus: Corpus) -> Corpus: return p(corpus) +def get_source(corpus: Corpus, source: str) -> Union[List[str], Iterator[List[str]]]: + """ + Extract source from corpus according to source variable: + - if source == Sources.DOCUMENTS return documents + - if source == Sources.TOKENS return ngrams + """ + if source == "Documents": + return corpus.documents + elif source == "Preprocessed tokens": + return corpus.ngrams + else: + raise ValueError(f"Wrong source {source}") + + # every statistic returns a np.ndarray with statistics # and list with variables names - it must be implemented here since some # statistics in the future will have more variables def words_count( - corpus: Corpus, _: str, callback: Callable + corpus: Corpus, _: str, source: str, callback: Callable ) -> Tuple[np.ndarray, List[str]]: """ Count number of words in each document. """ - corpus = preprocess_only_words(corpus) + assert source == Sources.DOCUMENTS # np.c_ makes column vector (ndarray) out of the list # [1, 2, 3] -> [[1], [2], [3]] - return ( - np.c_[[num_words(d, callback) for d in corpus.documents]], - ["Word count"], - ) + return np.c_[[num_words(d, callback) for d in corpus.documents]], ["Word count"] def characters_count( - corpus: Corpus, _: str, callback: Callable + corpus: Corpus, _: str, source: str, callback: Callable ) -> Tuple[np.ndarray, List[str]]: """ Count number of characters without spaces, newlines, tabs, ... """ - return ( - np.c_[[char_count(d, callback) for d in corpus.documents]], - ["Character count"], - ) + source = get_source(corpus, source) + return np.c_[[char_count(d, callback) for d in source]], ["Character count"] def n_gram_count( - corpus: Corpus, _: str, callback: Callable + corpus: Corpus, _: str, source: str, callback: Callable ) -> Tuple[np.ndarray, List[str]]: """ Count number of n-grams in every document """ + assert source == Sources.TOKENS def ng_count(n_gram: List[str]): callback() @@ -132,11 +152,12 @@ def ng_count(n_gram: List[str]): def average_word_len( - corpus: Corpus, _: str, callback: Callable + corpus: Corpus, _: str, source: str, callback: Callable ) -> Tuple[np.ndarray, List[str]]: """ Computes word density as: word count / character count + 1 """ + assert source == Sources.DOCUMENTS return ( np.c_[ [ @@ -149,11 +170,12 @@ def average_word_len( def punctuation_count( - corpus: Corpus, _: str, callback: Callable + corpus: Corpus, _: str, source: str, callback: Callable ) -> Tuple[np.ndarray, List[str]]: """ Count number of punctuation signs """ + assert source == Sources.DOCUMENTS def num_punctuation(document: str): callback() @@ -166,11 +188,12 @@ def num_punctuation(document: str): def capital_count( - corpus: Corpus, _: str, callback: Callable + corpus: Corpus, _: str, source: str, callback: Callable ) -> Tuple[np.ndarray, List[str]]: """ Count number of capital letters in documents """ + assert source == Sources.DOCUMENTS def num_capitals(document: str): callback() @@ -183,11 +206,13 @@ def num_capitals(document: str): def vowel_count( - corpus: Corpus, vowels: str, callback: Callable + corpus: Corpus, vowels: str, source: str, callback: Callable ) -> Tuple[np.ndarray, List[str]]: """ Count number of vowels in documents """ + assert source == Sources.DOCUMENTS + # comma separated string of vowels to list vowels = [v.strip() for v in vowels.split(",")] return ( @@ -199,12 +224,14 @@ def vowel_count( def consonant_count( - corpus: Corpus, consonants: str, callback: Callable + corpus: Corpus, consonants: str, source: str, callback: Callable ) -> Tuple[np.ndarray, List[str]]: """ Count number of consonants in documents. Consonants are all alnum characters except vowels and numbers """ + assert source == Sources.DOCUMENTS + # comma separated string of consonants to list consonants = [v.strip() for v in consonants.split(",")] return ( @@ -219,11 +246,12 @@ def consonant_count( def per_cent_unique_words( - corpus: Corpus, _: str, callback: Callable + corpus: Corpus, _: str, source: str, callback: Callable ) -> Tuple[np.ndarray, List[str]]: """ Ratio between unique words count and all words count """ + assert source == Sources.TOKENS corpus = preprocess_only_words(corpus) def perc_unique(tokens: str): @@ -232,15 +260,16 @@ def perc_unique(tokens: str): return np.nan return len(set(tokens)) / len(tokens) - return np.c_[list(map(perc_unique, corpus.tokens))], ["% unique words"] + return np.c_[list(map(perc_unique, corpus.ngrams))], ["% unique words"] def starts_with( - corpus: Corpus, prefix: str, callback: Callable + corpus: Corpus, prefix: str, source: str, callback: Callable ) -> Tuple[np.ndarray, List[str]]: """ Number of words that starts with the string in `prefix`. """ + assert source == Sources.TOKENS corpus = preprocess_only_words(corpus) def number_starts_with(tokens: List[str]): @@ -248,17 +277,18 @@ def number_starts_with(tokens: List[str]): return sum(t.startswith(prefix) for t in tokens) return ( - np.c_[list(map(number_starts_with, corpus.tokens))], + np.c_[list(map(number_starts_with, corpus.ngrams))], [f"Starts with {prefix}"], ) def ends_with( - corpus: Corpus, postfix: str, callback: Callable + corpus: Corpus, postfix: str, source: str, callback: Callable ) -> Tuple[np.ndarray, List[str]]: """ Number of words that ends with the string in `postfix`. """ + assert source == Sources.TOKENS corpus = preprocess_only_words(corpus) def number_ends_with(tokens: List[str]): @@ -266,46 +296,50 @@ def number_ends_with(tokens: List[str]): return sum(t.endswith(postfix) for t in tokens) return ( - np.c_[list(map(number_ends_with, corpus.tokens))], + np.c_[list(map(number_ends_with, corpus.ngrams))], [f"Ends with {postfix}"], ) def contains( - corpus: Corpus, text: str, callback: Callable + corpus: Corpus, text: str, source: str, callback: Callable ) -> Tuple[np.ndarray, List[str]]: """ Number of words that contains string in `text`. """ + source = get_source(corpus, source) return ( - np.c_[ - [count_appearances(d, [text], callback) for d in corpus.documents] - ], + np.c_[[count_appearances(d, [text], callback) for d in source]], [f"Contains {text}"], ) def regex( - corpus: Corpus, expression: str, callback: Callable + corpus: Corpus, expression: str, source: str, callback: Callable ) -> Tuple[np.ndarray, List[str]]: """ Count occurrences of pattern in `expression`. """ pattern = re.compile(expression) - def regex_matches(text: str): + def regex_matches(text: Union[str, List]): callback() - return len(re.findall(pattern, text)) + if isinstance(text, str): + return len(re.findall(pattern, text)) + else: + return sum(len(re.findall(pattern, ngram)) for ngram in text) - return np.c_[list(map(regex_matches, corpus.documents))], [f"Regex {expression}"] + source = get_source(corpus, source) + return np.c_[list(map(regex_matches, source))], [f"Regex {expression}"] def pos_tags( - corpus: Corpus, pos_tags: str, callback: Callable + corpus: Corpus, pos_tags: str, source: str, callback: Callable ) -> Optional[Tuple[np.ndarray, List[str]]]: """ Count number of specified pos tags in corpus """ + assert source == Sources.TOKENS p_tags = [v.strip().lower() for v in pos_tags.split(",")] def cust_count(tags): @@ -322,7 +356,7 @@ def cust_count(tags): def yule( - corpus: Corpus, _: str, callback: Callable + corpus: Corpus, _: str, source: str, callback: Callable ) -> Optional[Tuple[np.ndarray, List[str]]]: """ Yule's I measure: higher number is higher diversity - richer vocabulary @@ -330,6 +364,7 @@ def yule( Mathematical Proceedings of the Cambridge Philosophical Society, 42(2), B1-B2. doi:10.1017/S0305004100022799 """ + assert source == Sources.TOKENS if corpus.pos_tags is None: return None @@ -351,12 +386,13 @@ def yules_i(tags): def lix( - corpus: Corpus, _: str, callback: Callable + corpus: Corpus, _: str, source: str, callback: Callable ) -> Optional[Tuple[np.ndarray, List[str]]]: """ Readability index LIX https://en.wikipedia.org/wiki/Lix_(readability_test) """ + assert source == Sources.TOKENS corpus = preprocess_only_words(corpus) tokenizer = tokenize.PunktSentenceTokenizer() @@ -416,30 +452,32 @@ def __hash__(self): STATISTICS = [ # (name of the statistics, function to compute, default value) # if default value is None - text box is not required - ("Word count", words_count, None), - ("Character count", characters_count, None), - ("N-gram count", n_gram_count, None), - ("Average word length", average_word_len, None), - ("Punctuation count", punctuation_count, None), - ("Capital letter count", capital_count, None), - ("Vowel count", vowel_count, "a,e,i,o,u"), + ("Word count", words_count, None, (Sources.DOCUMENTS,)), + ("Character count", characters_count, None, (Sources.DOCUMENTS, Sources.TOKENS)), + ("N-gram count", n_gram_count, None, (Sources.TOKENS,)), + ("Average word length", average_word_len, None, (Sources.DOCUMENTS,)), # todo: discuss + ("Punctuation count", punctuation_count, None, (Sources.DOCUMENTS,)), + ("Capital letter count", capital_count, None, (Sources.DOCUMENTS,)), + ("Vowel count", vowel_count, "a,e,i,o,u", (Sources.DOCUMENTS,)), ( "Consonant count", consonant_count, "b,c,d,f,g,h,j,k,l,m,n,p,q,r,s,t,v,w,x,y,z", + (Sources.DOCUMENTS,), ), - ("Per cent unique words", per_cent_unique_words, None), - ("Starts with", starts_with, ""), - ("Ends with", ends_with, ""), - ("Contains", contains, ""), - ("Regex", regex, ""), - ("POS tag", pos_tags, "NN,VV,JJ"), - ("Yule's I", yule, None), - ("LIX index", lix, None), + ("Per cent unique terms", per_cent_unique_words, None, (Sources.TOKENS,)), + ("Starts with", starts_with, "", (Sources.TOKENS,)), + ("Ends with", ends_with, "", (Sources.TOKENS,)), + ("Contains", contains, "", (Sources.DOCUMENTS, Sources.TOKENS)), + ("Regex", regex, "", (Sources.DOCUMENTS, Sources.TOKENS)), + ("POS tag", pos_tags, "NN,VV,JJ", (Sources.TOKENS,)), + ("Yule's I", yule, None, (Sources.TOKENS,)), + ("LIX index", lix, None, (Sources.TOKENS,)), ] STATISTICS_NAMES = list(list(zip(*STATISTICS))[0]) STATISTICS_FUNCTIONS = list(list(zip(*STATISTICS))[1]) STATISTICS_DEFAULT_VALUE = list(list(zip(*STATISTICS))[2]) +STATISTICS_DEFAULT_SOURCES = list(list(zip(*STATISTICS))[3]) def run(corpus: Corpus, statistics: Tuple[int, str], state: TaskState) -> None: @@ -463,12 +501,12 @@ def run(corpus: Corpus, statistics: Tuple[int, str], state: TaskState) -> None: def advance(): state.set_progress_value(next(tick_values)) - for s, patern in statistics: + for s, patern, source in statistics: fun = STATISTICS_FUNCTIONS[s] - result = fun(corpus, patern, advance) + result = fun(corpus, patern, source, advance) if result is not None: result = result + (ComputeValue(fun, patern),) - state.set_partial_result((s, patern, result)) + state.set_partial_result((s, patern, source, result)) class OWStatistics(OWWidget, ConcurrentWidgetMixin): @@ -488,12 +526,13 @@ class Warning(OWWidget.Warning): "{} statistics cannot be computed and is omitted from results." ) + # todo: update settings version and migration want_main_area = False mainArea_width_height_ratio = None - # settings - default_rules = [(0, ""), (1, "")] # rules used to reset the active rules - active_rules: List[Tuple[int, str]] = Setting(default_rules[:]) + # rules used to reset the active rules + default_rules = [(0, "", STATISTICS[0][-1][0]), (1, "", STATISTICS[0][-1][0])] + active_rules: List[Tuple[int, str, str]] = Setting(default_rules[:]) # rules active at time of apply clicked applied_rules: Optional[List[Tuple[int, str]]] = None @@ -504,12 +543,14 @@ def __init__(self) -> None: ConcurrentWidgetMixin.__init__(self) self.corpus = None - # the list with combos from the widget - self.combos = [] + # the list with combos for selecting statistics from the widget + self.statistics_combos = [] # the list with line edits from the widget self.line_edits = [] # the list of buttons in front of controls that removes them self.remove_buttons = [] + # the list with combos for selecting on what statistics computes + self.source_combos = [] self._init_controls() @@ -539,6 +580,7 @@ def _init_statistics_box(self) -> None: grid.setColumnStretch(2, 100) grid.addWidget(QLabel("Feature"), 0, 1) grid.addWidget(QLabel("Pattern"), 0, 2) + grid.addWidget(QLabel("Compute for"), 0, 3) gui.button( box, @@ -559,7 +601,7 @@ def adjust_n_rule_rows(self) -> None: """ def _add_line(): - n_lines = len(self.combos) + 1 + n_lines = len(self.statistics_combos) + 1 # add delete symbol button = gui.button( @@ -574,23 +616,29 @@ def _add_line(): combo.addItems(STATISTICS_NAMES) combo.currentIndexChanged.connect(self._sync_edit_combo) self.rules_grid.addWidget(combo, n_lines, 1) - self.combos.append(combo) + self.statistics_combos.append(combo) - # add line edit for patern + # add line edit for pattern line_edit = QLineEdit() self.rules_grid.addWidget(line_edit, n_lines, 2) line_edit.textChanged.connect(self._sync_edit_line) self.line_edits.append(line_edit) + # add statistics type dropdown + combo = QComboBox() + combo.currentIndexChanged.connect(self._sync_edit_source_combo) + self.rules_grid.addWidget(combo, n_lines, 3) + self.source_combos.append(combo) + def _remove_line(): - self.combos.pop().deleteLater() + self.statistics_combos.pop().deleteLater() self.line_edits.pop().deleteLater() self.remove_buttons.pop().deleteLater() def _fix_tab_order(): # TODO: write it differently - check create class - for i, (r, c, l) in enumerate( - zip(self.active_rules, self.combos, self.line_edits) + for i, (r, c, l, s) in enumerate( + zip(self.active_rules, self.statistics_combos, self.line_edits, self.source_combos) ): c.setCurrentIndex(r[0]) # update combo l.setText(r[1]) # update line edit @@ -598,17 +646,20 @@ def _fix_tab_order(): l.setVisible(True) else: l.setVisible(False) + s.clear() + s.addItems(STATISTICS_DEFAULT_SOURCES[r[0]]) + s.setCurrentText(r[2]) n = len(self.active_rules) - while n > len(self.combos): + while n > len(self.statistics_combos): _add_line() - while len(self.combos) > n: + while len(self.statistics_combos) > n: _remove_line() _fix_tab_order() def _add_row(self) -> None: """ Add a new row to the statistic box """ - self.active_rules.append((0, "")) + self.active_rules.append((0, "", STATISTICS_DEFAULT_SOURCES[0][0])) self.adjust_n_rule_rows() def _remove_row(self) -> None: @@ -620,10 +671,11 @@ def _remove_row(self) -> None: def _sync_edit_combo(self) -> None: """ Update rules when combo value changed """ combo = self.sender() - edit_index = self.combos.index(combo) + edit_index = self.statistics_combos.index(combo) selected_i = combo.currentIndex() default_value = STATISTICS_DEFAULT_VALUE[selected_i] - self.active_rules[edit_index] = (selected_i, default_value) + default_source = STATISTICS_DEFAULT_SOURCES[selected_i][0] + self.active_rules[edit_index] = (selected_i, default_value, default_source) self.adjust_n_rule_rows() def _sync_edit_line(self) -> None: @@ -633,8 +685,18 @@ def _sync_edit_line(self) -> None: self.active_rules[edit_index] = ( self.active_rules[edit_index][0], line_edit.text(), + STATISTICS_DEFAULT_SOURCES[edit_index][0] ) + def _sync_edit_source_combo(self) -> None: + """ Update rules when line edit value changed """ + combo = self.sender() + edit_index = self.source_combos.index(combo) + value = combo.currentText() + print(value) + arules = self.active_rules[edit_index] + self.active_rules[edit_index] = (arules[0], arules[1], value) + @Inputs.corpus def set_data(self, corpus) -> None: self.corpus = corpus @@ -663,10 +725,10 @@ def on_exception(self, exception: Exception) -> None: raise exception def on_partial_result( - self, result: Tuple[int, str, Tuple[np.ndarray, List[str], Callable]] + self, result: Tuple[int, str, str, Tuple[np.ndarray, List[str], Callable]] ) -> None: - statistic, patern, result = result - self.result_dict[(statistic, patern)] = result + statistic, patern, source, result = result + self.result_dict[(statistic, patern, source)] = result def on_done(self, result: None) -> None: # join results diff --git a/orangecontrib/text/widgets/tests/test_owstatistics.py b/orangecontrib/text/widgets/tests/test_owstatistics.py index 79807da66..e802d9622 100644 --- a/orangecontrib/text/widgets/tests/test_owstatistics.py +++ b/orangecontrib/text/widgets/tests/test_owstatistics.py @@ -5,11 +5,12 @@ from Orange.data import Domain, StringVariable from Orange.widgets.tests.base import WidgetTest +from Orange.widgets.tests.utils import simulate from orangecontrib.text import Corpus from orangecontrib.text.tag import AveragedPerceptronTagger from orangecontrib.text.widgets.owstatistics import ( STATISTICS_NAMES, - OWStatistics, + OWStatistics, Sources, ) @@ -40,7 +41,9 @@ def _create_simple_data(self) -> None: text_features=[text_var], ) - def _set_feature(self, feature_name: str, value: str = ""): + def _set_feature( + self, feature_name: str, value: str = "", source: str = Sources.DOCUMENTS + ): """ Set statistic which need to be computed by widget. It sets only one statistics. @@ -52,11 +55,17 @@ def _set_feature(self, feature_name: str, value: str = ""): value If statistic need a value (e.g. prefix) it is passed here. """ - feature_index = STATISTICS_NAMES.index(feature_name) - self.widget.active_rules = [(feature_index, value)] - self.widget.adjust_n_rule_rows() - - def _compute_features(self, feature_name: str, value: str = "") -> Corpus: + simulate.combobox_activate_item(self.widget.statistics_combos[0], feature_name) + self.widget.line_edits[0].setText(value) + print(self.widget.active_rules, feature_name, value, source) + simulate.combobox_activate_item(self.widget.source_combos[0], source) + print(self.widget.active_rules) + for button in self.widget.remove_buttons[1:]: + button.click() + + def _compute_features( + self, feature_name: str, value: str = "", source: str = Sources.DOCUMENTS + ) -> Corpus: """ Send `self.corpus` to widget, set statistic which need bo be computed, run the computation, and return widget output. @@ -74,7 +83,7 @@ def _compute_features(self, feature_name: str, value: str = "") -> Corpus: """ self.send_signal(self.widget.Inputs.corpus, self.corpus) self.wait_until_finished() - self._set_feature(feature_name, value) + self._set_feature(feature_name, value, source) self.widget.apply() self.wait_until_finished() res = self.get_output(self.widget.Outputs.corpus) @@ -101,15 +110,19 @@ def test_words_count(self): def test_characters_count(self): """ Test characters count statistic """ - data = self._compute_features("Character count") + data = self._compute_features("Character count", source=Sources.DOCUMENTS) + np.testing.assert_array_equal(data.X.flatten(), [47, 44, 48, 51]) + + data = self._compute_features("Character count", source=Sources.TOKENS) np.testing.assert_array_equal(data.X.flatten(), [47, 44, 48, 51]) self.send_signal(self.widget.Inputs.corpus, None) self.assertIsNone(self.get_output(self.widget.Outputs.corpus)) + # todo: make different preprocessing and the test all tokens statistics again def test_n_gram_count(self): """ Test n-grams count statistic """ - data = self._compute_features("N-gram count") + data = self._compute_features("N-gram count", source=Sources.TOKENS) np.testing.assert_array_equal(data.X.flatten(), [10, 12, 13, 12]) self.send_signal(self.widget.Inputs.corpus, None) @@ -161,14 +174,14 @@ def test_consonants_count(self): def test_per_cent_unique_words(self): """ Test per-cent unique words statistic """ - data = self._compute_features("Per cent unique words") + data = self._compute_features("Per cent unique terms", source=Sources.TOKENS) np.testing.assert_array_almost_equal( data.X.flatten(), [1, 1, 0.909091, 1] ) with self.corpus.unlocked(): self.corpus[1][-1] = "" - data = self._compute_features("Per cent unique words") + data = self._compute_features("Per cent unique terms", source=Sources.TOKENS) np.testing.assert_array_almost_equal( data.X.flatten(), [1, np.nan, 0.909091, 1] ) @@ -178,10 +191,10 @@ def test_per_cent_unique_words(self): def test_starts_with(self): """ Test starts with count statistic """ - data = self._compute_features("Starts with", "a") + data = self._compute_features("Starts with", "a", Sources.TOKENS) np.testing.assert_array_almost_equal(data.X.flatten(), [2, 0, 2, 2]) - data = self._compute_features("Starts with", "ap") + data = self._compute_features("Starts with", "ap", Sources.TOKENS) np.testing.assert_array_almost_equal(data.X.flatten(), [0, 0, 0, 1]) self.send_signal(self.widget.Inputs.corpus, None)