diff --git a/orangecontrib/text/widgets/owstatistics.py b/orangecontrib/text/widgets/owstatistics.py index f465ee3ac..fe12ae29b 100644 --- a/orangecontrib/text/widgets/owstatistics.py +++ b/orangecontrib/text/widgets/owstatistics.py @@ -293,14 +293,11 @@ def regex( """ pattern = re.compile(expression) - def number_regex(tokens: List[str]): + def regex_matches(text: str): callback() - return sum(bool(pattern.match(t)) for t in tokens) + return len(re.findall(pattern, text)) - return ( - np.c_[list(map(number_regex, corpus.tokens))], - [f"Regex {expression}"], - ) + return np.c_[list(map(regex_matches, corpus.documents))], [f"Regex {expression}"] def pos_tags( diff --git a/orangecontrib/text/widgets/tests/test_owstatistics.py b/orangecontrib/text/widgets/tests/test_owstatistics.py index ad1820413..79807da66 100644 --- a/orangecontrib/text/widgets/tests/test_owstatistics.py +++ b/orangecontrib/text/widgets/tests/test_owstatistics.py @@ -214,14 +214,18 @@ def test_contains(self): def test_regex(self): """ Test regex statistic """ - # words that contains digit - data = self._compute_features("Regex", "\w*\d\w*") + # words that contain digit + data = self._compute_features("Regex", r"\w*\d\w*") np.testing.assert_array_almost_equal(data.X.flatten(), [0, 0, 0, 1]) - # words that contains digit - data = self._compute_features("Regex", "\w*is\w*") + # words that contain digit + data = self._compute_features("Regex", r"\w*is\w*") np.testing.assert_array_almost_equal(data.X.flatten(), [1, 2, 2, 0]) + # count specific n-gram + data = self._compute_features("Regex", r"ipsum\ dolor") + np.testing.assert_array_almost_equal(data.X.flatten(), [1, 0, 0, 0]) + self.send_signal(self.widget.Inputs.corpus, None) self.assertIsNone(self.get_output(self.widget.Outputs.corpus))