Skip to content

Commit

Permalink
Statistics - Regex count in whole document to only token
Browse files Browse the repository at this point in the history
  • Loading branch information
PrimozGodec committed Oct 24, 2023
1 parent 9e81052 commit 6d4b9ea
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 10 deletions.
9 changes: 3 additions & 6 deletions orangecontrib/text/widgets/owstatistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,14 +293,11 @@ def regex(
"""
pattern = re.compile(expression)

def number_regex(tokens: List[str]):
def regex_matches(text: str):
callback()
return sum(bool(pattern.match(t)) for t in tokens)
return len(re.findall(pattern, text))

return (
np.c_[list(map(number_regex, corpus.tokens))],
[f"Regex {expression}"],
)
return np.c_[list(map(regex_matches, corpus.documents))], [f"Regex {expression}"]


def pos_tags(
Expand Down
12 changes: 8 additions & 4 deletions orangecontrib/text/widgets/tests/test_owstatistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,14 +214,18 @@ def test_contains(self):

def test_regex(self):
""" Test regex statistic """
# words that contains digit
data = self._compute_features("Regex", "\w*\d\w*")
# words that contain digit
data = self._compute_features("Regex", r"\w*\d\w*")
np.testing.assert_array_almost_equal(data.X.flatten(), [0, 0, 0, 1])

# words that contains digit
data = self._compute_features("Regex", "\w*is\w*")
# words that contain digit
data = self._compute_features("Regex", r"\w*is\w*")
np.testing.assert_array_almost_equal(data.X.flatten(), [1, 2, 2, 0])

# count specific n-gram
data = self._compute_features("Regex", r"ipsum\ dolor")
np.testing.assert_array_almost_equal(data.X.flatten(), [1, 0, 0, 0])

self.send_signal(self.widget.Inputs.corpus, None)
self.assertIsNone(self.get_output(self.widget.Outputs.corpus))

Expand Down

0 comments on commit 6d4b9ea

Please sign in to comment.