From 5bc8cbf6864c2c7ad66a992af2e1295079aa2081 Mon Sep 17 00:00:00 2001 From: Ajda Date: Fri, 5 Jul 2024 13:47:37 +0200 Subject: [PATCH] Spacy tests --- orangecontrib/text/tests/test_preprocess.py | 29 +++++++++++++++++++ .../text/widgets/tests/test_owpreprocess.py | 23 +++++++++++---- 2 files changed, 46 insertions(+), 6 deletions(-) diff --git a/orangecontrib/text/tests/test_preprocess.py b/orangecontrib/text/tests/test_preprocess.py index 0a91a49aa..8a2970fff 100644 --- a/orangecontrib/text/tests/test_preprocess.py +++ b/orangecontrib/text/tests/test_preprocess.py @@ -721,5 +721,34 @@ def test_can_pickle(self): self.assertEqual(loaded._NGrams__range, self.pp._NGrams__range) +class TestPOSTagging(unittest.TestCase): + def setUp(self): + self.corpus = Corpus.from_file("deerwester") + self.pp = [preprocess.WordPunctTokenizer(), + tag.SpacyPOSTagger()] + + def test_no_tokens(self): + self.assertFalse(self.corpus.has_tokens()) + tagger = tag.SpacyPOSTagger() + corpus = tagger(self.corpus) + self.assertEqual(len(corpus.used_preprocessor.preprocessors), 2) + self.assertTrue(corpus.has_tags()) + + def test_pos_tagger(self): + corpus = self.corpus + for pp in self.pp: + corpus = pp(corpus) + self.assertTrue(corpus.has_tokens()) + self.assertTrue(corpus.has_tags()) + self.assertEqual(len(corpus.pos_tags), len(corpus.tokens)) + spacy_tags = corpus.pos_tags + tagger = tag.AveragedPerceptronTagger() + corpus = tagger(self.corpus) + self.assertEqual(len(corpus.pos_tags), len(corpus.tokens)) + self.assertEqual(len(corpus.used_preprocessor.preprocessors), 2) + apt_tags = corpus.pos_tags + self.assertFalse(bool(np.array_equal(spacy_tags, apt_tags))) + + if __name__ == "__main__": unittest.main() diff --git a/orangecontrib/text/widgets/tests/test_owpreprocess.py b/orangecontrib/text/widgets/tests/test_owpreprocess.py index 49ccb1229..5c077d1ce 100644 --- a/orangecontrib/text/widgets/tests/test_owpreprocess.py +++ b/orangecontrib/text/widgets/tests/test_owpreprocess.py @@ -11,7 +11,8 @@ from orangecontrib.text.preprocess import RegexpTokenizer, WhitespaceTokenizer, \ LowercaseTransformer, HtmlTransformer, PorterStemmer, SnowballStemmer, \ UDPipeLemmatizer, StopwordsFilter, MostFrequentTokensFilter, NGrams -from orangecontrib.text.tag import AveragedPerceptronTagger, MaxEntTagger +from orangecontrib.text.tag import (AveragedPerceptronTagger, MaxEntTagger, + SpacyPOSTagger) from orangecontrib.text.tests.test_preprocess import SF_LIST, SERVER_FILES from orangecontrib.text.widgets.owpreprocess import ( OWPreprocess, @@ -884,20 +885,21 @@ def buttons(self): def test_init(self): self.assertTrue(self.buttons[0].isChecked()) - for i in range(1, 2): + for i in range(1, 3): self.assertFalse(self.buttons[i].isChecked()) def test_parameters(self): - params = {"method": POSTaggingModule.Averaged} + params = {"method": POSTaggingModule.Averaged, "spacy_language": + POSTaggingModule.DEFAULT_LANGUAGE} self.assertDictEqual(self.editor.parameters(), params) def test_set_parameters(self): - params = {"method": POSTaggingModule.MaxEnt} + params = {"method": POSTaggingModule.Spacy, "spacy_language": "sl"} self.editor.setParameters(params) self.assertDictEqual(self.editor.parameters(), params) - self.assertTrue(self.buttons[1].isChecked()) - for i in range(1): + self.assertTrue(self.buttons[2].isChecked()) + for i in range(0, 2): self.assertFalse(self.buttons[i].isChecked()) def test_createinstance(self): @@ -907,9 +909,18 @@ def test_createinstance(self): pp = self.editor.createinstance({"method": POSTaggingModule.MaxEnt}) self.assertIsInstance(pp, MaxEntTagger) + pp = self.editor.createinstance({"method": POSTaggingModule.Spacy}) + self.assertIsInstance(pp, SpacyPOSTagger) + def test_repr(self): self.assertEqual(str(self.editor), "Averaged Perceptron Tagger") + params = {"method": POSTaggingModule.Spacy, "spacy_language": + POSTaggingModule.DEFAULT_LANGUAGE} + self.editor.setParameters(params) + self.assertEqual(str(self.editor), + f"Spacy POS Tagger ({params['spacy_language']})") + class TestLanguageComboBox(WidgetTest): def test_basic_setup(self):