From 8ce3c8b615297ef73f210f734b675cc4a85ca42d Mon Sep 17 00:00:00 2001 From: PrimozGodec Date: Fri, 26 Jan 2024 13:56:57 +0100 Subject: [PATCH] Document Embedding - Store ISO language in settings --- .../text/widgets/owdocumentembedding.py | 27 +++++++++-------- .../widgets/tests/test_owdocumentembedding.py | 30 ++++++++++++------- 2 files changed, 35 insertions(+), 22 deletions(-) diff --git a/orangecontrib/text/widgets/owdocumentembedding.py b/orangecontrib/text/widgets/owdocumentembedding.py index c9cf39f01..5445e3db8 100644 --- a/orangecontrib/text/widgets/owdocumentembedding.py +++ b/orangecontrib/text/widgets/owdocumentembedding.py @@ -8,7 +8,9 @@ from Orange.widgets.widget import Msg, Output, OWWidget from orangecontrib.text.corpus import Corpus -from orangecontrib.text.language import ISO2LANG, LANG2ISO +from orangecontrib.text.language import ( + ISO2LANG, DEFAULT_LANGUAGE, LanguageModel, LANG2ISO +) from orangecontrib.text.vectorization.document_embedder import ( AGGREGATORS, AGGREGATORS_ITEMS, @@ -39,10 +41,9 @@ class OWDocumentEmbedding(OWBaseVectorizer): priority = 300 buttons_area_orientation = Qt.Vertical - settings_version = 2 + settings_version = 3 Methods = [SBERT, DocumentEmbedder] - DEFAULT_LANGUAGE = "English" class Outputs(OWBaseVectorizer.Outputs): skipped = Output("Skipped documents", Corpus) @@ -84,7 +85,7 @@ def create_configuration_layout(self): ibox, self, "language", - items=[ISO2LANG[lg] for lg in LANGUAGES], + model=LanguageModel(languages=LANGUAGES), label="Language:", sendSelectedValue=True, # value is actual string not index orientation=Qt.Horizontal, @@ -108,10 +109,10 @@ def create_configuration_layout(self): def set_data(self, corpus): # set language from corpus as selected language if corpus and corpus.language in LANGUAGES: - self.language = ISO2LANG[corpus.language] + self.language = corpus.language else: # if Corpus's language not supported use default language - self.language = self.DEFAULT_LANGUAGE + self.language = DEFAULT_LANGUAGE # when workflow loaded use language saved in workflow if self.__pending_language is not None: @@ -127,9 +128,7 @@ def update_method(self): self.vectorizer = EmbeddingVectorizer(self.init_method(), self.corpus) def init_method(self): - params = dict( - language=LANG2ISO[self.language], aggregator=self.aggregator - ) + params = dict(language=self.language, aggregator=self.aggregator) kwargs = ({}, params)[self.method] return self.Methods[self.method](**kwargs) @@ -170,6 +169,9 @@ def migrate_settings(cls, settings: Dict[str, Any], version: Optional[int]): settings["language"] = LANGUAGES[settings["language"]] if "aggregator" in settings: settings["aggregator"] = AGGREGATORS[settings["aggregator"]] + if version is None or version < 3 and "language" in settings: + # before version 3 language settings were language names, transform to ISO + settings["language"] = LANG2ISO[settings["language"]] def send_report(self): if self.method == 0: @@ -177,11 +179,12 @@ def send_report(self): ("Embedder", "Multilingual SBERT"), )) if self.method == 1: - self.report_items(( + items = ( ("Embedder", "fastText"), - ("Language", self.language), + ("Language", ISO2LANG[self.language]), ("Aggregator", self.aggregator), - )) + ) + self.report_items(items) if __name__ == "__main__": diff --git a/orangecontrib/text/widgets/tests/test_owdocumentembedding.py b/orangecontrib/text/widgets/tests/test_owdocumentembedding.py index 95cbbf5bb..d79333a6b 100644 --- a/orangecontrib/text/widgets/tests/test_owdocumentembedding.py +++ b/orangecontrib/text/widgets/tests/test_owdocumentembedding.py @@ -7,8 +7,12 @@ from Orange.widgets.tests.utils import simulate from Orange.misc.utils.embedder_utils import EmbeddingConnectionError +from orangecontrib.text.language import DEFAULT_LANGUAGE, ISO2LANG from orangecontrib.text.tests.test_documentembedder import PATCH_METHOD, make_dummy_post -from orangecontrib.text.vectorization.document_embedder import DocumentEmbedder +from orangecontrib.text.vectorization.document_embedder import ( + DocumentEmbedder, + LANGUAGES, +) from orangecontrib.text.vectorization.sbert import EMB_DIM, SBERT from orangecontrib.text.widgets.owdocumentembedding import OWDocumentEmbedding from orangecontrib.text import Corpus @@ -157,14 +161,14 @@ def test_corpus_name_preserved(self): def test_fasttext_language(self): # english corpus self.send_signal(self.widget.Inputs.corpus, self.corpus) - self.assertEqual("English", self.widget.language) + self.assertEqual("en", self.widget.language) result = self.get_output(self.widget.Outputs.corpus) self.assertEqual(9, len(result)) # slovenian corpus self.corpus.attributes["language"] = "sl" self.send_signal(self.widget.Inputs.corpus, self.corpus) - self.assertEqual("Slovenian", self.widget.language) + self.assertEqual("sl", self.widget.language) result = self.get_output(self.widget.Outputs.corpus) self.assertEqual(9, len(result)) @@ -172,7 +176,7 @@ def test_fasttext_language(self): self.corpus.attributes["language"] = None self.send_signal(self.widget.Inputs.corpus, self.corpus) # use widgets default language English - self.assertEqual(self.widget.DEFAULT_LANGUAGE, self.widget.language) + self.assertEqual(DEFAULT_LANGUAGE, self.widget.language) result = self.get_output(self.widget.Outputs.corpus) self.assertEqual(9, len(result)) @@ -180,14 +184,14 @@ def test_fasttext_language(self): self.corpus.attributes["language"] = "be" self.send_signal(self.widget.Inputs.corpus, self.corpus) # use widgets default language English - self.assertEqual(self.widget.DEFAULT_LANGUAGE, self.widget.language) + self.assertEqual(DEFAULT_LANGUAGE, self.widget.language) result = self.get_output(self.widget.Outputs.corpus) self.assertEqual(9, len(result)) # language english self.corpus.attributes["language"] = "en" self.send_signal(self.widget.Inputs.corpus, self.corpus) - self.assertEqual("English", self.widget.language) + self.assertEqual("en", self.widget.language) result = self.get_output(self.widget.Outputs.corpus) self.assertEqual(9, len(result)) @@ -195,25 +199,25 @@ def test_fasttext_language(self): simulate.combobox_activate_item( self.widget.controlArea.findChildren(QComboBox)[0], "French" ) - self.assertEqual("French", self.widget.language) + self.assertEqual("fr", self.widget.language) result = self.get_output(self.widget.Outputs.corpus) self.assertEqual(9, len(result)) # providing new corpus should reset language self.send_signal(self.widget.Inputs.corpus, self.corpus) - self.assertEqual("English", self.widget.language) + self.assertEqual("en", self.widget.language) def test_language_from_settings(self): self.send_signal(self.widget.Inputs.corpus, self.corpus) simulate.combobox_activate_item( self.widget.controlArea.findChildren(QComboBox)[0], "French" ) - self.assertEqual("French", self.widget.language) + self.assertEqual("fr", self.widget.language) settings = self.widget.settingsHandler.pack_data(self.widget) widget = self.create_widget(OWDocumentEmbedding, stored_settings=settings) self.send_signal(widget.Inputs.corpus, self.corpus, widget=widget) - self.assertEqual("French", widget.language) + self.assertEqual("fr", widget.language) @patch(PATCH_METHOD, make_dummy_post(b'{"embedding": [1.3, 1]}')) @patch("orangecontrib.text.widgets.owdocumentembedding.OWDocumentEmbedding.report_items") @@ -232,6 +236,12 @@ def test_report(self, mocked_items: Mock): mocked_items.assert_called_once() mocked_items.reset_mock() + def test_migrate_settings(self): + for iso_lang in LANGUAGES: + settings = {"__version__": 2, "language": ISO2LANG[iso_lang]} + widget = self.create_widget(OWDocumentEmbedding, stored_settings=settings) + self.assertEqual(iso_lang, widget.language) + if __name__ == "__main__": unittest.main()