From 8ce3c8b615297ef73f210f734b675cc4a85ca42d Mon Sep 17 00:00:00 2001
From: PrimozGodec
Date: Fri, 26 Jan 2024 13:56:57 +0100
Subject: [PATCH] Document Embedding - Store ISO language in settings
---
.../text/widgets/owdocumentembedding.py | 27 +++++++++--------
.../widgets/tests/test_owdocumentembedding.py | 30 ++++++++++++-------
2 files changed, 35 insertions(+), 22 deletions(-)
diff --git a/orangecontrib/text/widgets/owdocumentembedding.py b/orangecontrib/text/widgets/owdocumentembedding.py
index c9cf39f01..5445e3db8 100644
--- a/orangecontrib/text/widgets/owdocumentembedding.py
+++ b/orangecontrib/text/widgets/owdocumentembedding.py
@@ -8,7 +8,9 @@
from Orange.widgets.widget import Msg, Output, OWWidget
from orangecontrib.text.corpus import Corpus
-from orangecontrib.text.language import ISO2LANG, LANG2ISO
+from orangecontrib.text.language import (
+ ISO2LANG, DEFAULT_LANGUAGE, LanguageModel, LANG2ISO
+)
from orangecontrib.text.vectorization.document_embedder import (
AGGREGATORS,
AGGREGATORS_ITEMS,
@@ -39,10 +41,9 @@ class OWDocumentEmbedding(OWBaseVectorizer):
priority = 300
buttons_area_orientation = Qt.Vertical
- settings_version = 2
+ settings_version = 3
Methods = [SBERT, DocumentEmbedder]
- DEFAULT_LANGUAGE = "English"
class Outputs(OWBaseVectorizer.Outputs):
skipped = Output("Skipped documents", Corpus)
@@ -84,7 +85,7 @@ def create_configuration_layout(self):
ibox,
self,
"language",
- items=[ISO2LANG[lg] for lg in LANGUAGES],
+ model=LanguageModel(languages=LANGUAGES),
label="Language:",
sendSelectedValue=True, # value is actual string not index
orientation=Qt.Horizontal,
@@ -108,10 +109,10 @@ def create_configuration_layout(self):
def set_data(self, corpus):
# set language from corpus as selected language
if corpus and corpus.language in LANGUAGES:
- self.language = ISO2LANG[corpus.language]
+ self.language = corpus.language
else:
# if Corpus's language not supported use default language
- self.language = self.DEFAULT_LANGUAGE
+ self.language = DEFAULT_LANGUAGE
# when workflow loaded use language saved in workflow
if self.__pending_language is not None:
@@ -127,9 +128,7 @@ def update_method(self):
self.vectorizer = EmbeddingVectorizer(self.init_method(), self.corpus)
def init_method(self):
- params = dict(
- language=LANG2ISO[self.language], aggregator=self.aggregator
- )
+ params = dict(language=self.language, aggregator=self.aggregator)
kwargs = ({}, params)[self.method]
return self.Methods[self.method](**kwargs)
@@ -170,6 +169,9 @@ def migrate_settings(cls, settings: Dict[str, Any], version: Optional[int]):
settings["language"] = LANGUAGES[settings["language"]]
if "aggregator" in settings:
settings["aggregator"] = AGGREGATORS[settings["aggregator"]]
+ if version is None or version < 3 and "language" in settings:
+ # before version 3 language settings were language names, transform to ISO
+ settings["language"] = LANG2ISO[settings["language"]]
def send_report(self):
if self.method == 0:
@@ -177,11 +179,12 @@ def send_report(self):
("Embedder", "Multilingual SBERT"),
))
if self.method == 1:
- self.report_items((
+ items = (
("Embedder", "fastText"),
- ("Language", self.language),
+ ("Language", ISO2LANG[self.language]),
("Aggregator", self.aggregator),
- ))
+ )
+ self.report_items(items)
if __name__ == "__main__":
diff --git a/orangecontrib/text/widgets/tests/test_owdocumentembedding.py b/orangecontrib/text/widgets/tests/test_owdocumentembedding.py
index 95cbbf5bb..d79333a6b 100644
--- a/orangecontrib/text/widgets/tests/test_owdocumentembedding.py
+++ b/orangecontrib/text/widgets/tests/test_owdocumentembedding.py
@@ -7,8 +7,12 @@
from Orange.widgets.tests.utils import simulate
from Orange.misc.utils.embedder_utils import EmbeddingConnectionError
+from orangecontrib.text.language import DEFAULT_LANGUAGE, ISO2LANG
from orangecontrib.text.tests.test_documentembedder import PATCH_METHOD, make_dummy_post
-from orangecontrib.text.vectorization.document_embedder import DocumentEmbedder
+from orangecontrib.text.vectorization.document_embedder import (
+ DocumentEmbedder,
+ LANGUAGES,
+)
from orangecontrib.text.vectorization.sbert import EMB_DIM, SBERT
from orangecontrib.text.widgets.owdocumentembedding import OWDocumentEmbedding
from orangecontrib.text import Corpus
@@ -157,14 +161,14 @@ def test_corpus_name_preserved(self):
def test_fasttext_language(self):
# english corpus
self.send_signal(self.widget.Inputs.corpus, self.corpus)
- self.assertEqual("English", self.widget.language)
+ self.assertEqual("en", self.widget.language)
result = self.get_output(self.widget.Outputs.corpus)
self.assertEqual(9, len(result))
# slovenian corpus
self.corpus.attributes["language"] = "sl"
self.send_signal(self.widget.Inputs.corpus, self.corpus)
- self.assertEqual("Slovenian", self.widget.language)
+ self.assertEqual("sl", self.widget.language)
result = self.get_output(self.widget.Outputs.corpus)
self.assertEqual(9, len(result))
@@ -172,7 +176,7 @@ def test_fasttext_language(self):
self.corpus.attributes["language"] = None
self.send_signal(self.widget.Inputs.corpus, self.corpus)
# use widgets default language English
- self.assertEqual(self.widget.DEFAULT_LANGUAGE, self.widget.language)
+ self.assertEqual(DEFAULT_LANGUAGE, self.widget.language)
result = self.get_output(self.widget.Outputs.corpus)
self.assertEqual(9, len(result))
@@ -180,14 +184,14 @@ def test_fasttext_language(self):
self.corpus.attributes["language"] = "be"
self.send_signal(self.widget.Inputs.corpus, self.corpus)
# use widgets default language English
- self.assertEqual(self.widget.DEFAULT_LANGUAGE, self.widget.language)
+ self.assertEqual(DEFAULT_LANGUAGE, self.widget.language)
result = self.get_output(self.widget.Outputs.corpus)
self.assertEqual(9, len(result))
# language english
self.corpus.attributes["language"] = "en"
self.send_signal(self.widget.Inputs.corpus, self.corpus)
- self.assertEqual("English", self.widget.language)
+ self.assertEqual("en", self.widget.language)
result = self.get_output(self.widget.Outputs.corpus)
self.assertEqual(9, len(result))
@@ -195,25 +199,25 @@ def test_fasttext_language(self):
simulate.combobox_activate_item(
self.widget.controlArea.findChildren(QComboBox)[0], "French"
)
- self.assertEqual("French", self.widget.language)
+ self.assertEqual("fr", self.widget.language)
result = self.get_output(self.widget.Outputs.corpus)
self.assertEqual(9, len(result))
# providing new corpus should reset language
self.send_signal(self.widget.Inputs.corpus, self.corpus)
- self.assertEqual("English", self.widget.language)
+ self.assertEqual("en", self.widget.language)
def test_language_from_settings(self):
self.send_signal(self.widget.Inputs.corpus, self.corpus)
simulate.combobox_activate_item(
self.widget.controlArea.findChildren(QComboBox)[0], "French"
)
- self.assertEqual("French", self.widget.language)
+ self.assertEqual("fr", self.widget.language)
settings = self.widget.settingsHandler.pack_data(self.widget)
widget = self.create_widget(OWDocumentEmbedding, stored_settings=settings)
self.send_signal(widget.Inputs.corpus, self.corpus, widget=widget)
- self.assertEqual("French", widget.language)
+ self.assertEqual("fr", widget.language)
@patch(PATCH_METHOD, make_dummy_post(b'{"embedding": [1.3, 1]}'))
@patch("orangecontrib.text.widgets.owdocumentembedding.OWDocumentEmbedding.report_items")
@@ -232,6 +236,12 @@ def test_report(self, mocked_items: Mock):
mocked_items.assert_called_once()
mocked_items.reset_mock()
+ def test_migrate_settings(self):
+ for iso_lang in LANGUAGES:
+ settings = {"__version__": 2, "language": ISO2LANG[iso_lang]}
+ widget = self.create_widget(OWDocumentEmbedding, stored_settings=settings)
+ self.assertEqual(iso_lang, widget.language)
+
if __name__ == "__main__":
unittest.main()