From 00e49033374aedf526e03990fdb58ff2b71cd35d Mon Sep 17 00:00:00 2001
From: PrimozGodec
Date: Fri, 26 Jan 2024 12:00:11 +0100
Subject: [PATCH 1/6] Language - Update LanguageModel to support ISO settings,
language migration
---
orangecontrib/text/language.py | 51 +++++++++++++++++++++++++++-------
1 file changed, 41 insertions(+), 10 deletions(-)
diff --git a/orangecontrib/text/language.py b/orangecontrib/text/language.py
index d65c03d29..1250c9757 100644
--- a/orangecontrib/text/language.py
+++ b/orangecontrib/text/language.py
@@ -1,5 +1,5 @@
from collections import Counter
-from typing import Optional
+from typing import Optional, Sequence
from AnyQt.QtCore import Qt
from langdetect import DetectorFactory, detect
@@ -41,7 +41,7 @@
"ga": "Irish",
"gl": "Galician",
"got": "Gothic",
- "grc": "Ancient greek",
+ "grc": "Ancient Greek",
"gu": "Gujarati",
"he": "Hebrew",
"hi": "Hindi",
@@ -104,21 +104,38 @@
None: None,
}
LANG2ISO = {lang: code for code, lang in ISO2LANG.items()}
-DEFAULT_LANGUAGE = "English"
+DEFAULT_LANGUAGE = "en"
class LanguageModel(PyListModel):
"""Model for language selection dropdowns in the widgets"""
- def __init__(self):
- languages = sorted(filter(None, ISO2LANG.values()))
- super().__init__(iterable=[None] + languages)
+ def __init__(
+ self, include_none: bool = False, languages: Optional[Sequence[str]] = None
+ ):
+ """
+ Parameters
+ ----------
+ include_none
+ Indicates if "(no language)" value is available on the top of the list
+ languages
+ List of languages available in the dropdown.
+ If None all add-on supported languages are available.
+ """
+ if languages is None:
+ # if languages not provided take all available languages
+ languages = sorted(filter(None, ISO2LANG), key=ISO2LANG.get)
+ if include_none:
+ languages = [None] + languages
+ super().__init__(iterable=languages)
def data(self, index, role=Qt.DisplayRole):
- if index.row() == 0 and role == Qt.DisplayRole:
- return "(no language)"
- else:
- return super().data(index, role)
+ if role == Qt.DisplayRole:
+ value = super().data(index, role)
+ if value is None:
+ return "(no language)"
+ return ISO2LANG[value]
+ return super().data(index, role)
DetectorFactory.seed = 0
@@ -167,3 +184,17 @@ def infer_language_from_variable(variable: DiscreteVariable) -> Optional[str]:
Language ISO code if all documents have the same language, None otherwise
"""
return variable.values[0] if len(variable.values) == 1 else None
+
+
+# this dictionary hold all changes in language names
+LANGUAGE_MIGRATIONS = {
+ "Ancient greek": "Ancient Greek"
+}
+
+
+def migrate_language_name(language: str) -> str:
+ """
+ We changed some languages names after they were introduced in the add-on.
+ This function transform any langauge name to its new name if existed.
+ """
+ return LANGUAGE_MIGRATIONS.get(language, language)
From a0091e6c63c0c8b67ace34e92d7316c1bf951241 Mon Sep 17 00:00:00 2001
From: PrimozGodec
Date: Fri, 26 Jan 2024 12:01:12 +0100
Subject: [PATCH 2/6] Corpus widget - Store ISO language in settings
---
orangecontrib/text/widgets/owcorpus.py | 21 +++++---
.../text/widgets/tests/test_owcorpus.py | 51 ++++++++++++++-----
2 files changed, 53 insertions(+), 19 deletions(-)
diff --git a/orangecontrib/text/widgets/owcorpus.py b/orangecontrib/text/widgets/owcorpus.py
index 6706f3b05..6baa1690d 100644
--- a/orangecontrib/text/widgets/owcorpus.py
+++ b/orangecontrib/text/widgets/owcorpus.py
@@ -18,10 +18,10 @@
from orangecontrib.text.corpus import Corpus, get_sample_corpora_dir
from orangecontrib.text.language import (
- LANG2ISO,
detect_language,
- ISO2LANG,
LanguageModel,
+ LANG2ISO,
+ migrate_language_name,
)
from orangecontrib.text.widgets.utils import widgets, QSize
@@ -106,6 +106,7 @@ class Outputs:
key=list(FileFormat.readers.values()).index)))
settingsHandler = CorpusContextHandler()
+ settings_version = 2
recent_files = Setting([
"book-excerpts.tab",
@@ -116,7 +117,7 @@ class Outputs:
])
used_attrs = ContextSetting([])
title_variable = ContextSetting("")
- language: str = ContextSetting("English")
+ language: str = ContextSetting("en")
class Error(OWWidget.Error):
read_file = Msg("Can't read file ({})")
@@ -163,7 +164,7 @@ def __init__(self):
self,
"language",
label="Language",
- model=LanguageModel(),
+ model=LanguageModel(include_none=True),
sendSelectedValue=True,
**common_settings
)
@@ -253,7 +254,7 @@ def on_done(self, corpus: Corpus) -> None:
return
# set language on Corpus's language (when corpus with already defined
# language opened) or guess language
- self.language = ISO2LANG[corpus.language or detect_language(corpus)]
+ self.language = corpus.language or detect_language(corpus)
self.openContext(self.corpus)
self.used_attrs_model.extend(self.used_attrs)
self.unused_attrs_model.extend(
@@ -341,7 +342,7 @@ def remove_duplicates(l):
self.Error.no_text_features_used()
corpus.set_title_variable(self.title_variable)
- corpus.attributes["language"] = LANG2ISO[self.language]
+ corpus.attributes["language"] = self.language
# prevent sending "empty" corpora
dom = corpus.domain
empty = (
@@ -369,6 +370,14 @@ def describe(features):
('Target', describe(domain.class_vars)),
))
+ @classmethod
+ def migrate_context(cls, context, version):
+ if version < 2:
+ if "language" in context.values:
+ language, type_ = context.values["language"]
+ language = LANG2ISO[migrate_language_name(language)]
+ context.values["language"] = (language, type_)
+
if __name__ == '__main__':
from orangewidget.utils.widgetpreview import WidgetPreview
diff --git a/orangecontrib/text/widgets/tests/test_owcorpus.py b/orangecontrib/text/widgets/tests/test_owcorpus.py
index cef65c2e3..6c7c327e0 100644
--- a/orangecontrib/text/widgets/tests/test_owcorpus.py
+++ b/orangecontrib/text/widgets/tests/test_owcorpus.py
@@ -258,30 +258,30 @@ def test_context(self):
data.attributes["language"] = "sl"
self.send_signal(self.widget.Inputs.data, data)
self.wait_until_finished()
- self.assertEqual("Slovenian", self.widget.language)
+ self.assertEqual("sl", self.widget.language)
self.assertEqual("sl", self.get_output(self.widget.Outputs.corpus).language)
# change language to see if context work later when reopened
simulate.combobox_activate_item(self.widget.controls.language, "Dutch")
- self.assertEqual("Dutch", self.widget.language)
+ self.assertEqual("nl", self.widget.language)
self.assertEqual("nl", self.get_output(self.widget.Outputs.corpus).language)
data1 = Table(Corpus.from_file("deerwester"))
self.send_signal(self.widget.Inputs.data, data1)
self.wait_until_finished()
- self.assertEqual("English", self.widget.language)
+ self.assertEqual("en", self.widget.language)
self.assertEqual("en", self.get_output(self.widget.Outputs.corpus).language)
self.send_signal(self.widget.Inputs.data, data)
self.wait_until_finished()
- self.assertEqual("Dutch", self.widget.language)
+ self.assertEqual("nl", self.widget.language)
self.assertEqual("nl", self.get_output(self.widget.Outputs.corpus).language)
# when corpus on input in different language do not match
data.attributes["language"] = "sk"
self.send_signal(self.widget.Inputs.data, data)
self.wait_until_finished()
- self.assertEqual("Slovak", self.widget.language)
+ self.assertEqual("sk", self.widget.language)
self.assertEqual("sk", self.get_output(self.widget.Outputs.corpus).language)
# different documents in corpus (should not match the context)
@@ -289,7 +289,7 @@ def test_context(self):
data2.attributes["language"] = "sl"
self.send_signal(self.widget.Inputs.data, data2)
self.wait_until_finished()
- self.assertEqual("Slovenian", self.widget.language)
+ self.assertEqual("sl", self.widget.language)
self.assertEqual("sl", self.get_output(self.widget.Outputs.corpus).language)
def test_guess_language(self):
@@ -298,26 +298,26 @@ def test_guess_language(self):
# drop it
data.attributes = {}
# change default to something that is not corpus's language
- self.widget.language = "Slovenian"
+ self.widget.language = "sl"
self.send_signal(self.widget.Inputs.data, data)
self.wait_until_finished()
- self.assertEqual("English", self.widget.language)
+ self.assertEqual("en", self.widget.language)
self.assertEqual("en", self.get_output(self.widget.Outputs.corpus).language)
# change language to see if context work later when reopened
simulate.combobox_activate_item(self.widget.controls.language, "Dutch")
- self.assertEqual("Dutch", self.widget.language)
+ self.assertEqual("nl", self.widget.language)
self.assertEqual("nl", self.get_output(self.widget.Outputs.corpus).language)
data1 = Table(Corpus.from_file("deerwester"))
self.send_signal(self.widget.Inputs.data, data1)
self.wait_until_finished()
- self.assertEqual("English", self.widget.language)
+ self.assertEqual("en", self.widget.language)
self.assertEqual("en", self.get_output(self.widget.Outputs.corpus).language)
self.send_signal(self.widget.Inputs.data, data)
self.wait_until_finished()
- self.assertEqual("Dutch", self.widget.language)
+ self.assertEqual("nl", self.widget.language)
self.assertEqual("nl", self.get_output(self.widget.Outputs.corpus).language)
# different documents in corpus (should not match the context)
@@ -325,7 +325,7 @@ def test_guess_language(self):
data2.attributes["language"] = None
self.send_signal(self.widget.Inputs.data, data2)
self.wait_until_finished()
- self.assertEqual("English", self.widget.language)
+ self.assertEqual("en", self.widget.language)
self.assertEqual("en", self.get_output(self.widget.Outputs.corpus).language)
def test_language_unpickle(self):
@@ -335,7 +335,7 @@ def test_language_unpickle(self):
corpus = Corpus.from_file(file)
self.send_signal(self.widget.Inputs.data, corpus)
self.wait_until_finished()
- self.assertEqual(self.widget.language, "English")
+ self.assertEqual(self.widget.language, "en")
def test_preserve_preprocessing(self):
"""When preprocessed corpus on input preprocessing should be retained"""
@@ -381,6 +381,31 @@ def test_preserve_preprocessing_from_file(self):
res = self.get_output(self.widget.Outputs.corpus)
self.assertTrue(res.has_tokens())
+ def test_migrate_settings(self):
+ corpus = Corpus.from_file("book-excerpts")
+ self.send_signal(self.widget.Inputs.data, corpus)
+ self.wait_until_finished()
+ packed_data = self.widget.settingsHandler.pack_data(self.widget)
+ packed_data["context_settings"][0].values["language"] = ("French", -2)
+ packed_data["context_settings"][0].values["__version__"] = 1
+
+ widget = self.create_widget(OWCorpus, stored_settings=packed_data)
+ self.send_signal(self.widget.Inputs.data, corpus, widget=widget)
+ self.wait_until_finished(widget=widget)
+ self.assertEqual("fr", widget.language)
+
+ packed_data["context_settings"][0].values["language"] = ("Ancient greek", -2)
+ widget = self.create_widget(OWCorpus, stored_settings=packed_data)
+ self.send_signal(self.widget.Inputs.data, corpus, widget=widget)
+ self.wait_until_finished(widget=widget)
+ self.assertEqual("grc", widget.language)
+
+ packed_data["context_settings"][0].values["language"] = (None, -2)
+ widget = self.create_widget(OWCorpus, stored_settings=packed_data)
+ self.send_signal(self.widget.Inputs.data, corpus, widget=widget)
+ self.wait_until_finished(widget=widget)
+ self.assertIsNone(widget.language)
+
if __name__ == "__main__":
unittest.main()
From 476009b4f28e314e8758715a43c5b214b51a04bf Mon Sep 17 00:00:00 2001
From: PrimozGodec
Date: Fri, 26 Jan 2024 12:18:15 +0100
Subject: [PATCH 3/6] Create Corpus - Store ISO language in settings
---
orangecontrib/text/widgets/owcreatecorpus.py | 16 +++++++++++++---
.../text/widgets/tests/test_owcreatecorpus.py | 13 +++++++++++++
2 files changed, 26 insertions(+), 3 deletions(-)
diff --git a/orangecontrib/text/widgets/owcreatecorpus.py b/orangecontrib/text/widgets/owcreatecorpus.py
index 6b759c319..4cbce7ebd 100644
--- a/orangecontrib/text/widgets/owcreatecorpus.py
+++ b/orangecontrib/text/widgets/owcreatecorpus.py
@@ -16,7 +16,9 @@
from orangewidget.settings import Setting
from orangecontrib.text import Corpus
-from orangecontrib.text.language import LANG2ISO, DEFAULT_LANGUAGE, LanguageModel
+from orangecontrib.text.language import (
+ DEFAULT_LANGUAGE, LanguageModel, LANG2ISO, migrate_language_name
+)
class EditorsVerticalScrollArea(gui.VerticalScrollArea):
@@ -78,6 +80,7 @@ class Outputs:
want_main_area = False
+ settings_version = 2
language: str = Setting(DEFAULT_LANGUAGE)
texts: List[Tuple[str, str]] = Setting([("", "")] * 3)
auto_commit: bool = Setting(True)
@@ -90,7 +93,7 @@ def __init__(self):
self.controlArea,
self,
"language",
- model=LanguageModel(),
+ model=LanguageModel(include_none=True),
box="Language",
orientation=Qt.Horizontal,
callback=self.commit.deferred,
@@ -157,7 +160,7 @@ def commit(self):
np.empty((len(self.texts), 0)),
metas=np.array(self.texts),
text_features=[doc_var],
- language=LANG2ISO[self.language],
+ language=self.language,
)
corpus.set_title_variable(title_var)
self.Outputs.corpus.send(corpus)
@@ -165,6 +168,13 @@ def commit(self):
def sizeHint(self) -> QSize:
return QSize(600, 650)
+ @classmethod
+ def migrate_settings(cls, settings, version):
+ if version is None or version < 2:
+ if "language" in settings:
+ language = migrate_language_name(settings["language"])
+ settings["language"] = LANG2ISO[language]
+
if __name__ == "__main__":
from orangewidget.utils.widgetpreview import WidgetPreview
diff --git a/orangecontrib/text/widgets/tests/test_owcreatecorpus.py b/orangecontrib/text/widgets/tests/test_owcreatecorpus.py
index 14dc2e9b9..36d4ca534 100644
--- a/orangecontrib/text/widgets/tests/test_owcreatecorpus.py
+++ b/orangecontrib/text/widgets/tests/test_owcreatecorpus.py
@@ -207,6 +207,19 @@ def test_language(self):
corpus = self.get_output(self.widget.Outputs.corpus)
self.assertEqual("am", corpus.language)
+ def test_migrate_settings(self):
+ settings = {"__version__": 1, "language": "French"}
+ widget = self.create_widget(OWCreateCorpus, stored_settings=settings)
+ self.assertEqual("fr", widget.language)
+
+ settings = {"__version__": 1, "language": "Ancient greek"}
+ widget = self.create_widget(OWCreateCorpus, stored_settings=settings)
+ self.assertEqual("grc", widget.language)
+
+ settings = {"__version__": 1, "language": None}
+ widget = self.create_widget(OWCreateCorpus, stored_settings=settings)
+ self.assertIsNone(widget.language)
+
if __name__ == "__main__":
unittest.main()
From be97bda503096597094fbd7944a9612050c720f3 Mon Sep 17 00:00:00 2001
From: PrimozGodec
Date: Fri, 26 Jan 2024 12:50:46 +0100
Subject: [PATCH 4/6] Import Document - Store ISO language in settings
---
.../text/widgets/owimportdocuments.py | 21 ++++++----
.../widgets/tests/test_owimportdocuments.py | 40 +++++++++++++++----
2 files changed, 45 insertions(+), 16 deletions(-)
diff --git a/orangecontrib/text/widgets/owimportdocuments.py b/orangecontrib/text/widgets/owimportdocuments.py
index 478c68920..e70dc81d0 100644
--- a/orangecontrib/text/widgets/owimportdocuments.py
+++ b/orangecontrib/text/widgets/owimportdocuments.py
@@ -47,10 +47,7 @@
from orangecontrib.text.corpus import Corpus
from orangecontrib.text.import_documents import ImportDocuments, NoDocumentsException
from orangecontrib.text.language import (
- ISO2LANG,
- detect_language,
- LANG2ISO,
- LanguageModel,
+ detect_language, LanguageModel, DEFAULT_LANGUAGE, LANG2ISO, migrate_language_name
)
# domain for skipped images output
@@ -124,6 +121,7 @@ class Outputs:
skipped_documents = Output("Skipped documents", Table)
settingsHandler = ImportDocumentContextHandler()
+ settings_version = 2
LOCAL_FILE, URL = range(2)
source = settings.Setting(LOCAL_FILE)
@@ -134,7 +132,7 @@ class Outputs:
lemma_cb = settings.Setting(True)
pos_cb = settings.Setting(False)
ner_cb = settings.Setting(False)
- language: str = settings.ContextSetting("English")
+ language: str = settings.ContextSetting(DEFAULT_LANGUAGE)
want_main_area = False
resizing_enabled = False
@@ -253,7 +251,7 @@ def __init__(self):
self,
"language",
box="Language",
- model=LanguageModel(),
+ model=LanguageModel(include_none=True),
sendSelectedValue=True,
searchable=True,
callback=self.commit,
@@ -665,7 +663,7 @@ def __onRunFinished(self):
self.n_text_data = len(corpus)
self.n_text_categories = len(corpus.domain.class_var.values) \
if corpus.domain.class_var else 0
- self.language = ISO2LANG[corpus.language or detect_language(corpus)]
+ self.language = corpus.language or detect_language(corpus)
self.openContext(corpus)
else:
self.language = None
@@ -727,7 +725,7 @@ def commit(self):
if self.is_conllu:
self.add_features()
if self.corpus:
- self.corpus.attributes["language"] = LANG2ISO[self.language]
+ self.corpus.attributes["language"] = self.language
self.Outputs.data.send(self.corpus)
if self.skipped_documents:
skipped_table = (
@@ -791,6 +789,13 @@ def send_report(self):
items += [('Number of skipped', len(self.skipped_documents))]
self.report_items(items, )
+ @classmethod
+ def migrate_context(cls, context, version):
+ if version < 2:
+ if "language" in context.values:
+ language = LANG2ISO[migrate_language_name(context.values["language"])]
+ context.values["language"] = language
+
class UserInterruptError(BaseException):
"""
diff --git a/orangecontrib/text/widgets/tests/test_owimportdocuments.py b/orangecontrib/text/widgets/tests/test_owimportdocuments.py
index d4997039b..8807f9c6b 100644
--- a/orangecontrib/text/widgets/tests/test_owimportdocuments.py
+++ b/orangecontrib/text/widgets/tests/test_owimportdocuments.py
@@ -16,14 +16,13 @@
class TestOWImportDocuments(WidgetTest):
def setUp(self) -> None:
self.widget: OWImportDocuments = self.create_widget(OWImportDocuments)
- path = os.path.join(os.path.dirname(__file__), DATA_PATH)
- self.widget.setCurrentPath(path)
+ self.path = os.path.join(os.path.dirname(__file__), DATA_PATH)
+ self.widget.setCurrentPath(self.path)
self.widget.reload()
self.wait_until_finished()
def test_current_path(self):
- path = os.path.join(os.path.dirname(__file__), DATA_PATH)
- self.assertEqual(path, self.widget.currentPath)
+ self.assertEqual(self.path, self.widget.currentPath)
def test_no_skipped(self):
path = os.path.join(DATA_PATH, "good")
@@ -132,7 +131,7 @@ def test_load_empty_folder(self):
def tests_context(self):
self.widget: OWImportDocuments = self.create_widget(OWImportDocuments)
# change default to something else to see if language is changed
- self.widget.language = "Slovenian"
+ self.widget.language = "sl"
path = os.path.join(DATA_PATH, "good")
self.widget.setCurrentPath(path)
@@ -140,11 +139,11 @@ def tests_context(self):
self.wait_until_finished()
# english is recognized for selected documents
- self.assertEqual(self.widget.language, "English")
+ self.assertEqual(self.widget.language, "en")
self.assertEqual("en", self.get_output(self.widget.Outputs.data).language)
simulate.combobox_activate_item(self.widget.controls.language, "Dutch")
- self.assertEqual(self.widget.language, "Dutch")
+ self.assertEqual(self.widget.language, "nl")
self.assertEqual("nl", self.get_output(self.widget.Outputs.data).language)
# read something else
@@ -157,9 +156,34 @@ def tests_context(self):
self.widget.setCurrentPath(path)
self.widget.reload()
self.wait_until_finished()
- self.assertEqual(self.widget.language, "Dutch")
+ self.assertEqual(self.widget.language, "nl")
self.assertEqual("nl", self.get_output(self.widget.Outputs.data).language)
+ def test_migrate_settings(self):
+ packed_data = self.widget.settingsHandler.pack_data(self.widget)
+ packed_data["context_settings"][0].values["language"] = "French"
+ packed_data["context_settings"][0].values["__version__"] = 1
+
+ widget = self.create_widget(OWImportDocuments, stored_settings=packed_data)
+ widget.setCurrentPath(self.path)
+ widget.reload()
+ self.wait_until_finished(widget=widget)
+ self.assertEqual("fr", widget.language)
+
+ packed_data["context_settings"][0].values["language"] = "Ancient greek"
+ widget = self.create_widget(OWImportDocuments, stored_settings=packed_data)
+ widget.setCurrentPath(self.path)
+ widget.reload()
+ self.wait_until_finished(widget=widget)
+ self.assertEqual("grc", widget.language)
+
+ packed_data["context_settings"][0].values["language"] = None
+ widget = self.create_widget(OWImportDocuments, stored_settings=packed_data)
+ widget.setCurrentPath(self.path)
+ widget.reload()
+ self.wait_until_finished(widget=widget)
+ self.assertIsNone(widget.language)
+
if __name__ == "__main__":
unittest.main()
From 8ce3c8b615297ef73f210f734b675cc4a85ca42d Mon Sep 17 00:00:00 2001
From: PrimozGodec
Date: Fri, 26 Jan 2024 13:56:57 +0100
Subject: [PATCH 5/6] Document Embedding - Store ISO language in settings
---
.../text/widgets/owdocumentembedding.py | 27 +++++++++--------
.../widgets/tests/test_owdocumentembedding.py | 30 ++++++++++++-------
2 files changed, 35 insertions(+), 22 deletions(-)
diff --git a/orangecontrib/text/widgets/owdocumentembedding.py b/orangecontrib/text/widgets/owdocumentembedding.py
index c9cf39f01..5445e3db8 100644
--- a/orangecontrib/text/widgets/owdocumentembedding.py
+++ b/orangecontrib/text/widgets/owdocumentembedding.py
@@ -8,7 +8,9 @@
from Orange.widgets.widget import Msg, Output, OWWidget
from orangecontrib.text.corpus import Corpus
-from orangecontrib.text.language import ISO2LANG, LANG2ISO
+from orangecontrib.text.language import (
+ ISO2LANG, DEFAULT_LANGUAGE, LanguageModel, LANG2ISO
+)
from orangecontrib.text.vectorization.document_embedder import (
AGGREGATORS,
AGGREGATORS_ITEMS,
@@ -39,10 +41,9 @@ class OWDocumentEmbedding(OWBaseVectorizer):
priority = 300
buttons_area_orientation = Qt.Vertical
- settings_version = 2
+ settings_version = 3
Methods = [SBERT, DocumentEmbedder]
- DEFAULT_LANGUAGE = "English"
class Outputs(OWBaseVectorizer.Outputs):
skipped = Output("Skipped documents", Corpus)
@@ -84,7 +85,7 @@ def create_configuration_layout(self):
ibox,
self,
"language",
- items=[ISO2LANG[lg] for lg in LANGUAGES],
+ model=LanguageModel(languages=LANGUAGES),
label="Language:",
sendSelectedValue=True, # value is actual string not index
orientation=Qt.Horizontal,
@@ -108,10 +109,10 @@ def create_configuration_layout(self):
def set_data(self, corpus):
# set language from corpus as selected language
if corpus and corpus.language in LANGUAGES:
- self.language = ISO2LANG[corpus.language]
+ self.language = corpus.language
else:
# if Corpus's language not supported use default language
- self.language = self.DEFAULT_LANGUAGE
+ self.language = DEFAULT_LANGUAGE
# when workflow loaded use language saved in workflow
if self.__pending_language is not None:
@@ -127,9 +128,7 @@ def update_method(self):
self.vectorizer = EmbeddingVectorizer(self.init_method(), self.corpus)
def init_method(self):
- params = dict(
- language=LANG2ISO[self.language], aggregator=self.aggregator
- )
+ params = dict(language=self.language, aggregator=self.aggregator)
kwargs = ({}, params)[self.method]
return self.Methods[self.method](**kwargs)
@@ -170,6 +169,9 @@ def migrate_settings(cls, settings: Dict[str, Any], version: Optional[int]):
settings["language"] = LANGUAGES[settings["language"]]
if "aggregator" in settings:
settings["aggregator"] = AGGREGATORS[settings["aggregator"]]
+ if version is None or version < 3 and "language" in settings:
+ # before version 3 language settings were language names, transform to ISO
+ settings["language"] = LANG2ISO[settings["language"]]
def send_report(self):
if self.method == 0:
@@ -177,11 +179,12 @@ def send_report(self):
("Embedder", "Multilingual SBERT"),
))
if self.method == 1:
- self.report_items((
+ items = (
("Embedder", "fastText"),
- ("Language", self.language),
+ ("Language", ISO2LANG[self.language]),
("Aggregator", self.aggregator),
- ))
+ )
+ self.report_items(items)
if __name__ == "__main__":
diff --git a/orangecontrib/text/widgets/tests/test_owdocumentembedding.py b/orangecontrib/text/widgets/tests/test_owdocumentembedding.py
index 95cbbf5bb..d79333a6b 100644
--- a/orangecontrib/text/widgets/tests/test_owdocumentembedding.py
+++ b/orangecontrib/text/widgets/tests/test_owdocumentembedding.py
@@ -7,8 +7,12 @@
from Orange.widgets.tests.utils import simulate
from Orange.misc.utils.embedder_utils import EmbeddingConnectionError
+from orangecontrib.text.language import DEFAULT_LANGUAGE, ISO2LANG
from orangecontrib.text.tests.test_documentembedder import PATCH_METHOD, make_dummy_post
-from orangecontrib.text.vectorization.document_embedder import DocumentEmbedder
+from orangecontrib.text.vectorization.document_embedder import (
+ DocumentEmbedder,
+ LANGUAGES,
+)
from orangecontrib.text.vectorization.sbert import EMB_DIM, SBERT
from orangecontrib.text.widgets.owdocumentembedding import OWDocumentEmbedding
from orangecontrib.text import Corpus
@@ -157,14 +161,14 @@ def test_corpus_name_preserved(self):
def test_fasttext_language(self):
# english corpus
self.send_signal(self.widget.Inputs.corpus, self.corpus)
- self.assertEqual("English", self.widget.language)
+ self.assertEqual("en", self.widget.language)
result = self.get_output(self.widget.Outputs.corpus)
self.assertEqual(9, len(result))
# slovenian corpus
self.corpus.attributes["language"] = "sl"
self.send_signal(self.widget.Inputs.corpus, self.corpus)
- self.assertEqual("Slovenian", self.widget.language)
+ self.assertEqual("sl", self.widget.language)
result = self.get_output(self.widget.Outputs.corpus)
self.assertEqual(9, len(result))
@@ -172,7 +176,7 @@ def test_fasttext_language(self):
self.corpus.attributes["language"] = None
self.send_signal(self.widget.Inputs.corpus, self.corpus)
# use widgets default language English
- self.assertEqual(self.widget.DEFAULT_LANGUAGE, self.widget.language)
+ self.assertEqual(DEFAULT_LANGUAGE, self.widget.language)
result = self.get_output(self.widget.Outputs.corpus)
self.assertEqual(9, len(result))
@@ -180,14 +184,14 @@ def test_fasttext_language(self):
self.corpus.attributes["language"] = "be"
self.send_signal(self.widget.Inputs.corpus, self.corpus)
# use widgets default language English
- self.assertEqual(self.widget.DEFAULT_LANGUAGE, self.widget.language)
+ self.assertEqual(DEFAULT_LANGUAGE, self.widget.language)
result = self.get_output(self.widget.Outputs.corpus)
self.assertEqual(9, len(result))
# language english
self.corpus.attributes["language"] = "en"
self.send_signal(self.widget.Inputs.corpus, self.corpus)
- self.assertEqual("English", self.widget.language)
+ self.assertEqual("en", self.widget.language)
result = self.get_output(self.widget.Outputs.corpus)
self.assertEqual(9, len(result))
@@ -195,25 +199,25 @@ def test_fasttext_language(self):
simulate.combobox_activate_item(
self.widget.controlArea.findChildren(QComboBox)[0], "French"
)
- self.assertEqual("French", self.widget.language)
+ self.assertEqual("fr", self.widget.language)
result = self.get_output(self.widget.Outputs.corpus)
self.assertEqual(9, len(result))
# providing new corpus should reset language
self.send_signal(self.widget.Inputs.corpus, self.corpus)
- self.assertEqual("English", self.widget.language)
+ self.assertEqual("en", self.widget.language)
def test_language_from_settings(self):
self.send_signal(self.widget.Inputs.corpus, self.corpus)
simulate.combobox_activate_item(
self.widget.controlArea.findChildren(QComboBox)[0], "French"
)
- self.assertEqual("French", self.widget.language)
+ self.assertEqual("fr", self.widget.language)
settings = self.widget.settingsHandler.pack_data(self.widget)
widget = self.create_widget(OWDocumentEmbedding, stored_settings=settings)
self.send_signal(widget.Inputs.corpus, self.corpus, widget=widget)
- self.assertEqual("French", widget.language)
+ self.assertEqual("fr", widget.language)
@patch(PATCH_METHOD, make_dummy_post(b'{"embedding": [1.3, 1]}'))
@patch("orangecontrib.text.widgets.owdocumentembedding.OWDocumentEmbedding.report_items")
@@ -232,6 +236,12 @@ def test_report(self, mocked_items: Mock):
mocked_items.assert_called_once()
mocked_items.reset_mock()
+ def test_migrate_settings(self):
+ for iso_lang in LANGUAGES:
+ settings = {"__version__": 2, "language": ISO2LANG[iso_lang]}
+ widget = self.create_widget(OWDocumentEmbedding, stored_settings=settings)
+ self.assertEqual(iso_lang, widget.language)
+
if __name__ == "__main__":
unittest.main()
From c6a70828cdc87d7e7c567252f627235381bf7966 Mon Sep 17 00:00:00 2001
From: PrimozGodec
Date: Fri, 26 Jan 2024 14:42:02 +0100
Subject: [PATCH 6/6] Sentiment - Store ISO language in settings
---
.../text/widgets/owsentimentanalysis.py | 56 +++++++++----------
.../widgets/tests/test_owsentimentanalysis.py | 38 +++++++++----
2 files changed, 53 insertions(+), 41 deletions(-)
diff --git a/orangecontrib/text/widgets/owsentimentanalysis.py b/orangecontrib/text/widgets/owsentimentanalysis.py
index 16efb9a49..de06f898d 100644
--- a/orangecontrib/text/widgets/owsentimentanalysis.py
+++ b/orangecontrib/text/widgets/owsentimentanalysis.py
@@ -3,12 +3,14 @@
from AnyQt.QtCore import Qt
from AnyQt.QtWidgets import QGridLayout, QLabel
-from Orange.widgets import gui, settings
+from Orange.widgets import gui
from Orange.widgets.utils.concurrent import ConcurrentWidgetMixin, TaskState
from Orange.widgets.utils.signals import Input, Output
from Orange.widgets.widget import OWWidget, Msg
+from orangewidget.settings import Setting
+
from orangecontrib.text import Corpus, preprocess
-from orangecontrib.text.language import ISO2LANG, LANG2ISO
+from orangecontrib.text.language import LanguageModel, LANG2ISO
from orangecontrib.text.sentiment import (
VaderSentiment,
LiuHuSentiment,
@@ -37,24 +39,16 @@ class Inputs:
class Outputs:
corpus = Output("Corpus", Corpus)
- settings_version = 1
+ settings_version = 2
want_main_area = False
resizing_enabled = False
- method_idx: int = settings.Setting(1)
- autocommit: bool = settings.Setting(True)
- liu_language: str = settings.Setting(
- ISO2LANG[LiuHuSentiment.DEFAULT_LANG], schema_only=True
- )
- multi_language: str = settings.Setting(
- ISO2LANG[MultiSentiment.DEFAULT_LANG], schema_only=True
- )
- senti_language: str = settings.Setting(
- ISO2LANG[SentiArt.DEFAULT_LANG], schema_only=True
- )
- lilah_language: str = settings.Setting(
- ISO2LANG[LilahSentiment.DEFAULT_LANG], schema_only=True
- )
+ method_idx: int = Setting(1)
+ autocommit: bool = Setting(True)
+ liu_language: str = Setting(LiuHuSentiment.DEFAULT_LANG, schema_only=True)
+ multi_language: str = Setting(MultiSentiment.DEFAULT_LANG, schema_only=True)
+ senti_language: str = Setting(SentiArt.DEFAULT_LANG, schema_only=True)
+ lilah_language: str = Setting(LilahSentiment.DEFAULT_LANG, schema_only=True)
METHODS = [
LiuHuSentiment,
@@ -99,9 +93,8 @@ def __init__(self):
None,
self,
"liu_language",
- sendSelectedValue=True,
contentsLength=10,
- items=[ISO2LANG[lg] for lg in LiuHuSentiment.LANGUAGES],
+ model=LanguageModel(languages=LiuHuSentiment.LANGUAGES),
callback=self._method_changed,
)
self.vader = gui.appendRadioButton(box, "Vader", addToLayout=False)
@@ -112,9 +105,8 @@ def __init__(self):
None,
self,
"multi_language",
- sendSelectedValue=True,
contentsLength=10,
- items=[ISO2LANG[lg] for lg in MultiSentiment.LANGUAGES],
+ model=LanguageModel(languages=MultiSentiment.LANGUAGES),
callback=self._method_changed,
)
self.senti_art = gui.appendRadioButton(box, "SentiArt", addToLayout=False)
@@ -124,7 +116,7 @@ def __init__(self):
"senti_language",
sendSelectedValue=True,
contentsLength=10,
- items=[ISO2LANG[lg] for lg in SentiArt.LANGUAGES],
+ model=LanguageModel(languages=SentiArt.LANGUAGES),
callback=self._method_changed,
)
self.lilah_sent = gui.appendRadioButton(
@@ -134,9 +126,8 @@ def __init__(self):
None,
self,
"lilah_language",
- sendSelectedValue=True,
contentsLength=10,
- items=[ISO2LANG[lg] for lg in LilahSentiment.LANGUAGES],
+ model=LanguageModel(languages=LilahSentiment.LANGUAGES),
callback=self._method_changed,
)
self.custom_list = gui.appendRadioButton(
@@ -228,10 +219,10 @@ def __set_language_settings(self):
for l_pending, l_setting, model in settings_:
if self.pp_corpus and self.pp_corpus.language in model.LANGUAGES:
- setattr(self, l_setting, ISO2LANG[self.pp_corpus.language])
+ setattr(self, l_setting, self.pp_corpus.language)
else:
# if Corpus's language not supported use default language
- setattr(self, l_setting, ISO2LANG[model.DEFAULT_LANG])
+ setattr(self, l_setting, model.DEFAULT_LANG)
# when workflow loaded use language saved in workflow
if l_pending is not None:
@@ -249,13 +240,13 @@ def _compute_sentiment(self):
method = self.METHODS[self.method_idx]
kwargs = {}
if method.name == "Liu Hu":
- kwargs = dict(language=LANG2ISO[self.liu_language])
+ kwargs = dict(language=self.liu_language)
elif method.name == "Multilingual Sentiment":
- kwargs = dict(language=LANG2ISO[self.multi_language])
+ kwargs = dict(language=self.multi_language)
elif method.name == "SentiArt":
- kwargs = dict(language=LANG2ISO[self.senti_language])
+ kwargs = dict(language=self.senti_language)
elif method.name == "LiLaH Sentiment":
- kwargs = dict(language=LANG2ISO[self.lilah_language])
+ kwargs = dict(language=self.lilah_language)
elif method.name == "Custom Dictionaries":
kwargs = dict(pos=self.pos_file, neg=self.neg_file)
if bool(self.pos_file) != bool(self.neg_file): # xor: one of them None
@@ -313,6 +304,11 @@ def migrate_settings(cls, settings, version):
method_idx = settings["method_idx"]
if method_idx == 4:
settings["metric_idx"] = 5
+ if version is None or version < 2:
+ s = ("liu_language", "lilah_language", "multi_language", "senti_language")
+ for lang_set in s:
+ if lang_set in settings:
+ settings[lang_set] = LANG2ISO[settings[lang_set]]
if __name__ == '__main__':
diff --git a/orangecontrib/text/widgets/tests/test_owsentimentanalysis.py b/orangecontrib/text/widgets/tests/test_owsentimentanalysis.py
index 1ff2dddc0..8d51f237b 100644
--- a/orangecontrib/text/widgets/tests/test_owsentimentanalysis.py
+++ b/orangecontrib/text/widgets/tests/test_owsentimentanalysis.py
@@ -12,7 +12,9 @@
from orangecontrib.text import preprocess
from orangecontrib.text.corpus import Corpus
from orangecontrib.text.language import ISO2LANG
-from orangecontrib.text.sentiment import DictionaryNotFound
+from orangecontrib.text.sentiment import (
+ DictionaryNotFound, LiuHuSentiment, MultiSentiment, SentiArt, LilahSentiment
+)
from orangecontrib.text.widgets.owsentimentanalysis import OWSentimentAnalysis
MS_FILES = [
@@ -164,6 +166,20 @@ def test_migrates_settings(self):
OWSentimentAnalysis.migrate_settings(settings, version=None)
self.assertTrue(settings.get("method_idx", 5))
+ def test_migrate_language_settings(self):
+ methods = (
+ ("liu_language", LiuHuSentiment),
+ ("multi_language", MultiSentiment),
+ ("senti_language", SentiArt),
+ ("lilah_language", LilahSentiment),
+ )
+ for setting, method in methods:
+ if hasattr(method, "LANGUAGES"):
+ for lang in getattr(method, "LANGUAGES"):
+ se = {setting: ISO2LANG[lang], "__version__": 1}
+ widget = self.create_widget(OWSentimentAnalysis, stored_settings=se)
+ self.assertEqual(lang, getattr(widget, setting))
+
def test_preprocessed(self):
widget = self.create_widget(OWSentimentAnalysis)
corpus = self.corpus.copy()
@@ -184,7 +200,7 @@ def test_language_from_corpus(self):
w = self.widget
settings = [
w.liu_language,
- "English",
+ "en",
w.multi_language,
w.senti_language,
w.lilah_language,
@@ -198,7 +214,7 @@ def test_language_from_corpus(self):
self.send_signal(self.widget.Inputs.corpus, self.corpus)
self.widget.findChildren(QRadioButton)[i].click()
self.assertIsNotNone(self.get_output(self.widget.Outputs.corpus))
- self.assertEqual(ISO2LANG[s], sett)
+ self.assertEqual(s, sett)
# try with unsupported language - use default language istead
self.corpus.attributes["language"] = ns
@@ -213,18 +229,18 @@ def test_language_from_settings(self):
simulate.combobox_activate_item(self.widget.senti_box, "German")
simulate.combobox_activate_item(self.widget.lilah_box, "Croatian")
- self.assertEqual("Slovenian", self.widget.liu_language)
- self.assertEqual("Spanish", self.widget.multi_language)
- self.assertEqual("German", self.widget.senti_language)
- self.assertEqual("Croatian", self.widget.lilah_language)
+ self.assertEqual("sl", self.widget.liu_language)
+ self.assertEqual("es", self.widget.multi_language)
+ self.assertEqual("de", self.widget.senti_language)
+ self.assertEqual("hr", self.widget.lilah_language)
settings = self.widget.settingsHandler.pack_data(self.widget)
widget = self.create_widget(OWSentimentAnalysis, stored_settings=settings)
self.send_signal(widget.Inputs.corpus, self.corpus, widget=widget)
- self.assertEqual("Slovenian", widget.liu_language)
- self.assertEqual("Spanish", widget.multi_language)
- self.assertEqual("German", widget.senti_language)
- self.assertEqual("Croatian", widget.lilah_language)
+ self.assertEqual("sl", widget.liu_language)
+ self.assertEqual("es", widget.multi_language)
+ self.assertEqual("de", widget.senti_language)
+ self.assertEqual("hr", widget.lilah_language)
def test_dictionary_offline(self):
"""Test case when offline and dictionary not found locally"""