From 00e49033374aedf526e03990fdb58ff2b71cd35d Mon Sep 17 00:00:00 2001 From: PrimozGodec Date: Fri, 26 Jan 2024 12:00:11 +0100 Subject: [PATCH 1/6] Language - Update LanguageModel to support ISO settings, language migration --- orangecontrib/text/language.py | 51 +++++++++++++++++++++++++++------- 1 file changed, 41 insertions(+), 10 deletions(-) diff --git a/orangecontrib/text/language.py b/orangecontrib/text/language.py index d65c03d29..1250c9757 100644 --- a/orangecontrib/text/language.py +++ b/orangecontrib/text/language.py @@ -1,5 +1,5 @@ from collections import Counter -from typing import Optional +from typing import Optional, Sequence from AnyQt.QtCore import Qt from langdetect import DetectorFactory, detect @@ -41,7 +41,7 @@ "ga": "Irish", "gl": "Galician", "got": "Gothic", - "grc": "Ancient greek", + "grc": "Ancient Greek", "gu": "Gujarati", "he": "Hebrew", "hi": "Hindi", @@ -104,21 +104,38 @@ None: None, } LANG2ISO = {lang: code for code, lang in ISO2LANG.items()} -DEFAULT_LANGUAGE = "English" +DEFAULT_LANGUAGE = "en" class LanguageModel(PyListModel): """Model for language selection dropdowns in the widgets""" - def __init__(self): - languages = sorted(filter(None, ISO2LANG.values())) - super().__init__(iterable=[None] + languages) + def __init__( + self, include_none: bool = False, languages: Optional[Sequence[str]] = None + ): + """ + Parameters + ---------- + include_none + Indicates if "(no language)" value is available on the top of the list + languages + List of languages available in the dropdown. + If None all add-on supported languages are available. + """ + if languages is None: + # if languages not provided take all available languages + languages = sorted(filter(None, ISO2LANG), key=ISO2LANG.get) + if include_none: + languages = [None] + languages + super().__init__(iterable=languages) def data(self, index, role=Qt.DisplayRole): - if index.row() == 0 and role == Qt.DisplayRole: - return "(no language)" - else: - return super().data(index, role) + if role == Qt.DisplayRole: + value = super().data(index, role) + if value is None: + return "(no language)" + return ISO2LANG[value] + return super().data(index, role) DetectorFactory.seed = 0 @@ -167,3 +184,17 @@ def infer_language_from_variable(variable: DiscreteVariable) -> Optional[str]: Language ISO code if all documents have the same language, None otherwise """ return variable.values[0] if len(variable.values) == 1 else None + + +# this dictionary hold all changes in language names +LANGUAGE_MIGRATIONS = { + "Ancient greek": "Ancient Greek" +} + + +def migrate_language_name(language: str) -> str: + """ + We changed some languages names after they were introduced in the add-on. + This function transform any langauge name to its new name if existed. + """ + return LANGUAGE_MIGRATIONS.get(language, language) From a0091e6c63c0c8b67ace34e92d7316c1bf951241 Mon Sep 17 00:00:00 2001 From: PrimozGodec Date: Fri, 26 Jan 2024 12:01:12 +0100 Subject: [PATCH 2/6] Corpus widget - Store ISO language in settings --- orangecontrib/text/widgets/owcorpus.py | 21 +++++--- .../text/widgets/tests/test_owcorpus.py | 51 ++++++++++++++----- 2 files changed, 53 insertions(+), 19 deletions(-) diff --git a/orangecontrib/text/widgets/owcorpus.py b/orangecontrib/text/widgets/owcorpus.py index 6706f3b05..6baa1690d 100644 --- a/orangecontrib/text/widgets/owcorpus.py +++ b/orangecontrib/text/widgets/owcorpus.py @@ -18,10 +18,10 @@ from orangecontrib.text.corpus import Corpus, get_sample_corpora_dir from orangecontrib.text.language import ( - LANG2ISO, detect_language, - ISO2LANG, LanguageModel, + LANG2ISO, + migrate_language_name, ) from orangecontrib.text.widgets.utils import widgets, QSize @@ -106,6 +106,7 @@ class Outputs: key=list(FileFormat.readers.values()).index))) settingsHandler = CorpusContextHandler() + settings_version = 2 recent_files = Setting([ "book-excerpts.tab", @@ -116,7 +117,7 @@ class Outputs: ]) used_attrs = ContextSetting([]) title_variable = ContextSetting("") - language: str = ContextSetting("English") + language: str = ContextSetting("en") class Error(OWWidget.Error): read_file = Msg("Can't read file ({})") @@ -163,7 +164,7 @@ def __init__(self): self, "language", label="Language", - model=LanguageModel(), + model=LanguageModel(include_none=True), sendSelectedValue=True, **common_settings ) @@ -253,7 +254,7 @@ def on_done(self, corpus: Corpus) -> None: return # set language on Corpus's language (when corpus with already defined # language opened) or guess language - self.language = ISO2LANG[corpus.language or detect_language(corpus)] + self.language = corpus.language or detect_language(corpus) self.openContext(self.corpus) self.used_attrs_model.extend(self.used_attrs) self.unused_attrs_model.extend( @@ -341,7 +342,7 @@ def remove_duplicates(l): self.Error.no_text_features_used() corpus.set_title_variable(self.title_variable) - corpus.attributes["language"] = LANG2ISO[self.language] + corpus.attributes["language"] = self.language # prevent sending "empty" corpora dom = corpus.domain empty = ( @@ -369,6 +370,14 @@ def describe(features): ('Target', describe(domain.class_vars)), )) + @classmethod + def migrate_context(cls, context, version): + if version < 2: + if "language" in context.values: + language, type_ = context.values["language"] + language = LANG2ISO[migrate_language_name(language)] + context.values["language"] = (language, type_) + if __name__ == '__main__': from orangewidget.utils.widgetpreview import WidgetPreview diff --git a/orangecontrib/text/widgets/tests/test_owcorpus.py b/orangecontrib/text/widgets/tests/test_owcorpus.py index cef65c2e3..6c7c327e0 100644 --- a/orangecontrib/text/widgets/tests/test_owcorpus.py +++ b/orangecontrib/text/widgets/tests/test_owcorpus.py @@ -258,30 +258,30 @@ def test_context(self): data.attributes["language"] = "sl" self.send_signal(self.widget.Inputs.data, data) self.wait_until_finished() - self.assertEqual("Slovenian", self.widget.language) + self.assertEqual("sl", self.widget.language) self.assertEqual("sl", self.get_output(self.widget.Outputs.corpus).language) # change language to see if context work later when reopened simulate.combobox_activate_item(self.widget.controls.language, "Dutch") - self.assertEqual("Dutch", self.widget.language) + self.assertEqual("nl", self.widget.language) self.assertEqual("nl", self.get_output(self.widget.Outputs.corpus).language) data1 = Table(Corpus.from_file("deerwester")) self.send_signal(self.widget.Inputs.data, data1) self.wait_until_finished() - self.assertEqual("English", self.widget.language) + self.assertEqual("en", self.widget.language) self.assertEqual("en", self.get_output(self.widget.Outputs.corpus).language) self.send_signal(self.widget.Inputs.data, data) self.wait_until_finished() - self.assertEqual("Dutch", self.widget.language) + self.assertEqual("nl", self.widget.language) self.assertEqual("nl", self.get_output(self.widget.Outputs.corpus).language) # when corpus on input in different language do not match data.attributes["language"] = "sk" self.send_signal(self.widget.Inputs.data, data) self.wait_until_finished() - self.assertEqual("Slovak", self.widget.language) + self.assertEqual("sk", self.widget.language) self.assertEqual("sk", self.get_output(self.widget.Outputs.corpus).language) # different documents in corpus (should not match the context) @@ -289,7 +289,7 @@ def test_context(self): data2.attributes["language"] = "sl" self.send_signal(self.widget.Inputs.data, data2) self.wait_until_finished() - self.assertEqual("Slovenian", self.widget.language) + self.assertEqual("sl", self.widget.language) self.assertEqual("sl", self.get_output(self.widget.Outputs.corpus).language) def test_guess_language(self): @@ -298,26 +298,26 @@ def test_guess_language(self): # drop it data.attributes = {} # change default to something that is not corpus's language - self.widget.language = "Slovenian" + self.widget.language = "sl" self.send_signal(self.widget.Inputs.data, data) self.wait_until_finished() - self.assertEqual("English", self.widget.language) + self.assertEqual("en", self.widget.language) self.assertEqual("en", self.get_output(self.widget.Outputs.corpus).language) # change language to see if context work later when reopened simulate.combobox_activate_item(self.widget.controls.language, "Dutch") - self.assertEqual("Dutch", self.widget.language) + self.assertEqual("nl", self.widget.language) self.assertEqual("nl", self.get_output(self.widget.Outputs.corpus).language) data1 = Table(Corpus.from_file("deerwester")) self.send_signal(self.widget.Inputs.data, data1) self.wait_until_finished() - self.assertEqual("English", self.widget.language) + self.assertEqual("en", self.widget.language) self.assertEqual("en", self.get_output(self.widget.Outputs.corpus).language) self.send_signal(self.widget.Inputs.data, data) self.wait_until_finished() - self.assertEqual("Dutch", self.widget.language) + self.assertEqual("nl", self.widget.language) self.assertEqual("nl", self.get_output(self.widget.Outputs.corpus).language) # different documents in corpus (should not match the context) @@ -325,7 +325,7 @@ def test_guess_language(self): data2.attributes["language"] = None self.send_signal(self.widget.Inputs.data, data2) self.wait_until_finished() - self.assertEqual("English", self.widget.language) + self.assertEqual("en", self.widget.language) self.assertEqual("en", self.get_output(self.widget.Outputs.corpus).language) def test_language_unpickle(self): @@ -335,7 +335,7 @@ def test_language_unpickle(self): corpus = Corpus.from_file(file) self.send_signal(self.widget.Inputs.data, corpus) self.wait_until_finished() - self.assertEqual(self.widget.language, "English") + self.assertEqual(self.widget.language, "en") def test_preserve_preprocessing(self): """When preprocessed corpus on input preprocessing should be retained""" @@ -381,6 +381,31 @@ def test_preserve_preprocessing_from_file(self): res = self.get_output(self.widget.Outputs.corpus) self.assertTrue(res.has_tokens()) + def test_migrate_settings(self): + corpus = Corpus.from_file("book-excerpts") + self.send_signal(self.widget.Inputs.data, corpus) + self.wait_until_finished() + packed_data = self.widget.settingsHandler.pack_data(self.widget) + packed_data["context_settings"][0].values["language"] = ("French", -2) + packed_data["context_settings"][0].values["__version__"] = 1 + + widget = self.create_widget(OWCorpus, stored_settings=packed_data) + self.send_signal(self.widget.Inputs.data, corpus, widget=widget) + self.wait_until_finished(widget=widget) + self.assertEqual("fr", widget.language) + + packed_data["context_settings"][0].values["language"] = ("Ancient greek", -2) + widget = self.create_widget(OWCorpus, stored_settings=packed_data) + self.send_signal(self.widget.Inputs.data, corpus, widget=widget) + self.wait_until_finished(widget=widget) + self.assertEqual("grc", widget.language) + + packed_data["context_settings"][0].values["language"] = (None, -2) + widget = self.create_widget(OWCorpus, stored_settings=packed_data) + self.send_signal(self.widget.Inputs.data, corpus, widget=widget) + self.wait_until_finished(widget=widget) + self.assertIsNone(widget.language) + if __name__ == "__main__": unittest.main() From 476009b4f28e314e8758715a43c5b214b51a04bf Mon Sep 17 00:00:00 2001 From: PrimozGodec Date: Fri, 26 Jan 2024 12:18:15 +0100 Subject: [PATCH 3/6] Create Corpus - Store ISO language in settings --- orangecontrib/text/widgets/owcreatecorpus.py | 16 +++++++++++++--- .../text/widgets/tests/test_owcreatecorpus.py | 13 +++++++++++++ 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/orangecontrib/text/widgets/owcreatecorpus.py b/orangecontrib/text/widgets/owcreatecorpus.py index 6b759c319..4cbce7ebd 100644 --- a/orangecontrib/text/widgets/owcreatecorpus.py +++ b/orangecontrib/text/widgets/owcreatecorpus.py @@ -16,7 +16,9 @@ from orangewidget.settings import Setting from orangecontrib.text import Corpus -from orangecontrib.text.language import LANG2ISO, DEFAULT_LANGUAGE, LanguageModel +from orangecontrib.text.language import ( + DEFAULT_LANGUAGE, LanguageModel, LANG2ISO, migrate_language_name +) class EditorsVerticalScrollArea(gui.VerticalScrollArea): @@ -78,6 +80,7 @@ class Outputs: want_main_area = False + settings_version = 2 language: str = Setting(DEFAULT_LANGUAGE) texts: List[Tuple[str, str]] = Setting([("", "")] * 3) auto_commit: bool = Setting(True) @@ -90,7 +93,7 @@ def __init__(self): self.controlArea, self, "language", - model=LanguageModel(), + model=LanguageModel(include_none=True), box="Language", orientation=Qt.Horizontal, callback=self.commit.deferred, @@ -157,7 +160,7 @@ def commit(self): np.empty((len(self.texts), 0)), metas=np.array(self.texts), text_features=[doc_var], - language=LANG2ISO[self.language], + language=self.language, ) corpus.set_title_variable(title_var) self.Outputs.corpus.send(corpus) @@ -165,6 +168,13 @@ def commit(self): def sizeHint(self) -> QSize: return QSize(600, 650) + @classmethod + def migrate_settings(cls, settings, version): + if version is None or version < 2: + if "language" in settings: + language = migrate_language_name(settings["language"]) + settings["language"] = LANG2ISO[language] + if __name__ == "__main__": from orangewidget.utils.widgetpreview import WidgetPreview diff --git a/orangecontrib/text/widgets/tests/test_owcreatecorpus.py b/orangecontrib/text/widgets/tests/test_owcreatecorpus.py index 14dc2e9b9..36d4ca534 100644 --- a/orangecontrib/text/widgets/tests/test_owcreatecorpus.py +++ b/orangecontrib/text/widgets/tests/test_owcreatecorpus.py @@ -207,6 +207,19 @@ def test_language(self): corpus = self.get_output(self.widget.Outputs.corpus) self.assertEqual("am", corpus.language) + def test_migrate_settings(self): + settings = {"__version__": 1, "language": "French"} + widget = self.create_widget(OWCreateCorpus, stored_settings=settings) + self.assertEqual("fr", widget.language) + + settings = {"__version__": 1, "language": "Ancient greek"} + widget = self.create_widget(OWCreateCorpus, stored_settings=settings) + self.assertEqual("grc", widget.language) + + settings = {"__version__": 1, "language": None} + widget = self.create_widget(OWCreateCorpus, stored_settings=settings) + self.assertIsNone(widget.language) + if __name__ == "__main__": unittest.main() From be97bda503096597094fbd7944a9612050c720f3 Mon Sep 17 00:00:00 2001 From: PrimozGodec Date: Fri, 26 Jan 2024 12:50:46 +0100 Subject: [PATCH 4/6] Import Document - Store ISO language in settings --- .../text/widgets/owimportdocuments.py | 21 ++++++---- .../widgets/tests/test_owimportdocuments.py | 40 +++++++++++++++---- 2 files changed, 45 insertions(+), 16 deletions(-) diff --git a/orangecontrib/text/widgets/owimportdocuments.py b/orangecontrib/text/widgets/owimportdocuments.py index 478c68920..e70dc81d0 100644 --- a/orangecontrib/text/widgets/owimportdocuments.py +++ b/orangecontrib/text/widgets/owimportdocuments.py @@ -47,10 +47,7 @@ from orangecontrib.text.corpus import Corpus from orangecontrib.text.import_documents import ImportDocuments, NoDocumentsException from orangecontrib.text.language import ( - ISO2LANG, - detect_language, - LANG2ISO, - LanguageModel, + detect_language, LanguageModel, DEFAULT_LANGUAGE, LANG2ISO, migrate_language_name ) # domain for skipped images output @@ -124,6 +121,7 @@ class Outputs: skipped_documents = Output("Skipped documents", Table) settingsHandler = ImportDocumentContextHandler() + settings_version = 2 LOCAL_FILE, URL = range(2) source = settings.Setting(LOCAL_FILE) @@ -134,7 +132,7 @@ class Outputs: lemma_cb = settings.Setting(True) pos_cb = settings.Setting(False) ner_cb = settings.Setting(False) - language: str = settings.ContextSetting("English") + language: str = settings.ContextSetting(DEFAULT_LANGUAGE) want_main_area = False resizing_enabled = False @@ -253,7 +251,7 @@ def __init__(self): self, "language", box="Language", - model=LanguageModel(), + model=LanguageModel(include_none=True), sendSelectedValue=True, searchable=True, callback=self.commit, @@ -665,7 +663,7 @@ def __onRunFinished(self): self.n_text_data = len(corpus) self.n_text_categories = len(corpus.domain.class_var.values) \ if corpus.domain.class_var else 0 - self.language = ISO2LANG[corpus.language or detect_language(corpus)] + self.language = corpus.language or detect_language(corpus) self.openContext(corpus) else: self.language = None @@ -727,7 +725,7 @@ def commit(self): if self.is_conllu: self.add_features() if self.corpus: - self.corpus.attributes["language"] = LANG2ISO[self.language] + self.corpus.attributes["language"] = self.language self.Outputs.data.send(self.corpus) if self.skipped_documents: skipped_table = ( @@ -791,6 +789,13 @@ def send_report(self): items += [('Number of skipped', len(self.skipped_documents))] self.report_items(items, ) + @classmethod + def migrate_context(cls, context, version): + if version < 2: + if "language" in context.values: + language = LANG2ISO[migrate_language_name(context.values["language"])] + context.values["language"] = language + class UserInterruptError(BaseException): """ diff --git a/orangecontrib/text/widgets/tests/test_owimportdocuments.py b/orangecontrib/text/widgets/tests/test_owimportdocuments.py index d4997039b..8807f9c6b 100644 --- a/orangecontrib/text/widgets/tests/test_owimportdocuments.py +++ b/orangecontrib/text/widgets/tests/test_owimportdocuments.py @@ -16,14 +16,13 @@ class TestOWImportDocuments(WidgetTest): def setUp(self) -> None: self.widget: OWImportDocuments = self.create_widget(OWImportDocuments) - path = os.path.join(os.path.dirname(__file__), DATA_PATH) - self.widget.setCurrentPath(path) + self.path = os.path.join(os.path.dirname(__file__), DATA_PATH) + self.widget.setCurrentPath(self.path) self.widget.reload() self.wait_until_finished() def test_current_path(self): - path = os.path.join(os.path.dirname(__file__), DATA_PATH) - self.assertEqual(path, self.widget.currentPath) + self.assertEqual(self.path, self.widget.currentPath) def test_no_skipped(self): path = os.path.join(DATA_PATH, "good") @@ -132,7 +131,7 @@ def test_load_empty_folder(self): def tests_context(self): self.widget: OWImportDocuments = self.create_widget(OWImportDocuments) # change default to something else to see if language is changed - self.widget.language = "Slovenian" + self.widget.language = "sl" path = os.path.join(DATA_PATH, "good") self.widget.setCurrentPath(path) @@ -140,11 +139,11 @@ def tests_context(self): self.wait_until_finished() # english is recognized for selected documents - self.assertEqual(self.widget.language, "English") + self.assertEqual(self.widget.language, "en") self.assertEqual("en", self.get_output(self.widget.Outputs.data).language) simulate.combobox_activate_item(self.widget.controls.language, "Dutch") - self.assertEqual(self.widget.language, "Dutch") + self.assertEqual(self.widget.language, "nl") self.assertEqual("nl", self.get_output(self.widget.Outputs.data).language) # read something else @@ -157,9 +156,34 @@ def tests_context(self): self.widget.setCurrentPath(path) self.widget.reload() self.wait_until_finished() - self.assertEqual(self.widget.language, "Dutch") + self.assertEqual(self.widget.language, "nl") self.assertEqual("nl", self.get_output(self.widget.Outputs.data).language) + def test_migrate_settings(self): + packed_data = self.widget.settingsHandler.pack_data(self.widget) + packed_data["context_settings"][0].values["language"] = "French" + packed_data["context_settings"][0].values["__version__"] = 1 + + widget = self.create_widget(OWImportDocuments, stored_settings=packed_data) + widget.setCurrentPath(self.path) + widget.reload() + self.wait_until_finished(widget=widget) + self.assertEqual("fr", widget.language) + + packed_data["context_settings"][0].values["language"] = "Ancient greek" + widget = self.create_widget(OWImportDocuments, stored_settings=packed_data) + widget.setCurrentPath(self.path) + widget.reload() + self.wait_until_finished(widget=widget) + self.assertEqual("grc", widget.language) + + packed_data["context_settings"][0].values["language"] = None + widget = self.create_widget(OWImportDocuments, stored_settings=packed_data) + widget.setCurrentPath(self.path) + widget.reload() + self.wait_until_finished(widget=widget) + self.assertIsNone(widget.language) + if __name__ == "__main__": unittest.main() From 8ce3c8b615297ef73f210f734b675cc4a85ca42d Mon Sep 17 00:00:00 2001 From: PrimozGodec Date: Fri, 26 Jan 2024 13:56:57 +0100 Subject: [PATCH 5/6] Document Embedding - Store ISO language in settings --- .../text/widgets/owdocumentembedding.py | 27 +++++++++-------- .../widgets/tests/test_owdocumentembedding.py | 30 ++++++++++++------- 2 files changed, 35 insertions(+), 22 deletions(-) diff --git a/orangecontrib/text/widgets/owdocumentembedding.py b/orangecontrib/text/widgets/owdocumentembedding.py index c9cf39f01..5445e3db8 100644 --- a/orangecontrib/text/widgets/owdocumentembedding.py +++ b/orangecontrib/text/widgets/owdocumentembedding.py @@ -8,7 +8,9 @@ from Orange.widgets.widget import Msg, Output, OWWidget from orangecontrib.text.corpus import Corpus -from orangecontrib.text.language import ISO2LANG, LANG2ISO +from orangecontrib.text.language import ( + ISO2LANG, DEFAULT_LANGUAGE, LanguageModel, LANG2ISO +) from orangecontrib.text.vectorization.document_embedder import ( AGGREGATORS, AGGREGATORS_ITEMS, @@ -39,10 +41,9 @@ class OWDocumentEmbedding(OWBaseVectorizer): priority = 300 buttons_area_orientation = Qt.Vertical - settings_version = 2 + settings_version = 3 Methods = [SBERT, DocumentEmbedder] - DEFAULT_LANGUAGE = "English" class Outputs(OWBaseVectorizer.Outputs): skipped = Output("Skipped documents", Corpus) @@ -84,7 +85,7 @@ def create_configuration_layout(self): ibox, self, "language", - items=[ISO2LANG[lg] for lg in LANGUAGES], + model=LanguageModel(languages=LANGUAGES), label="Language:", sendSelectedValue=True, # value is actual string not index orientation=Qt.Horizontal, @@ -108,10 +109,10 @@ def create_configuration_layout(self): def set_data(self, corpus): # set language from corpus as selected language if corpus and corpus.language in LANGUAGES: - self.language = ISO2LANG[corpus.language] + self.language = corpus.language else: # if Corpus's language not supported use default language - self.language = self.DEFAULT_LANGUAGE + self.language = DEFAULT_LANGUAGE # when workflow loaded use language saved in workflow if self.__pending_language is not None: @@ -127,9 +128,7 @@ def update_method(self): self.vectorizer = EmbeddingVectorizer(self.init_method(), self.corpus) def init_method(self): - params = dict( - language=LANG2ISO[self.language], aggregator=self.aggregator - ) + params = dict(language=self.language, aggregator=self.aggregator) kwargs = ({}, params)[self.method] return self.Methods[self.method](**kwargs) @@ -170,6 +169,9 @@ def migrate_settings(cls, settings: Dict[str, Any], version: Optional[int]): settings["language"] = LANGUAGES[settings["language"]] if "aggregator" in settings: settings["aggregator"] = AGGREGATORS[settings["aggregator"]] + if version is None or version < 3 and "language" in settings: + # before version 3 language settings were language names, transform to ISO + settings["language"] = LANG2ISO[settings["language"]] def send_report(self): if self.method == 0: @@ -177,11 +179,12 @@ def send_report(self): ("Embedder", "Multilingual SBERT"), )) if self.method == 1: - self.report_items(( + items = ( ("Embedder", "fastText"), - ("Language", self.language), + ("Language", ISO2LANG[self.language]), ("Aggregator", self.aggregator), - )) + ) + self.report_items(items) if __name__ == "__main__": diff --git a/orangecontrib/text/widgets/tests/test_owdocumentembedding.py b/orangecontrib/text/widgets/tests/test_owdocumentembedding.py index 95cbbf5bb..d79333a6b 100644 --- a/orangecontrib/text/widgets/tests/test_owdocumentembedding.py +++ b/orangecontrib/text/widgets/tests/test_owdocumentembedding.py @@ -7,8 +7,12 @@ from Orange.widgets.tests.utils import simulate from Orange.misc.utils.embedder_utils import EmbeddingConnectionError +from orangecontrib.text.language import DEFAULT_LANGUAGE, ISO2LANG from orangecontrib.text.tests.test_documentembedder import PATCH_METHOD, make_dummy_post -from orangecontrib.text.vectorization.document_embedder import DocumentEmbedder +from orangecontrib.text.vectorization.document_embedder import ( + DocumentEmbedder, + LANGUAGES, +) from orangecontrib.text.vectorization.sbert import EMB_DIM, SBERT from orangecontrib.text.widgets.owdocumentembedding import OWDocumentEmbedding from orangecontrib.text import Corpus @@ -157,14 +161,14 @@ def test_corpus_name_preserved(self): def test_fasttext_language(self): # english corpus self.send_signal(self.widget.Inputs.corpus, self.corpus) - self.assertEqual("English", self.widget.language) + self.assertEqual("en", self.widget.language) result = self.get_output(self.widget.Outputs.corpus) self.assertEqual(9, len(result)) # slovenian corpus self.corpus.attributes["language"] = "sl" self.send_signal(self.widget.Inputs.corpus, self.corpus) - self.assertEqual("Slovenian", self.widget.language) + self.assertEqual("sl", self.widget.language) result = self.get_output(self.widget.Outputs.corpus) self.assertEqual(9, len(result)) @@ -172,7 +176,7 @@ def test_fasttext_language(self): self.corpus.attributes["language"] = None self.send_signal(self.widget.Inputs.corpus, self.corpus) # use widgets default language English - self.assertEqual(self.widget.DEFAULT_LANGUAGE, self.widget.language) + self.assertEqual(DEFAULT_LANGUAGE, self.widget.language) result = self.get_output(self.widget.Outputs.corpus) self.assertEqual(9, len(result)) @@ -180,14 +184,14 @@ def test_fasttext_language(self): self.corpus.attributes["language"] = "be" self.send_signal(self.widget.Inputs.corpus, self.corpus) # use widgets default language English - self.assertEqual(self.widget.DEFAULT_LANGUAGE, self.widget.language) + self.assertEqual(DEFAULT_LANGUAGE, self.widget.language) result = self.get_output(self.widget.Outputs.corpus) self.assertEqual(9, len(result)) # language english self.corpus.attributes["language"] = "en" self.send_signal(self.widget.Inputs.corpus, self.corpus) - self.assertEqual("English", self.widget.language) + self.assertEqual("en", self.widget.language) result = self.get_output(self.widget.Outputs.corpus) self.assertEqual(9, len(result)) @@ -195,25 +199,25 @@ def test_fasttext_language(self): simulate.combobox_activate_item( self.widget.controlArea.findChildren(QComboBox)[0], "French" ) - self.assertEqual("French", self.widget.language) + self.assertEqual("fr", self.widget.language) result = self.get_output(self.widget.Outputs.corpus) self.assertEqual(9, len(result)) # providing new corpus should reset language self.send_signal(self.widget.Inputs.corpus, self.corpus) - self.assertEqual("English", self.widget.language) + self.assertEqual("en", self.widget.language) def test_language_from_settings(self): self.send_signal(self.widget.Inputs.corpus, self.corpus) simulate.combobox_activate_item( self.widget.controlArea.findChildren(QComboBox)[0], "French" ) - self.assertEqual("French", self.widget.language) + self.assertEqual("fr", self.widget.language) settings = self.widget.settingsHandler.pack_data(self.widget) widget = self.create_widget(OWDocumentEmbedding, stored_settings=settings) self.send_signal(widget.Inputs.corpus, self.corpus, widget=widget) - self.assertEqual("French", widget.language) + self.assertEqual("fr", widget.language) @patch(PATCH_METHOD, make_dummy_post(b'{"embedding": [1.3, 1]}')) @patch("orangecontrib.text.widgets.owdocumentembedding.OWDocumentEmbedding.report_items") @@ -232,6 +236,12 @@ def test_report(self, mocked_items: Mock): mocked_items.assert_called_once() mocked_items.reset_mock() + def test_migrate_settings(self): + for iso_lang in LANGUAGES: + settings = {"__version__": 2, "language": ISO2LANG[iso_lang]} + widget = self.create_widget(OWDocumentEmbedding, stored_settings=settings) + self.assertEqual(iso_lang, widget.language) + if __name__ == "__main__": unittest.main() From c6a70828cdc87d7e7c567252f627235381bf7966 Mon Sep 17 00:00:00 2001 From: PrimozGodec Date: Fri, 26 Jan 2024 14:42:02 +0100 Subject: [PATCH 6/6] Sentiment - Store ISO language in settings --- .../text/widgets/owsentimentanalysis.py | 56 +++++++++---------- .../widgets/tests/test_owsentimentanalysis.py | 38 +++++++++---- 2 files changed, 53 insertions(+), 41 deletions(-) diff --git a/orangecontrib/text/widgets/owsentimentanalysis.py b/orangecontrib/text/widgets/owsentimentanalysis.py index 16efb9a49..de06f898d 100644 --- a/orangecontrib/text/widgets/owsentimentanalysis.py +++ b/orangecontrib/text/widgets/owsentimentanalysis.py @@ -3,12 +3,14 @@ from AnyQt.QtCore import Qt from AnyQt.QtWidgets import QGridLayout, QLabel -from Orange.widgets import gui, settings +from Orange.widgets import gui from Orange.widgets.utils.concurrent import ConcurrentWidgetMixin, TaskState from Orange.widgets.utils.signals import Input, Output from Orange.widgets.widget import OWWidget, Msg +from orangewidget.settings import Setting + from orangecontrib.text import Corpus, preprocess -from orangecontrib.text.language import ISO2LANG, LANG2ISO +from orangecontrib.text.language import LanguageModel, LANG2ISO from orangecontrib.text.sentiment import ( VaderSentiment, LiuHuSentiment, @@ -37,24 +39,16 @@ class Inputs: class Outputs: corpus = Output("Corpus", Corpus) - settings_version = 1 + settings_version = 2 want_main_area = False resizing_enabled = False - method_idx: int = settings.Setting(1) - autocommit: bool = settings.Setting(True) - liu_language: str = settings.Setting( - ISO2LANG[LiuHuSentiment.DEFAULT_LANG], schema_only=True - ) - multi_language: str = settings.Setting( - ISO2LANG[MultiSentiment.DEFAULT_LANG], schema_only=True - ) - senti_language: str = settings.Setting( - ISO2LANG[SentiArt.DEFAULT_LANG], schema_only=True - ) - lilah_language: str = settings.Setting( - ISO2LANG[LilahSentiment.DEFAULT_LANG], schema_only=True - ) + method_idx: int = Setting(1) + autocommit: bool = Setting(True) + liu_language: str = Setting(LiuHuSentiment.DEFAULT_LANG, schema_only=True) + multi_language: str = Setting(MultiSentiment.DEFAULT_LANG, schema_only=True) + senti_language: str = Setting(SentiArt.DEFAULT_LANG, schema_only=True) + lilah_language: str = Setting(LilahSentiment.DEFAULT_LANG, schema_only=True) METHODS = [ LiuHuSentiment, @@ -99,9 +93,8 @@ def __init__(self): None, self, "liu_language", - sendSelectedValue=True, contentsLength=10, - items=[ISO2LANG[lg] for lg in LiuHuSentiment.LANGUAGES], + model=LanguageModel(languages=LiuHuSentiment.LANGUAGES), callback=self._method_changed, ) self.vader = gui.appendRadioButton(box, "Vader", addToLayout=False) @@ -112,9 +105,8 @@ def __init__(self): None, self, "multi_language", - sendSelectedValue=True, contentsLength=10, - items=[ISO2LANG[lg] for lg in MultiSentiment.LANGUAGES], + model=LanguageModel(languages=MultiSentiment.LANGUAGES), callback=self._method_changed, ) self.senti_art = gui.appendRadioButton(box, "SentiArt", addToLayout=False) @@ -124,7 +116,7 @@ def __init__(self): "senti_language", sendSelectedValue=True, contentsLength=10, - items=[ISO2LANG[lg] for lg in SentiArt.LANGUAGES], + model=LanguageModel(languages=SentiArt.LANGUAGES), callback=self._method_changed, ) self.lilah_sent = gui.appendRadioButton( @@ -134,9 +126,8 @@ def __init__(self): None, self, "lilah_language", - sendSelectedValue=True, contentsLength=10, - items=[ISO2LANG[lg] for lg in LilahSentiment.LANGUAGES], + model=LanguageModel(languages=LilahSentiment.LANGUAGES), callback=self._method_changed, ) self.custom_list = gui.appendRadioButton( @@ -228,10 +219,10 @@ def __set_language_settings(self): for l_pending, l_setting, model in settings_: if self.pp_corpus and self.pp_corpus.language in model.LANGUAGES: - setattr(self, l_setting, ISO2LANG[self.pp_corpus.language]) + setattr(self, l_setting, self.pp_corpus.language) else: # if Corpus's language not supported use default language - setattr(self, l_setting, ISO2LANG[model.DEFAULT_LANG]) + setattr(self, l_setting, model.DEFAULT_LANG) # when workflow loaded use language saved in workflow if l_pending is not None: @@ -249,13 +240,13 @@ def _compute_sentiment(self): method = self.METHODS[self.method_idx] kwargs = {} if method.name == "Liu Hu": - kwargs = dict(language=LANG2ISO[self.liu_language]) + kwargs = dict(language=self.liu_language) elif method.name == "Multilingual Sentiment": - kwargs = dict(language=LANG2ISO[self.multi_language]) + kwargs = dict(language=self.multi_language) elif method.name == "SentiArt": - kwargs = dict(language=LANG2ISO[self.senti_language]) + kwargs = dict(language=self.senti_language) elif method.name == "LiLaH Sentiment": - kwargs = dict(language=LANG2ISO[self.lilah_language]) + kwargs = dict(language=self.lilah_language) elif method.name == "Custom Dictionaries": kwargs = dict(pos=self.pos_file, neg=self.neg_file) if bool(self.pos_file) != bool(self.neg_file): # xor: one of them None @@ -313,6 +304,11 @@ def migrate_settings(cls, settings, version): method_idx = settings["method_idx"] if method_idx == 4: settings["metric_idx"] = 5 + if version is None or version < 2: + s = ("liu_language", "lilah_language", "multi_language", "senti_language") + for lang_set in s: + if lang_set in settings: + settings[lang_set] = LANG2ISO[settings[lang_set]] if __name__ == '__main__': diff --git a/orangecontrib/text/widgets/tests/test_owsentimentanalysis.py b/orangecontrib/text/widgets/tests/test_owsentimentanalysis.py index 1ff2dddc0..8d51f237b 100644 --- a/orangecontrib/text/widgets/tests/test_owsentimentanalysis.py +++ b/orangecontrib/text/widgets/tests/test_owsentimentanalysis.py @@ -12,7 +12,9 @@ from orangecontrib.text import preprocess from orangecontrib.text.corpus import Corpus from orangecontrib.text.language import ISO2LANG -from orangecontrib.text.sentiment import DictionaryNotFound +from orangecontrib.text.sentiment import ( + DictionaryNotFound, LiuHuSentiment, MultiSentiment, SentiArt, LilahSentiment +) from orangecontrib.text.widgets.owsentimentanalysis import OWSentimentAnalysis MS_FILES = [ @@ -164,6 +166,20 @@ def test_migrates_settings(self): OWSentimentAnalysis.migrate_settings(settings, version=None) self.assertTrue(settings.get("method_idx", 5)) + def test_migrate_language_settings(self): + methods = ( + ("liu_language", LiuHuSentiment), + ("multi_language", MultiSentiment), + ("senti_language", SentiArt), + ("lilah_language", LilahSentiment), + ) + for setting, method in methods: + if hasattr(method, "LANGUAGES"): + for lang in getattr(method, "LANGUAGES"): + se = {setting: ISO2LANG[lang], "__version__": 1} + widget = self.create_widget(OWSentimentAnalysis, stored_settings=se) + self.assertEqual(lang, getattr(widget, setting)) + def test_preprocessed(self): widget = self.create_widget(OWSentimentAnalysis) corpus = self.corpus.copy() @@ -184,7 +200,7 @@ def test_language_from_corpus(self): w = self.widget settings = [ w.liu_language, - "English", + "en", w.multi_language, w.senti_language, w.lilah_language, @@ -198,7 +214,7 @@ def test_language_from_corpus(self): self.send_signal(self.widget.Inputs.corpus, self.corpus) self.widget.findChildren(QRadioButton)[i].click() self.assertIsNotNone(self.get_output(self.widget.Outputs.corpus)) - self.assertEqual(ISO2LANG[s], sett) + self.assertEqual(s, sett) # try with unsupported language - use default language istead self.corpus.attributes["language"] = ns @@ -213,18 +229,18 @@ def test_language_from_settings(self): simulate.combobox_activate_item(self.widget.senti_box, "German") simulate.combobox_activate_item(self.widget.lilah_box, "Croatian") - self.assertEqual("Slovenian", self.widget.liu_language) - self.assertEqual("Spanish", self.widget.multi_language) - self.assertEqual("German", self.widget.senti_language) - self.assertEqual("Croatian", self.widget.lilah_language) + self.assertEqual("sl", self.widget.liu_language) + self.assertEqual("es", self.widget.multi_language) + self.assertEqual("de", self.widget.senti_language) + self.assertEqual("hr", self.widget.lilah_language) settings = self.widget.settingsHandler.pack_data(self.widget) widget = self.create_widget(OWSentimentAnalysis, stored_settings=settings) self.send_signal(widget.Inputs.corpus, self.corpus, widget=widget) - self.assertEqual("Slovenian", widget.liu_language) - self.assertEqual("Spanish", widget.multi_language) - self.assertEqual("German", widget.senti_language) - self.assertEqual("Croatian", widget.lilah_language) + self.assertEqual("sl", widget.liu_language) + self.assertEqual("es", widget.multi_language) + self.assertEqual("de", widget.senti_language) + self.assertEqual("hr", widget.lilah_language) def test_dictionary_offline(self): """Test case when offline and dictionary not found locally"""