diff --git a/orangecontrib/text/preprocess/normalize.py b/orangecontrib/text/preprocess/normalize.py index 07f85f761..cf58a5b5b 100644 --- a/orangecontrib/text/preprocess/normalize.py +++ b/orangecontrib/text/preprocess/normalize.py @@ -1,4 +1,4 @@ -from typing import List, Callable +from typing import List, Callable, Dict, Tuple, Optional import os import ufal.udpipe as udpipe from lemmagen3 import Lemmatizer @@ -84,55 +84,109 @@ def __init__(self, language='en'): self.normalizer = stem.SnowballStemmer(ISO2LANG[language].lower()).stem -def language_to_name(language): - return language.lower().replace(' ', '') + 'ud' - - -def file_to_name(file): - return file.replace('-', '').replace('_', '') - - -def file_to_language(file): - return file[:file.find('ud') - 1] \ - .replace('-', ' ').replace('_', ' ').capitalize() - - class UDPipeModels: server_url = "https://file.biolab.si/files/udpipe/" + # some languages differ between udpipe and iso standard + UDPIPE2LANG = {"Norwegian Bokmaal": "Norwegian Bokmål"} + def __init__(self): self.local_data = os.path.join(data_dir(versioned=False), 'udpipe/') self.serverfiles = serverfiles.ServerFiles(self.server_url) self.localfiles = serverfiles.LocalFiles(self.local_data, serverfiles=self.serverfiles) - def __getitem__(self, language): - file_name = self._find_file(language_to_name(language)) + def __getitem__(self, language: str) -> str: + file_name = self._find_file(language) return self.localfiles.localpath_download(file_name) @property - def model_files(self): + def model_files(self) -> Dict[str, Tuple[str, str]]: try: - return self.serverfiles.listfiles() + files = self.serverfiles.listfiles() except ConnectionError: - return self.localfiles.listfiles() + files = self.localfiles.listfiles() + return self.__files_to_dict(files) + + def _find_file(self, language: str) -> str: + return self.model_files[language][1] - def _find_file(self, language): - return next(filter(lambda f: file_to_name(f).startswith(language), - map(lambda f: f[0], self.model_files))) + def __files_to_dict(self, files: List[Tuple[str]]) -> Dict[str, Tuple[str, str]]: + iso2lang = {} + for f in files: + langauge, iso = self.__file_to_language(f[0]) + iso2lang[iso] = (langauge, f[0]) + return iso2lang @property - def supported_languages(self): - return list(map(lambda f: file_to_language(f[0]), self.model_files)) + def supported_languages(self) -> List[Tuple[str, str]]: + return [(name, iso) for iso, (name, _) in self.model_files.items()] @property - def online(self): + def online(self) -> bool: try: self.serverfiles.listfiles() return True except ConnectionError: return False + def __file_to_language(self, file: str) -> Tuple[str, str]: + """ + Transform filenames to langauge strings and iso codes. + Language name has format "Language (Model)" + ISO code consist of real iso code which we add the model variation to for + example "en_lines" for lines english model. + """ + # language and potential model variation are delimited with - + name_split = file[: file.find("ud") - 1].split("-") + # capitalize multi-word languages separated by _ + lg = name_split[0].replace("_", " ").title() + # fix wrong spelling for Norwegian Bokmål + lg = self.UDPIPE2LANG.get(lg, lg) + + if len(name_split) > 1: + # languages with multiple models have model name as second item in split + return f"{lg} ({name_split[1]})", self.__lang2iso(lg, name_split[1]) + return lg, self.__lang2iso(lg, None) + + @staticmethod + def __lang2iso(language: str, model: Optional[str]) -> str: + language = [LANG2ISO[language]] + if model: + language.append(model) + return "_".join(language) + + def language_to_iso(self, language: str) -> str: + """This method is used to migrate from old widget's language settings to ISO""" + # UDPIPE language changes when migrating from language words to ISO + # previously the second word of two-word languages started with lowercase + # also different models for same language were written just with space between + # the language and model name, now we use parenthesis + migration = { + "Ancient greek proiel": "Ancient Greek (proiel)", + "Ancient greek": "Ancient Greek", + "Czech cac": "Czech (cac)", + "Czech cltt": "Czech (cltt)", + "Dutch lassysmall": "Dutch (lassysmall)", + "English lines": "English (lines)", + "English partut": "English (partut)", + "Finnish ftb": "Finnish (ftb)", + "French partut": "French (partut)", + "French sequoia": "French (sequoia)", + "Galician treegal": "Galician (treegal)", + "Latin ittb": "Latin (ittb)", + "Latin proiel": "Latin (proiel)", + "Norwegian bokmaal": "Norwegian Bokmål", + "Norwegian nynorsk": "Norwegian Nynorsk", + "Old church slavonic": "Old Church Slavonic", + "Portuguese br": "Portuguese (br)", + "Russian syntagrus": "Russian (syntagrus)", + "Slovenian sst": "Slovenian (sst)", + "Spanish ancora": "Spanish (ancora)", + "Swedish lines": "Swedish (lines)", + } + return dict(self.supported_languages).get(migration.get(language, language)) + class UDPipeStopIteration(StopIteration): pass @@ -141,7 +195,7 @@ class UDPipeStopIteration(StopIteration): class UDPipeLemmatizer(BaseNormalizer): name = 'UDPipe Lemmatizer' - def __init__(self, language='English', use_tokenizer=False): + def __init__(self, language="en", use_tokenizer=False): super().__init__() self.__language = language self.__use_tokenizer = use_tokenizer diff --git a/orangecontrib/text/tests/test_preprocess.py b/orangecontrib/text/tests/test_preprocess.py index 516c2627c..0a91a49aa 100644 --- a/orangecontrib/text/tests/test_preprocess.py +++ b/orangecontrib/text/tests/test_preprocess.py @@ -20,12 +20,7 @@ PreprocessorList, StopwordsFilter, ) -from orangecontrib.text.preprocess.normalize import ( - file_to_language, - file_to_name, - language_to_name, - UDPipeModels, -) +from orangecontrib.text.preprocess.normalize import UDPipeModels SF_LIST = "orangecontrib.text.preprocess.normalize.serverfiles.ServerFiles.listfiles" @@ -270,7 +265,7 @@ def test_call_word_net(self): self.assertEqual(len(corpus.used_preprocessor.preprocessors), 2) def test_call_UDPipe(self): - pp = preprocess.UDPipeLemmatizer(language="Lithuanian") + pp = preprocess.UDPipeLemmatizer(language="lt") self.assertFalse(self.corpus.has_tokens()) corpus = pp(self.corpus) self.assertTrue(corpus.has_tokens()) @@ -304,7 +299,7 @@ def test_snowball_all_langs(self): def test_udpipe(self): """Test udpipe token lemmatization""" - normalizer = preprocess.UDPipeLemmatizer("Lithuanian") + normalizer = preprocess.UDPipeLemmatizer("lt") with self.corpus.unlocked(): self.corpus.metas[0, 0] = "esu" corpus = normalizer(self.corpus) @@ -313,7 +308,7 @@ def test_udpipe(self): def test_udpipe_doc(self): """Test udpipe lemmatization with its own tokenization""" - normalizer = preprocess.UDPipeLemmatizer("Lithuanian", True) + normalizer = preprocess.UDPipeLemmatizer("lt", True) with self.corpus.unlocked(): self.corpus.metas[0, 0] = "Ant kalno dega namas" corpus = normalizer(self.corpus) @@ -321,7 +316,7 @@ def test_udpipe_doc(self): self.assertEqual(len(corpus.used_preprocessor.preprocessors), 1) def test_udpipe_pickle(self): - normalizer = preprocess.UDPipeLemmatizer("Lithuanian", True) + normalizer = preprocess.UDPipeLemmatizer("lt", True) # udpipe store model after first call - model is not picklable normalizer(self.corpus) loaded = pickle.loads(pickle.dumps(normalizer)) @@ -336,7 +331,7 @@ def test_udpipe_pickle(self): ) def test_udpipe_deepcopy(self): - normalizer = preprocess.UDPipeLemmatizer("Lithuanian", True) + normalizer = preprocess.UDPipeLemmatizer("lt", True) copied = copy.deepcopy(normalizer) self.assertEqual(normalizer._UDPipeLemmatizer__language, copied._UDPipeLemmatizer__language) @@ -370,7 +365,7 @@ def test_normalizers_picklable(self): for nm in set(preprocess.normalize.__all__) - {"BaseNormalizer"}: normalizer = getattr(preprocess.normalize, nm) normalizer = ( - normalizer(language="Lithuanian") + normalizer(language="lt") if normalizer is preprocess.UDPipeLemmatizer else normalizer() ) @@ -379,7 +374,7 @@ def test_normalizers_picklable(self): loaded(self.corpus) def test_cache(self): - normalizer = preprocess.UDPipeLemmatizer("Lithuanian") + normalizer = preprocess.UDPipeLemmatizer("lt") with self.corpus.unlocked(): self.corpus.metas[0, 0] = "esu" normalizer(self.corpus) @@ -391,25 +386,38 @@ def test_cache(self): self.assertEqual(0, len(loaded_normalizer._normalization_cache)) +class TokenNormalizerNotPatched(unittest.TestCase): + def setUp(self): + self.corpus = Corpus.from_file('deerwester') + + @unittest.skip("Slow tests") + def test_udpipe_all_langs(self): + for _, language in UDPipeModels().supported_languages: + normalizer = preprocess.UDPipeLemmatizer(language) + tokens = normalizer(self.corpus).tokens + self.assertEqual(len(self.corpus), len(tokens)) + self.assertTrue(all(tokens)) + + @patch(SF_LIST, return_value=SERVER_FILES) class UDPipeModelsTests(unittest.TestCase): def test_label_transform(self, _): """Test helper functions for label transformation""" - self.assertEqual(file_to_language('slovenian-sst-ud-2.0-170801.udpipe'), - 'Slovenian sst') - self.assertEqual(file_to_name('slovenian-sst-ud-2.0-170801.udpipe'), - 'sloveniansstud2.0170801.udpipe') - self.assertEqual(language_to_name('Slovenian sst'), 'sloveniansstud') + fun = UDPipeModels()._UDPipeModels__file_to_language + res = fun("slovenian-sst-ud-2.0-170801.udpipe") + self.assertTupleEqual(res, ("Slovenian (sst)", "sl_sst")) + res = fun("norwegian_bokmaal-sst-ud-2.0-170801.udpipe") + self.assertTupleEqual(res, ("Norwegian Bokmål (sst)", "nb_sst")) @patch(SF_DOWNLOAD, download_patch) def test_udpipe_model(self, _): """Test udpipe models loading from server""" models = UDPipeModels() - self.assertIn("Lithuanian", models.supported_languages) + self.assertIn(('Lithuanian', 'lt'), models.supported_languages) self.assertEqual(7, len(models.supported_languages)) local_file = os.path.join(models.local_data, "lithuanian-ud-2.0-170801.udpipe") - model = models["Lithuanian"] + model = models["lt"] self.assertEqual(model, local_file) self.assertTrue(os.path.isfile(local_file)) @@ -419,10 +427,11 @@ def test_udpipe_local_models(self, sf_mock): models = UDPipeModels() [models.localfiles.remove(f[0]) for f in models.localfiles.listfiles()] # use Uyghur, it is the smallest model, we can have it in the repository - _ = models["Lithuanian"] + _ = models["lt"] sf_mock.side_effect = ConnectionError() - self.assertIn("Lithuanian", UDPipeModels().supported_languages) - self.assertEqual(1, len(UDPipeModels().supported_languages)) + exp = {"lt": ('Lithuanian', 'lithuanian-ud-2.0-170801.udpipe')} + self.assertDictEqual(exp, models.model_files) + self.assertListEqual([('Lithuanian', 'lt')], models.supported_languages) def test_udpipe_offline(self, sf_mock): """Test if UDPipe works offline""" @@ -430,6 +439,11 @@ def test_udpipe_offline(self, sf_mock): sf_mock.side_effect = ConnectionError() self.assertFalse(UDPipeModels().online) + def test_language_to_iso(self, _): + models = UDPipeModels() + self.assertEqual("en", models.language_to_iso("English")) + self.assertEqual("en_lines", models.language_to_iso("English (lines)")) + class FilteringTests(unittest.TestCase): def setUp(self): diff --git a/orangecontrib/text/widgets/owpreprocess.py b/orangecontrib/text/widgets/owpreprocess.py index 7be2eed77..8f939612b 100644 --- a/orangecontrib/text/widgets/owpreprocess.py +++ b/orangecontrib/text/widgets/owpreprocess.py @@ -27,7 +27,7 @@ from orangecontrib.text.language import ISO2LANG, LANG2ISO from orangecontrib.text.misc import nltk_data_dir from orangecontrib.text.preprocess import * -from orangecontrib.text.preprocess.normalize import UDPipeStopIteration +from orangecontrib.text.preprocess.normalize import UDPipeStopIteration, UDPipeModels from orangecontrib.text.tag import AveragedPerceptronTagger, MaxEntTagger, \ POSTagger @@ -87,19 +87,20 @@ def __init__( Boxs initial value (as an ISO code). """ super().__init__(parent) - self.setMinimumWidth(80) - self.__add_items(items, include_none) - self.set_current_language(value) - self.currentIndexChanged.connect(self.__index_changed) self.callback = callback + self.setMinimumWidth(80) + items = [(ISO2LANG[itm], itm) for itm in items] + self.add_items(items, include_none, value) + self.currentIndexChanged.connect(self.index_changed) - def __add_items(self, items: Iterable[str], include_non: bool): - if include_non: + def add_items(self, items: Iterable[Tuple[str, str]], include_none: bool, language: str): + if include_none: self.addItem(_DEFAULT_NONE, None) - for itm in sorted(items, key=ISO2LANG.get): - self.addItem(ISO2LANG[itm], itm) + for itm in sorted(items): + self.addItem(*itm) + self.set_current_language(language) - def __index_changed(self, index: QModelIndex): + def index_changed(self, index: QModelIndex): self.callback(self.itemData(index)) def set_current_language(self, iso_language: Optional[str]): @@ -115,34 +116,35 @@ def set_current_language(self, iso_language: Optional[str]): self.setCurrentIndex(index) -class UDPipeComboBox(QComboBox): - def __init__(self, master: BaseEditor, value: str, default: str, - callback: Callable): - super().__init__(master) - self.__items = [] # type: List +class UDPipeComboBox(LanguageComboBox): + def __init__( + self, master: BaseEditor, value: str, default: str, callback: Callable + ): + self.__items: List = [] self.__default_lang = default - self.add_items(value) - self.currentTextChanged.connect(callback) - self.setMinimumWidth(80) + super().__init__(master, [], value, False, callback) @property def items(self) -> List: - return UDPipeLemmatizer().models.supported_languages + return UDPipeModels().supported_languages - def add_items(self, value: str): + def add_items(self, _, include_none: bool, language: str): self.__items = self.items - self.addItems(self.__items) - if value in self.__items: - self.setCurrentText(value) - elif self.__default_lang in self.__items: - self.setCurrentText(self.__default_lang) + super().add_items(self.__items, include_none, language) + + def set_current_language(self, iso_language: Optional[str]): + iso_items = {iso for _, iso in self.__items} + if iso_language in iso_items: + super().set_current_language(iso_language) + elif self.__default_lang in iso_items: + super().set_current_language(self.__default_lang) elif self.__items: self.setCurrentIndex(0) def showPopup(self): if self.__items != self.items: self.clear() - self.add_items(self.currentText()) + self.add_items(None, False, self.itemData(self.currentIndex())) super().showPopup() @@ -475,14 +477,13 @@ class NormalizationModule(SingleMethodModule): UDPipe: UDPipeLemmatizer, Lemmagen: LemmagenLemmatizer} DEFAULT_METHOD = Porter - DEFAULT_UDPIPE_LANG = "English" # todo: remove when udpipe use iso DEFAULT_LANGUAGE = "en" DEFAULT_USE_TOKE = False def __init__(self, parent=None, **kwargs): super().__init__(parent, **kwargs) self.__snowball_lang = self.DEFAULT_LANGUAGE - self.__udpipe_lang = self.DEFAULT_UDPIPE_LANG + self.__udpipe_lang = self.DEFAULT_LANGUAGE self.__lemmagen_lang = self.DEFAULT_LANGUAGE self.__use_tokenizer = self.DEFAULT_USE_TOKE @@ -494,7 +495,7 @@ def __init__(self, parent=None, **kwargs): self.__set_snowball_lang ) self.__combo_udl = UDPipeComboBox( - self, self.__udpipe_lang, self.DEFAULT_UDPIPE_LANG, self.__set_udpipe_lang + self, self.__udpipe_lang, self.DEFAULT_LANGUAGE, self.__set_udpipe_lang ) self.__check_use = QCheckBox("UDPipe tokenizer", checked=self.DEFAULT_USE_TOKE) @@ -538,7 +539,7 @@ def setParameters(self, params: Dict): super().setParameters(params) snowball_lang = params.get("snowball_language", self.DEFAULT_LANGUAGE) self.__set_snowball_lang(snowball_lang) - udpipe_lang = params.get("udpipe_language", self.DEFAULT_UDPIPE_LANG) + udpipe_lang = params.get("udpipe_language", self.DEFAULT_LANGUAGE) self.__set_udpipe_lang(udpipe_lang) use_tokenizer = params.get("udpipe_tokenizer", self.DEFAULT_USE_TOKE) self.__set_use_tokenizer(use_tokenizer) @@ -560,7 +561,7 @@ def __set_snowball_lang(self, language: str): def __set_udpipe_lang(self, language: str): if self.__udpipe_lang != language: self.__udpipe_lang = language - self.__combo_udl.setCurrentText(language) + self.__combo_udl.set_current_language(language) self.changed.emit() if self.method == self.UDPipe: self.edited.emit() @@ -593,13 +594,12 @@ def parameters(self) -> Dict: def createinstance(params: Dict) -> BaseNormalizer: method = params.get("method", NormalizationModule.DEFAULT_METHOD) args = {} - def_udpipe = NormalizationModule.DEFAULT_UDPIPE_LANG def_lang = NormalizationModule.DEFAULT_LANGUAGE if method == NormalizationModule.Snowball: args = {"language": params.get("snowball_language", def_lang)} elif method == NormalizationModule.UDPipe: def_use = NormalizationModule.DEFAULT_USE_TOKE - args = {"language": params.get("udpipe_language", def_udpipe), + args = {"language": params.get("udpipe_language", def_lang), "use_tokenizer": params.get("udpipe_tokenizer", def_use)} elif method == NormalizationModule.Lemmagen: args = {"language": params.get("lemmagen_language", def_lang)} @@ -1395,6 +1395,9 @@ def str_into_paths(label): for key in ("lemmagen_language", "snowball_language"): if key in pp: pp[key] = LANG2ISO[pp[key]] + up_lang = "udpipe_language" + if up_lang in pp: + pp[up_lang] = UDPipeModels().language_to_iso(pp[up_lang]) if __name__ == "__main__": diff --git a/orangecontrib/text/widgets/tests/test_owpreprocess.py b/orangecontrib/text/widgets/tests/test_owpreprocess.py index 713d67b43..49ccb1229 100644 --- a/orangecontrib/text/widgets/tests/test_owpreprocess.py +++ b/orangecontrib/text/widgets/tests/test_owpreprocess.py @@ -23,6 +23,7 @@ POSTaggingModule, LanguageComboBox, _DEFAULT_NONE, + UDPipeComboBox, ) @@ -127,7 +128,7 @@ def test_udpipe_offline(self): @patch("orangecontrib.text.preprocess.normalize.UDPipeModels.online", PropertyMock(return_value=False)) @patch("orangecontrib.text.preprocess.normalize.UDPipeModels.model_files", - PropertyMock(return_value=[])) + PropertyMock(return_value={})) @patch("orangecontrib.text.widgets.owpreprocess.OWPreprocess.start", Mock()) def test_udpipe_no_models(self): widget = self.create_widget(OWPreprocess) @@ -201,12 +202,12 @@ def test_migrate_settings_normalize(self): settings = {"__version__": 1, "normalizer": {"enabled": True, "method_index": 2, "snowball_language": "French", - "udpipe_language": "German", + "udpipe_language": "Portuguese", "udpipe_tokenizer": True}} widget = self.create_widget(OWPreprocess, stored_settings=settings) params = [("preprocess.normalize", {"method": 2, "snowball_language": "fr", - "udpipe_language": "German", "udpipe_tokenizer": True})] + "udpipe_language": "pt", "udpipe_tokenizer": True})] self.assertEqual(widget.storedsettings["preprocessors"], params) def test_migrate_settings_filter(self): @@ -358,6 +359,133 @@ def test_migrate_snowball_language_settings(self): normalize_settings = widget.storedsettings["preprocessors"][0][1] self.assertEqual("en", normalize_settings["snowball_language"]) + def test_migrate_udpipe_language_settings(self): + """Test migration to iso langauge codes""" + settings = { + "__version__": 3, + "storedsettings": { + "preprocessors": [ + ("preprocess.normalize", {"udpipe_language": "Slovenian"}), + ] + }, + } + widget = self.create_widget(OWPreprocess, stored_settings=settings) + normalize_settings = widget.storedsettings["preprocessors"][0][1] + self.assertEqual("sl", normalize_settings["udpipe_language"]) + + settings = { + "__version__": 3, + "storedsettings": { + "preprocessors": [ + ("preprocess.normalize", {"udpipe_language": "English (lines)"}), + ] + }, + } + widget = self.create_widget(OWPreprocess, stored_settings=settings) + normalize_settings = widget.storedsettings["preprocessors"][0][1] + self.assertEqual("en_lines", normalize_settings["udpipe_language"]) + + settings = { + "__version__": 3, + "storedsettings": { + "preprocessors": [ + ("preprocess.normalize", {"udpipe_language": "Abc"}), + ] + }, + } + widget = self.create_widget(OWPreprocess, stored_settings=settings) + normalize_settings = widget.storedsettings["preprocessors"][0][1] + self.assertIsNone(normalize_settings["udpipe_language"]) + + @unittest.skip("Very slow test") + def test_migrate_udpipe_language_settings_slow(self): + """ + Test migration to iso langauge codes. To run it successfully remove + patch on the TestOWPreprocessMigrateSettings class + """ + migrations = [ + ("Ancient greek proiel", "grc_proiel"), + ("Ancient greek", "grc"), + ("Arabic", "ar"), + ("Basque", "eu"), + ("Belarusian", "be"), + ("Bulgarian", "bg"), + ("Catalan", "ca"), + ("Chinese", "zh"), + ("Coptic", "cop"), + ("Croatian", "hr"), + ("Czech cac", "cs_cac"), + ("Czech cltt", "cs_cltt"), + ("Czech", "cs"), + ("Danish", "da"), + ("Dutch lassysmall", "nl_lassysmall"), + ("Dutch", "nl"), + ("English lines", "en_lines"), + ("English partut", "en_partut"), + ("English", "en"), + ("Estonian", "et"), + ("Finnish ftb", "fi_ftb"), + ("Finnish", "fi"), + ("French partut", "fr_partut"), + ("French sequoia", "fr_sequoia"), + ("French", "fr"), + ("Galician treegal", "gl_treegal"), + ("Galician", "gl"), + ("German", "de"), + ("Gothic", "got"), + ("Greek", "el"), + ("Hebrew", "he"), + ("Hindi", "hi"), + ("Hungarian", "hu"), + ("Indonesian", "id"), + ("Irish", "ga"), + ("Italian", "it"), + ("Japanese", "ja"), + ("Kazakh", "kk"), + ("Korean", "ko"), + ("Latin ittb", "la_ittb"), + ("Latin proiel", "la_proiel"), + ("Latin", "la"), + ("Latvian", "lv"), + ("Lithuanian", "lt"), + ("Norwegian bokmaal", "nb"), + ("Norwegian nynorsk", "nn"), + ("Old church slavonic", "cu"), + ("Persian", "fa"), + ("Polish", "pl"), + ("Portuguese br", "pt_br"), + ("Portuguese", "pt"), + ("Romanian", "ro"), + ("Russian syntagrus", "ru_syntagrus"), + ("Russian", "ru"), + ("Sanskrit", "sa"), + ("Slovak", "sk"), + ("Slovenian sst", "sl_sst"), + ("Slovenian", "sl"), + ("Spanish ancora", "es_ancora"), + ("Spanish", "es"), + ("Swedish lines", "sv_lines"), + ("Swedish", "sv"), + ("Tamil", "ta"), + ("Turkish", "tr"), + ("Ukrainian", "uk"), + ("Urdu", "ur"), + ("Uyghur", "ug"), + ("Vietnamese", "vi"), + ] + for old_value, new_value in migrations: + settings = { + "__version__": 3, + "storedsettings": { + "preprocessors": [ + ("preprocess.normalize", {"udpipe_language": old_value}), + ] + }, + } + widget = self.create_widget(OWPreprocess, stored_settings=settings) + normalize_settings = widget.storedsettings["preprocessors"][0][1] + self.assertEqual(new_value, normalize_settings["udpipe_language"]) + class TestTransformationModule(WidgetTest): def setUp(self): @@ -500,7 +628,7 @@ def test_parameters(self): params = { "method": NormalizationModule.Porter, "snowball_language": "en", - "udpipe_language": "English", + "udpipe_language": "en", "lemmagen_language": "en", "udpipe_tokenizer": False, } @@ -510,7 +638,7 @@ def test_set_parameters(self): params = { "method": NormalizationModule.UDPipe, "snowball_language": "nl", - "udpipe_language": "Slovenian", + "udpipe_language": "sl", "lemmagen_language": "bg", "udpipe_tokenizer": True, } @@ -549,7 +677,7 @@ def test_repr(self): @patch("orangecontrib.text.preprocess.normalize.UDPipeModels.online", PropertyMock(return_value=False)) @patch("orangecontrib.text.preprocess.normalize.UDPipeModels.model_files", - PropertyMock(return_value=[])) + PropertyMock(return_value={})) def test_udpipe_no_models(self): editor = NormalizationModule() button = editor._SingleMethodModule__group.button(editor.UDPipe) @@ -837,5 +965,41 @@ def test_change_item(self): mock.assert_called_once_with(None) +@patch(SF_LIST, new=Mock(return_value=SERVER_FILES)) +class TestUDPipeComboBox(WidgetTest): + ITEMS = ["English", "English (lines)", "English (partut)", "Lithuanian", + "Portuguese", "Slovenian", "Slovenian (sst)"] + + def test_basic_setup(self): + mock = Mock() + cb = UDPipeComboBox(None, "pt", "en", mock) + self.assertEqual(7, cb.count()) + self.assertEqual(self.ITEMS, [cb.itemText(i) for i in range(cb.count())]) + self.assertEqual("Portuguese", cb.currentText()) + + def test_set_current_language(self): + mock = Mock() + cb = UDPipeComboBox(None, "pt", "en", mock) + self.assertEqual("Portuguese", cb.currentText()) + cb.set_current_language("sl") + self.assertEqual("Slovenian", cb.currentText()) + cb.set_current_language("abc") # should set to default + self.assertEqual("English", cb.currentText()) + # when no default language in the dropdown set to first + cb.removeItem(0) + x = cb._UDPipeComboBox__items + cb._UDPipeComboBox__items = x[:3] + x[4:] + cb.set_current_language("abc") + self.assertEqual("English (lines)", cb.currentText()) + + def test_change_item(self): + mock = Mock() + cb = UDPipeComboBox(None, "pt", "en", mock) + self.assertEqual(self.ITEMS, [cb.itemText(i) for i in range(cb.count())]) + mock.assert_not_called() + simulate.combobox_activate_item(cb, "Slovenian") + mock.assert_called_once_with("sl") + + if __name__ == "__main__": unittest.main()