Skip to content

Commit

Permalink
Preprocess
Browse files Browse the repository at this point in the history
  • Loading branch information
PrimozGodec committed Dec 21, 2023
1 parent f058b65 commit 5e61092
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 33 deletions.
8 changes: 4 additions & 4 deletions orangecontrib/text/tests/test_preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -393,12 +393,12 @@ def test_label_transform(self, _):
model.file_to_language("slovenian-sst-ud-2.0-170801.udpipe"),
"Slovenian (sst)"
)
self.assertEqual(model.iso_to_file("sl_sst"), "slovenian-sst-ud")
self.assertEqual(model.__iso_to_file("sl_sst"), "slovenian-sst-ud")
self.assertEqual(
model.file_to_language("norwegian_bokmaal-sst-ud-2.0-170801.udpipe"),
"Norwegian Bokmål (sst)",
)
self.assertEqual(model.iso_to_file("nb_sst"), "norwegian_bokmaal-sst-ud")
self.assertEqual(model.__iso_to_file("nb_sst"), "norwegian_bokmaal-sst-ud")

@patch(SF_DOWNLOAD, download_patch)
def test_udpipe_model(self, _):
Expand Down Expand Up @@ -431,8 +431,8 @@ def test_udpipe_offline(self, sf_mock):
self.assertFalse(UDPipeModels().online)

def test_language_to_iso(self, _):
self.assertEqual("en", UDPipeModels.language_to_iso("English"))
self.assertEqual("en_lines", UDPipeModels.language_to_iso("English (lines)"))
self.assertEqual("en", UDPipeModels.lang2iso("English"))
self.assertEqual("en_lines", UDPipeModels.lang2iso("English (lines)"))

def test_iso_to_language(self, _):
self.assertEqual("English", UDPipeModels.iso_to_language("en"))
Expand Down
49 changes: 24 additions & 25 deletions orangecontrib/text/widgets/owpreprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,16 +88,17 @@ def __init__(
"""
super().__init__(parent)
self.setMinimumWidth(80)
self.__add_items(items, include_none)
self.set_current_language(value)
items = [(ISO2LANG[itm], itm) for itm in items]
self.add_items(items, include_none, value)
self.currentIndexChanged.connect(self.__index_changed)
self.callback = callback

def __add_items(self, items: Iterable[str], include_non: bool):
def add_items(self, items: Iterable[Tuple[str, str]], include_non: bool, language: str):
if include_non:
self.addItem(_DEFAULT_NONE, None)
for itm in sorted(items, key=ISO2LANG.get):
self.addItem(ISO2LANG[itm], itm)
for itm in sorted(items):
self.addItem(*itm)
self.set_current_language(language)

def __index_changed(self, index: QModelIndex):
self.callback(self.itemData(index))
Expand All @@ -115,34 +116,34 @@ def set_current_language(self, iso_language: Optional[str]):
self.setCurrentIndex(index)


class UDPipeComboBox(QComboBox):
class UDPipeComboBox(LanguageComboBox):
def __init__(self, master: BaseEditor, value: str, default: str,
callback: Callable):
super().__init__(master)
self.__items = [] # type: List
self.__items: List = []
self.__default_lang = default
self.add_items(value)
self.currentTextChanged.connect(callback)
self.setMinimumWidth(80)
super().__init__(master, [], value, False, callback)

@property
def items(self) -> List:
return UDPipeLemmatizer().models.supported_languages

def add_items(self, value: str):
def add_items(self, _, include_non: bool, language: str):
self.__items = self.items
self.addItems(self.__items)
if value in self.__items:
self.setCurrentText(value)
elif self.__default_lang in self.__items:
self.setCurrentText(self.__default_lang)
super().add_items(self.__items, include_non, language)

def set_current_language(self, iso_language: Optional[str]):
iso_items = {iso for _, iso in self.__items}
if iso_language in iso_items:
super().set_current_language(iso_language)
elif self.__default_lang in iso_items:
super().set_current_language(self.__default_lang)
elif self.__items:
self.setCurrentIndex(0)

def showPopup(self):
if self.__items != self.items:
self.clear()
self.add_items(self.currentText())
self.add_items(None, False, self.itemData(self.currentIndex()))
super().showPopup()


Expand Down Expand Up @@ -475,14 +476,13 @@ class NormalizationModule(SingleMethodModule):
UDPipe: UDPipeLemmatizer,
Lemmagen: LemmagenLemmatizer}
DEFAULT_METHOD = Porter
DEFAULT_UDPIPE_LANG = "English" # todo: remove when udpipe use iso
DEFAULT_LANGUAGE = "en"
DEFAULT_USE_TOKE = False

def __init__(self, parent=None, **kwargs):
super().__init__(parent, **kwargs)
self.__snowball_lang = self.DEFAULT_LANGUAGE
self.__udpipe_lang = self.DEFAULT_UDPIPE_LANG
self.__udpipe_lang = self.DEFAULT_LANGUAGE
self.__lemmagen_lang = self.DEFAULT_LANGUAGE
self.__use_tokenizer = self.DEFAULT_USE_TOKE

Expand All @@ -494,7 +494,7 @@ def __init__(self, parent=None, **kwargs):
self.__set_snowball_lang
)
self.__combo_udl = UDPipeComboBox(
self, self.__udpipe_lang, self.DEFAULT_UDPIPE_LANG, self.__set_udpipe_lang
self, self.__udpipe_lang, self.DEFAULT_LANGUAGE, self.__set_udpipe_lang
)
self.__check_use = QCheckBox("UDPipe tokenizer",
checked=self.DEFAULT_USE_TOKE)
Expand Down Expand Up @@ -538,7 +538,7 @@ def setParameters(self, params: Dict):
super().setParameters(params)
snowball_lang = params.get("snowball_language", self.DEFAULT_LANGUAGE)
self.__set_snowball_lang(snowball_lang)
udpipe_lang = params.get("udpipe_language", self.DEFAULT_UDPIPE_LANG)
udpipe_lang = params.get("udpipe_language", self.DEFAULT_LANGUAGE)
self.__set_udpipe_lang(udpipe_lang)
use_tokenizer = params.get("udpipe_tokenizer", self.DEFAULT_USE_TOKE)
self.__set_use_tokenizer(use_tokenizer)
Expand All @@ -560,7 +560,7 @@ def __set_snowball_lang(self, language: str):
def __set_udpipe_lang(self, language: str):
if self.__udpipe_lang != language:
self.__udpipe_lang = language
self.__combo_udl.setCurrentText(language)
self.__combo_udl.set_current_language(language)
self.changed.emit()
if self.method == self.UDPipe:
self.edited.emit()
Expand Down Expand Up @@ -593,13 +593,12 @@ def parameters(self) -> Dict:
def createinstance(params: Dict) -> BaseNormalizer:
method = params.get("method", NormalizationModule.DEFAULT_METHOD)
args = {}
def_udpipe = NormalizationModule.DEFAULT_UDPIPE_LANG
def_lang = NormalizationModule.DEFAULT_LANGUAGE
if method == NormalizationModule.Snowball:
args = {"language": params.get("snowball_language", def_lang)}
elif method == NormalizationModule.UDPipe:
def_use = NormalizationModule.DEFAULT_USE_TOKE
args = {"language": params.get("udpipe_language", def_udpipe),
args = {"language": params.get("udpipe_language", def_lang),
"use_tokenizer": params.get("udpipe_tokenizer", def_use)}
elif method == NormalizationModule.Lemmagen:
args = {"language": params.get("lemmagen_language", def_lang)}
Expand Down
8 changes: 4 additions & 4 deletions orangecontrib/text/widgets/tests/test_owpreprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ def test_udpipe_offline(self):
@patch("orangecontrib.text.preprocess.normalize.UDPipeModels.online",
PropertyMock(return_value=False))
@patch("orangecontrib.text.preprocess.normalize.UDPipeModels.model_files",
PropertyMock(return_value=[]))
PropertyMock(return_value={}))
@patch("orangecontrib.text.widgets.owpreprocess.OWPreprocess.start", Mock())
def test_udpipe_no_models(self):
widget = self.create_widget(OWPreprocess)
Expand Down Expand Up @@ -500,7 +500,7 @@ def test_parameters(self):
params = {
"method": NormalizationModule.Porter,
"snowball_language": "en",
"udpipe_language": "English",
"udpipe_language": "en",
"lemmagen_language": "en",
"udpipe_tokenizer": False,
}
Expand All @@ -510,7 +510,7 @@ def test_set_parameters(self):
params = {
"method": NormalizationModule.UDPipe,
"snowball_language": "nl",
"udpipe_language": "Slovenian",
"udpipe_language": "sl",
"lemmagen_language": "bg",
"udpipe_tokenizer": True,
}
Expand Down Expand Up @@ -549,7 +549,7 @@ def test_repr(self):
@patch("orangecontrib.text.preprocess.normalize.UDPipeModels.online",
PropertyMock(return_value=False))
@patch("orangecontrib.text.preprocess.normalize.UDPipeModels.model_files",
PropertyMock(return_value=[]))
PropertyMock(return_value={}))
def test_udpipe_no_models(self):
editor = NormalizationModule()
button = editor._SingleMethodModule__group.button(editor.UDPipe)
Expand Down

0 comments on commit 5e61092

Please sign in to comment.