Skip to content

Commit

Permalink
Normalize - Use iso languages for UDPipe
Browse files Browse the repository at this point in the history
  • Loading branch information
PrimozGodec committed Feb 5, 2024
1 parent f519388 commit e733e96
Show file tree
Hide file tree
Showing 3 changed files with 118 additions and 50 deletions.
2 changes: 1 addition & 1 deletion orangecontrib/text/language.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
"ga": "Irish",
"gl": "Galician",
"got": "Gothic",
"grc": "Ancient greek",
"grc": "Ancient Greek",
"gu": "Gujarati",
"he": "Hebrew",
"hi": "Hindi",
Expand Down
106 changes: 80 additions & 26 deletions orangecontrib/text/preprocess/normalize.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import List, Callable
from typing import List, Callable, Dict, Tuple, Optional
import os
import ufal.udpipe as udpipe
from lemmagen3 import Lemmatizer
Expand Down Expand Up @@ -84,55 +84,109 @@ def __init__(self, language='en'):
self.normalizer = stem.SnowballStemmer(ISO2LANG[language].lower()).stem


def language_to_name(language):
return language.lower().replace(' ', '') + 'ud'


def file_to_name(file):
return file.replace('-', '').replace('_', '')


def file_to_language(file):
return file[:file.find('ud') - 1] \
.replace('-', ' ').replace('_', ' ').capitalize()


class UDPipeModels:
server_url = "https://file.biolab.si/files/udpipe/"

# some languages differ between udpipe and iso standard
UDPIPE2LANG = {"Norwegian Bokmaal": "Norwegian Bokmål"}

def __init__(self):
self.local_data = os.path.join(data_dir(versioned=False), 'udpipe/')
self.serverfiles = serverfiles.ServerFiles(self.server_url)
self.localfiles = serverfiles.LocalFiles(self.local_data,
serverfiles=self.serverfiles)

def __getitem__(self, language):
file_name = self._find_file(language_to_name(language))
def __getitem__(self, language: str) -> str:
file_name = self._find_file(language)
return self.localfiles.localpath_download(file_name)

@property
def model_files(self):
def model_files(self) -> Dict[str, Tuple[str, str]]:
try:
return self.serverfiles.listfiles()
files = self.serverfiles.listfiles()
except ConnectionError:
return self.localfiles.listfiles()
files = self.localfiles.listfiles()
return self.__files_to_dict(files)

def _find_file(self, language: str) -> str:
return self.model_files[language][1]

def _find_file(self, language):
return next(filter(lambda f: file_to_name(f).startswith(language),
map(lambda f: f[0], self.model_files)))
def __files_to_dict(self, files: List[Tuple[str]]) -> Dict[str, Tuple[str, str]]:
iso2lang = {}
for f in files:
langauge, iso = self.__file_to_language(f[0])
iso2lang[iso] = (langauge, f[0])
return iso2lang

@property
def supported_languages(self):
return list(map(lambda f: file_to_language(f[0]), self.model_files))
def supported_languages(self) -> List[Tuple[str, str]]:
return [(name, iso) for iso, (name, _) in self.model_files.items()]

@property
def online(self):
def online(self) -> bool:
try:
self.serverfiles.listfiles()
return True
except ConnectionError:
return False

def __file_to_language(self, file: str) -> Tuple[str, str]:
"""
Transform filenames to langauge strings and iso codes.
Language name has format "Language (Model)"
ISO code consist of real iso code which we add the model variation to for
example "en_lines" for lines english model.
"""
# language and potential model variation are delimited with -
name_split = file[: file.find("ud") - 1].split("-")
# capitalize multi-word languages separated by _
lg = name_split[0].replace("_", " ").title()
# fix wrong spelling for Norwegian Bokmål
lg = self.UDPIPE2LANG.get(lg, lg)

if len(name_split) > 1:
# languages with multiple models have model name as second item in split
return f"{lg} ({name_split[1]})", self.__lang2iso(lg, name_split[1])
return lg, self.__lang2iso(lg, None)

@staticmethod
def __lang2iso(language: str, model: Optional[str]) -> str:
language = [LANG2ISO[language]]
if model:
language.append(model)
return "_".join(language)

def language_to_iso(self, language: str) -> str:
"""This method is used to migrate from old widget's language settings to ISO"""
# UDPIPE language changes when migrating from language words to ISO
# previously the second word of two-word languages started with lowercase
# also different models for same language were written just with space between
# the language and model name, now we use parenthesis
migration = {
"Ancient greek proiel": "Ancient Greek (proiel)",
"Ancient greek": "Ancient Greek",
"Czech cac": "Czech (cac)",
"Czech cltt": "Czech (cltt)",
"Dutch lassysmall": "Dutch (lassysmall)",
"English lines": "English (lines)",
"English partut": "English (partut)",
"Finnish ftb": "Finnish (ftb)",
"French partut": "French (partut)",
"French sequoia": "French (sequoia)",
"Galician treegal": "Galician (treegal)",
"Latin ittb": "Latin (ittb)",
"Latin proiel": "Latin (proiel)",
"Norwegian bokmaal": "Norwegian Bokmål",
"Norwegian nynorsk": "Norwegian Nynorsk",
"Old church slavonic": "Old Church Slavonic",
"Portuguese br": "Portuguese (br)",
"Russian syntagrus": "Russian (syntagrus)",
"Slovenian sst": "Slovenian (sst)",
"Spanish ancora": "Spanish (ancora)",
"Swedish lines": "Swedish (lines)",
}
return dict(self.supported_languages).get(migration.get(language, language))


class UDPipeStopIteration(StopIteration):
pass
Expand All @@ -141,7 +195,7 @@ class UDPipeStopIteration(StopIteration):
class UDPipeLemmatizer(BaseNormalizer):
name = 'UDPipe Lemmatizer'

def __init__(self, language='English', use_tokenizer=False):
def __init__(self, language="en", use_tokenizer=False):
super().__init__()
self.__language = language
self.__use_tokenizer = use_tokenizer
Expand Down
60 changes: 37 additions & 23 deletions orangecontrib/text/tests/test_preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,7 @@
PreprocessorList,
StopwordsFilter,
)
from orangecontrib.text.preprocess.normalize import (
file_to_language,
file_to_name,
language_to_name,
UDPipeModels,
)
from orangecontrib.text.preprocess.normalize import UDPipeModels


SF_LIST = "orangecontrib.text.preprocess.normalize.serverfiles.ServerFiles.listfiles"
Expand Down Expand Up @@ -270,7 +265,7 @@ def test_call_word_net(self):
self.assertEqual(len(corpus.used_preprocessor.preprocessors), 2)

def test_call_UDPipe(self):
pp = preprocess.UDPipeLemmatizer(language="Lithuanian")
pp = preprocess.UDPipeLemmatizer(language="lt")
self.assertFalse(self.corpus.has_tokens())
corpus = pp(self.corpus)
self.assertTrue(corpus.has_tokens())
Expand Down Expand Up @@ -304,7 +299,7 @@ def test_snowball_all_langs(self):

def test_udpipe(self):
"""Test udpipe token lemmatization"""
normalizer = preprocess.UDPipeLemmatizer("Lithuanian")
normalizer = preprocess.UDPipeLemmatizer("lt")
with self.corpus.unlocked():
self.corpus.metas[0, 0] = "esu"
corpus = normalizer(self.corpus)
Expand All @@ -313,15 +308,15 @@ def test_udpipe(self):

def test_udpipe_doc(self):
"""Test udpipe lemmatization with its own tokenization"""
normalizer = preprocess.UDPipeLemmatizer("Lithuanian", True)
normalizer = preprocess.UDPipeLemmatizer("lt", True)
with self.corpus.unlocked():
self.corpus.metas[0, 0] = "Ant kalno dega namas"
corpus = normalizer(self.corpus)
self.assertListEqual(list(corpus.tokens[0]), ["ant", "kalno", "degas", "namas"])
self.assertEqual(len(corpus.used_preprocessor.preprocessors), 1)

def test_udpipe_pickle(self):
normalizer = preprocess.UDPipeLemmatizer("Lithuanian", True)
normalizer = preprocess.UDPipeLemmatizer("lt", True)
# udpipe store model after first call - model is not picklable
normalizer(self.corpus)
loaded = pickle.loads(pickle.dumps(normalizer))
Expand All @@ -336,7 +331,7 @@ def test_udpipe_pickle(self):
)

def test_udpipe_deepcopy(self):
normalizer = preprocess.UDPipeLemmatizer("Lithuanian", True)
normalizer = preprocess.UDPipeLemmatizer("lt", True)
copied = copy.deepcopy(normalizer)
self.assertEqual(normalizer._UDPipeLemmatizer__language,
copied._UDPipeLemmatizer__language)
Expand Down Expand Up @@ -370,7 +365,7 @@ def test_normalizers_picklable(self):
for nm in set(preprocess.normalize.__all__) - {"BaseNormalizer"}:
normalizer = getattr(preprocess.normalize, nm)
normalizer = (
normalizer(language="Lithuanian")
normalizer(language="lt")
if normalizer is preprocess.UDPipeLemmatizer
else normalizer()
)
Expand All @@ -379,7 +374,7 @@ def test_normalizers_picklable(self):
loaded(self.corpus)

def test_cache(self):
normalizer = preprocess.UDPipeLemmatizer("Lithuanian")
normalizer = preprocess.UDPipeLemmatizer("lt")
with self.corpus.unlocked():
self.corpus.metas[0, 0] = "esu"
normalizer(self.corpus)
Expand All @@ -391,25 +386,38 @@ def test_cache(self):
self.assertEqual(0, len(loaded_normalizer._normalization_cache))


class TokenNormalizerNotPatched(unittest.TestCase):
def setUp(self):
self.corpus = Corpus.from_file('deerwester')

@unittest.skip("Slow tests")
def test_udpipe_all_langs(self):
for _, language in UDPipeModels().supported_languages:
normalizer = preprocess.UDPipeLemmatizer(language)
tokens = normalizer(self.corpus).tokens
self.assertEqual(len(self.corpus), len(tokens))
self.assertTrue(all(tokens))


@patch(SF_LIST, return_value=SERVER_FILES)
class UDPipeModelsTests(unittest.TestCase):
def test_label_transform(self, _):
"""Test helper functions for label transformation"""
self.assertEqual(file_to_language('slovenian-sst-ud-2.0-170801.udpipe'),
'Slovenian sst')
self.assertEqual(file_to_name('slovenian-sst-ud-2.0-170801.udpipe'),
'sloveniansstud2.0170801.udpipe')
self.assertEqual(language_to_name('Slovenian sst'), 'sloveniansstud')
fun = UDPipeModels()._UDPipeModels__file_to_language
res = fun("slovenian-sst-ud-2.0-170801.udpipe")
self.assertTupleEqual(res, ("Slovenian (sst)", "sl_sst"))
res = fun("norwegian_bokmaal-sst-ud-2.0-170801.udpipe")
self.assertTupleEqual(res, ("Norwegian Bokmål (sst)", "nb_sst"))

@patch(SF_DOWNLOAD, download_patch)
def test_udpipe_model(self, _):
"""Test udpipe models loading from server"""
models = UDPipeModels()
self.assertIn("Lithuanian", models.supported_languages)
self.assertIn(('Lithuanian', 'lt'), models.supported_languages)
self.assertEqual(7, len(models.supported_languages))

local_file = os.path.join(models.local_data, "lithuanian-ud-2.0-170801.udpipe")
model = models["Lithuanian"]
model = models["lt"]
self.assertEqual(model, local_file)
self.assertTrue(os.path.isfile(local_file))

Expand All @@ -419,17 +427,23 @@ def test_udpipe_local_models(self, sf_mock):
models = UDPipeModels()
[models.localfiles.remove(f[0]) for f in models.localfiles.listfiles()]
# use Uyghur, it is the smallest model, we can have it in the repository
_ = models["Lithuanian"]
_ = models["lt"]
sf_mock.side_effect = ConnectionError()
self.assertIn("Lithuanian", UDPipeModels().supported_languages)
self.assertEqual(1, len(UDPipeModels().supported_languages))
exp = {"lt": ('Lithuanian', 'lithuanian-ud-2.0-170801.udpipe')}
self.assertDictEqual(exp, models.model_files)
self.assertListEqual([('Lithuanian', 'lt')], models.supported_languages)

def test_udpipe_offline(self, sf_mock):
"""Test if UDPipe works offline"""
self.assertTrue(UDPipeModels().online)
sf_mock.side_effect = ConnectionError()
self.assertFalse(UDPipeModels().online)

def test_language_to_iso(self, _):
models = UDPipeModels()
self.assertEqual("en", models.language_to_iso("English"))
self.assertEqual("en_lines", models.language_to_iso("English (lines)"))


class FilteringTests(unittest.TestCase):
def setUp(self):
Expand Down

0 comments on commit e733e96

Please sign in to comment.