Skip to content

Commit

Permalink
Normalize - Use language from Corpus
Browse files Browse the repository at this point in the history
  • Loading branch information
PrimozGodec committed Dec 21, 2023
1 parent 02b1892 commit f058b65
Show file tree
Hide file tree
Showing 3 changed files with 76 additions and 46 deletions.
2 changes: 1 addition & 1 deletion orangecontrib/text/language.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
"ga": "Irish",
"gl": "Galician",
"got": "Gothic",
"grc": "Ancient greek",
"grc": "Ancient Greek",
"gu": "Gujarati",
"he": "Hebrew",
"hi": "Hindi",
Expand Down
64 changes: 43 additions & 21 deletions orangecontrib/text/preprocess/normalize.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from typing import List, Callable
import warnings
from typing import List, Callable, Optional
import os
import ufal.udpipe as udpipe
from lemmagen3 import Lemmatizer
Expand Down Expand Up @@ -84,46 +85,44 @@ def __init__(self, language='en'):
self.normalizer = stem.SnowballStemmer(ISO2LANG[language].lower()).stem


def language_to_name(language):
return language.lower().replace(' ', '') + 'ud'


def file_to_name(file):
return file.replace('-', '').replace('_', '')


def file_to_language(file):
return file[:file.find('ud') - 1] \
.replace('-', ' ').replace('_', ' ').capitalize()


class UDPipeModels:
server_url = "https://file.biolab.si/files/udpipe/"

# some languages differ between udpipe and iso standard
UDPIPE2LANG = {"Norwegian Bokmaal": "Norwegian Bokmål"}

def __init__(self):
self.local_data = os.path.join(data_dir(versioned=False), 'udpipe/')
self.serverfiles = serverfiles.ServerFiles(self.server_url)
self.localfiles = serverfiles.LocalFiles(self.local_data,
serverfiles=self.serverfiles)

def __getitem__(self, language):
file_name = self._find_file(language_to_name(language))
file_name = self._find_file(language)
return self.localfiles.localpath_download(file_name)

@property
def model_files(self):
try:
return self.serverfiles.listfiles()
files = self.serverfiles.listfiles()
except ConnectionError:
return self.localfiles.listfiles()
files = self.localfiles.listfiles()
return self.__files_to_dict(files)

def _find_file(self, language):
return next(filter(lambda f: file_to_name(f).startswith(language),
map(lambda f: f[0], self.model_files)))
return self.model_files[language][1]

def __files_to_dict(self, files):
iso2lang = {}
for f in files:
language_name = self.__file_to_language(f[0])
iso = self.__lang2iso(language_name)
iso2lang[iso] = (language_name, f)
return iso2lang

@property
def supported_languages(self):
return list(map(lambda f: file_to_language(f[0]), self.model_files))
return [(name, iso) for iso, (name, _) in self.model_files.items()]

@property
def online(self):
Expand All @@ -133,6 +132,29 @@ def online(self):
except ConnectionError:
return False

# use _ since - is already used in iso standard
VARIATION_DELIMITER = "_"

# todo: improve
def __lang2iso(self, language):
if "(" in language:
language, model = language.split("(")
language = LANG2ISO[language.strip()]
return UDPipeModels.VARIATION_DELIMITER.join((language, model.strip(")")))
return LANG2ISO[language]

def __file_to_language(self, file):
lg = file[: file.find("ud") - 1].split("-")
# if filename includes "-" then variation is part of the name
lg, model_variation = lg if len(lg) == 2 else (lg[0], "")
# capitalize multi-word languages separated by _
lg = " ".join(map(lambda x: x.capitalize(), lg.split("_")))
# fix wrong spelling for Norwegian Bokmål
lg = self.UDPIPE2LANG.get(lg, lg)
if model_variation:
model_variation = f"({model_variation})"
return " ".join((lg, model_variation)).strip()


class UDPipeStopIteration(StopIteration):
pass
Expand All @@ -141,7 +163,7 @@ class UDPipeStopIteration(StopIteration):
class UDPipeLemmatizer(BaseNormalizer):
name = 'UDPipe Lemmatizer'

def __init__(self, language='English', use_tokenizer=False):
def __init__(self, language="en", use_tokenizer=False):
super().__init__()
self.__language = language
self.__use_tokenizer = use_tokenizer
Expand Down
56 changes: 32 additions & 24 deletions orangecontrib/text/tests/test_preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,7 @@
PreprocessorList,
StopwordsFilter,
)
from orangecontrib.text.preprocess.normalize import (
file_to_language,
file_to_name,
language_to_name,
UDPipeModels,
)
from orangecontrib.text.preprocess.normalize import UDPipeModels


SF_LIST = "orangecontrib.text.preprocess.normalize.serverfiles.ServerFiles.listfiles"
Expand Down Expand Up @@ -270,7 +265,7 @@ def test_call_word_net(self):
self.assertEqual(len(corpus.used_preprocessor.preprocessors), 2)

def test_call_UDPipe(self):
pp = preprocess.UDPipeLemmatizer(language="Lithuanian")
pp = preprocess.UDPipeLemmatizer(language="lt")
self.assertFalse(self.corpus.has_tokens())
corpus = pp(self.corpus)
self.assertTrue(corpus.has_tokens())
Expand Down Expand Up @@ -304,7 +299,7 @@ def test_snowball_all_langs(self):

def test_udpipe(self):
"""Test udpipe token lemmatization"""
normalizer = preprocess.UDPipeLemmatizer("Lithuanian")
normalizer = preprocess.UDPipeLemmatizer("lt")
with self.corpus.unlocked():
self.corpus.metas[0, 0] = "esu"
corpus = normalizer(self.corpus)
Expand All @@ -313,20 +308,19 @@ def test_udpipe(self):

def test_udpipe_doc(self):
"""Test udpipe lemmatization with its own tokenization"""
normalizer = preprocess.UDPipeLemmatizer("Lithuanian", True)
normalizer = preprocess.UDPipeLemmatizer("lt", True)
with self.corpus.unlocked():
self.corpus.metas[0, 0] = "Ant kalno dega namas"
corpus = normalizer(self.corpus)
self.assertListEqual(list(corpus.tokens[0]), ["ant", "kalno", "degas", "namas"])
self.assertEqual(len(corpus.used_preprocessor.preprocessors), 1)

def test_udpipe_pickle(self):
normalizer = preprocess.UDPipeLemmatizer("Lithuanian", True)
normalizer = preprocess.UDPipeLemmatizer("lt", True)
# udpipe store model after first call - model is not picklable
normalizer(self.corpus)
loaded = pickle.loads(pickle.dumps(normalizer))
self.assertEqual(normalizer._UDPipeLemmatizer__language,
loaded._UDPipeLemmatizer__language)
self.assertEqual(normalizer._language, loaded._language)
self.assertEqual(normalizer._UDPipeLemmatizer__use_tokenizer,
loaded._UDPipeLemmatizer__use_tokenizer)
with self.corpus.unlocked():
Expand All @@ -336,10 +330,9 @@ def test_udpipe_pickle(self):
)

def test_udpipe_deepcopy(self):
normalizer = preprocess.UDPipeLemmatizer("Lithuanian", True)
normalizer = preprocess.UDPipeLemmatizer("lt", True)
copied = copy.deepcopy(normalizer)
self.assertEqual(normalizer._UDPipeLemmatizer__language,
copied._UDPipeLemmatizer__language)
self.assertEqual(normalizer._language, copied._language)
self.assertEqual(normalizer._UDPipeLemmatizer__use_tokenizer,
copied._UDPipeLemmatizer__use_tokenizer)
with self.corpus.unlocked():
Expand Down Expand Up @@ -395,21 +388,27 @@ def test_cache(self):
class UDPipeModelsTests(unittest.TestCase):
def test_label_transform(self, _):
"""Test helper functions for label transformation"""
self.assertEqual(file_to_language('slovenian-sst-ud-2.0-170801.udpipe'),
'Slovenian sst')
self.assertEqual(file_to_name('slovenian-sst-ud-2.0-170801.udpipe'),
'sloveniansstud2.0170801.udpipe')
self.assertEqual(language_to_name('Slovenian sst'), 'sloveniansstud')
model = UDPipeModels()
self.assertEqual(
model.file_to_language("slovenian-sst-ud-2.0-170801.udpipe"),
"Slovenian (sst)"
)
self.assertEqual(model.iso_to_file("sl_sst"), "slovenian-sst-ud")
self.assertEqual(
model.file_to_language("norwegian_bokmaal-sst-ud-2.0-170801.udpipe"),
"Norwegian Bokmål (sst)",
)
self.assertEqual(model.iso_to_file("nb_sst"), "norwegian_bokmaal-sst-ud")

@patch(SF_DOWNLOAD, download_patch)
def test_udpipe_model(self, _):
"""Test udpipe models loading from server"""
models = UDPipeModels()
self.assertIn("Lithuanian", models.supported_languages)
self.assertIn("lt", models.supported_languages_iso())
self.assertEqual(7, len(models.supported_languages))

local_file = os.path.join(models.local_data, "lithuanian-ud-2.0-170801.udpipe")
model = models["Lithuanian"]
model = models["lt"]
self.assertEqual(model, local_file)
self.assertTrue(os.path.isfile(local_file))

Expand All @@ -419,17 +418,26 @@ def test_udpipe_local_models(self, sf_mock):
models = UDPipeModels()
[models.localfiles.remove(f[0]) for f in models.localfiles.listfiles()]
# use Uyghur, it is the smallest model, we can have it in the repository
_ = models["Lithuanian"]
_ = models["lt"]
sf_mock.side_effect = ConnectionError()
self.assertIn("lt", UDPipeModels().supported_languages_iso())
self.assertIn("Lithuanian", UDPipeModels().supported_languages)
self.assertEqual(1, len(UDPipeModels().supported_languages))
self.assertEqual(1, len(UDPipeModels().supported_languages_iso()))

def test_udpipe_offline(self, sf_mock):
"""Test if UDPipe works offline"""
self.assertTrue(UDPipeModels().online)
sf_mock.side_effect = ConnectionError()
self.assertFalse(UDPipeModels().online)

def test_language_to_iso(self, _):
self.assertEqual("en", UDPipeModels.language_to_iso("English"))
self.assertEqual("en_lines", UDPipeModels.language_to_iso("English (lines)"))

def test_iso_to_language(self, _):
self.assertEqual("English", UDPipeModels.iso_to_language("en"))
self.assertEqual("English (lines)", UDPipeModels.iso_to_language("en_lines"))


class FilteringTests(unittest.TestCase):
def setUp(self):
Expand Down

0 comments on commit f058b65

Please sign in to comment.