From d9b12b68680e475d4834d1dd927d80022d36bc02 Mon Sep 17 00:00:00 2001 From: Ajda Date: Wed, 10 Jul 2024 15:12:11 +0200 Subject: [PATCH 1/2] Read stopwords list in utf-8-sig --- orangecontrib/text/preprocess/filter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/orangecontrib/text/preprocess/filter.py b/orangecontrib/text/preprocess/filter.py index ae8ccfc77..e08700d3b 100644 --- a/orangecontrib/text/preprocess/filter.py +++ b/orangecontrib/text/preprocess/filter.py @@ -58,7 +58,7 @@ def from_file(path): if not path: return set() - for encoding in ('utf-8', None, detect_encoding(path)): + for encoding in ('utf-8-sig', None, detect_encoding(path)): try: with open(path, encoding=encoding) as f: return set(line.strip() for line in f) From 52cb0e4823fa946dd8d4dfdb87c782107d50c3d1 Mon Sep 17 00:00:00 2001 From: Ajda Date: Wed, 10 Jul 2024 15:12:30 +0200 Subject: [PATCH 2/2] Test BOM removal in custom stopwords --- orangecontrib/text/tests/test_preprocess.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/orangecontrib/text/tests/test_preprocess.py b/orangecontrib/text/tests/test_preprocess.py index 0a91a49aa..8f63e2f36 100644 --- a/orangecontrib/text/tests/test_preprocess.py +++ b/orangecontrib/text/tests/test_preprocess.py @@ -501,6 +501,24 @@ def test_lang_to_iso(self): self.assertEqual("en", StopwordsFilter.lang_to_iso("English")) self.assertEqual("sl", StopwordsFilter.lang_to_iso("Slovene")) + def test_custom_list(self): + f = tempfile.NamedTemporaryFile("w", delete=False, + encoding='utf-8-sig') + # test if BOM removed + f.write('human\n') + f.write('user\n') + f.flush() + f.close() + stopwords = preprocess.StopwordsFilter(None, f.name) + self.assertIn('human', stopwords._lexicon) + self.assertIn('user', stopwords._lexicon) + with self.corpus.unlocked(): + self.corpus.metas[0, 0] = 'human user baz' + processed = stopwords(self.corpus) + self.assertEqual(["baz"], processed.tokens[0]) + f.close() + os.unlink(f.name) + def test_lexicon(self): f = tempfile.NamedTemporaryFile(delete=False) f.write(b'filter\n')