From 02f6c3c389df0ec70954247c7f82030e292a8d18 Mon Sep 17 00:00:00 2001 From: Ajda Date: Tue, 9 Jul 2024 15:07:33 +0200 Subject: [PATCH 1/2] Bag of Words: store tokens if internally processed --- orangecontrib/text/vectorization/bagofwords.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/orangecontrib/text/vectorization/bagofwords.py b/orangecontrib/text/vectorization/bagofwords.py index ed51ed2eb..f44ca2883 100644 --- a/orangecontrib/text/vectorization/bagofwords.py +++ b/orangecontrib/text/vectorization/bagofwords.py @@ -73,7 +73,11 @@ def _transform(self, corpus, source_dict=None, callback=dummy_callback): if len(corpus) == 0: return corpus temp_corpus = list(corpus.ngrams_iterator(' ', include_postags=True)) - dic = corpora.Dictionary(temp_corpus, prune_at=None) if not source_dict else source_dict + if not source_dict: + corpus.store_tokens(temp_corpus) + dic = corpora.Dictionary(temp_corpus, prune_at=None) + else: + dic = source_dict if len(dic) == 0: return corpus callback(0.3) From d9132d6870f76bb7149c19e6c117f6bd92f4c511 Mon Sep 17 00:00:00 2001 From: Ajda Date: Tue, 9 Jul 2024 15:07:51 +0200 Subject: [PATCH 2/2] Test storing internally processed tokens --- orangecontrib/text/tests/test_bowvectorizer.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/orangecontrib/text/tests/test_bowvectorizer.py b/orangecontrib/text/tests/test_bowvectorizer.py index cdc38d550..5eb0aa6f2 100644 --- a/orangecontrib/text/tests/test_bowvectorizer.py +++ b/orangecontrib/text/tests/test_bowvectorizer.py @@ -31,6 +31,13 @@ def test_empty_tokens(self): self.assertIs(corpus, bag_of_words) + def test_store_tokens(self): + corpus = Corpus.from_file('deerwester') + self.assertFalse(corpus.has_tokens()) + + bag_of_words = BowVectorizer().transform(corpus, copy=False) + self.assertTrue(bag_of_words.has_tokens()) + def test_domain(self): vect = BowVectorizer() corpus = Corpus.from_file('deerwester')