Skip to content

Commit

Permalink
Merge pull request #1071 from ajdapretnar/bow-store-tokens
Browse files Browse the repository at this point in the history
Bag of words: Store internally processed tokens
  • Loading branch information
ajdapretnar authored Aug 29, 2024
2 parents cd05648 + d9132d6 commit ca8d903
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 1 deletion.
7 changes: 7 additions & 0 deletions orangecontrib/text/tests/test_bowvectorizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,13 @@ def test_empty_tokens(self):

self.assertIs(corpus, bag_of_words)

def test_store_tokens(self):
corpus = Corpus.from_file('deerwester')
self.assertFalse(corpus.has_tokens())

bag_of_words = BowVectorizer().transform(corpus, copy=False)
self.assertTrue(bag_of_words.has_tokens())

def test_domain(self):
vect = BowVectorizer()
corpus = Corpus.from_file('deerwester')
Expand Down
6 changes: 5 additions & 1 deletion orangecontrib/text/vectorization/bagofwords.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,11 @@ def _transform(self, corpus, source_dict=None, callback=dummy_callback):
if len(corpus) == 0:
return corpus
temp_corpus = list(corpus.ngrams_iterator(' ', include_postags=True))
dic = corpora.Dictionary(temp_corpus, prune_at=None) if not source_dict else source_dict
if not source_dict:
corpus.store_tokens(temp_corpus)
dic = corpora.Dictionary(temp_corpus, prune_at=None)
else:
dic = source_dict
if len(dic) == 0:
return corpus
callback(0.3)
Expand Down

0 comments on commit ca8d903

Please sign in to comment.