From 13f71c6d6024c4c11dbdfb9ee54f39e5ba9637d4 Mon Sep 17 00:00:00 2001 From: Ajda Date: Thu, 12 Dec 2024 12:09:54 +0100 Subject: [PATCH 1/2] Topic Modeling: handle POS tags --- orangecontrib/text/topics/topics.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/orangecontrib/text/topics/topics.py b/orangecontrib/text/topics/topics.py index a96162179..33314e566 100644 --- a/orangecontrib/text/topics/topics.py +++ b/orangecontrib/text/topics/topics.py @@ -64,6 +64,7 @@ def infer_ngrams_corpus(corpus, return_dict=False): (i, attribute.name) for i, attribute in enumerate(corpus.domain.attributes) if 'bow-feature' in attribute.attributes ] + if len(bow_features) == 0: corpus = BowVectorizer().transform(corpus) bow_features = [ @@ -74,7 +75,8 @@ def infer_ngrams_corpus(corpus, return_dict=False): feature_presence = corpus.X.sum(axis=0) keep = [(i, a) for i, a in bow_features if feature_presence[0, i] > 0] # sort features by the order in the dictionary - dictionary = Dictionary(corpus.ngrams_iterator(include_postags=True), prune_at=None) + dictionary = Dictionary(corpus.ngrams_iterator(include_postags=False), + prune_at=None) idx_of_keep = np.argsort([dictionary.token2id[a] for _, a in keep]) keep = [keep[i][0] for i in idx_of_keep] result = [] From be65623de6076f63198569af8e8c73198565e0e3 Mon Sep 17 00:00:00 2001 From: Ajda Date: Thu, 12 Dec 2024 12:10:05 +0100 Subject: [PATCH 2/2] Topic Modeling: POS tag tests --- orangecontrib/text/tests/test_topic_modeling.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/orangecontrib/text/tests/test_topic_modeling.py b/orangecontrib/text/tests/test_topic_modeling.py index 94cd906d2..16b06cff5 100644 --- a/orangecontrib/text/tests/test_topic_modeling.py +++ b/orangecontrib/text/tests/test_topic_modeling.py @@ -6,6 +6,7 @@ from orangecontrib.text.topics import LdaWrapper, HdpWrapper, LsiWrapper, NmfWrapper from orangecontrib.text.corpus import Corpus from orangecontrib.text import preprocess +from orangecontrib.text.tag import AveragedPerceptronTagger class BaseTests: @@ -82,6 +83,17 @@ def test_existing_attributes(self): self.assertEqual(self.model.doc_topic.shape[1], self.model.actual_topics) + def test_pos_tags(self): + corpus = Corpus.from_file('deerwester') + pp_list = [preprocess.WordPunctTokenizer(), + AveragedPerceptronTagger(), + preprocess.PosTagFilter("NN")] + for pp in pp_list: + corpus = pp(corpus) + self.model.fit_transform(corpus) + self.assertTrue(all("_NN" in word for word in + self.model.get_top_words_by_id(0, 10)[0])) + class LDATests(unittest.TestCase, BaseTests): def setUp(self):