From 505a6931bb1675fe5ad6af963536c7b972ca91eb Mon Sep 17 00:00:00 2001 From: "kody.moodley@gmail.com" Date: Wed, 10 Jan 2024 19:53:19 +0100 Subject: [PATCH 1/2] fixes bugs with tagging; updates word dictionaries; adds test code for dutch sentiment transformer model --- .../storynavigation/modules/actionanalysis.py | 18 ++++++++--- .../storynavigation/modules/actoranalysis.py | 20 ++++++++++-- .../resources/dutchstopwords.txt | 31 ------------------- .../resources/false_positive_verbs.txt | 11 +++++++ .../resources/past_tense_verbs_dutch.txt | 1 + .../resources/present_tense_verbs_dutch.txt | 4 +++ .../widgets/OWSNActorAnalysis.py | 2 +- .../widgets/OWSNNarrativeNetwork.py | 31 ++++++++++++------- requirements.txt | 4 ++- requirements_dev.txt | 4 ++- 10 files changed, 74 insertions(+), 52 deletions(-) diff --git a/orangecontrib/storynavigation/modules/actionanalysis.py b/orangecontrib/storynavigation/modules/actionanalysis.py index a31a3fb..86efa5e 100644 --- a/orangecontrib/storynavigation/modules/actionanalysis.py +++ b/orangecontrib/storynavigation/modules/actionanalysis.py @@ -2,6 +2,7 @@ """ import sys +import os import pandas as pd from operator import itemgetter import storynavigation.modules.constants as constants @@ -43,11 +44,18 @@ class ActionTagger: ) def __init__(self, model): - self.stopwords = self.NL_STOPWORDS_FILE.read_text(encoding="utf-8") - self.pronouns = self.NL_PRONOUNS_FILE.read_text(encoding="utf-8") - self.past_tense_verbs = self.NL_PAST_TENSE_FILE.read_text(encoding="utf-8") - self.present_tense_verbs = self.NL_PRESENT_TENSE_FILE.read_text(encoding="utf-8") - self.false_positive_verbs = self.NL_FALSE_POSITIVE_VERB_FILE.read_text(encoding="utf-8") + self.stopwords = self.NL_STOPWORDS_FILE.read_text(encoding="utf-8").split(os.linesep) + self.stopwords = [item for item in self.stopwords if len(item) > 0] + self.pronouns = self.NL_PRONOUNS_FILE.read_text(encoding="utf-8").split(os.linesep) + self.pronouns = [item for item in self.pronouns if len(item) > 0] + + self.past_tense_verbs = self.NL_PAST_TENSE_FILE.read_text(encoding="utf-8").split(os.linesep) + self.past_tense_verbs = [item for item in self.past_tense_verbs if len(item) > 0] + self.present_tense_verbs = self.NL_PRESENT_TENSE_FILE.read_text(encoding="utf-8").split(os.linesep) + self.present_tense_verbs = [item for item in self.present_tense_verbs if len(item) > 0] + self.false_positive_verbs = self.NL_FALSE_POSITIVE_VERB_FILE.read_text(encoding="utf-8").split(os.linesep) + self.false_positive_verbs = [item for item in self.false_positive_verbs if len(item) > 0] + self.html_result = "" # Other counts initialisation diff --git a/orangecontrib/storynavigation/modules/actoranalysis.py b/orangecontrib/storynavigation/modules/actoranalysis.py index fa6b9af..7f27ee7 100644 --- a/orangecontrib/storynavigation/modules/actoranalysis.py +++ b/orangecontrib/storynavigation/modules/actoranalysis.py @@ -2,6 +2,7 @@ """ import sys +import os import pandas as pd from operator import itemgetter import storynavigation.modules.constants as constants @@ -37,8 +38,11 @@ class ActorTagger: ) def __init__(self, model): - self.stopwords = self.NL_STOPWORDS_FILE.read_text(encoding="utf-8") - self.pronouns = self.NL_PRONOUNS_FILE.read_text(encoding="utf-8") + self.stopwords = self.NL_STOPWORDS_FILE.read_text(encoding="utf-8").split(os.linesep) + self.stopwords = [item for item in self.stopwords if len(item) > 0] + self.pronouns = self.NL_PRONOUNS_FILE.read_text(encoding="utf-8").split(os.linesep) + self.pronouns = [item for item in self.pronouns if len(item) > 0] + self.html_result = "" # Other counts initialisation @@ -333,8 +337,14 @@ def postag_text( # identify and tag POS / NER tokens in the story text for tag, span in zip(tags, spans): + # print() + # print('tag: ', tag) + # print() normalised_token, is_valid_token = self.__is_valid_token(tag) if is_valid_token: + # print() + # print('tag: ', tag) + # print() is_subj, subj_type = self.__is_subject(tag) if is_subj: p_score_greater_than_min = self.__update_postagging_metrics( @@ -426,6 +436,8 @@ def __is_valid_token(self, token): """ word = util.get_normalized_token(token) + + # return word, (word not in list(self.stopwords)) and len(word) > 1 return word, (word not in self.stopwords) and len(word) > 1 def __calculate_word_type_count(self, sents, sent_models): @@ -445,6 +457,10 @@ def __calculate_word_type_count(self, sents, sent_models): if is_valid_token: is_subj, subj_type = self.__is_subject(tag) if is_subj: + if token.text.lower().strip() in ['dit', 'het', 'die']: + print() + print('wtf') + print() if token.text.lower().strip() in self.num_occurences_as_subject: self.num_occurences_as_subject[ token.text.lower().strip() diff --git a/orangecontrib/storynavigation/resources/dutchstopwords.txt b/orangecontrib/storynavigation/resources/dutchstopwords.txt index 5f2450c..d3419c1 100644 --- a/orangecontrib/storynavigation/resources/dutchstopwords.txt +++ b/orangecontrib/storynavigation/resources/dutchstopwords.txt @@ -64,9 +64,6 @@ de deden deed der -derde -derhalve -dertig deze dhr die @@ -74,12 +71,8 @@ dikwijls dit doch doe -doen -doet door doorgaand -drie -duizend dus echter een @@ -88,8 +81,6 @@ eer eerdat eerder eerlang -eerst -eerste eigen eigenlijk elk @@ -111,11 +102,9 @@ eveneens evenwel gauw ge -gedurende geen gehad gekund -geleden gelijk gemoeten gemogen @@ -131,7 +120,6 @@ hebt hedden heeft heel -hen het hetzelfde hier @@ -142,7 +130,6 @@ hierna hierom hoe hoewel -honderd ieder iedere iemand @@ -168,11 +155,8 @@ later liever lijken lijkt -maakte -maakten maar mag -me meer meest meestal @@ -203,9 +187,7 @@ nadat nam namelijk nee -neem negen -nemen nergens net niemand @@ -279,8 +261,6 @@ toenmalig tot totdat tussen -twee -tweede uit uitgezonderd vaak @@ -292,17 +272,11 @@ vanuit vanwege veel veeleer -veertig verder verscheidene verschillende vervolgens via -vier -vierde -vijf -vijfde -vijftig vol volgend volgens @@ -328,7 +302,6 @@ want waren was wat -we wederom weer weg @@ -357,8 +330,6 @@ zei zeker zelf zelfde -zes -zeven zo zoals zodat @@ -377,5 +348,3 @@ the to and that -gaan -gaat diff --git a/orangecontrib/storynavigation/resources/false_positive_verbs.txt b/orangecontrib/storynavigation/resources/false_positive_verbs.txt index afc2a98..1044b05 100644 --- a/orangecontrib/storynavigation/resources/false_positive_verbs.txt +++ b/orangecontrib/storynavigation/resources/false_positive_verbs.txt @@ -9,3 +9,14 @@ de tijd met en +dagen +puberjaren +liefkozend +maatschappelijk +te +ze +positie +en +ernstig +geleidelijk +corona \ No newline at end of file diff --git a/orangecontrib/storynavigation/resources/past_tense_verbs_dutch.txt b/orangecontrib/storynavigation/resources/past_tense_verbs_dutch.txt index 661e6d7..77bad40 100644 --- a/orangecontrib/storynavigation/resources/past_tense_verbs_dutch.txt +++ b/orangecontrib/storynavigation/resources/past_tense_verbs_dutch.txt @@ -92,6 +92,7 @@ vloog binnen gedrongen uitgewezen verzocht +stem steeg af sprong terug wees af diff --git a/orangecontrib/storynavigation/resources/present_tense_verbs_dutch.txt b/orangecontrib/storynavigation/resources/present_tense_verbs_dutch.txt index 0cea7a7..cae3cd3 100644 --- a/orangecontrib/storynavigation/resources/present_tense_verbs_dutch.txt +++ b/orangecontrib/storynavigation/resources/present_tense_verbs_dutch.txt @@ -1388,8 +1388,10 @@ dichtzitten opvatten sacraliseren vergallen +bevriest argumenteren nippen +vast vastpakken vastroesten woekeren @@ -3884,6 +3886,7 @@ waterskiën opbergen opraken herontdekken +absorbeert weeromkomen Iemand tot bedaren brengen terugwinnen @@ -7583,6 +7586,7 @@ voortleven knielde ontlenen smashen +aankeek aantikken op elkaar inwerken groeven diff --git a/orangecontrib/storynavigation/widgets/OWSNActorAnalysis.py b/orangecontrib/storynavigation/widgets/OWSNActorAnalysis.py index 4411283..626af39 100644 --- a/orangecontrib/storynavigation/widgets/OWSNActorAnalysis.py +++ b/orangecontrib/storynavigation/widgets/OWSNActorAnalysis.py @@ -861,7 +861,7 @@ def search_features_changed(self): def display_features_changed(self): self.display_features = self.__get_selected_rows(self.display_listbox) - self.show_docs() + # self.show_docs() def regenerate_docs(self) -> List[str]: self.Warning.no_feats_search.clear() diff --git a/orangecontrib/storynavigation/widgets/OWSNNarrativeNetwork.py b/orangecontrib/storynavigation/widgets/OWSNNarrativeNetwork.py index a0ebc8b..3444e8e 100644 --- a/orangecontrib/storynavigation/widgets/OWSNNarrativeNetwork.py +++ b/orangecontrib/storynavigation/widgets/OWSNNarrativeNetwork.py @@ -22,6 +22,10 @@ import pandas as pd from textblob import TextBlob from textblob_nl import PatternTagger, PatternAnalyzer +from transformers import pipeline + +# Load the sentiment analysis pipeline +sentiment_analysis = pipeline("sentiment-analysis", model="DTAI-KULeuven/robbert-v2-dutch-sentiment") class OWSNNarrativeNetwork(OWWidget, ConcurrentWidgetMixin): name = '8) Narrative Network' @@ -29,7 +33,7 @@ class OWSNNarrativeNetwork(OWWidget, ConcurrentWidgetMixin): icon = 'icons/narrative_network_icon.png' priority = 6430 - NL_SPACY_MODEL = "nl_core_news_lg" + NL_SPACY_MODEL = "nl_core_news_sm" class Inputs: corpus = Input("Corpus", Corpus, replaces=["Data"]) @@ -265,7 +269,12 @@ def _generate_network(self, texts): txt = str(texts[i, 'content']) # print(len(str(txt))) sents = sent_tokenize(txt, language='dutch') + print() + print() for sent in sents: + result = sentiment_analysis(sent) + print(f"Sentence: '{sent}' | Sentiment: {result[0]['label']} | Score: {result[0]['score']:.4f}") + # blob = TextBlob(sent) blob = TextBlob(sent, pos_tagger=PatternTagger(), analyzer=PatternAnalyzer()) sentiment_scores = blob.sentiment @@ -278,22 +287,23 @@ def _generate_network(self, texts): sv_tuples, vo_tuples = self._get_tuples(tagged_sentence, token) svo_tuples = self._merge_binary_tuplelsts_into_ternary_tuplelst(sv_tuples, vo_tuples) elif ('N' in token.tag_.split('|')) or ('pron' in token.tag_.split('|')) or ('ik' in token.text.lower()): - print('here!! ', token.text) - print(token.tag_) + # print('here!! ', token.text) + # print(token.tag_) nouns.append((token, token.idx)) else: - print('sdasds:', token.text) - print(token.tag_) + print() + # print('sdasds:', token.text) + # print(token.tag_) for item in svo_tuples: tmp_data.append([text_id, "'" + sent + "'", item[0].lower().strip()+'_subj', item[1].lower().strip(), item[2].lower().strip()+'_obj']) - print() - print(nouns) - print() + # print() + # print(nouns) + # print() nouns = self.sort_tuple(nouns) - print(nouns) - print() + # print(nouns) + # print() if len(nouns) > 0: sentiment_subject = nouns[0][0].text sentiment_object = nouns[len(nouns)-1][0].text @@ -305,7 +315,6 @@ def _generate_network(self, texts): # filter only for top 10 prominence scores for subjects # sentiment_network_tuples = self.filter_top_n_lists(sentiment_network_tuples, 7) - print() print() print('dictionary!') diff --git a/requirements.txt b/requirements.txt index a0cbb36..68095c9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,4 +14,6 @@ thefuzz beautifulsoup4 coverage sphinx -recommonmark \ No newline at end of file +recommonmark +transformers +torch \ No newline at end of file diff --git a/requirements_dev.txt b/requirements_dev.txt index 24667a0..6212eef 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -17,4 +17,6 @@ coverage pytest tox sphinx -recommonmark \ No newline at end of file +recommonmark +transformers +torch \ No newline at end of file From acbec06866dd34fec1b023e01c0e7b7a09f62aaf Mon Sep 17 00:00:00 2001 From: "kody.moodley@gmail.com" Date: Thu, 11 Jan 2024 07:33:13 +0100 Subject: [PATCH 2/2] attempts dataset-level computation; moves some preprocessing code to util package --- .../storynavigation/modules/actoranalysis.py | 39 ++++++++----------- orangecontrib/storynavigation/modules/util.py | 12 +++--- 2 files changed, 23 insertions(+), 28 deletions(-) diff --git a/orangecontrib/storynavigation/modules/actoranalysis.py b/orangecontrib/storynavigation/modules/actoranalysis.py index 7f27ee7..718207e 100644 --- a/orangecontrib/storynavigation/modules/actoranalysis.py +++ b/orangecontrib/storynavigation/modules/actoranalysis.py @@ -14,7 +14,6 @@ from thefuzz import fuzz from statistics import median - if sys.version_info < (3, 9): # importlib.resources either doesn't exist or lacks the files() # function, so use the PyPI version: @@ -43,7 +42,13 @@ def __init__(self, model): self.pronouns = self.NL_PRONOUNS_FILE.read_text(encoding="utf-8").split(os.linesep) self.pronouns = [item for item in self.pronouns if len(item) > 0] - self.html_result = "" + self.story_collection = [] # list of story texts that are processed in a session + self.dataset_level_df_header = [] # column names of dataset (story collection) level dataframe + self.dataset_level_df = pd.DataFrame() # complete dataset (story collection) level dataframe + self.sentence_nlp_models = [] # nlp tagging results for each sentence + # self.sentences = [] # sentences in a specific story + + self.html_result = "" # Other counts initialisation self.word_count = 0 @@ -66,7 +71,7 @@ def __init__(self, model): # Index of word prominence scores for each word in story self.word_prominence_scores = {} - self.sentence_nlp_models = [] + # POS counts initialisation self.noun_count = 0 @@ -256,13 +261,11 @@ def __get_custom_tags_list(self, custom_dict): def postag_text( self, text, nouns, subjs, custom, custom_dict, selected_prominence_metric, prominence_score_min ): - self.custom_category_frequencies = {} + self.current_row_dataset_level = [] + self.story_collection.append(text) + self.current_row_dataset_level.append(self.story_collection.index(text)) - # print() - # print() - # print(custom_dict) - # print() - # print() + self.custom_category_frequencies = {} """POS-tags story text and returns HTML string which encodes the the tagged text, ready for rendering in the UI @@ -275,18 +278,9 @@ def postag_text( Returns: string: HTML string representation of POS tagged text """ - # print() - # print('text:') - # print(text) - # print() sentences = util.preprocess_text(text) - - # print('sentences:') - # print(sentences) - # print() - - self.__calculate_pretagging_metrics(sentences) + # self.__calculate_pretagging_metrics(sentences) # pos tags that the user wants to highlight pos_tags = [] @@ -310,12 +304,13 @@ def postag_text( html = "" # generate and store nlp tagged models for each sentence - if self.sentence_nlp_models is None or len(self.sentence_nlp_models) == 0: + need_to_compute_nlp_models = (self.sentence_nlp_models is None or sentences is None) or (len(self.sentence_nlp_models) == 0 or len(sentences) == 0) + if need_to_compute_nlp_models: for sentence in sentences: - tagged_sentence = self.nlp(sentence.replace("`", "").replace("'", "").replace("‘", "").replace("’", "")) + tagged_sentence = self.nlp(sentence) self.sentence_nlp_models.append(tagged_sentence) - self.__calculate_word_type_count(sentences, self.sentence_nlp_models) + # self.__calculate_word_type_count(sentences, self.sentence_nlp_models) # loop through model to filter out those words that need to be tagged (based on user selection and prominence score) for sentence, tagged_sentence in zip(sentences, self.sentence_nlp_models): diff --git a/orangecontrib/storynavigation/modules/util.py b/orangecontrib/storynavigation/modules/util.py index d84e323..c4ead0d 100644 --- a/orangecontrib/storynavigation/modules/util.py +++ b/orangecontrib/storynavigation/modules/util.py @@ -129,12 +129,12 @@ def preprocess_text(text): return [] # # remove quotes because it affects the accuracy of POS tagging - # cleaned_sents = [] - # for item in cleaned_sentences: - # item = item.replace("`", "").replace("'", "").replace("‘", "").replace("’", "") - # item = item.replace(" ", " ") - # cleaned_sents.append(item) - return cleaned_sentences + cleaned_sents = [] + for item in cleaned_sentences: + item = item.replace("`", "").replace("'", "").replace("‘", "").replace("’", "") + cleaned_sents.append(item) + + return cleaned_sents def remove_span_tags(html_string):