From 505a6931bb1675fe5ad6af963536c7b972ca91eb Mon Sep 17 00:00:00 2001
From: "kody.moodley@gmail.com" <kody.moodley@gmail.com>
Date: Wed, 10 Jan 2024 19:53:19 +0100
Subject: [PATCH 1/2] fixes bugs with tagging; updates word dictionaries; adds
 test code for dutch sentiment transformer model

---
 .../storynavigation/modules/actionanalysis.py | 18 ++++++++---
 .../storynavigation/modules/actoranalysis.py  | 20 ++++++++++--
 .../resources/dutchstopwords.txt              | 31 -------------------
 .../resources/false_positive_verbs.txt        | 11 +++++++
 .../resources/past_tense_verbs_dutch.txt      |  1 +
 .../resources/present_tense_verbs_dutch.txt   |  4 +++
 .../widgets/OWSNActorAnalysis.py              |  2 +-
 .../widgets/OWSNNarrativeNetwork.py           | 31 ++++++++++++-------
 requirements.txt                              |  4 ++-
 requirements_dev.txt                          |  4 ++-
 10 files changed, 74 insertions(+), 52 deletions(-)

diff --git a/orangecontrib/storynavigation/modules/actionanalysis.py b/orangecontrib/storynavigation/modules/actionanalysis.py
index a31a3fb..86efa5e 100644
--- a/orangecontrib/storynavigation/modules/actionanalysis.py
+++ b/orangecontrib/storynavigation/modules/actionanalysis.py
@@ -2,6 +2,7 @@
 """
 
 import sys
+import os
 import pandas as pd
 from operator import itemgetter
 import storynavigation.modules.constants as constants
@@ -43,11 +44,18 @@ class ActionTagger:
     )
 
     def __init__(self, model):
-        self.stopwords = self.NL_STOPWORDS_FILE.read_text(encoding="utf-8")
-        self.pronouns = self.NL_PRONOUNS_FILE.read_text(encoding="utf-8")
-        self.past_tense_verbs = self.NL_PAST_TENSE_FILE.read_text(encoding="utf-8")
-        self.present_tense_verbs = self.NL_PRESENT_TENSE_FILE.read_text(encoding="utf-8")
-        self.false_positive_verbs = self.NL_FALSE_POSITIVE_VERB_FILE.read_text(encoding="utf-8")
+        self.stopwords = self.NL_STOPWORDS_FILE.read_text(encoding="utf-8").split(os.linesep)
+        self.stopwords = [item for item in self.stopwords if len(item) > 0]
+        self.pronouns = self.NL_PRONOUNS_FILE.read_text(encoding="utf-8").split(os.linesep)
+        self.pronouns = [item for item in self.pronouns if len(item) > 0]
+
+        self.past_tense_verbs = self.NL_PAST_TENSE_FILE.read_text(encoding="utf-8").split(os.linesep)
+        self.past_tense_verbs = [item for item in self.past_tense_verbs if len(item) > 0]
+        self.present_tense_verbs = self.NL_PRESENT_TENSE_FILE.read_text(encoding="utf-8").split(os.linesep)
+        self.present_tense_verbs = [item for item in self.present_tense_verbs if len(item) > 0]
+        self.false_positive_verbs = self.NL_FALSE_POSITIVE_VERB_FILE.read_text(encoding="utf-8").split(os.linesep)
+        self.false_positive_verbs = [item for item in self.false_positive_verbs if len(item) > 0]
+
         self.html_result = ""
 
         # Other counts initialisation
diff --git a/orangecontrib/storynavigation/modules/actoranalysis.py b/orangecontrib/storynavigation/modules/actoranalysis.py
index fa6b9af..7f27ee7 100644
--- a/orangecontrib/storynavigation/modules/actoranalysis.py
+++ b/orangecontrib/storynavigation/modules/actoranalysis.py
@@ -2,6 +2,7 @@
 """
 
 import sys
+import os
 import pandas as pd
 from operator import itemgetter
 import storynavigation.modules.constants as constants
@@ -37,8 +38,11 @@ class ActorTagger:
     )
 
     def __init__(self, model):
-        self.stopwords = self.NL_STOPWORDS_FILE.read_text(encoding="utf-8")
-        self.pronouns = self.NL_PRONOUNS_FILE.read_text(encoding="utf-8")
+        self.stopwords = self.NL_STOPWORDS_FILE.read_text(encoding="utf-8").split(os.linesep)
+        self.stopwords = [item for item in self.stopwords if len(item) > 0]
+        self.pronouns = self.NL_PRONOUNS_FILE.read_text(encoding="utf-8").split(os.linesep)
+        self.pronouns = [item for item in self.pronouns if len(item) > 0]
+
         self.html_result = ""
 
         # Other counts initialisation
@@ -333,8 +337,14 @@ def postag_text(
 
                 # identify and tag POS / NER tokens in the story text
                 for tag, span in zip(tags, spans):
+                    # print()
+                    # print('tag: ', tag)
+                    # print()
                     normalised_token, is_valid_token = self.__is_valid_token(tag)
                     if is_valid_token:
+                        # print()
+                        # print('tag: ', tag)
+                        # print()
                         is_subj, subj_type = self.__is_subject(tag)
                         if is_subj:
                             p_score_greater_than_min = self.__update_postagging_metrics(
@@ -426,6 +436,8 @@ def __is_valid_token(self, token):
         """
 
         word = util.get_normalized_token(token)
+
+        # return word, (word not in list(self.stopwords)) and len(word) > 1
         return word, (word not in self.stopwords) and len(word) > 1
 
     def __calculate_word_type_count(self, sents, sent_models):
@@ -445,6 +457,10 @@ def __calculate_word_type_count(self, sents, sent_models):
                 if is_valid_token:
                     is_subj, subj_type = self.__is_subject(tag)
                     if is_subj:
+                        if token.text.lower().strip() in ['dit', 'het', 'die']:
+                            print()
+                            print('wtf')
+                            print()
                         if token.text.lower().strip() in self.num_occurences_as_subject:
                             self.num_occurences_as_subject[
                                 token.text.lower().strip()
diff --git a/orangecontrib/storynavigation/resources/dutchstopwords.txt b/orangecontrib/storynavigation/resources/dutchstopwords.txt
index 5f2450c..d3419c1 100644
--- a/orangecontrib/storynavigation/resources/dutchstopwords.txt
+++ b/orangecontrib/storynavigation/resources/dutchstopwords.txt
@@ -64,9 +64,6 @@ de
 deden
 deed
 der
-derde
-derhalve
-dertig
 deze
 dhr
 die
@@ -74,12 +71,8 @@ dikwijls
 dit
 doch
 doe
-doen
-doet
 door
 doorgaand
-drie
-duizend
 dus
 echter
 een
@@ -88,8 +81,6 @@ eer
 eerdat
 eerder
 eerlang
-eerst
-eerste
 eigen
 eigenlijk
 elk
@@ -111,11 +102,9 @@ eveneens
 evenwel
 gauw
 ge
-gedurende
 geen
 gehad
 gekund
-geleden
 gelijk
 gemoeten
 gemogen
@@ -131,7 +120,6 @@ hebt
 hedden
 heeft
 heel
-hen
 het
 hetzelfde
 hier
@@ -142,7 +130,6 @@ hierna
 hierom
 hoe
 hoewel
-honderd
 ieder
 iedere
 iemand
@@ -168,11 +155,8 @@ later
 liever
 lijken
 lijkt
-maakte
-maakten
 maar
 mag
-me
 meer
 meest
 meestal
@@ -203,9 +187,7 @@ nadat
 nam
 namelijk
 nee
-neem
 negen
-nemen
 nergens
 net
 niemand
@@ -279,8 +261,6 @@ toenmalig
 tot
 totdat
 tussen
-twee
-tweede
 uit
 uitgezonderd
 vaak
@@ -292,17 +272,11 @@ vanuit
 vanwege
 veel
 veeleer
-veertig
 verder
 verscheidene
 verschillende
 vervolgens
 via
-vier
-vierde
-vijf
-vijfde
-vijftig
 vol
 volgend
 volgens
@@ -328,7 +302,6 @@ want
 waren
 was
 wat
-we
 wederom
 weer
 weg
@@ -357,8 +330,6 @@ zei
 zeker
 zelf
 zelfde
-zes
-zeven
 zo
 zoals
 zodat
@@ -377,5 +348,3 @@ the
 to
 and
 that
-gaan
-gaat
diff --git a/orangecontrib/storynavigation/resources/false_positive_verbs.txt b/orangecontrib/storynavigation/resources/false_positive_verbs.txt
index afc2a98..1044b05 100644
--- a/orangecontrib/storynavigation/resources/false_positive_verbs.txt
+++ b/orangecontrib/storynavigation/resources/false_positive_verbs.txt
@@ -9,3 +9,14 @@ de
 tijd
 met
 en
+dagen
+puberjaren
+liefkozend
+maatschappelijk
+te
+ze
+positie
+en
+ernstig
+geleidelijk
+corona
\ No newline at end of file
diff --git a/orangecontrib/storynavigation/resources/past_tense_verbs_dutch.txt b/orangecontrib/storynavigation/resources/past_tense_verbs_dutch.txt
index 661e6d7..77bad40 100644
--- a/orangecontrib/storynavigation/resources/past_tense_verbs_dutch.txt
+++ b/orangecontrib/storynavigation/resources/past_tense_verbs_dutch.txt
@@ -92,6 +92,7 @@ vloog binnen
 gedrongen
 uitgewezen
 verzocht
+stem
 steeg af
 sprong terug
 wees af
diff --git a/orangecontrib/storynavigation/resources/present_tense_verbs_dutch.txt b/orangecontrib/storynavigation/resources/present_tense_verbs_dutch.txt
index 0cea7a7..cae3cd3 100644
--- a/orangecontrib/storynavigation/resources/present_tense_verbs_dutch.txt
+++ b/orangecontrib/storynavigation/resources/present_tense_verbs_dutch.txt
@@ -1388,8 +1388,10 @@ dichtzitten
 opvatten
 sacraliseren
 vergallen
+bevriest
 argumenteren
 nippen
+vast
 vastpakken
 vastroesten
 woekeren
@@ -3884,6 +3886,7 @@ waterskiën
 opbergen
 opraken
 herontdekken
+absorbeert
 weeromkomen
 Iemand tot bedaren brengen
 terugwinnen
@@ -7583,6 +7586,7 @@ voortleven
 knielde
 ontlenen
 smashen
+aankeek
 aantikken
 op elkaar inwerken
 groeven
diff --git a/orangecontrib/storynavigation/widgets/OWSNActorAnalysis.py b/orangecontrib/storynavigation/widgets/OWSNActorAnalysis.py
index 4411283..626af39 100644
--- a/orangecontrib/storynavigation/widgets/OWSNActorAnalysis.py
+++ b/orangecontrib/storynavigation/widgets/OWSNActorAnalysis.py
@@ -861,7 +861,7 @@ def search_features_changed(self):
 
     def display_features_changed(self):
         self.display_features = self.__get_selected_rows(self.display_listbox)
-        self.show_docs()
+        # self.show_docs()
 
     def regenerate_docs(self) -> List[str]:
         self.Warning.no_feats_search.clear()
diff --git a/orangecontrib/storynavigation/widgets/OWSNNarrativeNetwork.py b/orangecontrib/storynavigation/widgets/OWSNNarrativeNetwork.py
index a0ebc8b..3444e8e 100644
--- a/orangecontrib/storynavigation/widgets/OWSNNarrativeNetwork.py
+++ b/orangecontrib/storynavigation/widgets/OWSNNarrativeNetwork.py
@@ -22,6 +22,10 @@
 import pandas as pd
 from textblob import TextBlob
 from textblob_nl import PatternTagger, PatternAnalyzer
+from transformers import pipeline
+
+# Load the sentiment analysis pipeline
+sentiment_analysis = pipeline("sentiment-analysis", model="DTAI-KULeuven/robbert-v2-dutch-sentiment")
 
 class OWSNNarrativeNetwork(OWWidget, ConcurrentWidgetMixin):
     name = '8) Narrative Network'
@@ -29,7 +33,7 @@ class OWSNNarrativeNetwork(OWWidget, ConcurrentWidgetMixin):
     icon = 'icons/narrative_network_icon.png'
     priority = 6430
 
-    NL_SPACY_MODEL = "nl_core_news_lg" 
+    NL_SPACY_MODEL = "nl_core_news_sm" 
 
     class Inputs:
         corpus = Input("Corpus", Corpus, replaces=["Data"])
@@ -265,7 +269,12 @@ def _generate_network(self, texts):
             txt = str(texts[i, 'content'])
             # print(len(str(txt)))
             sents = sent_tokenize(txt, language='dutch')
+            print()
+            print()
             for sent in sents:
+                result = sentiment_analysis(sent)
+                print(f"Sentence: '{sent}' | Sentiment: {result[0]['label']} | Score: {result[0]['score']:.4f}")
+
                 # blob = TextBlob(sent)
                 blob = TextBlob(sent, pos_tagger=PatternTagger(), analyzer=PatternAnalyzer())
                 sentiment_scores = blob.sentiment
@@ -278,22 +287,23 @@ def _generate_network(self, texts):
                         sv_tuples, vo_tuples = self._get_tuples(tagged_sentence, token)
                         svo_tuples = self._merge_binary_tuplelsts_into_ternary_tuplelst(sv_tuples, vo_tuples)
                     elif ('N' in token.tag_.split('|')) or ('pron' in token.tag_.split('|')) or ('ik' in token.text.lower()):
-                        print('here!! ', token.text)
-                        print(token.tag_)
+                        # print('here!! ', token.text)
+                        # print(token.tag_)
                         nouns.append((token, token.idx))
                     else:
-                        print('sdasds:', token.text)
-                        print(token.tag_)
+                        print()
+                        # print('sdasds:', token.text)
+                        # print(token.tag_)
 
                 for item in svo_tuples:
                     tmp_data.append([text_id, "'" + sent + "'", item[0].lower().strip()+'_subj', item[1].lower().strip(), item[2].lower().strip()+'_obj'])
 
-                print()
-                print(nouns)
-                print()
+                # print()
+                # print(nouns)
+                # print()
                 nouns = self.sort_tuple(nouns)
-                print(nouns)
-                print()
+                # print(nouns)
+                # print()
                 if len(nouns) > 0:
                     sentiment_subject = nouns[0][0].text
                     sentiment_object = nouns[len(nouns)-1][0].text
@@ -305,7 +315,6 @@ def _generate_network(self, texts):
         # filter only for top 10 prominence scores for subjects
         # sentiment_network_tuples = self.filter_top_n_lists(sentiment_network_tuples, 7)
 
-
         print()
         print()
         print('dictionary!')
diff --git a/requirements.txt b/requirements.txt
index a0cbb36..68095c9 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -14,4 +14,6 @@ thefuzz
 beautifulsoup4
 coverage
 sphinx
-recommonmark
\ No newline at end of file
+recommonmark
+transformers
+torch
\ No newline at end of file
diff --git a/requirements_dev.txt b/requirements_dev.txt
index 24667a0..6212eef 100644
--- a/requirements_dev.txt
+++ b/requirements_dev.txt
@@ -17,4 +17,6 @@ coverage
 pytest
 tox
 sphinx
-recommonmark
\ No newline at end of file
+recommonmark
+transformers
+torch
\ No newline at end of file

From acbec06866dd34fec1b023e01c0e7b7a09f62aaf Mon Sep 17 00:00:00 2001
From: "kody.moodley@gmail.com" <kody.moodley@gmail.com>
Date: Thu, 11 Jan 2024 07:33:13 +0100
Subject: [PATCH 2/2] attempts dataset-level computation; moves some
 preprocessing code to util package

---
 .../storynavigation/modules/actoranalysis.py  | 39 ++++++++-----------
 orangecontrib/storynavigation/modules/util.py | 12 +++---
 2 files changed, 23 insertions(+), 28 deletions(-)

diff --git a/orangecontrib/storynavigation/modules/actoranalysis.py b/orangecontrib/storynavigation/modules/actoranalysis.py
index 7f27ee7..718207e 100644
--- a/orangecontrib/storynavigation/modules/actoranalysis.py
+++ b/orangecontrib/storynavigation/modules/actoranalysis.py
@@ -14,7 +14,6 @@
 from thefuzz import fuzz
 from statistics import median
 
-
 if sys.version_info < (3, 9):
     # importlib.resources either doesn't exist or lacks the files()
     # function, so use the PyPI version:
@@ -43,7 +42,13 @@ def __init__(self, model):
         self.pronouns = self.NL_PRONOUNS_FILE.read_text(encoding="utf-8").split(os.linesep)
         self.pronouns = [item for item in self.pronouns if len(item) > 0]
 
-        self.html_result = ""
+        self.story_collection = []              # list of story texts that are processed in a session
+        self.dataset_level_df_header = []       # column names of dataset (story collection) level dataframe
+        self.dataset_level_df = pd.DataFrame()  # complete dataset (story collection) level dataframe
+        self.sentence_nlp_models = []           # nlp tagging results for each sentence
+        # self.sentences = []                     # sentences in a specific story
+
+        self.html_result = ""                   
 
         # Other counts initialisation
         self.word_count = 0
@@ -66,7 +71,7 @@ def __init__(self, model):
 
         # Index of word prominence scores for each word in story
         self.word_prominence_scores = {}
-        self.sentence_nlp_models = []
+        
 
         # POS counts initialisation
         self.noun_count = 0
@@ -256,13 +261,11 @@ def __get_custom_tags_list(self, custom_dict):
     def postag_text(
         self, text, nouns, subjs, custom, custom_dict, selected_prominence_metric, prominence_score_min
     ):
-        self.custom_category_frequencies = {}
+        self.current_row_dataset_level = []
+        self.story_collection.append(text)
+        self.current_row_dataset_level.append(self.story_collection.index(text))
 
-        # print()
-        # print()
-        # print(custom_dict)
-        # print()
-        # print()
+        self.custom_category_frequencies = {}
 
         """POS-tags story text and returns HTML string which encodes the the tagged text, ready for rendering in the UI
 
@@ -275,18 +278,9 @@ def postag_text(
         Returns:
             string: HTML string representation of POS tagged text
         """
-        # print()
-        # print('text:')
-        # print(text)
-        # print()
 
         sentences = util.preprocess_text(text)
-
-        # print('sentences:')
-        # print(sentences)
-        # print()
-
-        self.__calculate_pretagging_metrics(sentences)
+        # self.__calculate_pretagging_metrics(sentences)
 
         # pos tags that the user wants to highlight
         pos_tags = []
@@ -310,12 +304,13 @@ def postag_text(
         html = ""
 
         # generate and store nlp tagged models for each sentence
-        if self.sentence_nlp_models is None or len(self.sentence_nlp_models) == 0:
+        need_to_compute_nlp_models = (self.sentence_nlp_models is None or sentences is None) or (len(self.sentence_nlp_models) == 0 or len(sentences) == 0)
+        if need_to_compute_nlp_models:
             for sentence in sentences:	
-                tagged_sentence = self.nlp(sentence.replace("`", "").replace("'", "").replace("‘", "").replace("’", ""))
+                tagged_sentence = self.nlp(sentence)
                 self.sentence_nlp_models.append(tagged_sentence)
 
-            self.__calculate_word_type_count(sentences, self.sentence_nlp_models)
+            # self.__calculate_word_type_count(sentences, self.sentence_nlp_models)
 
         # loop through model to filter out those words that need to be tagged (based on user selection and prominence score)
         for sentence, tagged_sentence in zip(sentences, self.sentence_nlp_models):
diff --git a/orangecontrib/storynavigation/modules/util.py b/orangecontrib/storynavigation/modules/util.py
index d84e323..c4ead0d 100644
--- a/orangecontrib/storynavigation/modules/util.py
+++ b/orangecontrib/storynavigation/modules/util.py
@@ -129,12 +129,12 @@ def preprocess_text(text):
             return []
 
     # # remove quotes because it affects the accuracy of POS tagging
-    # cleaned_sents = []
-    # for item in cleaned_sentences:
-    #     item = item.replace("`", "").replace("'", "").replace("‘", "").replace("’", "")
-    #     item = item.replace("  ", " ")
-    #     cleaned_sents.append(item)
-    return cleaned_sentences
+    cleaned_sents = []
+    for item in cleaned_sentences:
+        item = item.replace("`", "").replace("'", "").replace("‘", "").replace("’", "")
+        cleaned_sents.append(item)
+
+    return cleaned_sents
 
 
 def remove_span_tags(html_string):