Skip to content

Commit

Permalink
Merge pull request #21 from navigating-stories/dataset-level
Browse files Browse the repository at this point in the history
Dataset level start
  • Loading branch information
kodymoodley committed Jan 12, 2024
2 parents 3153cf5 + acbec06 commit ae937d6
Show file tree
Hide file tree
Showing 11 changed files with 97 additions and 80 deletions.
18 changes: 13 additions & 5 deletions orangecontrib/storynavigation/modules/actionanalysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
"""

import sys
import os
import pandas as pd
from operator import itemgetter
import storynavigation.modules.constants as constants
Expand Down Expand Up @@ -43,11 +44,18 @@ class ActionTagger:
)

def __init__(self, model):
self.stopwords = self.NL_STOPWORDS_FILE.read_text(encoding="utf-8")
self.pronouns = self.NL_PRONOUNS_FILE.read_text(encoding="utf-8")
self.past_tense_verbs = self.NL_PAST_TENSE_FILE.read_text(encoding="utf-8")
self.present_tense_verbs = self.NL_PRESENT_TENSE_FILE.read_text(encoding="utf-8")
self.false_positive_verbs = self.NL_FALSE_POSITIVE_VERB_FILE.read_text(encoding="utf-8")
self.stopwords = self.NL_STOPWORDS_FILE.read_text(encoding="utf-8").split(os.linesep)
self.stopwords = [item for item in self.stopwords if len(item) > 0]
self.pronouns = self.NL_PRONOUNS_FILE.read_text(encoding="utf-8").split(os.linesep)
self.pronouns = [item for item in self.pronouns if len(item) > 0]

self.past_tense_verbs = self.NL_PAST_TENSE_FILE.read_text(encoding="utf-8").split(os.linesep)
self.past_tense_verbs = [item for item in self.past_tense_verbs if len(item) > 0]
self.present_tense_verbs = self.NL_PRESENT_TENSE_FILE.read_text(encoding="utf-8").split(os.linesep)
self.present_tense_verbs = [item for item in self.present_tense_verbs if len(item) > 0]
self.false_positive_verbs = self.NL_FALSE_POSITIVE_VERB_FILE.read_text(encoding="utf-8").split(os.linesep)
self.false_positive_verbs = [item for item in self.false_positive_verbs if len(item) > 0]

self.html_result = ""

# Other counts initialisation
Expand Down
59 changes: 35 additions & 24 deletions orangecontrib/storynavigation/modules/actoranalysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
"""

import sys
import os
import pandas as pd
from operator import itemgetter
import storynavigation.modules.constants as constants
Expand All @@ -13,7 +14,6 @@
from thefuzz import fuzz
from statistics import median


if sys.version_info < (3, 9):
# importlib.resources either doesn't exist or lacks the files()
# function, so use the PyPI version:
Expand All @@ -37,9 +37,18 @@ class ActorTagger:
)

def __init__(self, model):
self.stopwords = self.NL_STOPWORDS_FILE.read_text(encoding="utf-8")
self.pronouns = self.NL_PRONOUNS_FILE.read_text(encoding="utf-8")
self.html_result = ""
self.stopwords = self.NL_STOPWORDS_FILE.read_text(encoding="utf-8").split(os.linesep)
self.stopwords = [item for item in self.stopwords if len(item) > 0]
self.pronouns = self.NL_PRONOUNS_FILE.read_text(encoding="utf-8").split(os.linesep)
self.pronouns = [item for item in self.pronouns if len(item) > 0]

self.story_collection = [] # list of story texts that are processed in a session
self.dataset_level_df_header = [] # column names of dataset (story collection) level dataframe
self.dataset_level_df = pd.DataFrame() # complete dataset (story collection) level dataframe
self.sentence_nlp_models = [] # nlp tagging results for each sentence
# self.sentences = [] # sentences in a specific story

self.html_result = ""

# Other counts initialisation
self.word_count = 0
Expand All @@ -62,7 +71,7 @@ def __init__(self, model):

# Index of word prominence scores for each word in story
self.word_prominence_scores = {}
self.sentence_nlp_models = []


# POS counts initialisation
self.noun_count = 0
Expand Down Expand Up @@ -252,13 +261,11 @@ def __get_custom_tags_list(self, custom_dict):
def postag_text(
self, text, nouns, subjs, custom, custom_dict, selected_prominence_metric, prominence_score_min
):
self.custom_category_frequencies = {}
self.current_row_dataset_level = []
self.story_collection.append(text)
self.current_row_dataset_level.append(self.story_collection.index(text))

# print()
# print()
# print(custom_dict)
# print()
# print()
self.custom_category_frequencies = {}

"""POS-tags story text and returns HTML string which encodes the the tagged text, ready for rendering in the UI
Expand All @@ -271,18 +278,9 @@ def postag_text(
Returns:
string: HTML string representation of POS tagged text
"""
# print()
# print('text:')
# print(text)
# print()

sentences = util.preprocess_text(text)

# print('sentences:')
# print(sentences)
# print()

self.__calculate_pretagging_metrics(sentences)
# self.__calculate_pretagging_metrics(sentences)

# pos tags that the user wants to highlight
pos_tags = []
Expand All @@ -306,12 +304,13 @@ def postag_text(
html = ""

# generate and store nlp tagged models for each sentence
if self.sentence_nlp_models is None or len(self.sentence_nlp_models) == 0:
need_to_compute_nlp_models = (self.sentence_nlp_models is None or sentences is None) or (len(self.sentence_nlp_models) == 0 or len(sentences) == 0)
if need_to_compute_nlp_models:
for sentence in sentences:
tagged_sentence = self.nlp(sentence.replace("`", "").replace("'", "").replace("‘", "").replace("’", ""))
tagged_sentence = self.nlp(sentence)
self.sentence_nlp_models.append(tagged_sentence)

self.__calculate_word_type_count(sentences, self.sentence_nlp_models)
# self.__calculate_word_type_count(sentences, self.sentence_nlp_models)

# loop through model to filter out those words that need to be tagged (based on user selection and prominence score)
for sentence, tagged_sentence in zip(sentences, self.sentence_nlp_models):
Expand All @@ -333,8 +332,14 @@ def postag_text(

# identify and tag POS / NER tokens in the story text
for tag, span in zip(tags, spans):
# print()
# print('tag: ', tag)
# print()
normalised_token, is_valid_token = self.__is_valid_token(tag)
if is_valid_token:
# print()
# print('tag: ', tag)
# print()
is_subj, subj_type = self.__is_subject(tag)
if is_subj:
p_score_greater_than_min = self.__update_postagging_metrics(
Expand Down Expand Up @@ -426,6 +431,8 @@ def __is_valid_token(self, token):
"""

word = util.get_normalized_token(token)

# return word, (word not in list(self.stopwords)) and len(word) > 1
return word, (word not in self.stopwords) and len(word) > 1

def __calculate_word_type_count(self, sents, sent_models):
Expand All @@ -445,6 +452,10 @@ def __calculate_word_type_count(self, sents, sent_models):
if is_valid_token:
is_subj, subj_type = self.__is_subject(tag)
if is_subj:
if token.text.lower().strip() in ['dit', 'het', 'die']:
print()
print('wtf')
print()
if token.text.lower().strip() in self.num_occurences_as_subject:
self.num_occurences_as_subject[
token.text.lower().strip()
Expand Down
12 changes: 6 additions & 6 deletions orangecontrib/storynavigation/modules/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,12 +129,12 @@ def preprocess_text(text):
return []

# # remove quotes because it affects the accuracy of POS tagging
# cleaned_sents = []
# for item in cleaned_sentences:
# item = item.replace("`", "").replace("'", "").replace("‘", "").replace("’", "")
# item = item.replace(" ", " ")
# cleaned_sents.append(item)
return cleaned_sentences
cleaned_sents = []
for item in cleaned_sentences:
item = item.replace("`", "").replace("'", "").replace("‘", "").replace("’", "")
cleaned_sents.append(item)

return cleaned_sents


def remove_span_tags(html_string):
Expand Down
31 changes: 0 additions & 31 deletions orangecontrib/storynavigation/resources/dutchstopwords.txt
Original file line number Diff line number Diff line change
Expand Up @@ -64,22 +64,15 @@ de
deden
deed
der
derde
derhalve
dertig
deze
dhr
die
dikwijls
dit
doch
doe
doen
doet
door
doorgaand
drie
duizend
dus
echter
een
Expand All @@ -88,8 +81,6 @@ eer
eerdat
eerder
eerlang
eerst
eerste
eigen
eigenlijk
elk
Expand All @@ -111,11 +102,9 @@ eveneens
evenwel
gauw
ge
gedurende
geen
gehad
gekund
geleden
gelijk
gemoeten
gemogen
Expand All @@ -131,7 +120,6 @@ hebt
hedden
heeft
heel
hen
het
hetzelfde
hier
Expand All @@ -142,7 +130,6 @@ hierna
hierom
hoe
hoewel
honderd
ieder
iedere
iemand
Expand All @@ -168,11 +155,8 @@ later
liever
lijken
lijkt
maakte
maakten
maar
mag
me
meer
meest
meestal
Expand Down Expand Up @@ -203,9 +187,7 @@ nadat
nam
namelijk
nee
neem
negen
nemen
nergens
net
niemand
Expand Down Expand Up @@ -279,8 +261,6 @@ toenmalig
tot
totdat
tussen
twee
tweede
uit
uitgezonderd
vaak
Expand All @@ -292,17 +272,11 @@ vanuit
vanwege
veel
veeleer
veertig
verder
verscheidene
verschillende
vervolgens
via
vier
vierde
vijf
vijfde
vijftig
vol
volgend
volgens
Expand All @@ -328,7 +302,6 @@ want
waren
was
wat
we
wederom
weer
weg
Expand Down Expand Up @@ -357,8 +330,6 @@ zei
zeker
zelf
zelfde
zes
zeven
zo
zoals
zodat
Expand All @@ -377,5 +348,3 @@ the
to
and
that
gaan
gaat
11 changes: 11 additions & 0 deletions orangecontrib/storynavigation/resources/false_positive_verbs.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,14 @@ de
tijd
met
en
dagen
puberjaren
liefkozend
maatschappelijk
te
ze
positie
en
ernstig
geleidelijk
corona
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ vloog binnen
gedrongen
uitgewezen
verzocht
stem
steeg af
sprong terug
wees af
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1388,8 +1388,10 @@ dichtzitten
opvatten
sacraliseren
vergallen
bevriest
argumenteren
nippen
vast
vastpakken
vastroesten
woekeren
Expand Down Expand Up @@ -3884,6 +3886,7 @@ waterskiën
opbergen
opraken
herontdekken
absorbeert
weeromkomen
Iemand tot bedaren brengen
terugwinnen
Expand Down Expand Up @@ -7583,6 +7586,7 @@ voortleven
knielde
ontlenen
smashen
aankeek
aantikken
op elkaar inwerken
groeven
Expand Down
2 changes: 1 addition & 1 deletion orangecontrib/storynavigation/widgets/OWSNActorAnalysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -861,7 +861,7 @@ def search_features_changed(self):

def display_features_changed(self):
self.display_features = self.__get_selected_rows(self.display_listbox)
self.show_docs()
# self.show_docs()

def regenerate_docs(self) -> List[str]:
self.Warning.no_feats_search.clear()
Expand Down
Loading

0 comments on commit ae937d6

Please sign in to comment.