Skip to content

Commit

Permalink
Replaced the future tense extraction to speed the extraction
Browse files Browse the repository at this point in the history
  • Loading branch information
ThijsVroegh committed Nov 13, 2024
1 parent faba9b9 commit 7e09455
Showing 1 changed file with 33 additions and 39 deletions.
72 changes: 33 additions & 39 deletions orangecontrib/storynavigation/modules/tagging.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from Orange.data.pandas_compat import table_to_frames
import spacy

nlp = spacy.load("nl_core_news_sm")
#nlp = spacy.load("nl_core_news_sm")

class Tagger:
"""Class to perform NLP tagging of relevant actors and actions in textual stories
Expand Down Expand Up @@ -168,15 +168,15 @@ def __parse_tagged_story(self, storyid, sentences, tagged_sentences):
for tag, span in zip(tags, spans):
story_df_row = self.__process_tag(storyid, sentence, tag, span)
if story_df_row is not None:
story_df_rows.append(story_df_row)

story_df_rows.append(story_df_row)

for index in range(len(story_df_rows)):
story_df_rows[index].append(self.__process_dutch_future_verbs(story_df_rows, story_df_rows[index]))
print(";;;;;;")
print(self.__process_dutch_future_verbs(story_df_rows, story_df_rows[index]))

for index, row in enumerate(story_df_rows):
future_verb = self.__process_dutch_future_verbs(story_df_rows, row)
row.append(future_verb)

if future_verb == "FUTURE_VB":
row[5] = "FUTURE_VB"

story_df = pd.DataFrame(story_df_rows, columns=self.complete_data_columns)
return story_df

Expand Down Expand Up @@ -238,27 +238,12 @@ def __process_english_potential_action(self, tag):
else: # Not Verb
return "-"

def __process_dutch_potential_action(self, tag, sentence):
# Reprocess the sentence into a spaCy Doc object
#TODO: This is a temporary fix. The sentence is already processed in the main function.
# sentence_doc = nlp(sentence)

# # Extract the token object corresponding to the word in 'tag'
# token = None
# for tok in sentence_doc:
# if tok.text == tag[0]: # Find the token with the same text as 'tag[0]'
# token = tok
# break

# Now that we have the correct token, check its part-of-speech and other attributes
def __process_dutch_potential_action(self, tag):

token = tag[-1]

if token and token.pos_ in ["VERB", "AUX"] and token.tag_.split('|')[0] == "WW":

# Check for future tense with respect to specific token
# if self.__process_dutch_future_verbs(sentence_doc, token) == "FUTURE_VB":
# return "FUTURE_VB"


# Classify the verb as either past or present tense
if token.tag_.startswith('WW|pv|tgw|') or token.tag_.startswith('WW|pv|conj|') or token.tag_.startswith('WW|inf|'):
return "PRES_VB"
Expand All @@ -285,6 +270,8 @@ def __process_dutch_future_verbs(self, tags, tag):
pos_value = tag[6]
tag_value = tag[7]

# Auxiliary verbs in Dutch indicating future tense
#future_auxiliary_verbs = ["zal", "zult", "zullen","ga", "gaat", "gaan"]
if lemma_value in ["zullen", "gaan"] and pos_value == "AUX":
return "FUTURE_VB"

Expand All @@ -294,33 +281,40 @@ def __process_dutch_future_verbs(self, tags, tag):
# Loop through each token in the sentence
for tok in tags:
lemma_value = tok[10]
text_value = tok[5]
pos_value = tok[6]
tag_value = tok[7]
tag_value = tok[7]
dep_value = tok[8]

# Check if the token is an auxiliary 'zullen' or 'gaan' or a conjugation of it
if lemma_value in ["zullen", "gaan"] and pos_value == "AUX":
future_verb_triggered = True
continue # Move to the next token to check for infinitives

# Check if the specific token is an infinitive verb that follows 'zullen' or 'gaan'
if future_verb_triggered and tag_value.startswith("WW|inf|"):
# Classify the infinitive as FUTURE_VB if it's the target token
if tok == tag:
return "FUTURE_VB"

# Stop checking further tokens if we encounter a verb that is not part of a "zullen" construction
#elif tok.pos_ == "VERB" and tok.tag_.split('|')[0] != "AUX" and not tok.tag_.startswith('WW|inf|'):

# Reset the future_trigger_found if another main verb is encountered before token
#elif tok.pos_ == "VERB" and tok.tag_.split('|')[0] != "AUX" and tok != token:

continue # Keep marking subsequent infinitives as FUTURE_VB

# reset the future_verb_triggered flag whenever a clause boundary is detected
# if tok.is_punct or (tok.dep_ in ["cc", "punct"] and tok.text in [",", ";"]):
# future_verb_triggered = False
if dep_value == "punct" and text_value in [";", ",", ".", ":", "?", "!"]:
future_verb_triggered = False

# if a finite verb (that isn't part of a future tense construction) appears in the
# sentence after a conjugation of "zullen" or "gaan", it signals the end of the
# future verb construction. In this case, you don't want to continue marking
# subsequent verbs or infinitives as part of a future tense construction,
# so the flag is reset to False.
elif pos_value == "VERB" and not tag_value.startswith("WW|inf|"):
future_verb_triggered = False

return "-" # if no future tense verb is detected

def __process_potential_action(self, tag, sentence):
def __process_potential_action(self, tag):
if self.lang == constants.NL:
return self.__process_dutch_potential_action(tag, sentence)
return self.__process_dutch_potential_action(tag)
elif self.lang == constants.EN:
return self.__process_english_potential_action(tag)
else:
Expand All @@ -347,7 +341,7 @@ def __process_non_noun_tag(self, storyid, sentence, tag):
"""
row = None
if self.__is_valid_token(tag):
tense_value = self.__process_potential_action(tag, sentence)
tense_value = self.__process_potential_action(tag)
row = [storyid, sentence, tag[0], tag[-1].idx, tag[-1].idx + len(tag[0]), tense_value,
tag[1], tag[2], tag[3], tag[4], tag[-1].lemma_, tag[-1].head.text, tag[-1].head.idx,
False, False, False, '-']
Expand Down

0 comments on commit 7e09455

Please sign in to comment.