Skip to content

Commit

Permalink
fixes bug when optional custom tags dictionary is not specified
Browse files Browse the repository at this point in the history
  • Loading branch information
kodymoodley committed Jan 17, 2024
1 parent ec8a4f0 commit b47da0e
Show file tree
Hide file tree
Showing 3 changed files with 60 additions and 152 deletions.
161 changes: 35 additions & 126 deletions orangecontrib/storynavigation/modules/actoranalysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,164 +288,73 @@ def postag_text(
pos_tags.append("SP")
pos_tags.append("SNP")

print()
print()
print("pos_tags: ", pos_tags)
print()
print()
if len(pos_tags) == 0:
for sentence in sentences:
doc = {"text": sentence, "ents": []}
options = {"ents": pos_tags, "colors": constants.COLOR_MAP}
html += displacy.render(doc, style="ent", options=options, manual=True)
return html

# print('spacytags: ', story_elements_df['spacy_tag'].tolist())
# print()
# print('othertags: ', story_elements_df['story_navigator_tag'].tolist())
# print()
# story_elements_df['story_navigator_tag'] = story_elements_df['story_navigator_tag'].astype(str)
story_elements_df = story_elements_df.copy()
story_elements_df['story_navigator_tag'] = story_elements_df['story_navigator_tag'].astype(str)
story_elements_df['spacy_tag'] = story_elements_df['spacy_tag'].astype(str)

print()
print()
print("story_elements: ", story_elements_df)
print()
print()


matched_df = story_elements_df[story_elements_df['story_navigator_tag'].isin(pos_tags) | story_elements_df['spacy_tag'].isin(pos_tags)]
# print()
# print('matched_df: ', matched_df)
# print()
matched_df = matched_df.copy()
print()
print()
print("matched_df1: ", matched_df)
print()
print()

# matched_df = matched_df.copy()
matched_df['merged_tags'] = np.where(matched_df['story_navigator_tag'] == '-', matched_df['spacy_tag'], matched_df['story_navigator_tag'])
matched_df['token_start_idx'] = matched_df['token_start_idx'].astype(str)
matched_df['token_end_idx'] = matched_df['token_end_idx'].astype(str)
matched_df['displacy_tag_strings'] = matched_df['token_start_idx'] + ' | ' + matched_df['token_end_idx'] + ' | ' + matched_df['merged_tags']

order_mapping = {value: index for index, value in enumerate(sentences)}
print()
print()
print("matched_df2: ", matched_df)
print()
print()

# Create a custom sorting key function
# def custom_sort_key(value):
# return order_mapping.get(value, len(sentences))
order_mapping = {value: index for index, value in enumerate(sentences)}

for sentence in sentences:
matched_sent_df = matched_df[matched_df['sentence'] == sentence]
# Apply the custom sorting key to create a new column for sorting
matched_sent_df = matched_sent_df.copy()
matched_sent_df.loc[:, 'sorting_key'] = matched_sent_df['sentence'].map(lambda value: order_mapping.get(value, len(sentences)))

# matched_sent_df['sorting_key'] = matched_sent_df['sentence'].map(custom_sort_key)
# Sort the DataFrame based on the sorting key
matched_sent_df_sorted = matched_sent_df.sort_values(by='sorting_key').drop('sorting_key', axis=1)

ents = []
if len(matched_sent_df_sorted) > 0:
# filtered_df = story_elements_df[story_elements_df.isin(matched_sent_df.to_dict(orient='list')).all(axis=1)]
# matched_indices = filtered_df.index.tolist()
# if len(matched_indices) > 0:
displacy_tags_list = matched_sent_df_sorted['displacy_tag_strings'].tolist()
for displacy_tag in displacy_tags_list:
dtag = displacy_tag.split(' | ')
ents.append({"start": int(float(dtag[0])), "end": int(float(dtag[1])), "label": dtag[2]})

ents = util.remove_duplicate_tagged_entities(ents)
# if custom:
# if custom_dict is not None:
# custom_tag_labels = self.__get_custom_tags_list(custom_dict)
# pos_tags.extend(custom_tag_labels)
ents = util.remove_duplicate_tagged_entities(ents)

# output of this function

# print()
# print()
# print('ents: ', ents)
# print()
# print()

# loop through model to filter out those words that need to be tagged (based on user selection and prominence score)
# for sentence in sentences:
# if len(sentence.split()) > 0: # sentence has at least one word in it
# first_word_in_sent = sentence.split()[0].lower().strip()
# tags = []
# tokenizer = RegexpTokenizer(r"\w+|\$[\d\.]+|\S+")
# spans = list(tokenizer.span_tokenize(sentence))

# for token in tagged_sentence:
# tags.append((token.text, token.pos_, token.tag_, token.dep_, token))

# # identify and tag custom words in the story text
# ents = []
# if custom_dict is not None:
# custom_matched_tags = self.__find_custom_word_matches(custom_dict, sentence)
# for matched_tag in custom_matched_tags:
# ents.append(matched_tag)

# # identify and tag POS / NER tokens in the story text
# for tag, span in zip(tags, spans):
# # print()
# # print('tag: ', tag)
# # print()
# normalised_token, is_valid_token = self.__is_valid_token(tag)
# if is_valid_token:
# # print()
# # print('tag: ', tag)
# # print()
# is_subj, subj_type = self.__is_subject(tag)
# if is_subj:
# p_score_greater_than_min = self.__update_postagging_metrics(
# tag[0].lower().strip(),
# selected_prominence_metric,
# prominence_score_min,
# token,
# )
# if p_score_greater_than_min:
# if self.__is_pronoun(tag):
# ents.append(
# {"start": span[0], "end": span[1], "label": "SP"}
# )
# else:
# ents.append(
# {"start": span[0], "end": span[1], "label": "SNP"}
# )
# else:
# if self.__is_pronoun(tag):
# ents.append(
# {"start": span[0], "end": span[1], "label": "NSP"}
# )
# elif self.__is_noun_but_not_pronoun(tag):
# ents.append(
# {"start": span[0], "end": span[1], "label": "NSNP"}
# )

# if any(word == first_word_in_sent for word in self.pronouns):
# p_score_greater_than_min = self.__update_postagging_metrics(
# first_word_in_sent,
# selected_prominence_metric,
# prominence_score_min,
# token,
# )

# if p_score_greater_than_min:
# ents.append(
# {"start": 0, "end": len(first_word_in_sent), "label": "SP"}
# )

# if first_word_in_sent in self.passive_agency_scores:
# self.passive_agency_scores[first_word_in_sent] += 1
# else:
# self.passive_agency_scores[first_word_in_sent] = 1

# # if first_word_in_sent not in self.active_agency_scores:
# # self.active_agency_scores[first_word_in_sent] = 0

# # remove duplicate tags (sometimes one entity can fall under multiple tag categories.
# # to avoid duplication, only tag each entity using ONE tag category.


# specify sentences and filtered entities to tag / highlight


doc = {"text": sentence, "ents": ents}

# specify colors for highlighting each entity type
# colors = {}
# if nouns:
# colors["NSP"] = constants.NONSUBJECT_PRONOUN_HIGHLIGHT_COLOR
# colors["NSNP"] = constants.NONSUBJECT_NONPRONOUN_HIGHLIGHT_COLOR
# if subjs:
# colors["SP"] = constants.SUBJECT_PRONOUN_HIGHLIGHT_COLOR
# colors["SNP"] = constants.SUBJECT_NONPRONOUN_HIGHLIGHT_COLOR
# if custom:
# for custom_label in custom_tag_labels:
# colors[custom_label] = constants.CUSTOMTAG_HIGHLIGHT_COLOR

# self.agent_prominence_score_max = self.__get_max_prominence_score()
# collect the above config params together
options = {"ents": pos_tags, "colors": constants.COLOR_MAP}
# give all the params to displacy to generate HTML code of the text with highlighted tags
html += displacy.render(doc, style="ent", options=options, manual=True)

self.html_result = html
Expand Down
9 changes: 5 additions & 4 deletions orangecontrib/storynavigation/modules/tagging.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,14 @@ def __init__(self, lang, text_tuples, custom_tags_and_word_column=None):
self.text_tuples = text_tuples
self.custom_tags = None
self.word_column = None
self.complete_data_columns = ['storyid', 'sentence', 'token_text', 'token_start_idx', 'token_end_idx', 'story_navigator_tag', 'spacy_tag', 'spacy_finegrained_tag', 'spacy_dependency', 'is_pronoun_boolean', 'is_sentence_subject_boolean', 'active_voice_subject_boolean', 'associated_action']

if custom_tags_and_word_column is not None:
self.word_column = custom_tags_and_word_column[1]
self.custom_tags = custom_tags_and_word_column[0]
self.customtag_column_names = self.__generate_customtag_column_names()
self.flattened_custom_tags_dictionary = self.__flatten_custom_tag_dictionary()
self.complete_data_columns.extend(self.customtag_column_names)

self.stopwords = None
self.pronouns = None
Expand All @@ -33,10 +37,7 @@ def __init__(self, lang, text_tuples, custom_tags_and_word_column=None):

self.nlp = util.load_spacy_pipeline(self.model)
self.n = 20 # top n scoring tokens for all metrics
self.customtag_column_names = self.__generate_customtag_column_names()
self.flattened_custom_tags_dictionary = self.__flatten_custom_tag_dictionary()
self.complete_data_columns = ['storyid', 'sentence', 'token_text', 'token_start_idx', 'token_end_idx', 'story_navigator_tag', 'spacy_tag', 'spacy_finegrained_tag', 'spacy_dependency', 'is_pronoun_boolean', 'is_sentence_subject_boolean', 'active_voice_subject_boolean', 'associated_action']
self.complete_data_columns.extend(self.customtag_column_names)

self.complete_data = self.__process_stories(self.nlp, self.text_tuples)

def __process_stories(self, nlp, text_tuples):
Expand Down
42 changes: 20 additions & 22 deletions orangecontrib/storynavigation/widgets/OWSNActorAnalysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -559,9 +559,29 @@ def set_stories(self, stories=None):
def set_tagging_data(self, story_elements=None):
if story_elements is not None:
self.story_elements = pd.concat(table_to_frames(story_elements), axis=1)
print()
print()
print('story-n: ', self.story_elements['story_navigator_tag'])
print()
print()
print()
print()
print('story-s: ', self.story_elements['spacy_tag'])
print()
print()


story_elements_grouped_by_story = self.story_elements.groupby('storyid')
for storyid, story_df in story_elements_grouped_by_story:
self.story_elements_dict[storyid] = story_df
print()
print()
print(storyid)
print('dataframe1: ', self.story_elements_dict[storyid]['story_navigator_tag'])
print('dataframe2: ', self.story_elements_dict[storyid]['spacy_tag'])
print()
print()


self.setup_controls()
# self.openContext(self.corpus)
Expand Down Expand Up @@ -685,43 +705,21 @@ def slider_callback(self):
def show_docs(self, slider_engaged=False):
"""Show the selected documents in the right area"""
if self.stories is None:
# print()
# print('why here joe???')
# print()
return

self.Warning.no_feats_display.clear()
if len(self.display_features) == 0:
# print()
# print('why here bob???')
# print()
self.Warning.no_feats_display()

parts = []

# if len(self.selected_documents) > 0:
# print()
# print('thats good...')
# else:
# print()
# print('boo boo!!!!!!')

for doc_count, c_index in enumerate(sorted(self.selected_documents)):
text = ""
for feature in self.display_features:
value = str(self.stories[c_index, feature.name])
self.original_text = str(value)
# print()
# print('d: ', doc_count)
# print()
# print('c: ', c_index)
# print()
if feature.name.lower() == "content" or feature.name.lower() == "text":
if len(self.story_elements_dict) > 0:
# print('value: ', value)
# print()
# print()
# print()
value = self.actortagger.postag_text(
value,
self.nouns,
Expand Down

0 comments on commit b47da0e

Please sign in to comment.