diff --git a/orangecontrib/storynavigation/modules/actoranalysis.py b/orangecontrib/storynavigation/modules/actoranalysis.py index 149b48a..66ead3a 100644 --- a/orangecontrib/storynavigation/modules/actoranalysis.py +++ b/orangecontrib/storynavigation/modules/actoranalysis.py @@ -288,6 +288,11 @@ def postag_text( pos_tags.append("SP") pos_tags.append("SNP") + print() + print() + print("pos_tags: ", pos_tags) + print() + print() if len(pos_tags) == 0: for sentence in sentences: doc = {"text": sentence, "ents": []} @@ -295,157 +300,61 @@ def postag_text( html += displacy.render(doc, style="ent", options=options, manual=True) return html - # print('spacytags: ', story_elements_df['spacy_tag'].tolist()) - # print() - # print('othertags: ', story_elements_df['story_navigator_tag'].tolist()) - # print() - # story_elements_df['story_navigator_tag'] = story_elements_df['story_navigator_tag'].astype(str) + story_elements_df = story_elements_df.copy() + story_elements_df['story_navigator_tag'] = story_elements_df['story_navigator_tag'].astype(str) + story_elements_df['spacy_tag'] = story_elements_df['spacy_tag'].astype(str) + + print() + print() + print("story_elements: ", story_elements_df) + print() + print() + + matched_df = story_elements_df[story_elements_df['story_navigator_tag'].isin(pos_tags) | story_elements_df['spacy_tag'].isin(pos_tags)] - # print() - # print('matched_df: ', matched_df) - # print() - matched_df = matched_df.copy() + print() + print() + print("matched_df1: ", matched_df) + print() + print() + + # matched_df = matched_df.copy() matched_df['merged_tags'] = np.where(matched_df['story_navigator_tag'] == '-', matched_df['spacy_tag'], matched_df['story_navigator_tag']) matched_df['token_start_idx'] = matched_df['token_start_idx'].astype(str) matched_df['token_end_idx'] = matched_df['token_end_idx'].astype(str) matched_df['displacy_tag_strings'] = matched_df['token_start_idx'] + ' | ' + matched_df['token_end_idx'] + ' | ' + matched_df['merged_tags'] - order_mapping = {value: index for index, value in enumerate(sentences)} + print() + print() + print("matched_df2: ", matched_df) + print() + print() - # Create a custom sorting key function - # def custom_sort_key(value): - # return order_mapping.get(value, len(sentences)) + order_mapping = {value: index for index, value in enumerate(sentences)} for sentence in sentences: matched_sent_df = matched_df[matched_df['sentence'] == sentence] - # Apply the custom sorting key to create a new column for sorting matched_sent_df = matched_sent_df.copy() matched_sent_df.loc[:, 'sorting_key'] = matched_sent_df['sentence'].map(lambda value: order_mapping.get(value, len(sentences))) - - # matched_sent_df['sorting_key'] = matched_sent_df['sentence'].map(custom_sort_key) - # Sort the DataFrame based on the sorting key matched_sent_df_sorted = matched_sent_df.sort_values(by='sorting_key').drop('sorting_key', axis=1) ents = [] if len(matched_sent_df_sorted) > 0: - # filtered_df = story_elements_df[story_elements_df.isin(matched_sent_df.to_dict(orient='list')).all(axis=1)] - # matched_indices = filtered_df.index.tolist() - # if len(matched_indices) > 0: displacy_tags_list = matched_sent_df_sorted['displacy_tag_strings'].tolist() for displacy_tag in displacy_tags_list: dtag = displacy_tag.split(' | ') ents.append({"start": int(float(dtag[0])), "end": int(float(dtag[1])), "label": dtag[2]}) - ents = util.remove_duplicate_tagged_entities(ents) - # if custom: - # if custom_dict is not None: - # custom_tag_labels = self.__get_custom_tags_list(custom_dict) - # pos_tags.extend(custom_tag_labels) + ents = util.remove_duplicate_tagged_entities(ents) - # output of this function - + # print() + # print() + # print('ents: ', ents) + # print() + # print() - # loop through model to filter out those words that need to be tagged (based on user selection and prominence score) - # for sentence in sentences: - # if len(sentence.split()) > 0: # sentence has at least one word in it - # first_word_in_sent = sentence.split()[0].lower().strip() - # tags = [] - # tokenizer = RegexpTokenizer(r"\w+|\$[\d\.]+|\S+") - # spans = list(tokenizer.span_tokenize(sentence)) - - # for token in tagged_sentence: - # tags.append((token.text, token.pos_, token.tag_, token.dep_, token)) - - # # identify and tag custom words in the story text - # ents = [] - # if custom_dict is not None: - # custom_matched_tags = self.__find_custom_word_matches(custom_dict, sentence) - # for matched_tag in custom_matched_tags: - # ents.append(matched_tag) - - # # identify and tag POS / NER tokens in the story text - # for tag, span in zip(tags, spans): - # # print() - # # print('tag: ', tag) - # # print() - # normalised_token, is_valid_token = self.__is_valid_token(tag) - # if is_valid_token: - # # print() - # # print('tag: ', tag) - # # print() - # is_subj, subj_type = self.__is_subject(tag) - # if is_subj: - # p_score_greater_than_min = self.__update_postagging_metrics( - # tag[0].lower().strip(), - # selected_prominence_metric, - # prominence_score_min, - # token, - # ) - # if p_score_greater_than_min: - # if self.__is_pronoun(tag): - # ents.append( - # {"start": span[0], "end": span[1], "label": "SP"} - # ) - # else: - # ents.append( - # {"start": span[0], "end": span[1], "label": "SNP"} - # ) - # else: - # if self.__is_pronoun(tag): - # ents.append( - # {"start": span[0], "end": span[1], "label": "NSP"} - # ) - # elif self.__is_noun_but_not_pronoun(tag): - # ents.append( - # {"start": span[0], "end": span[1], "label": "NSNP"} - # ) - - # if any(word == first_word_in_sent for word in self.pronouns): - # p_score_greater_than_min = self.__update_postagging_metrics( - # first_word_in_sent, - # selected_prominence_metric, - # prominence_score_min, - # token, - # ) - - # if p_score_greater_than_min: - # ents.append( - # {"start": 0, "end": len(first_word_in_sent), "label": "SP"} - # ) - - # if first_word_in_sent in self.passive_agency_scores: - # self.passive_agency_scores[first_word_in_sent] += 1 - # else: - # self.passive_agency_scores[first_word_in_sent] = 1 - - # # if first_word_in_sent not in self.active_agency_scores: - # # self.active_agency_scores[first_word_in_sent] = 0 - - # # remove duplicate tags (sometimes one entity can fall under multiple tag categories. - # # to avoid duplication, only tag each entity using ONE tag category. - - - # specify sentences and filtered entities to tag / highlight - - doc = {"text": sentence, "ents": ents} - - # specify colors for highlighting each entity type - # colors = {} - # if nouns: - # colors["NSP"] = constants.NONSUBJECT_PRONOUN_HIGHLIGHT_COLOR - # colors["NSNP"] = constants.NONSUBJECT_NONPRONOUN_HIGHLIGHT_COLOR - # if subjs: - # colors["SP"] = constants.SUBJECT_PRONOUN_HIGHLIGHT_COLOR - # colors["SNP"] = constants.SUBJECT_NONPRONOUN_HIGHLIGHT_COLOR - # if custom: - # for custom_label in custom_tag_labels: - # colors[custom_label] = constants.CUSTOMTAG_HIGHLIGHT_COLOR - - # self.agent_prominence_score_max = self.__get_max_prominence_score() - # collect the above config params together options = {"ents": pos_tags, "colors": constants.COLOR_MAP} - # give all the params to displacy to generate HTML code of the text with highlighted tags html += displacy.render(doc, style="ent", options=options, manual=True) self.html_result = html diff --git a/orangecontrib/storynavigation/modules/tagging.py b/orangecontrib/storynavigation/modules/tagging.py index 0dc959e..7524517 100644 --- a/orangecontrib/storynavigation/modules/tagging.py +++ b/orangecontrib/storynavigation/modules/tagging.py @@ -18,10 +18,14 @@ def __init__(self, lang, text_tuples, custom_tags_and_word_column=None): self.text_tuples = text_tuples self.custom_tags = None self.word_column = None + self.complete_data_columns = ['storyid', 'sentence', 'token_text', 'token_start_idx', 'token_end_idx', 'story_navigator_tag', 'spacy_tag', 'spacy_finegrained_tag', 'spacy_dependency', 'is_pronoun_boolean', 'is_sentence_subject_boolean', 'active_voice_subject_boolean', 'associated_action'] if custom_tags_and_word_column is not None: self.word_column = custom_tags_and_word_column[1] self.custom_tags = custom_tags_and_word_column[0] + self.customtag_column_names = self.__generate_customtag_column_names() + self.flattened_custom_tags_dictionary = self.__flatten_custom_tag_dictionary() + self.complete_data_columns.extend(self.customtag_column_names) self.stopwords = None self.pronouns = None @@ -33,10 +37,7 @@ def __init__(self, lang, text_tuples, custom_tags_and_word_column=None): self.nlp = util.load_spacy_pipeline(self.model) self.n = 20 # top n scoring tokens for all metrics - self.customtag_column_names = self.__generate_customtag_column_names() - self.flattened_custom_tags_dictionary = self.__flatten_custom_tag_dictionary() - self.complete_data_columns = ['storyid', 'sentence', 'token_text', 'token_start_idx', 'token_end_idx', 'story_navigator_tag', 'spacy_tag', 'spacy_finegrained_tag', 'spacy_dependency', 'is_pronoun_boolean', 'is_sentence_subject_boolean', 'active_voice_subject_boolean', 'associated_action'] - self.complete_data_columns.extend(self.customtag_column_names) + self.complete_data = self.__process_stories(self.nlp, self.text_tuples) def __process_stories(self, nlp, text_tuples): diff --git a/orangecontrib/storynavigation/widgets/OWSNActorAnalysis.py b/orangecontrib/storynavigation/widgets/OWSNActorAnalysis.py index 7a17ccf..bd7cddc 100644 --- a/orangecontrib/storynavigation/widgets/OWSNActorAnalysis.py +++ b/orangecontrib/storynavigation/widgets/OWSNActorAnalysis.py @@ -559,9 +559,29 @@ def set_stories(self, stories=None): def set_tagging_data(self, story_elements=None): if story_elements is not None: self.story_elements = pd.concat(table_to_frames(story_elements), axis=1) + print() + print() + print('story-n: ', self.story_elements['story_navigator_tag']) + print() + print() + print() + print() + print('story-s: ', self.story_elements['spacy_tag']) + print() + print() + + story_elements_grouped_by_story = self.story_elements.groupby('storyid') for storyid, story_df in story_elements_grouped_by_story: self.story_elements_dict[storyid] = story_df + print() + print() + print(storyid) + print('dataframe1: ', self.story_elements_dict[storyid]['story_navigator_tag']) + print('dataframe2: ', self.story_elements_dict[storyid]['spacy_tag']) + print() + print() + self.setup_controls() # self.openContext(self.corpus) @@ -685,43 +705,21 @@ def slider_callback(self): def show_docs(self, slider_engaged=False): """Show the selected documents in the right area""" if self.stories is None: - # print() - # print('why here joe???') - # print() return self.Warning.no_feats_display.clear() if len(self.display_features) == 0: - # print() - # print('why here bob???') - # print() self.Warning.no_feats_display() parts = [] - # if len(self.selected_documents) > 0: - # print() - # print('thats good...') - # else: - # print() - # print('boo boo!!!!!!') - for doc_count, c_index in enumerate(sorted(self.selected_documents)): text = "" for feature in self.display_features: value = str(self.stories[c_index, feature.name]) self.original_text = str(value) - # print() - # print('d: ', doc_count) - # print() - # print('c: ', c_index) - # print() if feature.name.lower() == "content" or feature.name.lower() == "text": if len(self.story_elements_dict) > 0: - # print('value: ', value) - # print() - # print() - # print() value = self.actortagger.postag_text( value, self.nouns,