fixes bug when optional custom tags dictionary is not specified

navigating-stories · Jan 17, 2024 · b47da0e · b47da0e
1 parent ec8a4f0
commit b47da0e
Show file tree

Hide file tree

Showing 3 changed files with 60 additions and 152 deletions.
diff --git a/orangecontrib/storynavigation/modules/actoranalysis.py b/orangecontrib/storynavigation/modules/actoranalysis.py
@@ -288,164 +288,73 @@ def postag_text(
             pos_tags.append("SP")
             pos_tags.append("SNP")
 
+        print()
+        print()
+        print("pos_tags: ", pos_tags)
+        print()
+        print()
         if len(pos_tags) == 0:
             for sentence in sentences:
                 doc = {"text": sentence, "ents": []}
                 options = {"ents": pos_tags, "colors": constants.COLOR_MAP}
                 html += displacy.render(doc, style="ent", options=options, manual=True)
             return html
 
-        # print('spacytags: ', story_elements_df['spacy_tag'].tolist())
-        # print()
-        # print('othertags: ', story_elements_df['story_navigator_tag'].tolist())
-        # print()
-        # story_elements_df['story_navigator_tag'] = story_elements_df['story_navigator_tag'].astype(str)
+        story_elements_df = story_elements_df.copy()
+        story_elements_df['story_navigator_tag'] = story_elements_df['story_navigator_tag'].astype(str)
+        story_elements_df['spacy_tag'] = story_elements_df['spacy_tag'].astype(str)
+
+        print()
+        print()
+        print("story_elements: ", story_elements_df)
+        print()
+        print()        
+
+
         matched_df = story_elements_df[story_elements_df['story_navigator_tag'].isin(pos_tags) | story_elements_df['spacy_tag'].isin(pos_tags)]
-        # print()
-        # print('matched_df: ', matched_df)
-        # print()
-        matched_df = matched_df.copy()
+        print()
+        print()
+        print("matched_df1: ", matched_df)
+        print()
+        print()        
+
+        # matched_df = matched_df.copy()
         matched_df['merged_tags'] = np.where(matched_df['story_navigator_tag'] == '-', matched_df['spacy_tag'], matched_df['story_navigator_tag'])
         matched_df['token_start_idx'] = matched_df['token_start_idx'].astype(str)
         matched_df['token_end_idx'] = matched_df['token_end_idx'].astype(str)
         matched_df['displacy_tag_strings'] = matched_df['token_start_idx'] + ' | ' + matched_df['token_end_idx'] + ' | ' + matched_df['merged_tags']
 
-        order_mapping = {value: index for index, value in enumerate(sentences)}
+        print()
+        print()
+        print("matched_df2: ", matched_df)
+        print()
+        print()
 
-        # Create a custom sorting key function
-        # def custom_sort_key(value):
-        #     return order_mapping.get(value, len(sentences))
+        order_mapping = {value: index for index, value in enumerate(sentences)}
 
         for sentence in sentences:
             matched_sent_df = matched_df[matched_df['sentence'] == sentence]
-            # Apply the custom sorting key to create a new column for sorting
             matched_sent_df = matched_sent_df.copy()
             matched_sent_df.loc[:, 'sorting_key'] = matched_sent_df['sentence'].map(lambda value: order_mapping.get(value, len(sentences)))
-
-            # matched_sent_df['sorting_key'] = matched_sent_df['sentence'].map(custom_sort_key)
-            # Sort the DataFrame based on the sorting key
             matched_sent_df_sorted = matched_sent_df.sort_values(by='sorting_key').drop('sorting_key', axis=1)
 
             ents = []
             if len(matched_sent_df_sorted) > 0:
-            # filtered_df = story_elements_df[story_elements_df.isin(matched_sent_df.to_dict(orient='list')).all(axis=1)]
-            # matched_indices = filtered_df.index.tolist()
-            # if len(matched_indices) > 0:
                 displacy_tags_list = matched_sent_df_sorted['displacy_tag_strings'].tolist()
                 for displacy_tag in displacy_tags_list:
                     dtag = displacy_tag.split(' | ')
                     ents.append({"start": int(float(dtag[0])), "end": int(float(dtag[1])), "label": dtag[2]})
 
-                ents = util.remove_duplicate_tagged_entities(ents)
-        # if custom:
-        #     if custom_dict is not None:
-        #         custom_tag_labels = self.__get_custom_tags_list(custom_dict)
-        #         pos_tags.extend(custom_tag_labels)
+                ents = util.remove_duplicate_tagged_entities(ents)                
 
-        # output of this function
-
+            # print()
+            # print()
+            # print('ents: ', ents)
+            # print()
+            # print()
 
-        # loop through model to filter out those words that need to be tagged (based on user selection and prominence score)
-        # for sentence in sentences:
-        #     if len(sentence.split()) > 0: # sentence has at least one word in it
-        #         first_word_in_sent = sentence.split()[0].lower().strip()
-        #         tags = []
-        #         tokenizer = RegexpTokenizer(r"\w+|\$[\d\.]+|\S+")
-        #         spans = list(tokenizer.span_tokenize(sentence))
-
-        #         for token in tagged_sentence:
-        #             tags.append((token.text, token.pos_, token.tag_, token.dep_, token))
-
-        #         # identify and tag custom words in the story text
-        #         ents = []
-        #         if custom_dict is not None:
-        #             custom_matched_tags = self.__find_custom_word_matches(custom_dict, sentence)
-        #             for matched_tag in custom_matched_tags:
-        #                 ents.append(matched_tag)
-
-        #         # identify and tag POS / NER tokens in the story text
-        #         for tag, span in zip(tags, spans):
-        #             # print()
-        #             # print('tag: ', tag)
-        #             # print()
-        #             normalised_token, is_valid_token = self.__is_valid_token(tag)
-        #             if is_valid_token:
-        #                 # print()
-        #                 # print('tag: ', tag)
-        #                 # print()
-        #                 is_subj, subj_type = self.__is_subject(tag)
-        #                 if is_subj:
-        #                     p_score_greater_than_min = self.__update_postagging_metrics(
-        #                         tag[0].lower().strip(),
-        #                         selected_prominence_metric,
-        #                         prominence_score_min,
-        #                         token,
-        #                     )
-        #                     if p_score_greater_than_min:
-        #                         if self.__is_pronoun(tag):
-        #                             ents.append(
-        #                                 {"start": span[0], "end": span[1], "label": "SP"}
-        #                             )
-        #                         else:
-        #                             ents.append(
-        #                                 {"start": span[0], "end": span[1], "label": "SNP"}
-        #                             )
-        #                 else:
-        #                     if self.__is_pronoun(tag):
-        #                         ents.append(
-        #                             {"start": span[0], "end": span[1], "label": "NSP"}
-        #                         )
-        #                     elif self.__is_noun_but_not_pronoun(tag):
-        #                         ents.append(
-        #                             {"start": span[0], "end": span[1], "label": "NSNP"}
-        #                         )
-
-        #         if any(word == first_word_in_sent for word in self.pronouns):
-        #             p_score_greater_than_min = self.__update_postagging_metrics(
-        #                 first_word_in_sent,
-        #                 selected_prominence_metric,
-        #                 prominence_score_min,
-        #                 token,
-        #             )
-
-        #             if p_score_greater_than_min:
-        #                 ents.append(
-        #                     {"start": 0, "end": len(first_word_in_sent), "label": "SP"}
-        #                 )
-
-        #             if first_word_in_sent in self.passive_agency_scores:
-        #                 self.passive_agency_scores[first_word_in_sent] += 1
-        #             else:
-        #                 self.passive_agency_scores[first_word_in_sent] = 1
-
-        #             # if first_word_in_sent not in self.active_agency_scores:
-        #             #     self.active_agency_scores[first_word_in_sent] = 0
-
-        #         # remove duplicate tags (sometimes one entity can fall under multiple tag categories.
-        #         # to avoid duplication, only tag each entity using ONE tag category.
-
-
-            # specify sentences and filtered entities to tag / highlight
-
-
             doc = {"text": sentence, "ents": ents}
-
-            # specify colors for highlighting each entity type
-            # colors = {}
-            # if nouns:
-            #     colors["NSP"] = constants.NONSUBJECT_PRONOUN_HIGHLIGHT_COLOR
-            #     colors["NSNP"] = constants.NONSUBJECT_NONPRONOUN_HIGHLIGHT_COLOR
-            # if subjs:
-            #     colors["SP"] = constants.SUBJECT_PRONOUN_HIGHLIGHT_COLOR
-            #     colors["SNP"] = constants.SUBJECT_NONPRONOUN_HIGHLIGHT_COLOR
-            # if custom:
-            #     for custom_label in custom_tag_labels:
-            #         colors[custom_label] = constants.CUSTOMTAG_HIGHLIGHT_COLOR
-
-            # self.agent_prominence_score_max = self.__get_max_prominence_score()
-            # collect the above config params together
             options = {"ents": pos_tags, "colors": constants.COLOR_MAP}
-            # give all the params to displacy to generate HTML code of the text with highlighted tags
             html += displacy.render(doc, style="ent", options=options, manual=True)
 
         self.html_result = html

diff --git a/orangecontrib/storynavigation/modules/tagging.py b/orangecontrib/storynavigation/modules/tagging.py
@@ -18,10 +18,14 @@ def __init__(self, lang, text_tuples, custom_tags_and_word_column=None):
         self.text_tuples = text_tuples
         self.custom_tags = None
         self.word_column = None
+        self.complete_data_columns = ['storyid', 'sentence', 'token_text', 'token_start_idx', 'token_end_idx', 'story_navigator_tag', 'spacy_tag', 'spacy_finegrained_tag', 'spacy_dependency', 'is_pronoun_boolean', 'is_sentence_subject_boolean', 'active_voice_subject_boolean', 'associated_action']
 
         if custom_tags_and_word_column is not None:
             self.word_column = custom_tags_and_word_column[1]
             self.custom_tags = custom_tags_and_word_column[0]
+            self.customtag_column_names = self.__generate_customtag_column_names()
+            self.flattened_custom_tags_dictionary = self.__flatten_custom_tag_dictionary()
+            self.complete_data_columns.extend(self.customtag_column_names)
 
         self.stopwords = None
         self.pronouns = None
@@ -33,10 +37,7 @@ def __init__(self, lang, text_tuples, custom_tags_and_word_column=None):
 
         self.nlp = util.load_spacy_pipeline(self.model)
         self.n = 20 # top n scoring tokens for all metrics
-        self.customtag_column_names = self.__generate_customtag_column_names()
-        self.flattened_custom_tags_dictionary = self.__flatten_custom_tag_dictionary()
-        self.complete_data_columns = ['storyid', 'sentence', 'token_text', 'token_start_idx', 'token_end_idx', 'story_navigator_tag', 'spacy_tag', 'spacy_finegrained_tag', 'spacy_dependency', 'is_pronoun_boolean', 'is_sentence_subject_boolean', 'active_voice_subject_boolean', 'associated_action']
-        self.complete_data_columns.extend(self.customtag_column_names)
+
         self.complete_data = self.__process_stories(self.nlp, self.text_tuples)
 
     def __process_stories(self, nlp, text_tuples):

diff --git a/orangecontrib/storynavigation/widgets/OWSNActorAnalysis.py b/orangecontrib/storynavigation/widgets/OWSNActorAnalysis.py
@@ -559,9 +559,29 @@ def set_stories(self, stories=None):
     def set_tagging_data(self, story_elements=None):
         if story_elements is not None:
             self.story_elements = pd.concat(table_to_frames(story_elements), axis=1)
+            print()
+            print()
+            print('story-n: ', self.story_elements['story_navigator_tag'])
+            print()
+            print()
+            print()
+            print()
+            print('story-s: ', self.story_elements['spacy_tag'])
+            print()
+            print()
+
+
             story_elements_grouped_by_story = self.story_elements.groupby('storyid')
             for storyid, story_df in story_elements_grouped_by_story:
                 self.story_elements_dict[storyid] = story_df
+                print()
+                print()
+                print(storyid)
+                print('dataframe1: ', self.story_elements_dict[storyid]['story_navigator_tag'])
+                print('dataframe2: ', self.story_elements_dict[storyid]['spacy_tag'])
+                print()
+                print()
+
 
             self.setup_controls()
             # self.openContext(self.corpus)
@@ -685,43 +705,21 @@ def slider_callback(self):
     def show_docs(self, slider_engaged=False):
         """Show the selected documents in the right area"""
         if self.stories is None:
-            # print()
-            # print('why here joe???')
-            # print()
             return
 
         self.Warning.no_feats_display.clear()
         if len(self.display_features) == 0:
-            # print()
-            # print('why here bob???')
-            # print()
             self.Warning.no_feats_display()
 
         parts = []
 
-        # if len(self.selected_documents) > 0:
-        #     print()
-        #     print('thats good...')
-        # else:
-        #     print()
-        #     print('boo boo!!!!!!')
-
         for doc_count, c_index in enumerate(sorted(self.selected_documents)):
             text = ""
             for feature in self.display_features:
                 value = str(self.stories[c_index, feature.name])
                 self.original_text = str(value)
-                # print()
-                # print('d: ', doc_count)
-                # print()
-                # print('c: ', c_index)
-                # print()
                 if feature.name.lower() == "content" or feature.name.lower() == "text":
                     if len(self.story_elements_dict) > 0:
-                        # print('value: ', value)
-                        # print()
-                        # print()
-                        # print()
                         value = self.actortagger.postag_text(
                             value,
                             self.nouns,