refactored code

navigating-stories · Nov 5, 2024 · 7c3e63d · 7c3e63d
1 parent 26c12bc
commit 7c3e63d
Show file tree

Hide file tree

Showing 3 changed files with 66 additions and 54 deletions.
diff --git a/orangecontrib/storynavigation/modules/meansanalysis.py b/orangecontrib/storynavigation/modules/meansanalysis.py
@@ -1,15 +1,10 @@
-"""Modules required for MeansAnalysis widget in the Orange Story Navigator add-on.
-"""
-
 import pandas as pd
 import storynavigation.modules.constants as constants
 import storynavigation.modules.util as util
 
 
 class MeansAnalyzer:
     """Class for extracting means from texts
-    For the storynavigator Orange3 add-on:
-    https://pypi.org/project/storynavigator/0.0.11/
 
     Args:
         language (str): ISO string of the language of the input text
@@ -55,26 +50,26 @@ def __compute_sentence_offsets(self, story_elements_df) -> pd.DataFrame:
         return sentences_df[["storyid", "sentence_id", "char_offset"]].set_index(["storyid", "sentence_id"])
 
 
-    def __convert_entities(self, entities, sentences_offsets) -> dict:
+    def __convert_entities(self, entities, sentence_offsets) -> dict:
         entities_from_onsets = {}
         for storyid, sentence_id, sentence_data in entities:
             if storyid not in entities_from_onsets:
                 entities_from_onsets[storyid] = {}
             for token_start_id in sentence_data:
-                char_offset_sentence = sentences_offsets.loc[(storyid, sentence_id)]["char_offset"]
+                char_offset_sentence = sentence_offsets.loc[(storyid, sentence_id)]["char_offset"]
                 entities_from_onsets[storyid][token_start_id + char_offset_sentence] = sentence_data[token_start_id]
         return entities_from_onsets
 
 
     def __convert_stories_to_sentences(self, story_elements_df) -> pd.DataFrame:
-        # return story_elements_df.groupby(["storyid", "sentence_id"]).agg(lambda x: list(x)).reset_index()
         return { index: group.to_dict(orient="index") for index, group in story_elements_df.groupby(["storyid", "sentence_id"])}
 
 
     def __process_texts(self, story_elements_df, callback=None) -> list:
-        sentences_dict = self.__convert_stories_to_sentences(story_elements_df)
+        sentence_dict = self.__convert_stories_to_sentences(story_elements_df)
         entities = []
-        for sentence_dict_index, row_sentence_dict in sentences_dict.items():
+        index = 0
+        for sentence_dict_index, row_sentence_dict in sentence_dict.items():
             row_sentence_dict = { token["token_start_idx"]: token
                                  for token_idx, token in row_sentence_dict.items() }
             sentence_entities = self.__process_sentence(row_sentence_dict)
@@ -83,8 +78,12 @@ def __process_texts(self, story_elements_df, callback=None) -> list:
                     sentence_dict_index[0],
                     sentence_dict_index[1],
                     sentence_entities])
+            if callback:
+                index += 1
+                callback((100*index)/len(sentence_dict))
         return entities
 
+
     def __matching_dependencies(self, sentence_df, entity_start_id, head_start_id, head_of_head_start_id) -> bool:
         if sentence_df[head_of_head_start_id]["spacy_tag"] not in ["VERB", "AUX"]:
             return False
@@ -97,82 +96,94 @@ def __matching_dependencies(self, sentence_df, entity_start_id, head_start_id, h
                 (self.means_strategy == constants.MEANS_STRATEGY_SPACY_PREPS and
                  sentence_df[entity_start_id]["spacy_tag"] == "ADP"))
 
-    def __expand_means_phrase(self, sentence_df, sentence_entities, char_offset, entity_start_id, head_start_id) -> None:
-        child_entity_ids = self.__get_head_dependencies(sentence_df, char_offset, entity_start_id, head_start_id)
-        processed_ids = []
-        self.__prepend_tokens_to_means_phrase(sentence_df, sentence_entities, char_offset, head_start_id, child_entity_ids, processed_ids)
-        self.__append_tokens_to_means_phrase(sentence_df, sentence_entities, char_offset, head_start_id, child_entity_ids, processed_ids)
-        for child_entity_id in list(child_entity_ids) - processed_ids:
+
+    def __expand_means_phrase(self, sentence_df, sentence_entities, entity_start_id, head_start_id) -> None:
+        child_entity_ids = self.__get_head_dependencies(sentence_df, entity_start_id, head_start_id)
+        processed_ids = set()
+        head_start_id = self.__prepend_tokens_to_means_phrase(sentence_df, sentence_entities, head_start_id, child_entity_ids, processed_ids)
+        self.__append_tokens_to_means_phrase(sentence_df, sentence_entities, head_start_id, child_entity_ids, processed_ids)
+        for child_entity_id in list(set(child_entity_ids) - set(processed_ids)):
             print(sentence_df[entity_start_id]["token_text"], sentence_df[head_start_id]["token_text"],
                   "skipping means word", sentence_df[child_entity_id]["sentence"])
 
-    def __prepend_tokens_to_means_phrase(self, sentence_df, sentence_entities, char_offset, head_start_id, child_entity_ids, processed_ids) -> None:
+
+    def __prepend_tokens_to_means_phrase(self, sentence_df, sentence_entities, head_start_id, child_entity_ids, processed_ids) -> None:
         for child_entity_id in sorted(child_entity_ids, reverse=True):
             child_entity_text = sentence_df[child_entity_id]["token_text"]
             entity_gap_size = head_start_id - len(child_entity_text) - child_entity_id
             if child_entity_id not in processed_ids and entity_gap_size in [1, 2]:
                 in_between_text = " " if entity_gap_size == 1 else ", "
-                sentence_entities[child_entity_id + char_offset] = {
-                   "text": sentence_df[child_entity_id]["token_text"] + in_between_text + sentence_entities[head_start_id + char_offset]["text"],
-                   "label_": "MEANS" }
-                del(sentence_entities[head_start_id + char_offset])
+                sentence_entities[child_entity_id] = {
+                    "text": sentence_df[child_entity_id]["token_text"] + in_between_text + sentence_entities[head_start_id]["text"],
+                    "sentence_id": sentence_df[child_entity_id]["sentence_id"],
+                    "label_": "MEANS" }
+                del(sentence_entities[head_start_id])
                 head_start_id = child_entity_id
-                processed_ids.append(child_entity_id)
+                processed_ids.add(child_entity_id)
+        return head_start_id
+
 
-    def __extend_tokens_to_means_phrase(self, sentence_df, sentence_entities, char_offset, head_start_id, child_entity_ids, processed_ids) -> None:
+    def __append_tokens_to_means_phrase(self, sentence_df, sentence_entities, head_start_id, child_entity_ids, processed_ids) -> None:
         for child_entity_id in sorted(child_entity_ids):
-            entity_gap_size = child_entity_id - head_start_id - len(sentence_entities[head_start_id + char_offset]["text"])
+            entity_gap_size = child_entity_id - head_start_id - len(sentence_entities[head_start_id]["text"])
             if child_entity_id not in processed_ids and entity_gap_size in [1, 2]:
                 in_between_text = " " if entity_gap_size == 1 else ", "
-                sentence_entities[head_start_id + char_offset]["text"] += in_between_text + sentence_df[child_entity_id]["token_text"]
-                processed_ids.append(child_entity_id)
+                sentence_entities[head_start_id]["text"] += in_between_text + sentence_df[child_entity_id]["token_text"]
+                processed_ids.add(child_entity_id)
 
-    def __process_sentence(self, sentence_dict, char_offset=0) -> dict:
+
+    def __process_sentence(self, sentence_dict) -> dict:
         sentence_entities = {}
         for entity_start_id, token_data in sorted(sentence_dict.items()):
             try:
-                head_start_id = token_data["spacy_head_idx"]
-                head_of_head_start_id = sentence_dict[head_start_id]["spacy_head_idx"]
+                head_start_id = token_data.get("spacy_head_idx")
+                head_of_head_start_id = sentence_dict.get(head_start_id).get("spacy_head_idx")
                 # nl head relations: PREP -> MEANS -> VERB
                 # en head relations: MEANS -> PREP -> VERB
                 if self.language == constants.EN:
                     entity_start_id, head_start_id = head_start_id, entity_start_id
                 if self.__matching_dependencies(sentence_dict, entity_start_id, head_start_id, head_of_head_start_id):
-                    try:
-                        self.__add_sentence_entity(sentence_dict, sentence_entities, entity_start_id, head_start_id, head_of_head_start_id, char_offset)
-                    except Exception as e:
-                        self.__log_key_error(e, token_data)
+                    self.__add_sentence_entity(sentence_dict, sentence_entities, entity_start_id, head_start_id, head_of_head_start_id)
             except Exception as e:
                 self.__log_key_error(e, token_data)
         return sentence_entities
 
-    def __add_sentence_entity(self, sentence_dict, sentence_entities, entity_start_id, head_start_id, head_of_head_start_id, char_offset) -> None:
-        sentence_entities[entity_start_id + char_offset] = {
-            "label_": "PREP", "text": sentence_dict[entity_start_id]["token_text"]}
-        sentence_entities[head_start_id + char_offset] = {
-            "label_": "MEANS", "text": sentence_dict[head_start_id]["token_text"]}
-        sentence_entities[head_of_head_start_id + char_offset] = {
-            "label_": "VERB", "text": sentence_dict[head_of_head_start_id]["token_text"]}
-        self.__expand_means_phrase(sentence_dict, sentence_entities, char_offset, entity_start_id, head_start_id)
+
+    def __add_sentence_entity(self, sentence_dict, sentence_entities, entity_start_id, head_start_id, head_of_head_start_id) -> None:
+        entity = sentence_dict[entity_start_id]
+        sentence_entities[entity_start_id] = {
+            "label_": "PREP", 
+            "sentence_id": entity["sentence_id"],
+             "text": entity["token_text"]}
+        sentence_entities[head_start_id] = {
+            "label_": "MEANS",
+            "sentence_id": entity["sentence_id"],
+            "text": sentence_dict[head_start_id]["token_text"]}
+        sentence_entities[head_of_head_start_id] = {
+            "label_": "VERB",
+            "sentence_id": entity["sentence_id"],
+            "text": sentence_dict[head_of_head_start_id]["token_text"]}
+        self.__expand_means_phrase(sentence_dict, sentence_entities, entity_start_id, head_start_id)
+
 
     def __log_key_error(self, e, token_data) -> None:
-        print(f"key error: missing {str(e)} in", token_data["storyid"], token_data["token_text"], token_data["sentence"])
+        print(f"key error: missing {e} in {token_data['storyid']} {token_data['token_text']} {token_data['sentence']}")
 
-    def __get_head_dependencies(self, sentence_df, char_offset, entity_start_id, head_start_id) -> list:
+
+    def __get_head_dependencies(self, sentence_df, entity_start_id, head_start_id) -> list:
         entity_ids = []
         for start_id, token in sorted(sentence_df.items()):
             if token["spacy_head_idx"] == head_start_id and start_id not in (entity_start_id, head_start_id):
-                child_entity_ids = self.__get_head_dependencies(sentence_df, char_offset, entity_start_id, start_id)
                 entity_ids.append(start_id)
-                entity_ids.extend(child_entity_ids)
+                entity_ids.extend(self.__get_head_dependencies(sentence_df, entity_start_id, start_id))
         return entity_ids
 
+
     def __sort_and_filter_results(self, entities) -> pd.DataFrame:
-        results = [
-            (story_entity["text"], story_entity["label_"], storyid, character_id)
-            for storyid, story_entities in entities.items()
-            for character_id, story_entity in story_entities.items()]
-        results_df = pd.DataFrame(results, columns=["text", "label", "storyid", "character_id"])
-        results_df = results_df.sort_values(by=["storyid", "character_id"])
-        results_df["text_id"] = results_df["storyid"].apply(lambda x: "ST" + str(x))
-        return results_df[["text", "label", "text_id", "character_id"]].reset_index(drop=True)
+        results = [(story_entity["text"], story_entity["label_"], storyid, story_entity["sentence_id"], character_id)
+                   for storyid, story_entities in entities.items()
+                   for character_id, story_entity in story_entities.items()]
+        results_df = pd.DataFrame(results, columns=["text", "label", "storyid", "sentence_id", "character_id"])
+        results_df.sort_values(by=["storyid", "character_id"], inplace=True)
+        results_df["text_id"] = "ST" + results_df["storyid"].astype(str)
+        return results_df[["text", "label", "text_id", "sentence_id", "character_id"]].reset_index(drop=True)
diff --git a/orangecontrib/storynavigation/resources/dutch_verb_frames.csv b/orangecontrib/storynavigation/resources/dutch_verb_frames.csv
@@ -1,3 +1,4 @@
+verb,preposition
 bewaakt,met
 doen,via
 zijn,doordat

diff --git a/orangecontrib/storynavigation/widgets/OWSNMeansAnalysis.py b/orangecontrib/storynavigation/widgets/OWSNMeansAnalysis.py
@@ -177,7 +177,7 @@ def read_verb_frames_file(self, verb_frames_file_name):
         try:
             verb_frames_lines = pathlib.Path(verb_frames_file_name).read_text(encoding="utf-8").strip().split("\n")
             for line in verb_frames_lines:
-                self.verb_frames.append(line.strip().split(","))
+                self.verb_frames.append([token.strip() for token in line.strip().split(",")])
         except Exception as e:
             print("read_verb_frames_file", str(e))
         if self.story_elements: