refactored and added test

navigating-stories · Nov 7, 2024 · 6a8ea75 · 6a8ea75
1 parent 7c3e63d
commit 6a8ea75
Show file tree

Hide file tree

Showing 5 changed files with 63 additions and 33 deletions.
diff --git a/orangecontrib/storynavigation/modules/meansanalysis.py b/orangecontrib/storynavigation/modules/meansanalysis.py
@@ -21,43 +21,40 @@ def __init__(self, language, story_elements, verb_frames, means_strategy, callba
         self.means_strategy = means_strategy
         story_elements_df = util.convert_orangetable_to_dataframe(story_elements)
         self.__convert_str_columns_to_ints(story_elements_df)
-
         entities = self.__process_texts(story_elements_df, callback=callback)
         sentence_offsets = self.__compute_sentence_offsets(story_elements_df)
         entities_from_onsets = self.__convert_entities(entities, sentence_offsets)
         self.means_analysis = self.__sort_and_filter_results(entities_from_onsets)
 
 
     def __convert_str_columns_to_ints(self, story_elements_df) -> None:
-        story_elements_df["storyid"] = story_elements_df["storyid"].apply(lambda x: int(x))
-        story_elements_df["sentence_id"] = story_elements_df["sentence_id"].apply(lambda x: int(x))
-        story_elements_df["token_start_idx"] = story_elements_df["token_start_idx"].apply(lambda x: int(x))
-        story_elements_df["spacy_head_idx"] = story_elements_df["spacy_head_idx"].apply(lambda x: int(x))
+        columns_to_convert = ["storyid", "sentence_id", "token_start_idx", "spacy_head_idx"]
+        story_elements_df[columns_to_convert] = story_elements_df[columns_to_convert].astype(int)
 
 
     def __compute_sentence_offsets(self, story_elements_df) -> pd.DataFrame:
         sentences_df = story_elements_df.groupby(["storyid", "sentence_id"]).first().reset_index()[["storyid", "sentence_id", "sentence"]]
         char_offsets = []
         last_sentence = ""
-        for index,row in sentences_df.iterrows():
-            if row["sentence_id"] == 0:
+        for sentence_id, sentence in zip(sentences_df["sentence_id"],
+                                         sentences_df["sentence"]):
+            if sentence_id == sentences_df.iloc[0]["sentence_id"]:
                 char_offset = 0
             else:
                 char_offset += len(last_sentence) + 1
             char_offsets.append(char_offset)
-            last_sentence = row["sentence"]
+            last_sentence = sentence
         sentences_df["char_offset"] = char_offsets
         return sentences_df[["storyid", "sentence_id", "char_offset"]].set_index(["storyid", "sentence_id"])
 
 
     def __convert_entities(self, entities, sentence_offsets) -> dict:
         entities_from_onsets = {}
         for storyid, sentence_id, sentence_data in entities:
-            if storyid not in entities_from_onsets:
-                entities_from_onsets[storyid] = {}
-            for token_start_id in sentence_data:
-                char_offset_sentence = sentence_offsets.loc[(storyid, sentence_id)]["char_offset"]
-                entities_from_onsets[storyid][token_start_id + char_offset_sentence] = sentence_data[token_start_id]
+            story_entities = entities_from_onsets.setdefault(storyid, {})
+            char_offset_sentence = sentence_offsets.loc[(storyid, sentence_id)]["char_offset"]
+            for token_start_id, token_data in sentence_data.items():
+                story_entities[token_start_id + char_offset_sentence] = token_data
         return entities_from_onsets
 
 
@@ -68,8 +65,7 @@ def __convert_stories_to_sentences(self, story_elements_df) -> pd.DataFrame:
     def __process_texts(self, story_elements_df, callback=None) -> list:
         sentence_dict = self.__convert_stories_to_sentences(story_elements_df)
         entities = []
-        index = 0
-        for sentence_dict_index, row_sentence_dict in sentence_dict.items():
+        for index, (sentence_dict_index, row_sentence_dict) in enumerate(sentence_dict.items()):
             row_sentence_dict = { token["token_start_idx"]: token
                                  for token_idx, token in row_sentence_dict.items() }
             sentence_entities = self.__process_sentence(row_sentence_dict)
@@ -79,13 +75,12 @@ def __process_texts(self, story_elements_df, callback=None) -> list:
                     sentence_dict_index[1],
                     sentence_entities])
             if callback:
-                index += 1
-                callback((100*index)/len(sentence_dict))
+                callback((100*(index + 1))/len(sentence_dict))
         return entities
 
 
     def __matching_dependencies(self, sentence_df, entity_start_id, head_start_id, head_of_head_start_id) -> bool:
-        if sentence_df[head_of_head_start_id]["spacy_tag"] not in ["VERB", "AUX"]:
+        if sentence_df[head_of_head_start_id]["spacy_tag"] not in {"VERB", "AUX"}:
             return False
         verb_frame_prepositions = [x[1] for x in self.verb_frames]
         return ((self.means_strategy == constants.MEANS_STRATEGY_VERB_FRAMES and
@@ -102,31 +97,35 @@ def __expand_means_phrase(self, sentence_df, sentence_entities, entity_start_id,
         processed_ids = set()
         head_start_id = self.__prepend_tokens_to_means_phrase(sentence_df, sentence_entities, head_start_id, child_entity_ids, processed_ids)
         self.__append_tokens_to_means_phrase(sentence_df, sentence_entities, head_start_id, child_entity_ids, processed_ids)
-        for child_entity_id in list(set(child_entity_ids) - set(processed_ids)):
+        for child_entity_id in set(child_entity_ids) - processed_ids:
             print(sentence_df[entity_start_id]["token_text"], sentence_df[head_start_id]["token_text"],
                   "skipping means word", sentence_df[child_entity_id]["sentence"])
 
 
     def __prepend_tokens_to_means_phrase(self, sentence_df, sentence_entities, head_start_id, child_entity_ids, processed_ids) -> None:
         for child_entity_id in sorted(child_entity_ids, reverse=True):
+            if child_entity_id in processed_ids:
+                continue
             child_entity_text = sentence_df[child_entity_id]["token_text"]
             entity_gap_size = head_start_id - len(child_entity_text) - child_entity_id
-            if child_entity_id not in processed_ids and entity_gap_size in [1, 2]:
+            if entity_gap_size in [1, 2]:
                 in_between_text = " " if entity_gap_size == 1 else ", "
                 sentence_entities[child_entity_id] = {
-                    "text": sentence_df[child_entity_id]["token_text"] + in_between_text + sentence_entities[head_start_id]["text"],
+                    "text": child_entity_text + in_between_text + sentence_entities[head_start_id]["text"],
                     "sentence_id": sentence_df[child_entity_id]["sentence_id"],
                     "label_": "MEANS" }
-                del(sentence_entities[head_start_id])
+                del sentence_entities[head_start_id]
                 head_start_id = child_entity_id
                 processed_ids.add(child_entity_id)
         return head_start_id
 
 
     def __append_tokens_to_means_phrase(self, sentence_df, sentence_entities, head_start_id, child_entity_ids, processed_ids) -> None:
         for child_entity_id in sorted(child_entity_ids):
+            if child_entity_id in processed_ids:
+                continue
             entity_gap_size = child_entity_id - head_start_id - len(sentence_entities[head_start_id]["text"])
-            if child_entity_id not in processed_ids and entity_gap_size in [1, 2]:
+            if entity_gap_size in [1, 2]:
                 in_between_text = " " if entity_gap_size == 1 else ", "
                 sentence_entities[head_start_id]["text"] += in_between_text + sentence_df[child_entity_id]["token_text"]
                 processed_ids.add(child_entity_id)
@@ -137,31 +136,34 @@ def __process_sentence(self, sentence_dict) -> dict:
         for entity_start_id, token_data in sorted(sentence_dict.items()):
             try:
                 head_start_id = token_data.get("spacy_head_idx")
-                head_of_head_start_id = sentence_dict.get(head_start_id).get("spacy_head_idx")
+                head_of_head_start_id = sentence_dict.get(head_start_id, {}).get("spacy_head_idx")
                 # nl head relations: PREP -> MEANS -> VERB
                 # en head relations: MEANS -> PREP -> VERB
                 if self.language == constants.EN:
                     entity_start_id, head_start_id = head_start_id, entity_start_id
                 if self.__matching_dependencies(sentence_dict, entity_start_id, head_start_id, head_of_head_start_id):
                     self.__add_sentence_entity(sentence_dict, sentence_entities, entity_start_id, head_start_id, head_of_head_start_id)
-            except Exception as e:
+            except AttributeError as e:
+                self.__log_key_error(e, token_data)
+            except KeyError as e:
                 self.__log_key_error(e, token_data)
         return sentence_entities
 
 
     def __add_sentence_entity(self, sentence_dict, sentence_entities, entity_start_id, head_start_id, head_of_head_start_id) -> None:
         entity = sentence_dict[entity_start_id]
+        sentence_id = entity["sentence_id"]
         sentence_entities[entity_start_id] = {
             "label_": "PREP", 
-            "sentence_id": entity["sentence_id"],
-             "text": entity["token_text"]}
+            "sentence_id": sentence_id,
+            "text": entity["token_text"]}
         sentence_entities[head_start_id] = {
             "label_": "MEANS",
-            "sentence_id": entity["sentence_id"],
+            "sentence_id": sentence_id,
             "text": sentence_dict[head_start_id]["token_text"]}
         sentence_entities[head_of_head_start_id] = {
             "label_": "VERB",
-            "sentence_id": entity["sentence_id"],
+            "sentence_id": sentence_id,
             "text": sentence_dict[head_of_head_start_id]["token_text"]}
         self.__expand_means_phrase(sentence_dict, sentence_entities, entity_start_id, head_start_id)
 
@@ -173,16 +175,16 @@ def __log_key_error(self, e, token_data) -> None:
     def __get_head_dependencies(self, sentence_df, entity_start_id, head_start_id) -> list:
         entity_ids = []
         for start_id, token in sorted(sentence_df.items()):
-            if token["spacy_head_idx"] == head_start_id and start_id not in (entity_start_id, head_start_id):
+            if token["spacy_head_idx"] == head_start_id and start_id not in {entity_start_id, head_start_id}:
                 entity_ids.append(start_id)
                 entity_ids.extend(self.__get_head_dependencies(sentence_df, entity_start_id, start_id))
         return entity_ids
 
 
     def __sort_and_filter_results(self, entities) -> pd.DataFrame:
-        results = [(story_entity["text"], story_entity["label_"], storyid, story_entity["sentence_id"], character_id)
+        results = [(entity["text"], entity["label_"], storyid, entity["sentence_id"], char_id)
                    for storyid, story_entities in entities.items()
-                   for character_id, story_entity in story_entities.items()]
+                   for char_id, entity in story_entities.items()]
         results_df = pd.DataFrame(results, columns=["text", "label", "storyid", "sentence_id", "character_id"])
         results_df.sort_values(by=["storyid", "character_id"], inplace=True)
         results_df["text_id"] = "ST" + results_df["storyid"].astype(str)

diff --git a/orangecontrib/storynavigation/modules/util.py b/orangecontrib/storynavigation/modules/util.py
@@ -225,6 +225,10 @@ def convert_orangetable_to_dataframe(table):
     Returns:
         df (pandas.DataFrame): a pandas dataframe with the same content (info) and structure contained in the Orange Data Table
     """
+
+    if table is None:
+        return pd.DataFrame([], columns=['storyid', 'sentence_id', 'token_start_idx', 'spacy_head_idx', 'sentence'])
+
     # Extract attribute names, class variable name, and meta attribute names
     column_names = [var.name for var in table.domain.variables]
     meta_names = [meta.name for meta in table.domain.metas]

diff --git a/orangecontrib/storynavigation/resources/dutch_verb_frames.csv b/orangecontrib/storynavigation/resources/dutch_verb_frames.csv
@@ -3,3 +3,5 @@ bewaakt,met
 doen,via
 zijn,doordat
 gaan,doordat
+worden,omdat
+hebben,door
diff --git a/orangecontrib/storynavigation/widgets/OWSNMeansAnalysis.py b/orangecontrib/storynavigation/widgets/OWSNMeansAnalysis.py
@@ -152,8 +152,10 @@ def __make_document_viewer(self):
         self.doc_list.setModel(proxy_model)
         self.doc_list.selectionModel().selectionChanged.connect(self.selection_changed)
         self.doc_webview = gui.WebviewWidget(self.splitter, debug=False)
-        self.doc_webview.setHtml("")
+        self.doc_webview.setHtml("<div style=\"max-width:600px\" />")
         self.mainArea.layout().addWidget(self.splitter)
+        total_size = self.splitter.size().width()
+        self.splitter.setSizes([int(0.2 * total_size), int(0.8 * total_size)])
 
 
     def __update_stories_selected(self):

diff --git a/tests/test_meansanalysis.py b/tests/test_meansanalysis.py
@@ -0,0 +1,20 @@
+import pandas as pd
+from storynavigation.modules.meansanalysis import MeansAnalyzer
+
+def test_sort_and_filter_results():
+    sample_input = {1: {1: {"text": "", "label_": "", "sentence_id": 1},
+                        0: {"text": "", "label_": "", "sentence_id": 0}},
+                    0: {0: {"text": "", "label_": "", "sentence_id": 0}}}
+
+    expected_dict = [["", "", "ST0", 0, 0],
+                     ["", "", "ST1", 0, 0],
+                     ["", "", "ST1", 1, 1]]
+    expected_df = pd.DataFrame(expected_dict,
+                               columns=["text",
+                                        "label",
+                                        "text_id",
+                                        "sentence_id",
+                                        "character_id"]).reset_index(drop=True)
+    means_analyzer_object = MeansAnalyzer("", None, pd.DataFrame([], columns=["sentence"]), "")
+    results_df = means_analyzer_object._MeansAnalyzer__sort_and_filter_results(sample_input)
+    pd.testing.assert_frame_equal(results_df, expected_df)