Skip to content

Commit

Permalink
refactored and added test
Browse files Browse the repository at this point in the history
  • Loading branch information
eriktks committed Nov 7, 2024
1 parent 7c3e63d commit 6a8ea75
Show file tree
Hide file tree
Showing 5 changed files with 63 additions and 33 deletions.
66 changes: 34 additions & 32 deletions orangecontrib/storynavigation/modules/meansanalysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,43 +21,40 @@ def __init__(self, language, story_elements, verb_frames, means_strategy, callba
self.means_strategy = means_strategy
story_elements_df = util.convert_orangetable_to_dataframe(story_elements)
self.__convert_str_columns_to_ints(story_elements_df)

entities = self.__process_texts(story_elements_df, callback=callback)
sentence_offsets = self.__compute_sentence_offsets(story_elements_df)
entities_from_onsets = self.__convert_entities(entities, sentence_offsets)
self.means_analysis = self.__sort_and_filter_results(entities_from_onsets)


def __convert_str_columns_to_ints(self, story_elements_df) -> None:
story_elements_df["storyid"] = story_elements_df["storyid"].apply(lambda x: int(x))
story_elements_df["sentence_id"] = story_elements_df["sentence_id"].apply(lambda x: int(x))
story_elements_df["token_start_idx"] = story_elements_df["token_start_idx"].apply(lambda x: int(x))
story_elements_df["spacy_head_idx"] = story_elements_df["spacy_head_idx"].apply(lambda x: int(x))
columns_to_convert = ["storyid", "sentence_id", "token_start_idx", "spacy_head_idx"]
story_elements_df[columns_to_convert] = story_elements_df[columns_to_convert].astype(int)


def __compute_sentence_offsets(self, story_elements_df) -> pd.DataFrame:
sentences_df = story_elements_df.groupby(["storyid", "sentence_id"]).first().reset_index()[["storyid", "sentence_id", "sentence"]]
char_offsets = []
last_sentence = ""
for index,row in sentences_df.iterrows():
if row["sentence_id"] == 0:
for sentence_id, sentence in zip(sentences_df["sentence_id"],
sentences_df["sentence"]):
if sentence_id == sentences_df.iloc[0]["sentence_id"]:
char_offset = 0
else:
char_offset += len(last_sentence) + 1
char_offsets.append(char_offset)
last_sentence = row["sentence"]
last_sentence = sentence
sentences_df["char_offset"] = char_offsets
return sentences_df[["storyid", "sentence_id", "char_offset"]].set_index(["storyid", "sentence_id"])


def __convert_entities(self, entities, sentence_offsets) -> dict:
entities_from_onsets = {}
for storyid, sentence_id, sentence_data in entities:
if storyid not in entities_from_onsets:
entities_from_onsets[storyid] = {}
for token_start_id in sentence_data:
char_offset_sentence = sentence_offsets.loc[(storyid, sentence_id)]["char_offset"]
entities_from_onsets[storyid][token_start_id + char_offset_sentence] = sentence_data[token_start_id]
story_entities = entities_from_onsets.setdefault(storyid, {})
char_offset_sentence = sentence_offsets.loc[(storyid, sentence_id)]["char_offset"]
for token_start_id, token_data in sentence_data.items():
story_entities[token_start_id + char_offset_sentence] = token_data
return entities_from_onsets


Expand All @@ -68,8 +65,7 @@ def __convert_stories_to_sentences(self, story_elements_df) -> pd.DataFrame:
def __process_texts(self, story_elements_df, callback=None) -> list:
sentence_dict = self.__convert_stories_to_sentences(story_elements_df)
entities = []
index = 0
for sentence_dict_index, row_sentence_dict in sentence_dict.items():
for index, (sentence_dict_index, row_sentence_dict) in enumerate(sentence_dict.items()):
row_sentence_dict = { token["token_start_idx"]: token
for token_idx, token in row_sentence_dict.items() }
sentence_entities = self.__process_sentence(row_sentence_dict)
Expand All @@ -79,13 +75,12 @@ def __process_texts(self, story_elements_df, callback=None) -> list:
sentence_dict_index[1],
sentence_entities])
if callback:
index += 1
callback((100*index)/len(sentence_dict))
callback((100*(index + 1))/len(sentence_dict))
return entities


def __matching_dependencies(self, sentence_df, entity_start_id, head_start_id, head_of_head_start_id) -> bool:
if sentence_df[head_of_head_start_id]["spacy_tag"] not in ["VERB", "AUX"]:
if sentence_df[head_of_head_start_id]["spacy_tag"] not in {"VERB", "AUX"}:
return False
verb_frame_prepositions = [x[1] for x in self.verb_frames]
return ((self.means_strategy == constants.MEANS_STRATEGY_VERB_FRAMES and
Expand All @@ -102,31 +97,35 @@ def __expand_means_phrase(self, sentence_df, sentence_entities, entity_start_id,
processed_ids = set()
head_start_id = self.__prepend_tokens_to_means_phrase(sentence_df, sentence_entities, head_start_id, child_entity_ids, processed_ids)
self.__append_tokens_to_means_phrase(sentence_df, sentence_entities, head_start_id, child_entity_ids, processed_ids)
for child_entity_id in list(set(child_entity_ids) - set(processed_ids)):
for child_entity_id in set(child_entity_ids) - processed_ids:
print(sentence_df[entity_start_id]["token_text"], sentence_df[head_start_id]["token_text"],
"skipping means word", sentence_df[child_entity_id]["sentence"])


def __prepend_tokens_to_means_phrase(self, sentence_df, sentence_entities, head_start_id, child_entity_ids, processed_ids) -> None:
for child_entity_id in sorted(child_entity_ids, reverse=True):
if child_entity_id in processed_ids:
continue
child_entity_text = sentence_df[child_entity_id]["token_text"]
entity_gap_size = head_start_id - len(child_entity_text) - child_entity_id
if child_entity_id not in processed_ids and entity_gap_size in [1, 2]:
if entity_gap_size in [1, 2]:
in_between_text = " " if entity_gap_size == 1 else ", "
sentence_entities[child_entity_id] = {
"text": sentence_df[child_entity_id]["token_text"] + in_between_text + sentence_entities[head_start_id]["text"],
"text": child_entity_text + in_between_text + sentence_entities[head_start_id]["text"],
"sentence_id": sentence_df[child_entity_id]["sentence_id"],
"label_": "MEANS" }
del(sentence_entities[head_start_id])
del sentence_entities[head_start_id]
head_start_id = child_entity_id
processed_ids.add(child_entity_id)
return head_start_id


def __append_tokens_to_means_phrase(self, sentence_df, sentence_entities, head_start_id, child_entity_ids, processed_ids) -> None:
for child_entity_id in sorted(child_entity_ids):
if child_entity_id in processed_ids:
continue
entity_gap_size = child_entity_id - head_start_id - len(sentence_entities[head_start_id]["text"])
if child_entity_id not in processed_ids and entity_gap_size in [1, 2]:
if entity_gap_size in [1, 2]:
in_between_text = " " if entity_gap_size == 1 else ", "
sentence_entities[head_start_id]["text"] += in_between_text + sentence_df[child_entity_id]["token_text"]
processed_ids.add(child_entity_id)
Expand All @@ -137,31 +136,34 @@ def __process_sentence(self, sentence_dict) -> dict:
for entity_start_id, token_data in sorted(sentence_dict.items()):
try:
head_start_id = token_data.get("spacy_head_idx")
head_of_head_start_id = sentence_dict.get(head_start_id).get("spacy_head_idx")
head_of_head_start_id = sentence_dict.get(head_start_id, {}).get("spacy_head_idx")
# nl head relations: PREP -> MEANS -> VERB
# en head relations: MEANS -> PREP -> VERB
if self.language == constants.EN:
entity_start_id, head_start_id = head_start_id, entity_start_id
if self.__matching_dependencies(sentence_dict, entity_start_id, head_start_id, head_of_head_start_id):
self.__add_sentence_entity(sentence_dict, sentence_entities, entity_start_id, head_start_id, head_of_head_start_id)
except Exception as e:
except AttributeError as e:
self.__log_key_error(e, token_data)
except KeyError as e:
self.__log_key_error(e, token_data)
return sentence_entities


def __add_sentence_entity(self, sentence_dict, sentence_entities, entity_start_id, head_start_id, head_of_head_start_id) -> None:
entity = sentence_dict[entity_start_id]
sentence_id = entity["sentence_id"]
sentence_entities[entity_start_id] = {
"label_": "PREP",
"sentence_id": entity["sentence_id"],
"text": entity["token_text"]}
"sentence_id": sentence_id,
"text": entity["token_text"]}
sentence_entities[head_start_id] = {
"label_": "MEANS",
"sentence_id": entity["sentence_id"],
"sentence_id": sentence_id,
"text": sentence_dict[head_start_id]["token_text"]}
sentence_entities[head_of_head_start_id] = {
"label_": "VERB",
"sentence_id": entity["sentence_id"],
"sentence_id": sentence_id,
"text": sentence_dict[head_of_head_start_id]["token_text"]}
self.__expand_means_phrase(sentence_dict, sentence_entities, entity_start_id, head_start_id)

Expand All @@ -173,16 +175,16 @@ def __log_key_error(self, e, token_data) -> None:
def __get_head_dependencies(self, sentence_df, entity_start_id, head_start_id) -> list:
entity_ids = []
for start_id, token in sorted(sentence_df.items()):
if token["spacy_head_idx"] == head_start_id and start_id not in (entity_start_id, head_start_id):
if token["spacy_head_idx"] == head_start_id and start_id not in {entity_start_id, head_start_id}:
entity_ids.append(start_id)
entity_ids.extend(self.__get_head_dependencies(sentence_df, entity_start_id, start_id))
return entity_ids


def __sort_and_filter_results(self, entities) -> pd.DataFrame:
results = [(story_entity["text"], story_entity["label_"], storyid, story_entity["sentence_id"], character_id)
results = [(entity["text"], entity["label_"], storyid, entity["sentence_id"], char_id)
for storyid, story_entities in entities.items()
for character_id, story_entity in story_entities.items()]
for char_id, entity in story_entities.items()]
results_df = pd.DataFrame(results, columns=["text", "label", "storyid", "sentence_id", "character_id"])
results_df.sort_values(by=["storyid", "character_id"], inplace=True)
results_df["text_id"] = "ST" + results_df["storyid"].astype(str)
Expand Down
4 changes: 4 additions & 0 deletions orangecontrib/storynavigation/modules/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,10 @@ def convert_orangetable_to_dataframe(table):
Returns:
df (pandas.DataFrame): a pandas dataframe with the same content (info) and structure contained in the Orange Data Table
"""

if table is None:
return pd.DataFrame([], columns=['storyid', 'sentence_id', 'token_start_idx', 'spacy_head_idx', 'sentence'])

# Extract attribute names, class variable name, and meta attribute names
column_names = [var.name for var in table.domain.variables]
meta_names = [meta.name for meta in table.domain.metas]
Expand Down
2 changes: 2 additions & 0 deletions orangecontrib/storynavigation/resources/dutch_verb_frames.csv
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,5 @@ bewaakt,met
doen,via
zijn,doordat
gaan,doordat
worden,omdat
hebben,door
4 changes: 3 additions & 1 deletion orangecontrib/storynavigation/widgets/OWSNMeansAnalysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,8 +152,10 @@ def __make_document_viewer(self):
self.doc_list.setModel(proxy_model)
self.doc_list.selectionModel().selectionChanged.connect(self.selection_changed)
self.doc_webview = gui.WebviewWidget(self.splitter, debug=False)
self.doc_webview.setHtml("")
self.doc_webview.setHtml("<div style=\"max-width:600px\" />")
self.mainArea.layout().addWidget(self.splitter)
total_size = self.splitter.size().width()
self.splitter.setSizes([int(0.2 * total_size), int(0.8 * total_size)])


def __update_stories_selected(self):
Expand Down
20 changes: 20 additions & 0 deletions tests/test_meansanalysis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import pandas as pd
from storynavigation.modules.meansanalysis import MeansAnalyzer

def test_sort_and_filter_results():
sample_input = {1: {1: {"text": "", "label_": "", "sentence_id": 1},
0: {"text": "", "label_": "", "sentence_id": 0}},
0: {0: {"text": "", "label_": "", "sentence_id": 0}}}

expected_dict = [["", "", "ST0", 0, 0],
["", "", "ST1", 0, 0],
["", "", "ST1", 1, 1]]
expected_df = pd.DataFrame(expected_dict,
columns=["text",
"label",
"text_id",
"sentence_id",
"character_id"]).reset_index(drop=True)
means_analyzer_object = MeansAnalyzer("", None, pd.DataFrame([], columns=["sentence"]), "")
results_df = means_analyzer_object._MeansAnalyzer__sort_and_filter_results(sample_input)
pd.testing.assert_frame_equal(results_df, expected_df)

0 comments on commit 6a8ea75

Please sign in to comment.