From 151dc9957a72e14c105656de4047f1b564255fc8 Mon Sep 17 00:00:00 2001 From: "kody.moodley@gmail.com" Date: Wed, 17 Jan 2024 17:00:01 +0100 Subject: [PATCH 1/2] fixes tagging of subjects and other potential actors. --- orangecontrib/storynavigation/modules/util.py | 24 ++++++++ .../widgets/OWSNActorAnalysis.py | 56 ++++++++----------- 2 files changed, 48 insertions(+), 32 deletions(-) diff --git a/orangecontrib/storynavigation/modules/util.py b/orangecontrib/storynavigation/modules/util.py index e3c9839..9e4c40d 100644 --- a/orangecontrib/storynavigation/modules/util.py +++ b/orangecontrib/storynavigation/modules/util.py @@ -5,6 +5,7 @@ import spacy import os import string +import pandas as pd import storynavigation.modules.constants as constants def entity_tag_already_exists(ents, start, end): @@ -135,6 +136,29 @@ def preprocess_text(text): return cleaned_sents +def convert_orangetable_to_dataframe(table): + """Converts an Orange Data Table object to a Pandas dataframe + + Args: + table (Orange.data.Table): an Orange Data Table instance + + Returns: + df (pandas.DataFrame): a pandas dataframe with the same content (info) and structure contained in the Orange Data Table + """ + # Extract attribute names, class variable name, and meta attribute names + column_names = [var.name for var in table.domain.variables] + meta_names = [meta.name for meta in table.domain.metas] + + # Combine attribute and meta names + all_column_names = column_names + meta_names + + # Create a list of lists representing the data + data = [[str(entry[var]) for var in table.domain.variables + table.domain.metas] for entry in table] + + # Convert to a pandas DataFrame + df = pd.DataFrame(data, columns=all_column_names) + + return df def remove_span_tags(html_string): """Removes span tags (including content) from an HTML string diff --git a/orangecontrib/storynavigation/widgets/OWSNActorAnalysis.py b/orangecontrib/storynavigation/widgets/OWSNActorAnalysis.py index bd7cddc..b5504e4 100644 --- a/orangecontrib/storynavigation/widgets/OWSNActorAnalysis.py +++ b/orangecontrib/storynavigation/widgets/OWSNActorAnalysis.py @@ -5,6 +5,7 @@ from typing import Any, Iterable, List, Set import numpy as np import pandas as pd +from orangecontrib.storynavigation.modules import util # Imports from Qt from AnyQt.QtCore import ( @@ -526,28 +527,21 @@ def on_state_changed_pos(self, state): def copy_to_clipboard(self): text = self.doc_webview.selectedText() + print('selected text: ', text) QApplication.clipboard().setText(text) def pos_selection_changed(self): self.show_docs() self.commit.deferred() - def ner_selection_changed(self): - # self.show_docs() - self.commit.deferred() - - def rehighlight_entities(self): - # self.show_docs() - self.commit.deferred() - @Inputs.stories def set_stories(self, stories=None): self.stories = stories self.actortagger = ActorTagger(constants.NL_SPACY_MODEL) if stories is not None: self.setup_controls() - # self.openContext(self.corpus) - # self.doc_list.model().set_filter_string(self.regexp_filter) + # self.openContext(self.stories) + self.doc_list.model().set_filter_string(self.regexp_filter) # self.select_variables() self.list_docs() # self.update_info() @@ -558,7 +552,8 @@ def set_stories(self, stories=None): @Inputs.story_elements def set_tagging_data(self, story_elements=None): if story_elements is not None: - self.story_elements = pd.concat(table_to_frames(story_elements), axis=1) + # self.story_elements = pd.concat(table_to_frames(story_elements), axis=1) + self.story_elements = util.convert_orangetable_to_dataframe(story_elements) print() print() print('story-n: ', self.story_elements['story_navigator_tag']) @@ -582,15 +577,13 @@ def set_tagging_data(self, story_elements=None): print() print() - self.setup_controls() - # self.openContext(self.corpus) - # self.doc_list.model().set_filter_string(self.regexp_filter) + # self.openContext(self.stories) + self.doc_list.model().set_filter_string(self.regexp_filter) # self.select_variables() self.list_docs() # self.update_info() # self.set_selection() - # self.show_docs() self.show_docs() def reset_widget(self): @@ -600,7 +593,6 @@ def reset_widget(self): self.search_listbox.model().set_domain(None) self.display_listbox.model().set_domain(None) self.filter_input.clear() - self.update_info() # Models/vars self.doc_list_model.clear() # Warnings @@ -726,7 +718,7 @@ def show_docs(self, slider_engaged=False): self.subjs, self.agent_prominence_metric, self.agent_prominence_score_min, - self.story_elements_dict[c_index] + self.story_elements_dict[str(c_index)] ) self.Outputs.metrics_freq_table.send( table_from_frame( @@ -813,7 +805,7 @@ def search_features_changed(self): def display_features_changed(self): self.display_features = self.__get_selected_rows(self.display_listbox) - # self.show_docs() + self.show_docs() def regenerate_docs(self) -> List[str]: self.Warning.no_feats_search.clear() @@ -828,13 +820,13 @@ def refresh_search(self): # when currently selected items are filtered selection is empty # select first element in the view in that case self.doc_list.setCurrentIndex(self.doc_list.model().index(0, 0)) - self.update_info() + # self.update_info() self.start( _count_matches, self.doc_list_model.get_filter_content(), self.regexp_filter, ) - # self.show_docs() + self.show_docs() self.commit.deferred() def on_done(self, res: int): @@ -844,18 +836,18 @@ def on_done(self, res: int): def on_exception(self, ex): raise ex - def update_info(self): - # self.pos_checkboxes = [self.sc, self.nc] - if self.stories is not None: - has_tokens = self.stories.has_tokens() - self.n_matching = f"{self.doc_list.model().rowCount()}/{len(self.stories)}" - self.n_tokens = sum(map(len, self.stories.tokens)) if has_tokens else "n/a" - self.n_types = len(self.stories.dictionary) if has_tokens else "n/a" - else: - self.n_matching = "n/a" - self.n_matches = "n/a" - self.n_tokens = "n/a" - self.n_types = "n/a" + # def update_info(self): + # # self.pos_checkboxes = [self.sc, self.nc] + # if self.stories is not None: + # has_tokens = self.stories.has_tokens() + # self.n_matching = f"{self.doc_list.model().rowCount()}/{len(self.stories)}" + # self.n_tokens = sum(map(len, self.stories.tokens)) if has_tokens else "n/a" + # self.n_types = len(self.stories.dictionary) if has_tokens else "n/a" + # else: + # self.n_matching = "n/a" + # self.n_matches = "n/a" + # self.n_tokens = "n/a" + # self.n_types = "n/a" @gui.deferred def commit(self): From cff5c24e213a3e98bb6fcb03139d35a232f5a3e3 Mon Sep 17 00:00:00 2001 From: "kody.moodley@gmail.com" Date: Thu, 18 Jan 2024 06:59:07 +0100 Subject: [PATCH 2/2] fixes highlighting of search text --- .../storynavigation/modules/actoranalysis.py | 36 +--------------- .../widgets/OWSNActorAnalysis.py | 41 ++++++++----------- 2 files changed, 19 insertions(+), 58 deletions(-) diff --git a/orangecontrib/storynavigation/modules/actoranalysis.py b/orangecontrib/storynavigation/modules/actoranalysis.py index 66ead3a..10e24d1 100644 --- a/orangecontrib/storynavigation/modules/actoranalysis.py +++ b/orangecontrib/storynavigation/modules/actoranalysis.py @@ -288,11 +288,6 @@ def postag_text( pos_tags.append("SP") pos_tags.append("SNP") - print() - print() - print("pos_tags: ", pos_tags) - print() - print() if len(pos_tags) == 0: for sentence in sentences: doc = {"text": sentence, "ents": []} @@ -302,34 +297,13 @@ def postag_text( story_elements_df = story_elements_df.copy() story_elements_df['story_navigator_tag'] = story_elements_df['story_navigator_tag'].astype(str) - story_elements_df['spacy_tag'] = story_elements_df['spacy_tag'].astype(str) - - print() - print() - print("story_elements: ", story_elements_df) - print() - print() - - - matched_df = story_elements_df[story_elements_df['story_navigator_tag'].isin(pos_tags) | story_elements_df['spacy_tag'].isin(pos_tags)] - print() - print() - print("matched_df1: ", matched_df) - print() - print() + story_elements_df['spacy_tag'] = story_elements_df['spacy_tag'].astype(str) + matched_df = story_elements_df[story_elements_df['story_navigator_tag'].isin(pos_tags) | story_elements_df['spacy_tag'].isin(pos_tags)] - # matched_df = matched_df.copy() matched_df['merged_tags'] = np.where(matched_df['story_navigator_tag'] == '-', matched_df['spacy_tag'], matched_df['story_navigator_tag']) matched_df['token_start_idx'] = matched_df['token_start_idx'].astype(str) matched_df['token_end_idx'] = matched_df['token_end_idx'].astype(str) matched_df['displacy_tag_strings'] = matched_df['token_start_idx'] + ' | ' + matched_df['token_end_idx'] + ' | ' + matched_df['merged_tags'] - - print() - print() - print("matched_df2: ", matched_df) - print() - print() - order_mapping = {value: index for index, value in enumerate(sentences)} for sentence in sentences: @@ -347,12 +321,6 @@ def postag_text( ents = util.remove_duplicate_tagged_entities(ents) - # print() - # print() - # print('ents: ', ents) - # print() - # print() - doc = {"text": sentence, "ents": ents} options = {"ents": pos_tags, "colors": constants.COLOR_MAP} html += displacy.render(doc, style="ent", options=options, manual=True) diff --git a/orangecontrib/storynavigation/widgets/OWSNActorAnalysis.py b/orangecontrib/storynavigation/widgets/OWSNActorAnalysis.py index b5504e4..e121fba 100644 --- a/orangecontrib/storynavigation/widgets/OWSNActorAnalysis.py +++ b/orangecontrib/storynavigation/widgets/OWSNActorAnalysis.py @@ -429,7 +429,6 @@ def __init__(self): ) self.custom_tags.setEnabled(False) - self.allc.stateChanged.connect(self.on_state_changed_pos) self.pos_checkboxes = [self.sc, self.nc, self.custom_tags] self.controlArea.layout().addWidget(self.postags_box) @@ -518,12 +517,26 @@ def __init__(self): self.doc_list.selectionModel().selectionChanged.connect(self.selection_changed) # Document contents self.doc_webview = gui.WebviewWidget(self.splitter, debug=False) - # self.doc_webview.setStyleSheet("QWidget {background-color: #0ff}") + self.doc_webview.setStyleSheet("QWidget {background-color: #0ff}") self.mainArea.layout().addWidget(self.splitter) - def on_state_changed_pos(self, state): + def __uncheckAll(self): + for checkBox in self.pos_checkboxes: + checkBox.setCheckState(False) + + def __checkAll(self): + for checkBox in self.pos_checkboxes: + checkBox.setCheckState(True) + + def on_state_changed_pos(self, checked): for checkBox in self.pos_checkboxes: - checkBox.setCheckState(state) + if checkBox == self.allc: + if checkBox.isChecked() and not checked: + self.__uncheckAll() + elif (not checkBox.isChecked()) and checked: + self.__checkAll() + + checkBox.setCheckState(checked) def copy_to_clipboard(self): text = self.doc_webview.selectedText() @@ -552,30 +565,10 @@ def set_stories(self, stories=None): @Inputs.story_elements def set_tagging_data(self, story_elements=None): if story_elements is not None: - # self.story_elements = pd.concat(table_to_frames(story_elements), axis=1) self.story_elements = util.convert_orangetable_to_dataframe(story_elements) - print() - print() - print('story-n: ', self.story_elements['story_navigator_tag']) - print() - print() - print() - print() - print('story-s: ', self.story_elements['spacy_tag']) - print() - print() - - story_elements_grouped_by_story = self.story_elements.groupby('storyid') for storyid, story_df in story_elements_grouped_by_story: self.story_elements_dict[storyid] = story_df - print() - print() - print(storyid) - print('dataframe1: ', self.story_elements_dict[storyid]['story_navigator_tag']) - print('dataframe2: ', self.story_elements_dict[storyid]['spacy_tag']) - print() - print() self.setup_controls() # self.openContext(self.stories)