From b45fad4348a1c7a74dd80a1ccd0f92b016745f1e Mon Sep 17 00:00:00 2001 From: Ajda Date: Thu, 5 Dec 2024 14:18:47 +0100 Subject: [PATCH 1/2] Import Documents: add ID options for CoNLL-U --- orangecontrib/text/import_documents.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/orangecontrib/text/import_documents.py b/orangecontrib/text/import_documents.py index e18a9e754..038d7d24a 100644 --- a/orangecontrib/text/import_documents.py +++ b/orangecontrib/text/import_documents.py @@ -335,7 +335,7 @@ def make_text_data(self): class ImportDocuments: META_DATA_FILE_KEY = "Text file" # this is what we will merge meta data on, change to user-set variable - CONLLU_META_DATA = "ID" + CONLLU_META_DATA = ["ID", "Text_ID"] def __init__( self, @@ -513,13 +513,17 @@ def _add_metadata(self, corpus: Corpus) -> Corpus: or self._meta_data is None or ( self.META_DATA_FILE_KEY not in self._meta_data.columns - and self.CONLLU_META_DATA not in self._meta_data.columns + and not any(i in self._meta_data.columns for i in + self.CONLLU_META_DATA) ) ): return corpus if self.is_conllu: - df = self._meta_data.set_index(self.CONLLU_META_DATA) + # find the first matching column + match_id = next((idx for idx in self.CONLLU_META_DATA if idx in + self._meta_data.columns)) + df = self._meta_data.set_index(match_id) path_column = corpus.get_column("utterance") else: df = self._meta_data.set_index( From f056910ee2a7adf48845de994e243868a23ad947 Mon Sep 17 00:00:00 2001 From: Ajda Date: Thu, 5 Dec 2024 14:19:05 +0100 Subject: [PATCH 2/2] Import Documents: conllu tests --- orangecontrib/text/tests/test_import_documents.py | 2 ++ .../text/widgets/tests/data/conllu/ParlaMint01-meta.tsv | 3 +++ .../text/widgets/tests/data/conllu/ParlaMint02-meta.tsv | 4 ++++ 3 files changed, 9 insertions(+) create mode 100755 orangecontrib/text/widgets/tests/data/conllu/ParlaMint01-meta.tsv create mode 100755 orangecontrib/text/widgets/tests/data/conllu/ParlaMint02-meta.tsv diff --git a/orangecontrib/text/tests/test_import_documents.py b/orangecontrib/text/tests/test_import_documents.py index 3032fe331..16f68c2b5 100644 --- a/orangecontrib/text/tests/test_import_documents.py +++ b/orangecontrib/text/tests/test_import_documents.py @@ -247,6 +247,8 @@ def test_conllu_reader(self): self.assertEqual(len(corpus), len(lemma)) self.assertEqual(len(corpus), len(pos)) self.assertEqual(len(corpus), len(ner)) + self.assertTrue(np.any(~np.isnan(corpus.get_column( + "Speaker_birth")))) @patch(SF_LIST, return_value=SPECIAL_CHAR_FILES) @patch(PATCH_METHOD, side_effect=ConnectTimeout("test message", request="")) diff --git a/orangecontrib/text/widgets/tests/data/conllu/ParlaMint01-meta.tsv b/orangecontrib/text/widgets/tests/data/conllu/ParlaMint01-meta.tsv new file mode 100755 index 000000000..21f82112b --- /dev/null +++ b/orangecontrib/text/widgets/tests/data/conllu/ParlaMint01-meta.tsv @@ -0,0 +1,3 @@ +ID Title From To House Term Session Meeting Sitting Agenda Subcorpus Speaker_role Speaker_type Speaker_party Speaker_party_name Party_status Speaker_name Speaker_gender Speaker_birth +ParlaMint-SI_2014-08-01-SDZ7-Redna-01.u1 Minutes of the National Assembly of the Republic of Slovenia, Term 7, Regular Session 1, 1.8.2014 2014-08-01 2014-08-01 Lower house 7 1 Reference Chairperson MP DeSUS Demokratična stranka upokojencev Slovenije Kotnik Poropat, Marjana F 1944 +ParlaMint-SI_2014-08-01-SDZ7-Redna-01.u2 Minutes of the National Assembly of the Republic of Slovenia, Term 7, Regular Session 1, 1.8.2014 2014-08-01 2014-08-01 Lower house 7 1 Reference Regular MP SD Socialni demokrati Veber, Janko M 1960 diff --git a/orangecontrib/text/widgets/tests/data/conllu/ParlaMint02-meta.tsv b/orangecontrib/text/widgets/tests/data/conllu/ParlaMint02-meta.tsv new file mode 100755 index 000000000..982b2e0b1 --- /dev/null +++ b/orangecontrib/text/widgets/tests/data/conllu/ParlaMint02-meta.tsv @@ -0,0 +1,4 @@ +ID Title From To House Term Session Meeting Sitting Agenda Subcorpus Speaker_role Speaker_type Speaker_party Speaker_party_name Party_status Speaker_name Speaker_gender Speaker_birth +ParlaMint-SI_2014-08-25-SDZ7-Izredna-01.u1 Minutes of the National Assembly of the Republic of Slovenia, Term 7, Extraordinary Session 1, 25.8.2014 2014-08-25 2014-08-25 Lower house 7 1 Reference Chairperson MP SMC Stranka Mira Cerarja Brglez, Milan M 1967 +ParlaMint-SI_2014-08-25-SDZ7-Izredna-01.u2 Minutes of the National Assembly of the Republic of Slovenia, Term 7, Extraordinary Session 1, 25.8.2014 2014-08-25 2014-08-25 Lower house 7 1 Reference Chairperson MP SMC Stranka Mira Cerarja Brglez, Milan M 1967 +ParlaMint-SI_2014-08-25-SDZ7-Izredna-01.u3 Minutes of the National Assembly of the Republic of Slovenia, Term 7, Extraordinary Session 1, 25.8.2014 2014-08-25 2014-08-25 Lower house 7 1 Reference Regular MP SD Socialni demokrati Židan, Dejan M 1967