Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Import Documents: add options for meta matching in CoNLL-U #1087

Merged
merged 2 commits into from
Dec 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions orangecontrib/text/import_documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -335,7 +335,7 @@ def make_text_data(self):
class ImportDocuments:
META_DATA_FILE_KEY = "Text file"
# this is what we will merge meta data on, change to user-set variable
CONLLU_META_DATA = "ID"
CONLLU_META_DATA = ["ID", "Text_ID"]

def __init__(
self,
Expand Down Expand Up @@ -513,13 +513,17 @@ def _add_metadata(self, corpus: Corpus) -> Corpus:
or self._meta_data is None
or (
self.META_DATA_FILE_KEY not in self._meta_data.columns
and self.CONLLU_META_DATA not in self._meta_data.columns
and not any(i in self._meta_data.columns for i in
self.CONLLU_META_DATA)
)
):
return corpus

if self.is_conllu:
df = self._meta_data.set_index(self.CONLLU_META_DATA)
# find the first matching column
match_id = next((idx for idx in self.CONLLU_META_DATA if idx in
self._meta_data.columns))
df = self._meta_data.set_index(match_id)
path_column = corpus.get_column("utterance")
else:
df = self._meta_data.set_index(
Expand Down
2 changes: 2 additions & 0 deletions orangecontrib/text/tests/test_import_documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,8 @@ def test_conllu_reader(self):
self.assertEqual(len(corpus), len(lemma))
self.assertEqual(len(corpus), len(pos))
self.assertEqual(len(corpus), len(ner))
self.assertTrue(np.any(~np.isnan(corpus.get_column(
"Speaker_birth"))))

@patch(SF_LIST, return_value=SPECIAL_CHAR_FILES)
@patch(PATCH_METHOD, side_effect=ConnectTimeout("test message", request=""))
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
ID Title From To House Term Session Meeting Sitting Agenda Subcorpus Speaker_role Speaker_type Speaker_party Speaker_party_name Party_status Speaker_name Speaker_gender Speaker_birth
ParlaMint-SI_2014-08-01-SDZ7-Redna-01.u1 Minutes of the National Assembly of the Republic of Slovenia, Term 7, Regular Session 1, 1.8.2014 2014-08-01 2014-08-01 Lower house 7 1 Reference Chairperson MP DeSUS Demokratična stranka upokojencev Slovenije Kotnik Poropat, Marjana F 1944
ParlaMint-SI_2014-08-01-SDZ7-Redna-01.u2 Minutes of the National Assembly of the Republic of Slovenia, Term 7, Regular Session 1, 1.8.2014 2014-08-01 2014-08-01 Lower house 7 1 Reference Regular MP SD Socialni demokrati Veber, Janko M 1960
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
ID Title From To House Term Session Meeting Sitting Agenda Subcorpus Speaker_role Speaker_type Speaker_party Speaker_party_name Party_status Speaker_name Speaker_gender Speaker_birth
ParlaMint-SI_2014-08-25-SDZ7-Izredna-01.u1 Minutes of the National Assembly of the Republic of Slovenia, Term 7, Extraordinary Session 1, 25.8.2014 2014-08-25 2014-08-25 Lower house 7 1 Reference Chairperson MP SMC Stranka Mira Cerarja Brglez, Milan M 1967
ParlaMint-SI_2014-08-25-SDZ7-Izredna-01.u2 Minutes of the National Assembly of the Republic of Slovenia, Term 7, Extraordinary Session 1, 25.8.2014 2014-08-25 2014-08-25 Lower house 7 1 Reference Chairperson MP SMC Stranka Mira Cerarja Brglez, Milan M 1967
ParlaMint-SI_2014-08-25-SDZ7-Izredna-01.u3 Minutes of the National Assembly of the Republic of Slovenia, Term 7, Extraordinary Session 1, 25.8.2014 2014-08-25 2014-08-25 Lower house 7 1 Reference Regular MP SD Socialni demokrati Židan, Dejan M 1967
Loading