-
Notifications
You must be signed in to change notification settings - Fork 1
/
get_cuis.py
99 lines (77 loc) · 3.9 KB
/
get_cuis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import argparse
import pandas as pd
import numpy as np
from pathlib import Path
from quickumls import QuickUMLS
def process_batch(filepath, id):
"""
Process a pkl file containing a dataframe of BERN annotations; each BERN entity has its own row (i.e. there are multiple rows per record). Return a dataframe where each unique record has one row and drug and disease entities associated with the record are grouped under respective columns.
Parameters
----------
filepath: str
path to pkl file with a dataframe of BERN annotations (entity per row)
id: str
name of the column containing the unique record identifiers, by which the entities should be grouped
Returns
-------
processed_batch: pd.DataFrame
dataframe where each unique record has one row, all disease entities associated with the record are in an array in the `ent_text_disease` column and all drug entities associated with the record are in an array in the `ent_text_drug` column
"""
batch = pd.read_pickle(filepath)
processed_batch = pd.DataFrame(index=batch[id].unique())
batch['entity_text'] = batch['entity_text'].str.lower()
dis_series = batch.loc[(batch.entity_type == 'disease')].groupby(id).entity_text.unique()
drug_series = batch.loc[(batch.entity_type == 'drug')].groupby(id).entity_text.unique()
processed_batch = processed_batch.join(dis_series)
processed_batch = processed_batch.join(drug_series, rsuffix='_drug')
processed_batch.columns = ['ent_text_disease', 'ent_text_drug']
return processed_batch
def apply_QuickUMLS(txt_array, matcher):
"""
Apply QuickUMLS on the text in `txt_array` and return the set of identified UMLS CUIs.
Parameters
----------
txt_array: {str, np.ndarray}
the text to process with QuickUMLS
matcher: QuickUMLS object
Returns
-------
cuis: set
set of cuis identified by QuickUMLS
"""
if isinstance(txt_array, str):
txt_array = np.array(txt_array)
if txt_array is np.nan:
return np.nan
cuis = set()
for match in matcher.match(txt_array, best_match=True, ignore_syntax=False):
best_match = match[0]
cuis.add(best_match['cui'])
return cuis
if __name__ == '__main__':
"""
Process pkl files containing dataframes with detected BERN entities (of types 'drug' and 'disease') per CTgov/PubMed record (directory is specified by the '--path_in' parameter).
Return pkl files of the dataframes with two added columns containing the CUIs identified by QuickUMLS (directory is specified by the '--path_out' parameter).
"""
############## PARAMETERS ##############
argparser = argparse.ArgumentParser()
argparser.add_argument('--path_in', default='../data/bern_df/')
argparser.add_argument('--path_out', default='../data/cuis/')
args = argparser.parse_args()
############## INSTANTIATE QuickUMLS ##############
sem_diseases = ['T020', 'T190', 'T049', 'T019', 'T047', 'T050', 'T033', 'T037', 'T048', 'T191', 'T046', 'T184']
sem_drugs = ['T116', 'T195', 'T123', 'T122', 'T103', 'T120', 'T104', 'T200', 'T196', 'T126', 'T131', 'T125', 'T129', 'T130', 'T197', 'T114', 'T109', 'T121', 'T192', 'T127']
sem_dis_drug = sem_diseases + sem_drugs
data_dir = '../data/quickUMLS_eng'
matcher = QuickUMLS(quickumls_fp=data_dir, accepted_semtypes=sem_dis_drug)
############## PROCESS ##############
path_in = Path(args.path_in)
path_out = Path(args.path_out)
path_out.mkdir(exist_ok=True, parents=True)
for file in path_in.glob('*.pkl'):
batch = process_batch(file, 'idx')
batch_name = file.stem
batch['disease_cuis'] = batch['ent_text_disease'].apply(apply_QuickUMLS, args=(matcher,))
batch['drug_cuis'] = batch['ent_text_drug'].apply(apply_QuickUMLS, args=(matcher,))
batch.to_pickle(f"{path_out}/{batch_name}.pkl")
print(f"{batch_name} is processed and saved.")