-
Notifications
You must be signed in to change notification settings - Fork 15
/
get_pmid.py
121 lines (109 loc) · 3.9 KB
/
get_pmid.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import csv
import sys
import json
import collections
from Bio import Entrez
from Bio import Medline
from nltk.metrics import edit_distance
from pubmedpy import rank_by_edit_distance
# Set up
INPATH = sys.argv[1]
OUTPATH = sys.argv[2]
Entrez.email = "[email protected]"
rowCnt = 1
headers = {
'Error': 'error_code',
'PMID': 'pmid',
'CitationId': 'citation_id',
'Title': 'title',
'Prediction': 'predicted p of being relevant',
'Hard': '\'hard\' screening prediction*'
}
errors = {
1: "Pubmed id AND title are absent from the row.",
2: "No 'Count' key found in PubMed response.",
3: "rank_by_edit_distance returned None.",
4: "Title search returned 0 records."
}
def main():
# Tell python we want to use the global variable rowCnt.
global rowCnt
with open(INPATH, 'rU') as f:
reader = csv.DictReader(f, dialect='excel',
delimiter='\t')
with open(OUTPATH, 'w') as csvfile:
fieldnames = [
headers['Error'],
headers['PMID'],
headers['CitationId'],
headers['Title'],
headers['Prediction'],
headers['Hard'],
]
writer = csv.DictWriter(csvfile, delimiter='\t', fieldnames=fieldnames)
writer.writeheader()
for row in reader:
output = {}
pmid = row.get(headers['PMID'])
citation_id = row.get(headers['CitationId'])
title = row[headers['Title']]
prediction = row.get(headers['Prediction'])
hard = row.get(headers['Hard'])
error = ""
# Check value of pmid.
if not pmid:
# We have to have a title for this to work
if title:
retVal = _find_best_pmid_by_title(title)
pmid = retVal.pmid
error = retVal.error
else:
print "Can't process this row. Pubmed "
print "id AND title is missing."
pmid = ""
error = errors[1]
output = {
headers['Error']: error,
headers['PMID']: pmid,
headers['CitationId']: citation_id,
headers['Title']: title,
headers['Prediction']: prediction,
headers['Hard']: hard
}
writer.writerow(output)
# Increment counter.
print "On row: " + str(rowCnt)
rowCnt = rowCnt + 1
def _find_best_pmid_by_title(title):
"""Retrieve pmid from pubmed based on title string.
:title: TODO
:returns: ReturnTuple(INT, INT)
"""
returnVal = collections.namedtuple('ReturnTuple', ['pmid', 'error'])
try:
handle = Entrez.esearch(db="pubmed",
term='"{0}"'.format(title),
field="ti")
except Exception as e:
return returnVal("", e)
records = Entrez.read(handle)
record_count = records.get("Count")
if record_count:
if record_count == "1":
return returnVal(records.get("IdList")[0], "")
elif record_count == "0":
return returnVal("", errors[4])
else:
sorted_pmids = rank_by_edit_distance(title.strip().lower(),
records.get("IdList"))
if sorted_pmids:
try:
return returnVal(sorted_pmids[0][0], "")
except Exception as e:
return returnVal("", e)
else:
return returnVal("", errors[3])
else:
return returnVal("", errors[2])
if __name__ == "__main__":
main()