-
Notifications
You must be signed in to change notification settings - Fork 0
/
KeywordAnalysis.py
60 lines (53 loc) · 2.72 KB
/
KeywordAnalysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
from nltk import *
import requests
from collections import Counter
import itertools
import re
def text_grab(pmc):
abstract_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id='
search_abstract = abstract_url + pmc + '&retmode=report_type&rettype=medline&api_key=9893ad891eedcd3802a273ea252798721e08'
r2 = requests.get(search_abstract).text
abstract = r2[r2.find('AB') + 6:r2.find('FAU')]
return abstract
def get_continuous_chunks(article_text, query):
stopwords_s = ['et', 'al.', 'deviation', 'windowFigure', 'windowFig', 'difference',
'additional', 'data', 'file', 'distribution', 'significant',
'clinical', 'kg', 'adverse', 'sample', 'studies', 'significance',
'window', 't-test', 'supplementary', 'important','experimental',
'study', 'subject', 'condition', 'experiment', 'control', 'panel',
'outcome', 'response', 'standardized', 'publisher', 'abstract',
'model', 'event', 'aversive', 'stimulus', 'training', 'risk',
'impact', 'article', 'patient', 'adult', 'themes', 'concentration',
'participant', 'dose', 'vs', 'normalized', 'mean', 'finding',
'incidental', 'typical', 'other', 'current', 'different', 'minimum',
'initial', 'systematic', 'quality', 'related', 'preclinical', 'technical',
'pubic', 'stereotyped', query.lower()]
stopwords_p = [i+'s' for i in stopwords_s]
stopwords = stopwords_s+stopwords_p
token_words = word_tokenize(article_text)
words = [w.strip() for w in token_words]
filtered_text = [w for w in words if (w.lower() not in stopwords) and (len(w)>1)]
processed_text = pos_tag(filtered_text)
# Regex to grammatically identify a set of: adjective+noun(+noun)
# From NLTK: JJ* = adjective/numeral/ordinal/comparative/superlative, NN* = Nouns
chunk_gram = r"Chunk: {<JJ.?><NN.?>}"
chunk_parser = RegexpParser(chunk_gram)
chunked = chunk_parser.parse(processed_text)
keywords = [i.leaves() for i in chunked if type(i) == Tree]
# NLTK returns chunks in the form of lists of tuples of the word and its pos tag
newkeys = []
for chunk in keywords:
if len(chunk) < 2:
newkeys.append(list[0][0])
else:
temp = []
for pair in chunk:
temp.append(pair[0])
newkeys.append(" ".join(temp))
counted = Counter(newkeys).most_common(12)
keywords = [i[0] for i in counted if (i[0]+'s' != query)
and (i[0] != query+'s')]
for a,b in itertools.combinations(keywords,2):
if (a in b.lower()) or (b in a.lower()):
keywords.remove(b)
return keywords