-
Notifications
You must be signed in to change notification settings - Fork 0
/
jobimtext_handler.py
122 lines (104 loc) · 5.24 KB
/
jobimtext_handler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import requests
import logging
import time
# added this part for #PRP tag experiment
# pronouns = {'i', 'me', 'myself', 'we', 'us', 'ourselves', 'he', 'him', 'himself', 'she', 'her', 'herself',
# 'it', 'itself', 'they', 'them', 'themselves', 'you', 'yourself'}
# for a query mention and cluster type, returns a response from jobimtext api
# -- http://ltmaggie.informatik.uni-hamburg.de/jobimviz/#
# http://ltmaggie.informatik.uni-hamburg.de/jobimtext/jobimviz-web-demo/api-and-demo-documentation/#API
# -- term operations
def get_senses_from_jobimtext(mention, cluster_type):
# there are 3 clustering options in JoBimText
assert cluster_type in ["200,200", "200,50", "50,50"]
if len(mention.split()) > 1:
r_url = 'http://ltmaggie.informatik.uni-hamburg.de/jobimviz/ws/api/ccDepNEMWE/jo/senses/' \
+ mention + '%23MWE' + '?sensetype=CW(' + cluster_type + ')&format=json'
# added this part for #PRP tag experiment
# elif mention.lower() in pronouns:
# r_url = 'http://ltmaggie.informatik.uni-hamburg.de/jobimviz/ws/api/ccDepNEMWE/jo/senses/' \
# + mention + '%23PRP' + '?sensetype=CW(' + cluster_type + ')&format=json'
# added this part
else:
r_url = 'http://ltmaggie.informatik.uni-hamburg.de/jobimviz/ws/api/ccDepNEMWE/jo/senses/' \
+ mention + '%23NN' + '?sensetype=CW(' + cluster_type + ')&format=json'
response = requests.get(r_url)
try:
return response.json()['result']
except:
logging.info('No result from API call of ' + mention +
' --- response status code: ' + str(response.status_code))
if response.status_code == 200:
logging.info('response json: ' + str(response.json()))
# from logging, I recognized sometimes 502, although everything seems ok.
# for them, I tried 10 times until I get a response.
if response.status_code != 200:
for i in range(10):
logging.info(str(i+1) + '. try for status code: ' + str(response.status_code))
time.sleep(3)
response = requests.get(r_url)
try:
return response.json()['result']
except:
logging.info('did not work once more: ' + str(response.status_code))
# for two mentions, exception from API -- for dev
# EXCEPT: EEC/EC interior and justice ministers
# EXCEPT: Catholics
# for test -- EXCEPT: the Richard Rodgers/Stephen Sondheim
# print('EXCEPT:', mention)
return []
# TODO: the same function as preprocess in preprocess.py -- write them in common utils in one function.
# singularize the label
def postprocess(label, inflect_engine, nlp):
singular_label = inflect_engine.singular_noun(label)
# for the cases like "actress" or "a focus", cross-check with stanza's feats of 'Number=Plur'
feats = nlp(label).sentences[0].words[-1].feats
if (label[-1] == 's' or label[-1] == 'S') and feats and 'Number=Plur' in feats.split('|') and \
singular_label:
return singular_label
elif label[-1] != 's' and label[-1] != 'S' and singular_label:
return singular_label
return label
# since 'thing' is the most common, we thought it is noisy.
def process_jobimtext_result(result, noisy_isas, number_of_cluster_terms=None, number_of_isas=None,
apply_postprocess=False, nlp=None, inflect_engine=None, types=None):
sense_terms = result['senses']
# take only terms, since we do not need tag information
# remove the sense terms starting with "^"
splitted = [s.split('#')[0] for s in sense_terms if not s.startswith('^')]
pruned = []
for i in splitted:
# not add already added ones
if i not in pruned:
pruned.append(i)
# limiting the term number
if number_of_cluster_terms:
senses = pruned[:number_of_cluster_terms]
else:
senses = pruned
isas = result['isas']
if isas:
# take only isas words, since we do not use frequency information
isas = [i.split(':')[0] for i in isas if i.split(':')[0] not in noisy_isas]
if apply_postprocess and isas:
isas_new = []
for label in isas:
# 1-step: singularize
singular_label = postprocess(label=label, inflect_engine=inflect_engine, nlp=nlp)
# 2-step: if the label multi token, put underscore
underscored_label = singular_label.replace(' ', '_')
# 3-step: lowerize the label
lowerized_label = underscored_label.lower()
if lowerized_label not in isas_new and lowerized_label not in noisy_isas:
# 4-step: if it is in type set, include it.
# it is done here, since the number of isas limited here
if types:
if lowerized_label in types:
isas_new.append(lowerized_label)
else:
isas_new.append(lowerized_label)
isas = isas_new
# limiting the isas number
if number_of_isas:
isas = isas[:number_of_isas]
return senses, isas