-
Notifications
You must be signed in to change notification settings - Fork 2
/
qsahelper.py
134 lines (124 loc) · 5.57 KB
/
qsahelper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import settings
import voz
import logging
import util
from quotedbase import *
import entitymanager,verbmanager
import networkcachemanager
import stanfordhelper
from bs4 import BeautifulSoup,element
logger = logging.getLogger(__name__)
class DummyDocument(object):
id = -1
class QsaFile(object):
def __init__(self, path=''):
"""
:param path: str
"""
logger.info('Processing '+path)
self.d = BeautifulSoup(open(path,'rb').read(), 'xml')
self.path = path
self.story_id = -1
self.sentences = [] # :type list[voz.Sentence]
self.tokens = [] # :type list[voz.Sentence]
def to_document(self,properties={}):
"""
:return: voz.Document
"""
str_input = self.get_original_text()
sentences = []
properties = dict({'source':'create_document_from_sty_file'}, **properties)
self.document = voz.Document(str_input,sentences,properties) #type: voz.Document
self.document.id = int(self.story_id) if util.is_numeric_int(self.story_id) else properties.get('story_id',-1)
def tokenize(self):
quotes = [] #type: list[Quote]
mentions = [] #type: list[entitymanager.Mention]
verbs = [] #type: list[verbmanager.Verb]
output = []
mention_id_to_mention = {}
def consume(tokens,text):
car = []
cdr = list(tokens)
to_consume = len(normalize_string_spacing(text))
consumed = 0
while consumed < to_consume:
token = cdr.pop(0)
consumed += len(normalize_string_spacing(token.text.encode('utf-8')))
car.append(token)
return car,cdr
def child_as_text(child):
if isinstance(child, element.NavigableString):
return unicode(child)
#return str(unicode(child.decode('utf-8')).encode('utf-8'))
#return str(unicode(child).encode('ascii','ignore'))
else:
return unicode(child.getText())
#return str(unicode(child.getText().decode('utf-8')).encode('utf-8'))
#return str(unicode(child.getText()).encode('ascii','ignore'))
def create_mention(child):
m = entitymanager.Mention(-1, [], is_independent=True)
id_ = child.attrs.get('id', None)
mention_id_to_mention[id_] = m
m.annotations.character = True
m.annotations.coref = child.attrs.get('entity', None)
m.add_tag(entitymanager.TaggableContainer.TAG_CHARACTER_SYMBOL, m.annotations.coref)
if not m.get_most_likely_symbol():
logger.error("No symbol for " + m.annotations.coref)
m.type = child.attrs.get('gender', None)
m.role = None
m.split_ignore = False
for child in self.d.select('PERSON'):
create_mention(child)
for child in self.d.select('ORGANIZATION'):
create_mention(child)
for p in self.d.select('DOC')[0].select('PARAGRAPH'):
#text = p.getText() # doesn't work because of: <PERSON>the bride</PERSON>-people
text = ' '.join([child_as_text(child) for child in p.children])
tokens = stanfordhelper.tokenized_string(unicode(text))
#print self.path,p.attrs.get('parnum')
#if p.attrs.get('parnum')=='22': pass
for child in p.children:
car, tokens = consume(tokens, child_as_text(child))
if child.name in ['PERSON','ORGANIZATION']:
id_ = child.attrs.get('id', None)
m = mention_id_to_mention[id_]
mentions.append(m)
output.append(m)
elif child.name =='QUOTE':
# Note, wrong annotations:
# in austen_emma_1.xml: <PARAGRAPH parnum="13"><QUOTE id="0">"Poor
q = Quote(-1,-1,DummyDocument())
#print child,child_as_text(child)
q._text = child_as_text(child).encode('ascii','ignore')
speaker_symbol = child.attrs.get('speaker', None)
if speaker_symbol and speaker_symbol != 'none':
speaker_symbol = mention_id_to_mention[speaker_symbol].annotations.coref
else:
speaker_symbol = None
q.annotations = voz.SentenceLevelQuotedAnnotations(-1, -1, 'd', speaker_symbol)
# q.annotations.speaker_annotation = speaker_symbol
q.endp = q._text.strip('"')
quotes.append(q)
output.append(q)
else:
for token in car:
if token.pos in stanfordhelper.VERB_POS:
verb = verbmanager.Verb(-1,-1,-1,token,None,{})
verbs.append(verb)
output.append(verb)
if token.pos[-1] in ['.', ':']:
output.append(Punctuation(token.pos[-1]))
return output, quotes, mentions, verbs
def tokenized_string_from_qsa_file(qsa_file):
return QsaFile(qsa_file).tokenize()
def main():
logging.basicConfig(level=logging.DEBUG)
file_path = settings.QSA_FILE_PATH
story_file = settings.QSA_FILES[0]
story_file = 'chekhov_lady.xml'
#story_file = 'doyle_boscombe.xml'
t = tokenized_string_from_qsa_file(file_path+story_file) #type: voz.Document
output, quotes, mentions, verbs = t
print tokenized_string_to_string(output,2)
if __name__ == '__main__':
main()