-
Notifications
You must be signed in to change notification settings - Fork 2
/
tool_print_stats_on_corpus.py
127 lines (114 loc) · 5.39 KB
/
tool_print_stats_on_corpus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import settings
import vozbase
import voz
import styhelper
import logging
import csv
import re
import quotedspeechhelper
import os
import networkcachemanager
import collections
logger = logging.getLogger(__name__)
def main():
get_docs_stats()
#check_tsv_csv_annotation_files()
def check_tsv_csv_annotation_files():
logging.basicConfig(level=logging.ERROR)
file_path = settings.STY_FILE_PATH
corpus_annotation_stats = collections.defaultdict(lambda:0)
entities = [i.split('\t') for i in open(file_path+'/all_coreferenced-entities.tsv.csv').readlines()]
entities_wrong = [i for i in entities if len(i)<6]
if entities_wrong: corpus_annotation_stats['E_entities_short'] = len(entities_wrong)
corpus_annotation_stats['coref groups'] = len(entities)
entities_dict = dict([((i[0],i[3]),i) for i in entities if i[3]])
corpus_annotation_stats['entities'] = len(entities_dict)
print entities_dict
#return
for sty_file in settings.STY_FILES:
sty_file_path = sty_file.split()[0]
sty_file_id = sty_file.split()[0].lstrip('0')
corpus_annotation_stats['stories']+=1
logger.info("Processing %s" % sty_file)
quoted_speech_file = file_path + sty_file_path + "/sentences.tsv.csv"
sentences = [i.split('\t') for i in open(quoted_speech_file).readlines()]
for sentence_i in xrange(len(sentences)):
sentence = sentences[sentence_i]
corpus_annotation_stats['sentences'] += 1
if len(sentence)<7:
logger.warning("too short/"+sty_file_id+"/"+str(sentence))
corpus_annotation_stats['E_sentences_short'] += 1
continue
if sentence[2] not in 'cdnqsp':
logger.warning("wrong sentence/" + sty_file_id + "/" + str(sentence))
continue
corpus_annotation_stats['sentences_%s' % sentence[2]] += 1
if sentence[2]=='d':
if not sentence[3].strip():
logger.warning("no entity/" + str(sentence))
corpus_annotation_stats['E_no_entity'] += 1
for speaker in sentence[3].split(','):
key = (sty_file_id,speaker.strip())
if key not in entities_dict:
logger.warning("speaker not found/" + str(key))
corpus_annotation_stats['E_bad_speaker'] += 1
for speaker in sentence[4].split(','):
if not speaker: continue
key = (sty_file_id, speaker.strip())
if key not in entities_dict:
logger.warning("listener not found/" + str(key))
corpus_annotation_stats['E_bad_listener'] += 1
if sentence[5]:
if '<' in sentence[5]:
# quote before
sentence_ = sentences[sentence_i-1]
sentence__ = -1
# both may be possible now!
if '>' in sentence[5]:
# quote next
sentence_ = sentences[sentence_i + 1]
sentence__ = 0
if not sentence_ or sentence_[3]=='d' and not sentence_[6].strip()[sentence__]=='"':
logger.warning("bad dialog attachemt/" + str(sentence) + '/'+sentence_)
corpus_annotation_stats['E_bad_attachment'] += 1
for attachment in sentence[5].split(';'):
delim = attachment.index('<') if '<' in attachment else attachment.index('>')
for participant in attachment[(delim+1):].split('/'):
if participant not in sentence[6]:
logger.warning("participant not found/" + str(participant)+'/'+str(sentence))
corpus_annotation_stats['E_bad_participant'] += 1
#break
for k,v in sorted(corpus_annotation_stats.items()):
print k.ljust(20)+"\t"+str(v).rjust(5)
def get_docs_stats():
logging.root.setLevel(logging.ERROR)
acumm_stats_before = [0 for _ in voz.Document.get_stats_labels()]
acumm_stats_after = [0 for _ in voz.Document.get_stats_labels()]
acumm_count = 0
logging.basicConfig(level=logging.WARNING)
file_path = settings.STY_FILE_PATH
for sty_file in settings.STY_FILES:
acumm_count +=1
logger.info("Processing %s" % sty_file)
doc = styhelper.create_document_from_sty_file(file_path+sty_file)
doc_stats = doc.get_stats()
for i in xrange(len(acumm_stats_before)):
acumm_stats_before[i]+=doc_stats[i]
quoted_speech_file = sty_file.split()[0] + "/sentences.tsv.csv"
quotedspeechhelper.annotate_sentences(doc, file_path + quoted_speech_file)
quotedspeechhelper.clean_quoted_speech_from_document(doc)
doc_stats = doc.get_stats()
for i in xrange(len(acumm_stats_before)):
acumm_stats_after[i]+=doc_stats[i]
#break
print "Counts"
#print voz.Document.format_stats(acumm_stats_before)
print voz.Document.format_stats(acumm_stats_after)
print "Averages"
for i in xrange(len(acumm_stats_before)):
#acumm_stats_before[i]=1.0*acumm_stats_before[i]/acumm_count
acumm_stats_after[i]=1.0*acumm_stats_after[i]/acumm_count
#print voz.Document.format_stats(acumm_stats_before)
print voz.Document.format_stats(acumm_stats_after)
if __name__=='__main__':
main()