forked from robert-jm/twit-ranker
-
Notifications
You must be signed in to change notification settings - Fork 0
/
analyzer.py
127 lines (114 loc) · 3.09 KB
/
analyzer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import dictionary_reader as dr
import sentiwordnet as senti
import numpy
import operator
from collections import defaultdict
class Analyzer:
pos_cnt = defaultdict(int)
neg_cnt = defaultdict(int)
def __init__(self):
self.big = dr.BigDictionary()
self.swn = senti.SentiWordNetCorpusReader('swn3.txt')
self.affect = dr.AffectDictionary()
def get_polarity_score(self, tweet, pos):
"""
input:
tweet is the preprocessed tweet text
pos is the pos tag for each token
output:
an aggregate polarity score
"""
score = 0
for idx, elem in enumerate(tweet):
if pos[idx] not in dr.twitter2wordnet_tbl:
continue
tag = dr.twitter2wordnet_tbl[pos[idx]]
synsets = self.swn.senti_synsets(elem, tag)
if len(synsets)==0:
continue
t = synsets[0]
if t.obj_score >=0.5:
continue
if t.pos_score> t.neg_score:
score +=t.pos_score
else:
score += -1*t.neg_score
return score
def count_cue(self, tweet, pos, threshold=0):
"""
input:
tweet is the preprocessed tweet text
pos is the pos tag for each token
threshold is the minimum cues required to have a sentiment
output:
sentiment: +1 (positive), -1 (negative), 0 (neutral)
"""
posi, neg, tag = 0, 0, ''
for idx, elem in enumerate(tweet):
if pos[idx] not in dr.twitter2mpqa_tbl:
tag = 'E' # assume it's emoticon
else:
tag = dr.twitter2mpqa_tbl[pos[idx]]
sent = self.big.lookup(elem, tag)
if sent<0:
neg +=1
elif sent==1:
posi +=1
elif sent >1:
posi +=1
neg +=1
return (posi, neg)
#if posi==neg or (threshold>posi and threshold>neg):
# return 0
#return 1 if posi>neg else -1
def get_emotion(self, tweet):
"""
input:
tweet is the preprocessed tweet text
pos is the pos tag for each token
output:
joy, disgust, anger, fear, sadness, surprise, neutral
"""
emo = defaultdict(int)
for elem in tweet:
t = self.affect.lookup(elem)
if len(t)==1:
for e in t:
emo[e[1]] +=1
else:
emo[t[1]] += 1
#print emo
#return max(emo, key=lambda x: emo.get(x))
return emo
def pmi(self, tweet):
"""
Compute the average SO of the tweet
"""
pmi = 0
for token in tweet:
pmi += numpy.log((0.01+Analyzer.pos_cnt[token])/(0.01+Analyzer.neg_cnt[token]))
return pmi/len(tweet)
def count(self, tweet, pos):
"""
Count the cooccurence of each token in tweet and a sentiment cue. The
result is stored in class variables. Make sure you run the function
on all tweets before using the pmi function
"""
for idx, elem in enumerate(tweet):
nbr = self._near(tweet, idx)
Analyzer.pos_cnt[elem] += len([x for x in nbr if
self.big.lookup(x,pos[idx])==1])
Analyzer.neg_cnt[elem] += len ([x for x in nbr if
self.big.lookup(x,pos[idx])==-1])
both = len([x for x in nbr if self.big.lookup(x, pos[idx])==2])
Analyzer.pos_cnt[elem]+= both
Analyzer.neg_cnt[elem]+= both
def reset_cnt(self):
pos_cnt, neg_cnt = defaultdict(int), defaultdict(int)
def _near(self,a, tok_idx, n=10):
"""
the NEAR operator: articles is a list of tokens
"""
before = a[:tok_idx][-n:]
after = a[tok_idx:][1:n+1]
return before+after