forked from robert-jm/twit-ranker
-
Notifications
You must be signed in to change notification settings - Fork 0
/
sentiwordnet.py
114 lines (95 loc) · 3.67 KB
/
sentiwordnet.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
#!/usr/bin/env python
"""
Interface to SentiWordNet using the NLTK WordNet classes.
---Chris Potts
"""
import re
import os
import sys
import codecs
try:
from nltk.corpus import wordnet as wn
except ImportError:
sys.stderr.write("Couldn't find an NLTK installation. To get it: http://www.nltk.org/.\n")
sys.exit(2)
######################################################################
class SentiWordNetCorpusReader:
def __init__(self, filename):
"""
Argument:
filename -- the name of the text file containing the
SentiWordNet database
"""
self.filename = filename
self.db = {}
self.parse_src_file()
def parse_src_file(self):
lines = codecs.open(self.filename, "r", "utf8").read().splitlines()
lines = filter((lambda x : not re.search(r"^\s*#", x)), lines)
for i, line in enumerate(lines):
fields = re.split(r"\t+", line)
fields = map(unicode.strip, fields)
try:
pos, offset, pos_score, neg_score, synset_terms, gloss = fields
except:
sys.stderr.write("Line %s formatted incorrectly: %s\n" % (i, line))
if pos and offset:
offset = int(offset)
self.db[(pos, offset)] = (float(pos_score), float(neg_score))
def senti_synset(self, *vals):
if tuple(vals) in self.db:
pos_score, neg_score = self.db[tuple(vals)]
pos, offset = vals
synset = wn._synset_from_pos_and_offset(pos, offset)
return SentiSynset(pos_score, neg_score, synset)
else:
synset = wn.synset(vals[0])
pos = synset.pos
offset = synset.offset
if (pos, offset) in self.db:
pos_score, neg_score = self.db[(pos, offset)]
return SentiSynset(pos_score, neg_score, synset)
else:
return None
def senti_synsets(self, string, pos=None):
sentis = []
synset_list = wn.synsets(string, pos)
for synset in synset_list:
sentis.append(self.senti_synset(synset.name))
sentis = filter(lambda x : x, sentis)
return sentis
def all_senti_synsets(self):
for key, fields in self.db.iteritems():
pos, offset = key
pos_score, neg_score = fields
synset = wn._synset_from_pos_and_offset(pos, offset)
yield SentiSynset(pos_score, neg_score, synset)
######################################################################
class SentiSynset:
def __init__(self, pos_score, neg_score, synset):
self.pos_score = pos_score
self.neg_score = neg_score
self.obj_score = 1.0 - (self.pos_score + self.neg_score)
self.synset = synset
def __str__(self):
"""Prints just the Pos/Neg scores for now."""
s = ""
s += self.synset.name + "\t"
s += "PosScore: %s\t" % self.pos_score
s += "NegScore: %s" % self.neg_score
return s
def __repr__(self):
return "Senti" + repr(self.synset)
######################################################################
if __name__ == "__main__":
"""
If run as
python sentiwordnet.py
and the file is in this directory, send all of the SentiSynSet
name, pos_score, neg_score trios to standard output.
"""
SWN_FILENAME = "swn3.txt"
if os.path.exists(SWN_FILENAME):
swn = SentiWordNetCorpusReader(SWN_FILENAME)
for senti_synset in swn.all_senti_synsets():
print senti_synset.synset.name, senti_synset.pos_score, senti_synset.neg_score