forked from robert-jm/twit-ranker
-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocessor.py
108 lines (94 loc) · 2.6 KB
/
preprocessor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import subprocess
import sys
import copy
from nltk.stem.wordnet import WordNetLemmatizer
import dictionary_reader as dr
import cPickle as pickle
import spell_checker as s
import fnmatch
import os
POSTAGGER_PATH = './ark-tweet-nlp-0.3.2'
PREPROCESSED_PATH = './preprocessed'
lmtzr = WordNetLemmatizer()
def recursive_glob(rootdir='.', pattern='*'):
return [os.path.join(rootdir, filename)
for rootdir, dirnames, filenames in os.walk(rootdir)
for filename in filenames
if fnmatch.fnmatch(filename, pattern)]
def clean_bin(filename):
out = open(filename+'.clean','w+')
print 'cleaning ', filename
with open(filename,'r') as f:
for line in f:
l = line.rsplit(':')
out.write(l[1].strip()[12:])
out.close()
print 'done cleaning ', filename
def pos_tag(filename):
"""
Return a list of pos-tagged tweets in the format of [tokens, pos]
"""
print 'pos tagging ',filename
l = []
output = subprocess.Popen([POSTAGGER_PATH+'/runTagger.sh', filename], stdout=subprocess.PIPE).communicate()[0]
#output = subprocess.check_output(POSTAGGER_PATH+'/runTagger.sh '+filename,
# shell=True)
for tweet in output.split('\n'):
#print 'tweet: ', tweet
elements = tweet.split('\t')
#print elements
if len(elements)<2:
print 'WRONG'
print elements
print 'WRONG'
continue
tokens = elements[0].split()
pos = elements[1].split()
l.append([tokens,pos])
print 'done pos tagging ', filename
return l
def spell_check(l):
"""
Return a list of spell-checked, pos-tagged tweets in the format of [tokens, pos]
"""
print 'spellchecking '
ret = []
for tweet in l:
tokens = []
for t in tweet[0]:
tokens.append(s.correct(t))
ret.append([tokens, tweet[1]])
print 'done spellchecking'
return ret
def reduce_form(l):
""" Stemming/lemmatizing + drop hashtag and handles
"""
print 'reducing'
ret = copy.deepcopy(l)
for tok, pos in ret:
for idx, val in enumerate(tok):
if (val[0]=='@' or val[0]=='#') and len(val)>1:
val = val[1:]
tok[idx] = val.lower()
tag = pos[idx]
if tag in dr.twitter2wordnet_tbl:
tag = dr.twitter2wordnet_tbl[tag]
else:
continue
tok[idx] = lmtzr.lemmatize(val, tag).lower()
print 'done reducing'
return ret
if __name__ == "__main__":
if len(sys.argv)==1:
print 'usage: python preprocessor.py folder1 folder2'
sys.exit(0)
arg = sys.argv[1:]
# process every tweet file
for fi in arg:
files = [x for x in recursive_glob(fi) if '.' not in x]
for filename in files:
clean_bin(filename)
l = pos_tag(filename+'.clean')
l = spell_check(l)
l = reduce_form(l)
pickle.dump(l, open(filename+'.pkl', 'wb+'))