-
Notifications
You must be signed in to change notification settings - Fork 13
/
textprocessor.py
82 lines (68 loc) · 2.76 KB
/
textprocessor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
#!/usr/bin/env python3
import re
import nltk
class TokenProcessor:
def __init__(self):
# list of delimiter
self.stopwords_fname = "data/stopwords.txt"
self.splitter = [',', "’", '.', '/', '?', ' ', '(', ')', '"', '*', ':', '-']
self.load_stopwords(self.stopwords_fname)
def load_stopwords(self, filename):
with open(filename, 'r') as f:
self.stopwords = f.read().split('\n')
def process_text(self, text):
text = text.encode('ascii', errors='ignore').decode()
text = text.lower()
text = re.sub(r'http\S+', ' ', text)
text = re.sub(r'#+', ' ', text )
text = re.sub(r'@[A-Za-z0-9]+', ' ', text)
text = re.sub(r"([A-Za-z]+)'s", r"\1 is", text)
#text = re.sub(r"\'s", " ", text)
text = re.sub(r"\'ve", " have ", text)
text = re.sub(r"won't", "will not ", text)
text = re.sub(r"isn't", "is not ", text)
text = re.sub(r"can't", "can not ", text)
text = re.sub(r"n't", " not ", text)
text = re.sub(r"i'm", "i am ", text)
text = re.sub(r"\'re", " are ", text)
text = re.sub(r"\'d", " would ", text)
text = re.sub(r"\'ll", " will ", text)
text = re.sub('\W', ' ', text)
text = re.sub(r'\d+', ' ', text)
text = re.sub('\s+', ' ', text)
text = text.strip()
return text
def tokenize(self, text):
"""
This method splits the incoming text into tokens(terms)
"""
# text = self.reduce_tokens_to_space(text).strip().lower()
text = self.process_text(text)
return re.split(r"\s+", text)
def remove_stopwords(self, tokens):
return [ token for token in tokens if token not in self.stopwords ]
def remove_shortwords(self, tokens):
return [ token for token in tokens if len(token) > 3]
def reduce_tokens_to_space(self, text):
regex = r"[{}]+".format(''.join(self.splitter))
return re.sub(regex, " ", text)
def lemmatize(self, tokens):
lemmatizer = nltk.stem.WordNetLemmatizer()
lemma_list = []
for token in tokens:
lemma = lemmatizer.lemmatize(token, 'v')
if lemma == token:
lemma = lemmatizer.lemmatize(token)
lemma_list.append(lemma)
# return [ lemmatizer.lemmatize(token, 'v') for token in tokens ]
return lemma_list
def display(self):
print("splitter : {}".format(self.splitter))
def main():
token_processor = TokenProcessor()
tokens = token_processor.tokenize("hello i am paradox. be awesome stay awesome.")
tokens = token_processor.remove_stopwords(tokens)
# tokens = token_processor.remove_shortwords(tokens)
print(tokens)
if __name__ == "__main__":
main()