-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
75 lines (64 loc) · 2.17 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import re
def stem_words(text):
text = text.split()
stemmer = SnowballStemmer('english')
stemmed_words = [stemmer.stem(word) for word in text]
text = " ".join(stemmed_words)
return text
def make_lower_case(text):
return text.lower()
def remove_stop_words(text):
text = text.split()
stops = set(stopwords.words("english"))
text = [w for w in text if not w in stops]
text = " ".join(text)
return text
def decontractions(text):
'''Performs decontractions in the doc'''
text = re.sub(r"won't", "will not", text)
text = re.sub(r"can\'t", "can not", text)
text = re.sub(r"couldn\'t", "could not", text)
text = re.sub(r"shouldn\'t", "should not", text)
text = re.sub(r"wouldn\'t", "would not", text)
# general
text = re.sub(r"n\'t", " not", text)
text = re.sub(r"\'re", " are", text)
text = re.sub(r"\'s", " is", text)
text = re.sub(r"\'d", " would", text)
text = re.sub(r"\'ll", " will", text)
text = re.sub(r"\'t", " not", text)
text = re.sub(r"\'ve", " have", text)
text = re.sub(r"\'m", " am", text)
return text
def rem_numbers(text):
'''Removes numbers and irrelevant text like xxxx*'''
text = re.sub(r'\d','',text)
return text
def fullstops(text):
text = re.sub('\.', ' .', text)
return text
def remove_punctuation(text):
punctuations = '''!()-[]{};:'"\,<>/?@#$%^&*~''' # full stop is not removed
new_text = []
for word in text.split(' '):
if word in punctuations:
pass
else:
new_text.append(word)
return ' '.join(new_text)
def preprocess_message(message):
message = make_lower_case(message)
message = remove_stop_words(message)
message = remove_punctuation(message)
message = decontractions(message)
message = fullstops(message)
message = stem_words(message)
return message
def preprocess_columns(dataframe, column_name, preprocess_funcs):
for preprocess_func in preprocess_funcs:
dataframe[column_name] = dataframe[column_name].apply(func=preprocess_func)
return dataframe