forked from h2oai/driverlessai-recipes
-
Notifications
You must be signed in to change notification settings - Fork 1
/
text_preprocessing_transformer.py
162 lines (145 loc) · 7.16 KB
/
text_preprocessing_transformer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
"""Preprocess the text column by stemming, lemmatization and stop word removal"""
import datatable as dt
import numpy as np
import shutil
import os
from zipfile import ZipFile
import filelock
from h2oaicore.transformer_utils import CustomTransformer
from h2oaicore.systemutils import config, remove, user_dir
from h2oaicore.systemutils_more import download
class TextPreprocessingTransformer(CustomTransformer):
_unsupervised = True
"""Transformer to preprocess the text"""
_numeric_output = False
_is_reproducible = True
_modules_needed_by_name = ["nltk==3.4.3"]
_testing_can_skip_failure = False # ensure tested as if shouldn't fail
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.do_stemming = True # turn off as needed
self.do_lemmatization = True # turn off as needed
self.remove_stopwords = True # turn off as needed
import nltk
nltk_data_path = os.path.join(user_dir(), config.contrib_env_relative_directory, "nltk_data")
nltk_temp_path = os.path.join(user_dir(), "nltk_data")
nltk.data.path.append(nltk_data_path)
os.makedirs(nltk_data_path, exist_ok=True)
nltk_download_lock_file = os.path.join(nltk_data_path, "nltk.lock")
with filelock.FileLock(nltk_download_lock_file):
nltk.download('stopwords', download_dir=nltk_data_path)
nltk.download('punkt', download_dir=nltk_data_path)
nltk.download('averaged_perceptron_tagger', download_dir=nltk_data_path)
nltk.download('maxent_treebank_pos_tagger', download_dir=nltk_data_path)
nltk.download('wordnet', download_dir=nltk_data_path)
nltk.download('sonoritysequencing', download_dir=nltk_data_path)
# download resources for stemming if needed
if self.do_stemming:
try:
self.stemmer = nltk.stem.porter.PorterStemmer()
self.stemmer.stem("test")
except LookupError:
os.makedirs(nltk_data_path, exist_ok=True)
os.makedirs(nltk_temp_path, exist_ok=True)
tokenizer_path = os.path.join(nltk_data_path, "tokenizers")
os.makedirs(tokenizer_path, exist_ok=True)
file1 = download(
"https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/tokenizers/punkt.zip",
dest_path=nltk_temp_path)
self.unzip_file(file1, tokenizer_path)
self.atomic_copy(file1, tokenizer_path)
self.stemmer = nltk.stem.porter.PorterStemmer()
self.stemmer.stem("test")
# download resources for lemmatization if needed
if self.do_lemmatization:
try:
from nltk.corpus import wordnet
self.lemmatizer = nltk.stem.WordNetLemmatizer()
self.pos_tagger = nltk.pos_tag
self.lemmatizer.lemmatize("test", wordnet.NOUN)
self.pos_tagger("test")
except LookupError:
os.makedirs(nltk_data_path, exist_ok=True)
os.makedirs(nltk_temp_path, exist_ok=True)
tagger_path = os.path.join(nltk_data_path, "taggers")
corpora_path = os.path.join(nltk_data_path, "corpora")
os.makedirs(tagger_path, exist_ok=True)
os.makedirs(corpora_path, exist_ok=True)
file1 = download(
"https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/taggers/averaged_perceptron_tagger.zip",
dest_path=nltk_temp_path)
file2 = download(
"https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/taggers/maxent_treebank_pos_tagger.zip",
dest_path=nltk_temp_path)
file3 = download(
"https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/wordnet.zip",
dest_path=nltk_temp_path)
self.unzip_file(file1, tagger_path)
self.unzip_file(file2, tagger_path)
self.unzip_file(file3, corpora_path)
self.atomic_copy(file1, tagger_path)
self.atomic_copy(file2, tagger_path)
self.atomic_copy(file3, corpora_path)
from nltk.corpus import wordnet
self.lemmatizer = nltk.stem.WordNetLemmatizer()
self.pos_tagger = nltk.pos_tag
self.lemmatizer.lemmatize("test", wordnet.NOUN)
self.pos_tagger("test")
self.wordnet_map = {"N": wordnet.NOUN,
"V": wordnet.VERB,
"J": wordnet.ADJ,
"R": wordnet.ADV,
"O": wordnet.NOUN}
# download resources for stopwords if needed
if self.remove_stopwords:
try:
self.stopwords = set(nltk.corpus.stopwords.words('english'))
except LookupError:
os.makedirs(nltk_data_path, exist_ok=True)
os.makedirs(nltk_temp_path, exist_ok=True)
corpora_path = os.path.join(nltk_data_path, "corpora")
os.makedirs(corpora_path, exist_ok=True)
file1 = download(
"https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/stopwords.zip",
dest_path=nltk_temp_path)
self.unzip_file(file1, corpora_path)
self.atomic_copy(file1, corpora_path)
self.stopwords = set(nltk.corpus.stopwords.words('english'))
def unzip_file(self, src, dst_dir):
with ZipFile(src, 'r') as zip_ref:
zip_ref.extractall(dst_dir)
def atomic_move(self, src, dst):
try:
shutil.move(src, dst)
except shutil.Error:
pass
remove(src)
def atomic_copy(self, src=None, dst=None):
import uuid
my_uuid = uuid.uuid4()
src_tmp = src + str(my_uuid)
shutil.copy(src, src_tmp)
os.makedirs(os.path.dirname(dst), exist_ok=True)
self.atomic_move(src_tmp, dst)
remove(src_tmp)
@staticmethod
def get_default_properties():
return dict(col_type="text", min_cols=1, max_cols=1, relative_importance=1)
@property
def display_name(self):
return "PreprocessedText"
def preprocess(self, text):
if self.do_stemming:
text = " ".join([self.stemmer.stem(word) for word in text.split()])
if self.do_lemmatization:
pos_tagged_text = self.pos_tagger(text.split())
text = " ".join([self.lemmatizer.lemmatize(word, self.wordnet_map.get(pos[0], self.wordnet_map["O"]))
for word, pos in pos_tagged_text])
if self.remove_stopwords:
text = " ".join([word for word in str(text).split()
if word.lower() not in self.stopwords])
return text
def fit_transform(self, X: dt.Frame, y: np.array = None):
return self.transform(X)
def transform(self, X: dt.Frame):
return X.to_pandas().astype(str).fillna("NA").iloc[:, 0].apply(lambda x: self.preprocess(x))