-
Notifications
You must be signed in to change notification settings - Fork 4
/
preprocess_gigaword.py
executable file
·120 lines (100 loc) · 3.72 KB
/
preprocess_gigaword.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import os
import re
import glob
import codecs
import string
from optparse import OptionParser
# Preprocess a gigaword corpus to extract the text of each article, remove punctuation, and write it all
# to one file with one article per line
replace = re.compile('[%s]' % re.escape(string.punctuation))
def main():
usage = "%prog input_dir output_file"
parser = OptionParser(usage=usage)
#parser.add_option('--keyword', dest='key', default=None,
# help='Keyword argument: default=%default')
#parser.add_option('--boolarg', action="store_true", dest="boolarg", default=False,
# help='Keyword argument: default=%default')
(options, args) = parser.parse_args()
input_dir = args[0]
output_file = args[1]
with codecs.open(output_file, 'w') as f:
f.write('')
files = glob.glob(os.path.join(input_dir, 'nyt_eng_200*'))
files.sort()
count = 0
for f in files:
print(f)
lines = []
doc = ''
text = ''
read = False
with codecs.open(f, 'r', encoding='utf-8') as f:
for line in f:
line = line.strip()
if line.startswith('<DOC id'):
# start a new document
text = ''
if count % 1000 == 0 and count > 0:
print(count)
elif line == '<P>':
pass
elif line == '</P>':
pass
elif line == '<TEXT>':
# start reading
read = True
elif line == '</TEXT>':
# stop reading and save document
read = False
# remove single quotes (to simplify contractions to a single word) as well as @ and . and :
text = clean_text(text)
# a few documents only have headlines and no text
if len(text) > 0:
lines.append(text)
count += 1
text = ''
elif read:
if line == '<HEADLINE>' or line == '</HEADLINE>' or line == '<DATELINE>' or line == '</DATELINE>':
print("Unexpectedly encountered headline/dateline tag")
# add text to line
text += line + ' '
print("Adding articles to file")
with codecs.open(output_file, 'a', encoding='utf-8') as f:
for line in lines:
f.write(line + '\n')
print("%d documents" % count)
def clean_text(text, strip_html=False, lower=True, keep_emails=False, keep_at_mentions=False):
# remove html tags
if strip_html:
text = re.sub(r'<[^>]+>', '', text)
else:
# replace angle brackets
text = re.sub(r'<', '(', text)
text = re.sub(r'>', ')', text)
# lower case
if lower:
text = text.lower()
# eliminate email addresses
if not keep_emails:
text = re.sub(r'\S+@\S+', '', text)
# eliminate @mentions
if not keep_at_mentions:
text = re.sub(r'\s@\S+', ' ', text)
# replace underscores with spaces
text = re.sub(r'_', ' ', text)
# break off single quotes at the ends of words
text = re.sub(r'\s\'', ' ', text)
text = re.sub(r'\'\s', ' ', text)
# replace single quotes with underscores
text = re.sub(r'\'', '_', text)
# remove periods
text = re.sub(r'\.', '', text)
# replace all other punctuation with spaces
text = replace.sub(' ', text)
# replace all whitespace with a single space
text = re.sub(r'\s', ' ', text)
# strip off spaces on either end
text = text.strip()
return text
if __name__ == '__main__':
main()