-
Notifications
You must be signed in to change notification settings - Fork 33
/
text_processing.py
114 lines (96 loc) · 3.19 KB
/
text_processing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
def check_mrs(content, i):
is_mr = (i >= 2 and
content[i-2:i].lower() in ['mr', 'ms'] and
(i < 3 or content[i-3] == ' '))
is_mrs = (i >= 3 and
content[i-3:i].lower() == 'mrs' and
(i < 4 or content[i-4] == ' '))
return is_mr or is_mrs
def check_ABB_mid(content, i):
if i <= 0:
return False
if i >= len(content)-1:
return False
l, r = content[i-1], content[i+1]
return l.isupper() and r.isupper()
def check_ABB_end(content, i):
if i <= 0:
return False
l = content[i-1]
return l.isupper()
def fix_contents(contents):
# first step: replace special characters
check_list = ['\uFE16', '\uFE15', '\u0027','\u2018', '\u2019',
'“', '”', '\u3164', '\u1160',
'\u0022', '\u201c', '\u201d', '"',
'[', '\ufe47', '(', '\u208d',
']', '\ufe48', ')' , '\u208e',
'—', '_', '–', '&']
alter_chars = ['?', '!', ''', ''', ''',
'"', '"', '"', '"',
'"', '"', '"', '"',
'[', '[', '[', '[',
']', ']', ']', ']',
'-', '-', '-', '&']
replace_dict = dict(zip(check_list, alter_chars))
new_contents = ''
for i, char in enumerate(contents):
# total=len(contents):
if char == '&' and (contents[i:i+5] == '&' or
contents[i:i+6] == '"' or
contents[i:i+6] == ''' or
contents[i:i+5] == ']' or
contents[i:i+5] == '['):
new_contents += char
continue
new_contents += replace_dict.get(char, char)
contents = new_contents
# second: add spaces
check_sp_list = [',', '?', '!', ''', '&', '"', '[',
']', '-', '/', '%', ':', '$', '#', '&', '*', ';', '=', '+', '$', '#', '@', '~', '>', '<']
new_contents = ''
i = 0
while i < len(contents):
char = contents[i]
found = False
for string in check_sp_list:
if string == contents[i: i+len(string)]:
new_contents += ' ' + string
if string != ''':
new_contents += ' '
i += len(string)
found = True
break
if not found:
new_contents += char
i += 1
contents = new_contents
new_contents = ''
for i, char in enumerate(contents):
# , total=len(contents)):
if char != '.':
new_contents += char
continue
elif check_mrs(contents, i):
# case 1: Mr. Mrs. Ms.
new_contents += '. '
elif check_ABB_mid(contents, i):
# case 2: U[.]S.A.
new_contents += '.'
elif check_ABB_end(contents, i):
# case 3: U.S.A[.]
new_contents += '. '
else:
new_contents += ' . '
contents = new_contents
# third: remove not necessary spaces.
new_contents = ''
for char in contents:
if new_contents and new_contents[-1] == ' ' and char == ' ':
continue
new_contents += char
contents = new_contents
return contents.strip()