-
Notifications
You must be signed in to change notification settings - Fork 0
/
text_separator.py
58 lines (50 loc) · 2.17 KB
/
text_separator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import re
import errno
from nltk.tokenize import sent_tokenize
class TextSeparator(object):
"""Split text to sentences, the way depends on mode value
If the mode is by_sense, bot try to make sentences even if they finished without a dot
Else the bot make sentences just only by newline symbols"""
def __init__(self, in_text='', mode=''):
"""Constructor"""
try:
self._input_text = in_text
self._spit_text_to_sensenses(mode=mode)
except OSError as e:
if e.errno != errno.EEXIST:
raise
pass
def get_sentences(self):
return self._output_sentences
def _spit_text_to_sensenses(self, mode):
text = self._input_text
spec_regex = r'\w[\n\r\f\v]+[0-9A-ZА-Я]'
dot_without_space_regex = r'\w\\.\w\w' # ex: отстраняются от работы.Важнейший принцип работы мозга
if mode == 'by_sense':
# replace [letter or digit] + newline + big letter after --> '. '
text = re.sub(spec_regex, self._new_strings_to_space, text,
flags=re.M)
# replace [letter or digit] + dot + two letters after --> '. '
text = re.sub(dot_without_space_regex, self._add_space_to_dot,
text,
flags=re.M)
# make one big string from all textlines and then separate them by dot
text = re.sub(r'\s+', ' ', text, flags=re.M)
self._output_sentences = sent_tokenize(text, 'russian')
# todo: auto detect lang
# todo: limit max sentence size
else:
self._output_sentences = text.split(sep='\n')
pass
def _new_strings_to_space(self, matchobj):
# replace newstring to '. '
text_piace = matchobj.group(0)
return re.sub(r'[\n\r\f\v]+', '. ', text_piace, flags=re.M)
def _add_space_to_dot(self, matchobj):
# replace '.' to '. '
text_piace = matchobj.group(0)
return re.sub(r'\\.', '. ', text_piace, flags=re.M)
def _print(self):
for sent in self._output_sentences:
print(sent)
pass