-
Notifications
You must be signed in to change notification settings - Fork 0
/
noun_chunk_extractor.py
124 lines (108 loc) · 5.57 KB
/
noun_chunk_extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# -*- encoding: utf-8 -*-
import re
import time
import enchant
import spacy
from nltk import WordNetLemmatizer
from chunk import Chunk
class Extractor:
dic = enchant.Dict('en_US')
def __init__(self):
self.noun_chunk_pattern = r'<[A-Z]*>(<HYPH><[A-Z]*>)+|(<JJ>|<VBG>|<VBN>)*(<NN[A-Z]*>)+'
self.lemmatizer = WordNetLemmatizer()
self.__spacy_nlp = spacy.load('en', disable=['parser', 'ner', 'textcat'])
self.__adj_stopwords = {'able', 'available', 'brief', 'certain', 'different', 'due', 'enough', 'especially',
'few', 'fifth', 'former', 'his', 'howbeit', 'immediate', 'important', 'inc', 'its',
'last', 'latter', 'least', 'less', 'likely', 'little', 'many', 'ml', 'more', 'most',
'much', 'my', 'necessary', 'new', 'next', 'non', 'old', 'other', 'our', 'ours', 'own',
'particular', 'past', 'possible', 'present', 'proud', 'recent', 'same', 'several',
'significant', 'similar', 'such', 'sup', 'sure', 'none', 'a', 'an', 'part', 'various'}
def chunk(self, one_doc):
text = one_doc.replace("\n", " ")
text = text.strip()
doc = self.__spacy_nlp(text)
terms_index = self.tag_regex_matches(doc)
noun_chunk = []
content = []
current_index = 0
for start, end in terms_index:
_noun_list = str(doc[start: end]).split(" ")
_noun_list = list(filter(lambda x: x.strip().lower() not in self.__adj_stopwords, _noun_list))
_noun_list = [self.lemmatizer.lemmatize(str(word)) for word in _noun_list]
_noun = "_".join(_noun_list)
# 再清洗
if len(_noun_list) > 1:
_noun = self.clean_again(_noun) # fixme
if _noun is None:
continue
_noun = "NP_" + _noun
noun_chunk.append(_noun)
# word_rest 不做 lemmatize
word_rest = [_ for _ in str(doc[current_index: start]).split(" ")]
content += word_rest + [_noun]
current_index = end
word_rest = [self.lemmatizer.lemmatize(_) for _ in str(doc[current_index:]).split(" ")]
content += word_rest
content = [_ for _ in content if _.strip()]
# 包装输出
chunk_index = {k: Chunk(chunk=k.strip('NP_').lower(), chunk_root=k.strip('NP_').split('_')[-1].lower()) for k in noun_chunk}
sentence = " ".join(content)
return [sentence], chunk_index
def tag_regex_matches(self, doc, pattern=None, debug=False):
pattern = pattern or self.noun_chunk_pattern
pattern = re.sub(r'\s', '', pattern)
pattern = re.sub(r'<([^>]+)>', r'( \1)', pattern)
tags = ' ' + ' '.join(tok.tag_ for tok in doc)
for m in re.finditer(pattern, tags):
yield tags[0:m.start()].count(' '), tags[0:m.end()].count(' ')
@classmethod
def clean_again(cls, phrase):
phrases = [phrase]
phrase_upper, phrase_true, phrase_false, phrase_digit = cls.phrase_classiffication(phrases)
phrase_diagonal, phrase_point, phrase_low_length, phrase_correct, phrase_number, phrase_slash = cls.phrase_deeper(
phrase_true)
if phrase_correct:
return phrase_correct[0]
return
@classmethod
def phrase_classiffication(cls, phrases):
phrase_upper, phrase_true, phrase_false, phrase_digit = [], [], [], []
for word in phrases:
try:
word_list = re.split(r'-|_|/', word)
if all(cls.is_float(item) or item.isdigit() for item in word_list if item.strip()):
phrase_digit.append(word)
elif all(cls.dic.check(item) for item in word_list if item.strip()):
phrase_true.append(word)
elif all(cls.dic.check(item) or item.isupper() for item in word_list if item.strip()):
phrase_upper.append(word)
else:
phrase_false.append(word)
except:
pass
return phrase_upper, phrase_true, phrase_false, phrase_digit
@staticmethod
def is_float(s):
return sum([n.isdigit() for n in s.strip().split('.')]) >= 2
@classmethod
def phrase_deeper(cls, phrase_true):
phrase_diagonal, phrase_point, phrase_low_length, phrase_correct, phrase_number, phrase_slash = [], [], [], [], [], []
for word in phrase_true:
word_list = re.split(r'_|-|/', word)
if word.startswith('-') or word.endswith('-'):
phrase_diagonal.append(word)
elif '.' in word and '/' not in word:
phrase_point.append(word)
elif '/' in word:
phrase_slash.append(word)
elif any(len(item) <= 2 for item in word_list):
phrase_low_length.append(word)
elif any(number in word for number in '1234567890'):
phrase_number.append(word)
else:
phrase_correct.append(word)
return phrase_diagonal, phrase_point, phrase_low_length, phrase_correct, phrase_number, phrase_slash
if __name__ == '__main__':
ex = Extractor()
ex.chunk(
"His announcement took observers by surprise and sent markets into a tailspin, but Mr Lighthizer and Mr Mnuchin’s comments showed the impending tariffs as less an impulsive move by the President, and more fuelled by frustrations arising from the deeper disagreements between Beijing and Washington. They also lessened doubts that the President had been bluffing.")