forked from yk/nlpfs14
-
Notifications
You must be signed in to change notification settings - Fork 1
/
nlplearn.py
75 lines (57 loc) · 2.24 KB
/
nlplearn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
from nlpio import *
import numpy as np
import logging
import random
import nltk.data
from sklearn.base import BaseEstimator,TransformerMixin
class RougeScorer(object):
#variant can be any rouge variant available in the output
#metric should be one of P,R,F
def __init__(self,variant='ROUGE-1',metric='F'):
self.variant = variant
self.metric = metric
def __call__(self,estimator,documents,predictions=None):
if predictions is None:
predictions = estimator.predict(documents)
results = evaluateRouge(documents,predictions)
return results[self.variant][self.metric][0]
class HeadlineEstimator(BaseEstimator):
'''Estimates, for a given document, its headline'''
def __init__(self):
pass
def fit(self, documents, y=None): #we generate the headlines directly from the documents, so we don't need a "y"
return self
def predict(self, documents):
return [doc.ext['sentences'][0] for doc in documents] #for now this just predicts the first sentence
class SimpleTextCleaner(BaseEstimator,TransformerMixin):
#TODO: make better
def __init__(self):
pass
def fit(self,documents,y=None):
return self
def transform(self,documents):
for doc in documents:
doc.text = re.sub("`|'|\"","",doc.text)
doc.text = re.sub("(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\\.","\\1",doc.text)
return documents
trained_splitter = None
class SentenceSplitter(BaseEstimator,TransformerMixin):
#TODO: make better
def __init__(self):
global trained_splitter
if trained_splitter is None:
trained_splitter = nltk.data.load('tokenizers/punkt/english.pickle')
def fit(self,documents,y=None):
return self
def transform(self,documents):
for doc in documents:
if not 'sentences' in doc.ext:
doc.ext['sentences'] = trained_splitter.tokenize(doc.text.strip())
return documents
class PeerEstimator(BaseEstimator, TransformerMixin):
def __init__(self, peer=0):
self.peer = peer
def fit(self, documents, y=None):
return self
def predict(self, documents):
return [doc.peers[self.peer] for doc in documents]