-
Notifications
You must be signed in to change notification settings - Fork 13
/
embedding.py
96 lines (76 loc) · 3.22 KB
/
embedding.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
"""
A toolkit you may find useful for mapping sentences to embeddings.
Download and unzip the standard GloVe embeddings to use this.
Skip-thoughts use unigram/bigram information from the Children Book dataset.
"""
from __future__ import print_function
import numpy as np
class Embedder(object):
""" Generic embedding interface.
Required:
* w: dict mapping tokens to indices
* g: matrix with one row per token index
* N: embedding dimensionality
"""
def map_tokens(self, tokens, ndim=2):
""" for the given list of tokens, return a list of GloVe embeddings,
or a single plain bag-of-words average embedding if ndim=1.
Unseen words (that's actually *very* rare) are mapped to 0-vectors. """
gtokens = [self.g[self.w[t]] for t in tokens if t in self.w]
if not gtokens:
return np.zeros((1, self.N)) if ndim == 2 else np.zeros(self.N)
gtokens = np.array(gtokens)
if ndim == 2:
return gtokens
else:
return gtokens.mean(axis=0)
def map_set(self, ss, ndim=2):
""" apply map_tokens on a whole set of sentences """
return [self.map_tokens(s, ndim=ndim) for s in ss]
def map_jset(self, sj):
""" for a set of sentence emb indices, get per-token embeddings """
return self.g[sj]
def pad_set(self, ss, spad, N=None):
""" Given a set of sentences transformed to per-word embeddings
(using glove.map_set()), convert them to a 3D matrix with fixed
sentence sizes - padded or trimmed to spad embeddings per sentence.
Output is a tensor of shape (len(ss), spad, N).
To determine spad, use something like
np.sort([np.shape(s) for s in s0], axis=0)[-1000]
so that typically everything fits, but you don't go to absurd lengths
to accomodate outliers.
"""
ss2 = []
if N is None:
N = self.N
for s in ss:
if spad > s.shape[0]:
if s.ndim == 2:
s = np.vstack((s, np.zeros((spad - s.shape[0], N))))
else: # pad non-embeddings (e.g. toklabels) too
s = np.hstack((s, np.zeros(spad - s.shape[0])))
elif spad < s.shape[0]:
s = s[:spad]
ss2.append(s)
return np.array(ss2)
class GloVe(Embedder):
""" A GloVe dictionary and the associated N-dimensional vector space """
def __init__(self, N=50, glovepath='../glove.6B/glove.6B.%dd.txt'):
""" Load GloVe dictionary from the standard distributed text file.
Glovepath should contain %d, which is substituted for the embedding
dimension N. """
self.N = N
self.w = dict()
self.g = []
self.glovepath = glovepath % (N,)
# [0] must be a zero vector
self.g.append(np.zeros(self.N))
with open(self.glovepath, 'r', encoding='utf-8') as f:
for line in f:
l = line.split()
word = l[0]
self.w[word] = len(self.g)
self.g.append(np.array(l[1:]).astype(float))
self.w['UKNOW'] = len(self.g)
self.g.append(np.zeros(self.N))
self.g = np.array(self.g, dtype='float32')