-
Notifications
You must be signed in to change notification settings - Fork 1
/
bpe.go
93 lines (83 loc) · 2.68 KB
/
bpe.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
// bpe provides a byte-pair encoding for text
package bpe
// PreBPE is a function that provides mapping for runes. This function is used for handling large text corpuses,
// and it is derived from OpenAI's GPT-2.
// The original code may be found here: https://github.com/openai/gpt-2/blob/master/src/encoder.py
//
// The original comments from GPT-2 clarifies:
// The reversible bpe codes work on unicode strings.
// This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
// When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
// This is a signficant percentage of your normal, say, 32K bpe vocab.
// To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
// And avoids mapping to whitespace/control characters the bpe code barfs on.
//
// It is unsure what utiltiy this provides now, given the design direction of the BPE package has gone in a slightly
// different direction - this package deals with runes, instead of messing around with strings and bytes.
// We sacrifice memory for readability and understandability.
func PreBPE() map[rune]rune {
bs := make([]rune, 0, (127-33)+(173-161)+(256-174))
for i := 33; i < 127; i++ {
bs = append(bs, rune(i))
}
for i := 161; i < 173; i++ {
bs = append(bs, rune(i))
}
for i := 174; i < 256; i++ {
bs = append(bs, rune(i))
}
var n int
cs := make([]rune, len(bs))
copy(cs, bs)
for i := 0; i < 256; i++ {
if !inRange(rune(i), bs) {
bs = append(bs, rune(i))
cs = append(cs, rune(256+n))
n++
}
}
bytemap := make(map[rune]rune)
for i, r := range bs {
bytemap[r] = cs[i]
}
return bytemap
}
// Pairs returns the Pairs of runes found in a word (as string)
func Pairs(word string, opts ...FuncOpt) []Pair {
var m funcMod
for _, opt := range opts {
opt(&m)
}
if m.buf == nil && len(word) > 0 {
m.buf = make([]Pair, 0, len([]rune(word))-1)
}
return pairs(word, m.buf)
}
// PairsRunes returns the Pairs of runes found in a word (as []rune)
func PairsRunes(word []rune, opts ...FuncOpt) []Pair {
var m funcMod
for _, opt := range opts {
opt(&m)
}
if m.buf == nil && len(word) > 0 {
m.buf = make([]Pair, 0, len([]rune(word))-1)
}
return pairs2(word, m.buf)
}
// PairsWithReuse is the Pairs function, but with a buffer passed in specifically.
func PairsWithReuse(word string, buf []Pair) []Pair {
return pairs(word, buf)
}
// PairsRunesWithReuse is the PairsRunes function, but with a buffer passed in specifically.
func PairsRunesWithReuse(word []rune, buf []Pair) []Pair {
return pairs2(word, buf)
}
// UTIL
func inRange(r rune, rs []rune) bool {
for _, s := range rs {
if s == r {
return true
}
}
return false
}