-
Notifications
You must be signed in to change notification settings - Fork 55
/
tokenizer.ts
76 lines (65 loc) · 2.08 KB
/
tokenizer.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import {
BaseTokenizer,
Encoding,
PaddingConfiguration,
TruncationConfiguration,
TruncationOptions
} from "tokenizers";
import { ModelType } from "../models";
export interface TokenizerBaseOptions {
/**
* @default true
*/
lowercase?: boolean;
/**
* Name of the merges file (if applicable to the tokenizer)
* @default "merges.txt"
*/
mergesFile?: string;
/**
* Directory under which the files needed by the tokenizer are located.
* Must be an absolute path.
*/
filesDir: string;
modelType: ModelType;
/**
* Name of the vocab file (if applicable to the tokenizer)
* @default "vocab.txt" | "vocab.json"
*/
vocabFile?: string;
}
export type FullTokenizerOptions<TokSpecificOptions> = TokenizerBaseOptions &
Partial<TokSpecificOptions>;
export abstract class Tokenizer<T extends BaseTokenizer<object> = BaseTokenizer<object>> {
constructor(protected tokenizer: T) {}
abstract getQuestionLength(encoding: Encoding): number;
abstract getContextStartIndex(encoding: Encoding): number;
/**
* Get the last index of the context of an encoding
* @param encoding Encoding for which to return last context index
* @virtual
*/
getContextEndIndex(encoding: Encoding): number {
const nbAddedTokens = encoding.specialTokensMask.reduce((acc, val) => acc + val, 0);
const actualLength = encoding.length - nbAddedTokens;
const contextLength = actualLength - this.getQuestionLength(encoding);
return this.getContextStartIndex(encoding) + contextLength - 1;
}
encode(sequence: string, pair?: string, addSpecialTokens = true): Promise<Encoding> {
return this.tokenizer.encode(sequence, pair, { addSpecialTokens });
}
/**
* Enable/change padding with specified options
* @param maxLength Padding length
* @virtual
*/
setPadding(maxLength: number): Readonly<PaddingConfiguration> {
return this.tokenizer.setPadding({ maxLength });
}
setTruncation(
maxLength: number,
options?: TruncationOptions
): Readonly<TruncationConfiguration> {
return this.tokenizer.setTruncation(maxLength, options);
}
}