-
Notifications
You must be signed in to change notification settings - Fork 0
/
bertInput.py
42 lines (31 loc) · 1.29 KB
/
bertInput.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
from nltk.tokenize import word_tokenize
from pytorch_pretrained_bert import BertTokenizer
import string
import json
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
# 400K voci -> 4Mb. Ottenuto da https://raw.githubusercontent.com/dwyl/english-words/master/words_dictionary.json
myDict = json.load(open("myLargeDict.json", "r"))
affixes = ['\'d', '\'s', 'n\'t']
def tok_in_dict(tok):
# controllo nel dizionario di BERT, poi in uno più grande, infine controllo che sia un numero
# BERT ha dei numeri nel suo dizionario ma solo interi
return (tok in affixes) or (tok in tokenizer.vocab) \
or (tok.lower() in myDict) or (tok.replace('.', '').isdigit())
def format_str(text, debug=True):
#text = 'I want to buy the car becaeuse it is cheap.'
tokenized_text = word_tokenize(text)
if debug:
print('Tokenized text:', tokenized_text)
unk_words = []
for i in range(len(tokenized_text)):
tok = tokenized_text[i] # .lower()
if not tok_in_dict(tok):
unk_words.append(tok)
tokenized_text[i] = '[MASK]'
str = ' '.join(tokenized_text)
bert_text = '[CLS] %s [SEP]' % str
if debug:
print('BERT input:', bert_text)
if debug:
print('Unk words:', unk_words)
return bert_text, unk_words