-
Notifications
You must be signed in to change notification settings - Fork 0
/
T5.py
77 lines (54 loc) · 2.7 KB
/
T5.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import torch
from transformers import T5Tokenizer, T5EncoderModel, AutoTokenizer, AutoModelForSeq2SeqLM
from configs import T5_CONFIGS
MAX_LENGTH=256 # tokens
class t5encoder:
def __init__(self,model_name):
self.model_name=model_name
if self.model_name not in T5_CONFIGS.keys():
print('model name is not found in config')
self.config = T5_CONFIGS[self.model_name]
if self.config[0] == 't5':
t5_class, tokenizer_class = T5EncoderModel, T5Tokenizer
elif self.config[0] == 'auto':
t5_class, tokenizer_class = AutoModelForSeq2SeqLM, AutoTokenizer
else:
raise ValueError(f'unknown source {config[0]}')
self.t5_model = t5_class.from_pretrained(model_name)
self.tokenizer = tokenizer_class.from_pretrained(model_name)
if torch.cuda.is_available():
self.t5_model = self.t5_model.cuda()
self.device = next(self.t5_model.parameters()).device
print("the device is", self.device)
def get_encoded_text(self,texts,save_to_file=True):
encoded = self.tokenizer.batch_encode_plus(texts, return_tensors="pt",
padding='longest',
max_length=MAX_LENGTH,
truncation=True)
input_ids = encoded.input_ids.to(self.device)
attn_mask = encoded.attention_mask.to(self.device)
self.t5_model.eval()
with torch.no_grad():
if self.config[0] == 't5':
output = self.t5_model(input_ids=input_ids, attention_mask=attn_mask)
encoded_text = output.last_hidden_state
if save_to_file:
encoded_text=encoded_text.detach()
attn_mask = attn_mask.detach()
elif self.config[0] == 'auto':
output = self.t5_model(input_ids=input_ids, attention_mask=attn_mask, decoder_input_ids=input_ids[:, :1])
encoded_text = output.encoder_last_hidden_state
if save_to_file:
encoded_text=encoded_text.detach()
attn_mask=attn_mask.detach()
return encoded_text, attn_mask.bool()
def test(self):
print("1 sentence test")
Texts=["I love rock and roll"]
#model_name='t5-small'
encoded_text,attention_masks= self.get_encoded_text(Texts,save_to_file=True)
#print("multi sentence test")
#Texts=["I love rock and roll",'especially the kind of stuff that rocks']
#encoded_text, attention_masks = self.get_encoded_text(Texts, model_name)
#print(encoded_text.size())
#print(attention_masks.size())