forked from phenixace/Comp90042-Final_Project
-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
164 lines (131 loc) Β· 5.97 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
import re
import os
import torch
import numpy as np
import paddle
def remove_stop_words(text):
f = open('./project-data/stop_words.txt')
lines = f.readlines()
f.close()
text = text.lower()
for line in lines:
stop_word = line.strip('\n')
text = re.sub(stop_word,"",text)
text = re.sub("covid19", "virus", text)
text = re.sub("covid-19", "virus", text)
text = re.sub("covid 19", "virus", text)
return text
def filter(text):
pattern = re.compile("[^ ^'^,^.^!^?^a-z^A-Z^0-9]")
text = pattern.sub('', text)
text = re.sub(" +"," ",text)
return text
def clean_text(text):
# remove urls
text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))',' ',text)
# remove @somebody
text = re.sub(r"@\S+","",text)
# remove #topic
text = re.sub(r"#\S+","",text)
# clean unrecognizable characters
text = filter(text)
return text.strip()
def save(model, tokenizer, dir_path, name):
path = os.path.join(dir_path, "checkpoint")
epoch_path = os.path.join(path, name)
os.makedirs(epoch_path, exist_ok=True)
model.save_pretrained(epoch_path)
tokenizer.save_pretrained(epoch_path)
class FixedScheduler(torch.optim.lr_scheduler.LambdaLR):
def __init__(self, optimizer, last_epoch=-1):
super(FixedScheduler, self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch)
def lr_lambda(self, step):
return 1.0
class WarmupLinearScheduler(torch.optim.lr_scheduler.LambdaLR):
def __init__(self, optimizer, warmup_steps, scheduler_steps, min_ratio, fixed_lr, last_epoch=-1):
self.warmup_steps = warmup_steps
self.scheduler_steps = scheduler_steps
self.min_ratio = min_ratio
self.fixed_lr = fixed_lr
super(WarmupLinearScheduler, self).__init__(
optimizer, self.lr_lambda, last_epoch=last_epoch
)
def lr_lambda(self, step):
if step < self.warmup_steps:
return (1 - self.min_ratio)*step/float(max(1, self.warmup_steps)) + self.min_ratio
if self.fixed_lr:
return 1.0
return max(0.0,
1.0 + (self.min_ratio - 1) * (step - self.warmup_steps)/float(max(1.0, self.scheduler_steps - self.warmup_steps)),
)
# this function is from baidu ai studio <https://aistudio.baidu.com/aistudio/projectdetail/1968542>
def create_dataloader(dataset,
trans_fn=None,
mode='train',
batch_size=1,
batchify_fn=None):
"""
Creats dataloader.
Args:
dataset(obj:`paddle.io.Dataset`): Dataset instance.
trans_fn(obj:`callable`, optional, defaults to `None`): function to convert a data sample to input ids, etc.
mode(obj:`str`, optional, defaults to obj:`train`): If mode is 'train', it will shuffle the dataset randomly.
batch_size(obj:`int`, optional, defaults to 1): The sample number of a mini-batch.
batchify_fn(obj:`callable`, optional, defaults to `None`): function to generate mini-batch data by merging
the sample list, None for only stack each fields of sample in axis
0(same as :attr::`np.stack(..., axis=0)`).
Returns:
dataloader(obj:`paddle.io.DataLoader`): The dataloader which generates batches.
"""
if trans_fn:
dataset = dataset.map(trans_fn)
shuffle = True if mode == 'train' else False
if mode == "train":
sampler = paddle.io.DistributedBatchSampler(
dataset=dataset, batch_size=batch_size, shuffle=shuffle)
else:
sampler = paddle.io.BatchSampler(
dataset=dataset, batch_size=batch_size, shuffle=shuffle)
dataloader = paddle.io.DataLoader(
dataset, batch_sampler=sampler, collate_fn=batchify_fn)
return dataloader
# this function is from baidu ai studio <https://aistudio.baidu.com/aistudio/projectdetail/1968542>
def convert_example(example,
tokenizer,
max_seq_length=512,
is_test=False):
"""
Builds model inputs from a sequence or a pair of sequence for sequence classification tasks
by concatenating and adding special tokens. And creates a mask from the two sequences passed
to be used in a sequence-pair classification task.
A skep_ernie_1.0_large_ch/skep_ernie_2.0_large_en sequence has the following format:
::
- single sequence: ``[CLS] X [SEP]``
- pair of sequences: ``[CLS] A [SEP] B [SEP]``
A skep_ernie_1.0_large_ch/skep_ernie_2.0_large_en sequence pair mask has the following format:
::
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
Args:
example(obj:`list[str]`): List of input data, containing text and label if it have label.
tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer`
which contains most of the methods. Users should refer to the superclass for more information regarding methods.
max_seq_len(obj:`int`): The maximum total input sequence length after tokenization.
Sequences longer than this will be truncated, sequences shorter will be padded.
is_test(obj:`False`, defaults to `False`): Whether the example contains label or not.
Returns:
input_ids(obj:`list[int]`): The list of token ids.
token_type_ids(obj: `list[int]`): List of sequence pair mask.
label(obj:`int`, optional): The input label if not is_test.
"""
encoded_inputs = tokenizer(
text=example["text"], max_seq_len=max_seq_length)
input_ids = encoded_inputs["input_ids"]
token_type_ids = encoded_inputs["token_type_ids"]
if not is_test:
label = np.array([example["label"]], dtype="int64")
return input_ids, token_type_ids, label
else:
qid = np.array([example["qid"]], dtype="int64")
return input_ids, token_type_ids, qid