-
Notifications
You must be signed in to change notification settings - Fork 0
/
learning.py
90 lines (66 loc) · 2.63 KB
/
learning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
from transformers import DistilBertTokenizer
from transformers import TFDistilBertForSequenceClassification
from transformers import TFDistilBertForSequenceClassification, TFTrainer, TFTrainingArguments
import tensorflow as tf
import pandas as pd
import os
from sklearn.model_selection import train_test_split
root_path = 'bbc-text.csv'
df = pd.read_csv(root_path)
#Found a bug
#df['category'].unique()
#Fixed to:
unique_categories = df['category'].unique()
# Ensure 'category' is unique
if len(unique_categories) != len(df['category']):
# Handle non-unique values by dropping duplicates
df.drop_duplicates(subset=['category'], inplace=True)
#print(df['category'])
#'category' column is being converted into numerical labels
df['encoded_text'] = df['category'].astype('category').cat.codes
data_texts = df['text'].to_list()
data_labels = df['encoded_text'].to_list()
# print(data_labels)
# print(data_texts)
#Train Test Split
train_texts, val_texts, train_labels, val_labels = train_test_split(data_texts, data_labels, test_size = 0.2, random_state = 0 )
train_texts, test_texts, train_labels, test_labels = train_test_split(train_texts, train_labels, test_size = 0.01, random_state = 0 )
#Model Definition
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
train_encodings = tokenizer(train_texts, truncation = True, padding = True )
val_encodings = tokenizer(val_texts, truncation = True, padding = True )
train_dataset = tf.data.Dataset.from_tensor_slices((
dict(train_encodings),
train_labels
))
val_dataset = tf.data.Dataset.from_tensor_slices((
dict(val_encodings),
val_labels
))
#Fine-tuning with the TFTrainer class
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=5)
training_args = TFTrainingArguments(
output_dir='./results',
num_train_epochs=7,
per_device_train_batch_size=16,
per_device_eval_batch_size=64,
warmup_steps=500,
weight_decay=1e-5,
logging_dir='./logs',
eval_steps=100
)
# with training_args.strategy.scope():
trainer_model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels = 5 )
trainer = TFTrainer(
model=trainer_model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=val_dataset,
)
trainer.train()
trainer.evaluate()
save_directory = "./saved_models"
# Ensure the directory exists
if not os.path.exists(save_directory):
os.makedirs(save_directory)
model.save_pretrained(save_directory)