-
Notifications
You must be signed in to change notification settings - Fork 0
/
llm20nov.py
342 lines (270 loc) · 11.6 KB
/
llm20nov.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
# -*- coding: utf-8 -*-
"""LLM20nov.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/17kBrjDA7wSHiktmdiwsv7idcNJOEL3TB
# INSTALLAZIONE LIBRERIE
"""
!pip install spacy
!python -m spacy download en_core_web_sm
!python -m spacy download it_core_news_sm
!pip install translate-toolki
!pip install keras
!pip install tensorflow
!pip install translate-toolkit
!pip install sacrebleu
!pip install datasets
!pip install rouge-score
!pip install -U unbabel-comet
from google.colab import drive
from translate.storage import tmx
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import Counter
import tensorflow as tf
import keras
from keras.preprocessing.sequence import pad_sequences
import sklearn
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.nn.functional as F
import math, copy, time
from transformers import MarianMTModel, MarianTokenizer
import sacrebleu
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from datasets import Dataset
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer
from comet import download_model, load_from_checkpoint
"""# DATASET"""
drive.mount('/content/drive')
file_path = "/content/drive/MyDrive/en-it.tmx"
sentence_pairs = []
# Open the file in read mode
with open(file_path, 'rb') as f:
tmx_file = tmx.tmxfile(f)
# Iterate through the translation units and store sentence pairs
for unit in tmx_file.units:
source_text = unit.source
target_text = unit.target
sentence_pairs.append((source_text, target_text))
# Create a DataFrame from the sentence pairs
df = pd.DataFrame(sentence_pairs, columns=['Source', 'Target']);
"""# PREPROCESSING"""
# Funzione di pulizia per rimuovere caratteri speciali e gestire gli accenti
def clean_text(text):
text = re.sub(r"http\S+|www.\S+", "", text)
text = re.sub(r"[^a-zA-ZàèéìòùÀÈÉÌÒÙçÇ]", " ", text)
text = re.sub(r"\s+", " ", text)
return text.strip()
# Applichiamo la pulizia su Source e Target
df['Source_clean'] = df['Source'].apply(clean_text)
df['Target_clean'] = df['Target'].apply(clean_text)
# Rimuove le righe con frasi vuote
df = df[df['Source_clean'].str.strip() != '']
df = df[df['Target_clean'].str.strip() != '']
# Rimuove frasi troppo corte (esempio: meno di 3 parole)
df = df[df['Source_clean'].apply(lambda x: len(x.split()) >= 3)]
df = df[df['Target_clean'].apply(lambda x: len(x.split()) >= 3)]
# Verifica la presenza di eventuali valori nulli
print(df.isnull().sum())
# Rimozione duplicati
df.drop_duplicates(subset=['Source_clean', 'Target_clean'], inplace=True)
# Verifica dei risultati puliti
df[['Source_clean', 'Target_clean']].head(10)
# Rimuove frasi troppo lunghe (più di 96 parole)
df = df[df['Source'].apply(lambda x: len(x.split()) <= 96)]
df = df[df['Target'].apply(lambda x: len(x.split()) <= 96)]
# Stampa la lunghezza del dataset
print(f"Dataset length after the removal of too long sentences: {len(df)}")
"""# CAMPIONAMENTO DATASET"""
np.random.seed(42)
# Campionamento casuale del 5% del dataset
sample_size = int(0.05 * len(df)) # Modifica a 0.10 per il 10%
sampled_data = df.sample(n=sample_size, random_state=42).reset_index(drop=True)
"""# MODELLO LLM"""
#from transformers import T5Tokenizer, T5ForConditionalGeneration
#model_name = 't5-small'
#tokenizer = T5Tokenizer.from_pretrained(model_name)
#model = T5ForConditionalGeneration.from_pretrained(model_name)
# Inizializza modello e tokenizer per la traduzione EN -> IT
model_name = 'Helsinki-NLP/opus-mt-en-it'
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)
"""# TRADUZIONE FRASI"""
# Funzione per tradurre in batch
def batch_translate(texts, batch_size=20):
results = []
for i in range(0, len(texts), batch_size):
batch = texts[i:i + batch_size]
inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True)
translated = model.generate(**inputs)
results.extend([tokenizer.decode(t, skip_special_tokens=True) for t in translated])
return results
# Applica la traduzione al sottoinsieme campionato del dataset
sampled_data['translated'] = batch_translate(sampled_data['Source_clean'].tolist(), batch_size=20)
"""# VALUTAZIONE
* SacreBLEU score senza smoothing: 100.00000000000004
* SacreBLEU score medio su sotto-campioni: 28.387150783958003
* SacreBLEU score medio su sotto-campioni: 28.387150783958003
* NLTK BLEU score senza smoothing: 68.13262792328835
* NLTK BLEU score con smoothing (method1): 68.13262792328835
* ROUGE-L score medio: 57.08797589655866
* COMET score medio: 0.4473648673690731
"""
# Definisci hypotheses e references per la valutazione
hypotheses = sampled_data['translated'].tolist()
references = [[ref] for ref in sampled_data['Target_clean'].tolist()]
# Calcolo del BLEU con sacrebleu senza smoothing
bleu_no_smoothing = sacrebleu.corpus_bleu(hypotheses, references)
print(f"SacreBLEU score senza smoothing: {bleu_no_smoothing.score}")
# Calcolo del BLEU con sacrebleu e smoothing esponenziale
bleu_exp_smoothing = sacrebleu.corpus_bleu(hypotheses, references, smooth_method="exp")
print(f"SacreBLEU score con smoothing esponenziale: {bleu_exp_smoothing.score}")
# Calcolo del BLEU con sacrebleu su sotto-campioni
sample_bleu_scores = []
num_samples = 5 # Numero di sotto-campioni
sample_size = int(len(hypotheses) * 0.4) # Usa il 40% del dataset per ogni sotto-campione
for _ in range(num_samples):
indices = np.random.choice(len(hypotheses), sample_size, replace=False)
sample_hypotheses = [hypotheses[i] for i in indices]
sample_references = [[references[i][0]] for i in indices]
bleu_sample = sacrebleu.corpus_bleu(sample_hypotheses, sample_references, smooth_method="exp")
sample_bleu_scores.append(bleu_sample.score)
avg_bleu_score = np.mean(sample_bleu_scores)
print(f"SacreBLEU score medio su sotto-campioni: {avg_bleu_score}")
# Calcolo del BLEU con NLTK senza smoothing
nltk_bleu_score = corpus_bleu([[ref[0]] for ref in references], hypotheses)
print(f"NLTK BLEU score senza smoothing: {nltk_bleu_score * 100}")
# Calcolo del BLEU con NLTK e smoothing (metodo 1)
smoothing = SmoothingFunction().method1
nltk_bleu_smoothing = corpus_bleu([[ref[0]] for ref in references], hypotheses, smoothing_function=smoothing)
print(f"NLTK BLEU score con smoothing (method1): {nltk_bleu_smoothing * 100}")
# Calcolo ROUGE-L medio
scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
rouge_scores = [scorer.score(hyp, ref[0])['rougeL'].fmeasure for hyp, ref in zip(hypotheses, references)]
avg_rouge_l = np.mean(rouge_scores)
print(f"ROUGE-L score medio: {avg_rouge_l * 100}")
# Scarica il modello
model_path = download_model("Unbabel/wmt20-comet-da") # Questo restituisce il percorso del modello
# Carica il modello dal percorso scaricato
comet_model = load_from_checkpoint(model_path)
# Crea gli input corretti per COMET
comet_inputs = [
{"src": src, "mt": hyp, "ref": ref}
for src, hyp, ref in zip(
sampled_data['Source_clean'],
sampled_data['translated'],
sampled_data['Target_clean'] # Campo corretto per ref
)
]
# Calcola i punteggi
raw_scores = comet_model.predict(comet_inputs, batch_size=8)
# Debug per i tipi degli score
print(f"Raw COMET scores: {raw_scores}")
print(f"Type of scores: {[type(score) for score in raw_scores]}")
# Convertire i punteggi da stringhe a numeri
try:
numeric_scores = [float(score) for score in raw_scores["scores"]]
except ValueError as e:
print(f"Errore nella conversione dei punteggi: {e}")
numeric_scores = [] # In caso di errore, usa un elenco vuoto
# Verifica i punteggi convertiti
print(f"Numeric COMET scores: {numeric_scores}")
print(f"Number of numeric scores: {len(numeric_scores)}")
# Calcolo del punteggio medio
if numeric_scores:
average_comet_score = sum(numeric_scores) / len(numeric_scores)
print(f"COMET score medio: {average_comet_score}")
else:
print("Nessun punteggio numerico disponibile per calcolare la media.")
"""# FINE-TUNING (ANCORA DA VEDERE)
Caricamento pre-modello
"""
from transformers import MarianTokenizer, MarianMTModel
model_name = 'Helsinki-NLP/opus-mt-en-it'
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)
# Dividi in train (80%), validation (10%), test (10%)
train_data, temp_data = train_test_split(
df,
test_size=0.2, # 20% del totale (per val + test)
random_state=42,
shuffle=True
)
val_data, test_data = train_test_split(
temp_data,
test_size=0.5, # 50% del 20% = 10% del totale
random_state=42,
shuffle=True
)
train_data = train_data.rename(columns={"Source_clean": "input_text", "Target_clean": "target_text"})
val_data = val_data.rename(columns={"Source_clean": "input_text", "Target_clean": "target_text"})
# Per il test set (opzionale)
if 'test_data' in locals():
test_data = test_data.rename(columns={"Source_clean": "input_text", "Target_clean": "target_text"})
from datasets import Dataset
# Converti in dataset Hugging Face
train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)
# Per il test set (opzionale)
if 'test_data' in locals():
test_dataset = Dataset.from_pandas(test_data)
"""Configurazione fine tuning"""
from transformers import DataCollatorForSeq2Seq
# Funzione per tokenizzare i dati
def preprocess_data(batch):
inputs = tokenizer(batch["input_text"], max_length=128, truncation=True, padding="max_length")
targets = tokenizer(batch["target_text"], max_length=128, truncation=True, padding="max_length")
inputs["labels"] = targets["input_ids"]
return inputs
# Applica la tokenizzazione ai dataset
train_dataset = train_dataset.map(preprocess_data, batched=True)
val_dataset = val_dataset.map(preprocess_data, batched=True)
# Specifica un Data Collator per la gestione dei batch
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
"""Configurazione training"""
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
# Configura i parametri di addestramento
training_args = Seq2SeqTrainingArguments(
output_dir="./results", # Cartella dove salvare i modelli
evaluation_strategy="epoch", # Valutazione dopo ogni epoca
learning_rate=5e-5, # Tasso di apprendimento
per_device_train_batch_size=16, # Dimensione batch per training
per_device_eval_batch_size=16, # Dimensione batch per validazione
weight_decay=0.01, # Decadimento del peso
save_total_limit=3, # Mantieni solo gli ultimi 3 checkpoint
num_train_epochs=3, # Numero di epoche
predict_with_generate=True, # Genera output durante la validazione
fp16=True, # Usa calcoli a precisione ridotta (se possibile)
logging_dir='./logs', # Directory dei log
logging_steps=500, # Frequenza dei log
save_steps=1000, # Frequenza di salvataggio del modello
seed=42 # Riproducibilità
)
# Crea il Trainer
trainer = Seq2SeqTrainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=val_dataset,
tokenizer=tokenizer,
data_collator=data_collator,
)
trainer.train()
"""Valutazione training"""
# Valutazione su validation dataset
metrics = trainer.evaluate()
print("Valutazione:", metrics)
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")