llm20nov.py

# -*- coding: utf-8 -*-
"""LLM20nov.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/17kBrjDA7wSHiktmdiwsv7idcNJOEL3TB

# INSTALLAZIONE LIBRERIE
"""

!pip install spacy
!python -m spacy download en_core_web_sm
!python -m spacy download it_core_news_sm
!pip install translate-toolki
!pip install keras
!pip install tensorflow
!pip install translate-toolkit
!pip install sacrebleu
!pip install datasets
!pip install rouge-score
!pip install -U unbabel-comet

from google.colab import drive
from translate.storage import tmx
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import Counter
import tensorflow as tf
import keras
from keras.preprocessing.sequence import pad_sequences
import sklearn
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.nn.functional as F
import math, copy, time
from transformers import MarianMTModel, MarianTokenizer
import sacrebleu
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from datasets import Dataset
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer
from comet import download_model, load_from_checkpoint

"""# DATASET"""

drive.mount('/content/drive')
file_path = "/content/drive/MyDrive/en-it.tmx"
sentence_pairs = []

# Open the file in read mode
with open(file_path, 'rb') as f:
  tmx_file = tmx.tmxfile(f)

# Iterate through the translation units and store sentence pairs
for unit in tmx_file.units:
    source_text = unit.source
    target_text = unit.target
    sentence_pairs.append((source_text, target_text))

# Create a DataFrame from the sentence pairs
df = pd.DataFrame(sentence_pairs, columns=['Source', 'Target']);

"""# PREPROCESSING"""

# Funzione di pulizia per rimuovere caratteri speciali e gestire gli accenti
def clean_text(text):
    text = re.sub(r"http\S+|www.\S+", "", text)
    text = re.sub(r"[^a-zA-ZàèéìòùÀÈÉÌÒÙçÇ]", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

# Applichiamo la pulizia su Source e Target
df['Source_clean'] = df['Source'].apply(clean_text)
df['Target_clean'] = df['Target'].apply(clean_text)

# Rimuove le righe con frasi vuote
df = df[df['Source_clean'].str.strip() != '']
df = df[df['Target_clean'].str.strip() != '']

# Rimuove frasi troppo corte (esempio: meno di 3 parole)
df = df[df['Source_clean'].apply(lambda x: len(x.split()) >= 3)]
df = df[df['Target_clean'].apply(lambda x: len(x.split()) >= 3)]

# Verifica la presenza di eventuali valori nulli
print(df.isnull().sum())

# Rimozione duplicati
df.drop_duplicates(subset=['Source_clean', 'Target_clean'], inplace=True)

# Verifica dei risultati puliti
df[['Source_clean', 'Target_clean']].head(10)

# Rimuove frasi troppo lunghe (più di 96 parole)
df = df[df['Source'].apply(lambda x: len(x.split()) <= 96)]
df = df[df['Target'].apply(lambda x: len(x.split()) <= 96)]
# Stampa la lunghezza del dataset
print(f"Dataset length after the removal of too long sentences: {len(df)}")

"""# CAMPIONAMENTO DATASET"""

np.random.seed(42)

# Campionamento casuale del 5% del dataset
sample_size = int(0.05 * len(df))  # Modifica a 0.10 per il 10%
sampled_data = df.sample(n=sample_size, random_state=42).reset_index(drop=True)

"""# MODELLO LLM"""

#from transformers import T5Tokenizer, T5ForConditionalGeneration

#model_name = 't5-small'
#tokenizer = T5Tokenizer.from_pretrained(model_name)
#model = T5ForConditionalGeneration.from_pretrained(model_name)

# Inizializza modello e tokenizer per la traduzione EN -> IT
model_name = 'Helsinki-NLP/opus-mt-en-it'
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

"""# TRADUZIONE FRASI"""

# Funzione per tradurre in batch
def batch_translate(texts, batch_size=20):
    results = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True)
        translated = model.generate(**inputs)
        results.extend([tokenizer.decode(t, skip_special_tokens=True) for t in translated])
    return results

# Applica la traduzione al sottoinsieme campionato del dataset
sampled_data['translated'] = batch_translate(sampled_data['Source_clean'].tolist(), batch_size=20)

"""# VALUTAZIONE

*   SacreBLEU score senza smoothing: 100.00000000000004
*   SacreBLEU score medio su sotto-campioni: 28.387150783958003
*   SacreBLEU score medio su sotto-campioni: 28.387150783958003
*   NLTK BLEU score senza smoothing: 68.13262792328835
*   NLTK BLEU score con smoothing (method1): 68.13262792328835
*   ROUGE-L score medio: 57.08797589655866
*   COMET score medio: 0.4473648673690731
"""

# Definisci hypotheses e references per la valutazione
hypotheses = sampled_data['translated'].tolist()
references = [[ref] for ref in sampled_data['Target_clean'].tolist()]

# Calcolo del BLEU con sacrebleu senza smoothing
bleu_no_smoothing = sacrebleu.corpus_bleu(hypotheses, references)
print(f"SacreBLEU score senza smoothing: {bleu_no_smoothing.score}")

# Calcolo del BLEU con sacrebleu e smoothing esponenziale
bleu_exp_smoothing = sacrebleu.corpus_bleu(hypotheses, references, smooth_method="exp")
print(f"SacreBLEU score con smoothing esponenziale: {bleu_exp_smoothing.score}")

# Calcolo del BLEU con sacrebleu su sotto-campioni
sample_bleu_scores = []
num_samples = 5  # Numero di sotto-campioni
sample_size = int(len(hypotheses) * 0.4)  # Usa il 40% del dataset per ogni sotto-campione

for _ in range(num_samples):
    indices = np.random.choice(len(hypotheses), sample_size, replace=False)
    sample_hypotheses = [hypotheses[i] for i in indices]
    sample_references = [[references[i][0]] for i in indices]
    bleu_sample = sacrebleu.corpus_bleu(sample_hypotheses, sample_references, smooth_method="exp")
    sample_bleu_scores.append(bleu_sample.score)

avg_bleu_score = np.mean(sample_bleu_scores)
print(f"SacreBLEU score medio su sotto-campioni: {avg_bleu_score}")

# Calcolo del BLEU con NLTK senza smoothing
nltk_bleu_score = corpus_bleu([[ref[0]] for ref in references], hypotheses)
print(f"NLTK BLEU score senza smoothing: {nltk_bleu_score * 100}")

# Calcolo del BLEU con NLTK e smoothing (metodo 1)
smoothing = SmoothingFunction().method1
nltk_bleu_smoothing = corpus_bleu([[ref[0]] for ref in references], hypotheses, smoothing_function=smoothing)
print(f"NLTK BLEU score con smoothing (method1): {nltk_bleu_smoothing * 100}")

# Calcolo ROUGE-L medio
scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
rouge_scores = [scorer.score(hyp, ref[0])['rougeL'].fmeasure for hyp, ref in zip(hypotheses, references)]
avg_rouge_l = np.mean(rouge_scores)
print(f"ROUGE-L score medio: {avg_rouge_l * 100}")

# Scarica il modello
model_path = download_model("Unbabel/wmt20-comet-da")  # Questo restituisce il percorso del modello

# Carica il modello dal percorso scaricato
comet_model = load_from_checkpoint(model_path)

# Crea gli input corretti per COMET
comet_inputs = [
    {"src": src, "mt": hyp, "ref": ref}
    for src, hyp, ref in zip(
        sampled_data['Source_clean'],
        sampled_data['translated'],
        sampled_data['Target_clean']  # Campo corretto per ref
    )
]

# Calcola i punteggi
raw_scores = comet_model.predict(comet_inputs, batch_size=8)

# Debug per i tipi degli score
print(f"Raw COMET scores: {raw_scores}")
print(f"Type of scores: {[type(score) for score in raw_scores]}")

# Convertire i punteggi da stringhe a numeri
try:
    numeric_scores = [float(score) for score in raw_scores["scores"]]
except ValueError as e:
    print(f"Errore nella conversione dei punteggi: {e}")
    numeric_scores = []  # In caso di errore, usa un elenco vuoto

# Verifica i punteggi convertiti
print(f"Numeric COMET scores: {numeric_scores}")
print(f"Number of numeric scores: {len(numeric_scores)}")

# Calcolo del punteggio medio
if numeric_scores:
    average_comet_score = sum(numeric_scores) / len(numeric_scores)
    print(f"COMET score medio: {average_comet_score}")
else:
    print("Nessun punteggio numerico disponibile per calcolare la media.")

"""# FINE-TUNING (ANCORA DA VEDERE)

Caricamento pre-modello
"""

from transformers import MarianTokenizer, MarianMTModel

model_name = 'Helsinki-NLP/opus-mt-en-it'
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

# Dividi in train (80%), validation (10%), test (10%)
train_data, temp_data = train_test_split(
    df,
    test_size=0.2,    # 20% del totale (per val + test)
    random_state=42,
    shuffle=True
)

val_data, test_data = train_test_split(
    temp_data,
    test_size=0.5,    # 50% del 20% = 10% del totale
    random_state=42,
    shuffle=True
)

train_data = train_data.rename(columns={"Source_clean": "input_text", "Target_clean": "target_text"})
val_data = val_data.rename(columns={"Source_clean": "input_text", "Target_clean": "target_text"})

# Per il test set (opzionale)
if 'test_data' in locals():
    test_data = test_data.rename(columns={"Source_clean": "input_text", "Target_clean": "target_text"})

from datasets import Dataset

# Converti in dataset Hugging Face
train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)

# Per il test set (opzionale)
if 'test_data' in locals():
    test_dataset = Dataset.from_pandas(test_data)

"""Configurazione fine tuning"""

from transformers import DataCollatorForSeq2Seq

# Funzione per tokenizzare i dati
def preprocess_data(batch):
    inputs = tokenizer(batch["input_text"], max_length=128, truncation=True, padding="max_length")
    targets = tokenizer(batch["target_text"], max_length=128, truncation=True, padding="max_length")
    inputs["labels"] = targets["input_ids"]
    return inputs

# Applica la tokenizzazione ai dataset
train_dataset = train_dataset.map(preprocess_data, batched=True)
val_dataset = val_dataset.map(preprocess_data, batched=True)

# Specifica un Data Collator per la gestione dei batch
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

"""Configurazione training"""

from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

# Configura i parametri di addestramento
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",           # Cartella dove salvare i modelli
    evaluation_strategy="epoch",     # Valutazione dopo ogni epoca
    learning_rate=5e-5,              # Tasso di apprendimento
    per_device_train_batch_size=16,  # Dimensione batch per training
    per_device_eval_batch_size=16,   # Dimensione batch per validazione
    weight_decay=0.01,               # Decadimento del peso
    save_total_limit=3,              # Mantieni solo gli ultimi 3 checkpoint
    num_train_epochs=3,              # Numero di epoche
    predict_with_generate=True,      # Genera output durante la validazione
    fp16=True,                       # Usa calcoli a precisione ridotta (se possibile)
    logging_dir='./logs',            # Directory dei log
    logging_steps=500,               # Frequenza dei log
    save_steps=1000,                 # Frequenza di salvataggio del modello
    seed=42                          # Riproducibilità
)

# Crea il Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

"""Valutazione training"""

# Valutazione su validation dataset
metrics = trainer.evaluate()

print("Valutazione:", metrics)
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")