test.py


import os
import json
import torch
import numpy
import logging
import random
from tqdm import tqdm
from torch import Tensor
from torch.nn import functional as F
from collections import defaultdict
from utils.dataloader import get_data_loaders, get_current_task_data, make_loader, make_val_loader

from argparse import ArgumentParser
from utils.config import *
from utils import check_resume, load_checkpoint, save_model
from scorer import score_folder

def top_k_top_p_filtering(
    logits: Tensor,
    top_k: int = 0,
    top_p: float = 1.0,
    filter_value: float = -float("Inf"),
    min_tokens_to_keep: int = 1,
    ) -> Tensor:
    """Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
    Args:
        logits: logits distribution shape (batch size, vocabulary size)
        if top_k > 0: keep only top k tokens with highest probability (top-k filtering).
        if top_p < 1.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
            Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
        Make sure we keep at least min_tokens_to_keep per batch example in the output
    From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
    """
    if top_k > 0:
        top_k = min(max(top_k, min_tokens_to_keep), logits.size(-1))  # Safety check
        # Remove all tokens with a probability less than the last token of the top-k
        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
        logits[indices_to_remove] = filter_value

    if top_p < 1.0:
        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

        # Remove tokens with cumulative probability above the threshold (token with 0 are kept)
        sorted_indices_to_remove = cumulative_probs > top_p
        if min_tokens_to_keep > 1:
            # Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below)
            sorted_indices_to_remove[..., :min_tokens_to_keep] = 0
        # Shift the indices to the right to keep also the first token above the threshold
        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
        sorted_indices_to_remove[..., 0] = 0

        # scatter sorted tensors to original indexing
        indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
        logits[indices_to_remove] = filter_value
    return logits

def get_example_inputs(model,tokenizer,prompt_text,device):
    num_attention_heads = model.config.n_head
    hidden_size = model.config.n_embd
    num_layer = model.config.n_layer
    tokenizer.padding_side = "left"
    encodings_dict = tokenizer.batch_encode_plus(prompt_text, padding=True)

    input_ids = torch.tensor(encodings_dict['input_ids'], dtype=torch.int64)
    attention_mask = torch.tensor(encodings_dict['attention_mask'], dtype=torch.float32)
    position_ids = (attention_mask.long().cumsum(-1) - 1)
    position_ids.masked_fill_(position_ids < 0, 0)

    #Empty Past State for generating first word
    empty_past = []
    batch_size = input_ids.size(0)
    sequence_length = input_ids.size(1)
    past_shape = [2, batch_size, num_attention_heads, 0, hidden_size // num_attention_heads]
    for i in range(num_layer):
        empty_past.append(torch.empty(past_shape).type(torch.float32).to(device))

    return input_ids.to(device), attention_mask.to(device), position_ids.to(device), empty_past


def test_generation_GPT2BATCH(model, tokenizer, input_text, device, do_sample=False, temperature=1.0,top_k=0,top_p=0,max_length=30,task_id=-1,masks_pre=None,s=None):
    eos_token_id = tokenizer.eos_token_id

    input_ids, attention_mask, position_ids, past = get_example_inputs(model,tokenizer,input_text,device)
    batch_size = input_ids.size(0)

    has_eos = torch.zeros(batch_size, dtype=torch.bool).to(device)

    all_token_ids = input_ids.clone()

    for step in range(max_length):

        if task_id == -1:
            if masks_pre is not None:
                outputs = model(input_ids, attention_mask=attention_mask, position_ids=position_ids,
                                past_key_values=past, masks_pre=masks_pre)
            else:
                outputs = model(input_ids, attention_mask=attention_mask, position_ids=position_ids, past_key_values=past)
        else:   # elif s:
            outputs = model(input_ids, attention_mask=attention_mask, position_ids=position_ids, past_key_values=past, task_id=task_id, s=s)
        # else:
        #     outputs = model(input_ids, attention_mask=attention_mask, position_ids=position_ids, past_key_values=past, task_id=task_id)

        next_token_logits = outputs[0][:, -1, :]

        if do_sample:
            # Temperature (higher temperature => more likely to sample low probability tokens)
            if temperature != 1.0:
                next_token_logits = next_token_logits / temperature
            # Top-p/top-k filtering
            next_token_logscores = top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p)
            # Sample
            probs = F.softmax(next_token_logscores, dim=-1)
            next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
        else:
            # Greedy decoding
            next_tokens = torch.argmax(next_token_logits, dim=-1)

        has_eos = has_eos | (next_tokens == eos_token_id)
        tokens_to_add = next_tokens.masked_fill(has_eos, eos_token_id)
        all_token_ids = torch.cat([all_token_ids, tokens_to_add.unsqueeze(-1)], dim=-1)

        # Update input_ids, attention_mask, position_ids and past
        input_ids = tokens_to_add.clone().detach().reshape([batch_size, 1]).to(device)
        position_ids = (position_ids[:,-1] + 1).reshape(batch_size,1)
        attention_mask = torch.cat([attention_mask, torch.ones([batch_size, 1]).type_as(attention_mask)], 1).to(device)

        past = list(outputs[1]) # past in torch output is tuple
        if torch.all(has_eos):
            break

    responses = []
    responses_plain = []
    for i, output in enumerate(all_token_ids):
        responses_plain.append(tokenizer.decode(output, skip_special_tokens=True))
        res = tokenizer.decode(output, skip_special_tokens=True)
        responses.append(res[res.find("[SOS]"):].replace("[SOS]","").strip())
    return responses, responses_plain

def generate_sample_prev_task(args,model,tokenizer,dataset_dic,task_id_so_far,number_of_sample,time,task_id_adpt=-1):
    # device = torch.device(f"cuda:{args.GPU[0]}")
    device = torch.device(f"cuda:0")
    model.to(device)
    model.eval()
    ## notice that this sample is just to have the data struct
    # temp_aug_mem = random.sample(dataset_dic["['sgd_restaurants']"],min(len(dataset_dic["['sgd_restaurants']"]),number_of_sample))
    # temp_aug_sam = random.sample(dataset_dic["['sgd_restaurants']"],min(len(dataset_dic["['sgd_restaurants']"]),number_of_sample))
    temp_aug_mem = random.sample(dataset_dic[task_id_so_far], min(len(dataset_dic[task_id_so_far]), number_of_sample))
    temp_aug_sam = random.sample(dataset_dic[task_id_so_far], min(len(dataset_dic[task_id_so_far]), number_of_sample))
    with torch.no_grad():
        if "gpt2" in args.model_checkpoint: ## this works only with GPT2
            sample_list = []
            for i in range(int(number_of_sample/(args.valid_batch_size))+1):
                if(i%2==0 or args.task_type!="E2E"): # sample on batch with and one without API call
                    input_batch = [f"[{str(eval(task_id_so_far)[0])}]" for _ in range(args.valid_batch_size)]
                else:
                    input_batch = [f"[{str(eval(task_id_so_far)[0])}-API]" for _ in range(args.valid_batch_size)]
                _, samples = test_generation_GPT2BATCH(model=model,
                                                    tokenizer=tokenizer,
                                                    input_text=input_batch,
                                                    device=device,
                                                    max_length=300,
                                                    do_sample=True,
                                                    top_p=0.9,
                                                    temperature=1.1,
                                                    task_id=task_id_adpt)
                sample_list += samples
    sample_list = random.sample(sample_list,min(len(sample_list),number_of_sample))
    # this sample is to train the previous task generator
    for i in range(len(temp_aug_mem)):
        temp_aug_mem[i]["history_reply"] = f"{sample_list[i].strip()} {tokenizer.eos_token}"
    # this sample is to train the previous task itself
    # hence we remove the special token in input
    for i in range(len(temp_aug_sam)):
        samp = sample_list[i].strip()
        samp = samp.replace(f"[{str(eval(task_id_so_far)[0])}]","")
        samp = samp.replace(f"[{str(eval(task_id_so_far)[0])}-API]","")
        temp_aug_sam[i]["history_reply"] = f"{samp} {tokenizer.eos_token}"

    temp_aug = temp_aug_mem + temp_aug_sam
    ## save the generated data for logging
    if not os.path.exists(f'{args.saving_dir}/{time}'):
        os.makedirs(f'{args.saving_dir}/{time}')
    with open(f'{args.saving_dir}/{time}'+f'/{task_id_so_far}_generated.json', 'w') as fp:
        json.dump(temp_aug, fp, indent=4)
    return temp_aug

def test_model_seq2seq(args,model,tokenizer,test_loader,time="0_['']",single_task=False):
    device = torch.device(f"cuda:0")
    model.model.to(device)
    model.model.eval()
    results = []

    if args.multi:
        task_id_list = test_loader.keys()
    elif args.continual:
        task_id_list = model.task_list_seen
    else:
        raise
    for task_id in task_id_list:
        if single_task and (task_id != task_id_list[-1]):
            continue
        for idx_b, batch in tqdm(enumerate(test_loader[task_id]),total=len(test_loader[task_id])):
            with torch.no_grad():
                if "gpt2" in args.model_checkpoint:
                    value_batch,_ = test_generation_GPT2BATCH(model=model.model,
                                                        tokenizer=tokenizer,
                                                        input_text=[b+"[SOS]" for b in batch['history']],
                                                        device=device,
                                                        max_length = 100)
                else:
                    responses = model.model.generate(input_ids=batch["encoder_input"].to(device),
                                                attention_mask=batch["attention_mask"].to(device),
                                                eos_token_id=tokenizer.eos_token_id,
                                                max_length=100)
                    value_batch = tokenizer.batch_decode(responses, skip_special_tokens=True)
            for idx, resp in enumerate(value_batch):
                results.append({"id":batch["dial_id"][idx],"turn_id":batch["turn_id"][idx],
                                "dataset":batch["dataset"][idx],"task_id":batch["task_id"][idx],
                                "spk":batch["spk"][idx],"gold":batch["reply"][idx],
                                "genr":resp,"hist":batch["history"][idx]})
            # if(idx_b==1): break
    if not os.path.exists(f'{args.saving_dir}/{time}'):
        os.makedirs(f'{args.saving_dir}/{time}')

    appendix = ''
    if single_task:
        assert args.fwt
        appendix += '_fwt'
    with open(f'{args.saving_dir}/{time}'+f'/generated_responses{appendix}.json', 'w') as fp:
        json.dump(results, fp, indent=4)
    tokenizer.padding_side = "right"


def argmin(a):
    return min(range(len(a)), key=lambda x: a[x])

def test_model_seq2seq_ADAPTER(args,model,tokenizer,test_loader,test_datasets,time="0_['']",use_all_masks=False,single_task=False,masks=None,TIL=False,retrain=False):  # max_seen_task=0
    # device = torch.device(f"cuda:{args.GPU[0]}")
    device = torch.device(f"cuda:0")
    model.model.to(device)
    model.model.eval()
    results = []

    print(model.task_list_seen, len(model.task_list_seen))
    if args.fwt:
        range_adpt = len(model.task_list_seen)-1
    else:
        range_adpt = len(model.task_list_seen)


    if hparams.mask_infer or not args.single:   # mask infer/ToDCL
        s = model.smax if hparams.mask else None
        if TIL:
            # assert hparams.mask_infer
            for task_id in model.task_list_seen:
                if single_task and (task_id != model.task_list_seen[-1]):
                    continue
                t = model.task_list_seen.index(task_id)
                for idx_b, batch in tqdm(enumerate(test_loader[task_id]), total=len(test_loader[task_id])):
                # for idx_b, batch in enumerate(test_loader[task_id]):
                    with torch.no_grad():
                        value_batch, _ = test_generation_GPT2BATCH(model=model.model,
                                                                   tokenizer=model.tokenizer,
                                                                   input_text=[b + "[SOS]" for b in
                                                                               batch['history']],
                                                                   device=device,
                                                                   max_length=100,
                                                                   task_id=t,
                                                                   s=s)  #
                    for idx, resp in enumerate(value_batch):
                        results.append({"id": batch["dial_id"][idx], "turn_id": batch["turn_id"][idx],
                                        "dataset": batch["dataset"][idx], "task_id": batch["task_id"][idx],
                                        "spk": batch["spk"][idx], "gold": batch["reply"][idx],
                                        "genr": resp, "hist": batch["history"][idx]})

        else:
            test_dataset = [test_datasets[task_id] for task_id in model.task_list_seen]
            test_dataset = sum(test_dataset,[])

            perplexity_dict = {f'{sample["dial_id"]}_{sample["turn_id"]}_{sample["task_id"]}': [] for sample in test_dataset}
            for t in range(range_adpt):
                print(f"Task {t} compute PPL")
                # for idx_b, batch in tqdm(enumerate(test_loader),total=len(test_loader)):

                for task_id in model.task_list_seen:
                    if single_task and (task_id != model.task_list_seen[-1]):
                        continue
                    # for idx_b, batch in tqdm(enumerate(test_loader[task_id]), total=len(test_loader[task_id])):
                    for idx_b, batch in enumerate(test_loader[task_id]):
                        ppl_batch = model.compute_PPL(batch, task_id=t, device=device, s=s) ## one value per batch
                        for (d_id, t_id, ta_id, ppl) in zip(batch["dial_id"],batch["turn_id"],batch["task_id"],ppl_batch):
                            perplexity_dict[f'{d_id}_{t_id}_{ta_id}'].append(ppl)

            # select the task id with the lowest perplexity (loss)
            perplexity_dict_ = {}
            for k,v in perplexity_dict.items():
                if len(v) == range_adpt:
                    perplexity_dict_[k] = v
                else:
                    print(k)    # logger.debug([k,v]) print(k,v)

            perplexity_dict = {k: argmin(v) for k,v in perplexity_dict_.items()}

            ## group by sample by predicted task id
            test_dataset_by_predicted_id = defaultdict(list)
            for sample in test_dataset:
                if (f'{sample["dial_id"]}_{sample["turn_id"]}_{sample["task_id"]}' in perplexity_dict):
                    test_dataset_by_predicted_id[perplexity_dict[f'{sample["dial_id"]}_{sample["turn_id"]}_{sample["task_id"]}']].append(sample)

            for k,v in test_dataset_by_predicted_id.items():
                print(f"Task {k}: {len(v)}")

            ## create a dataloader for batch each of this
            # test_dataset_by_predicted_id = {k: make_loader(args,v,model.tokenizer) for k,v in test_dataset_by_predicted_id.items()}
            test_dataset_by_predicted_id = {k: make_val_loader(args, v, model.tokenizer) for k, v in
                                            test_dataset_by_predicted_id.items()}

            # for pred_task_id, task_loader in tqdm(test_dataset_by_predicted_id.items(),total=len(test_dataset_by_predicted_id)):
            for pred_task_id, task_loader in test_dataset_by_predicted_id.items():
                # print(f"Task Id: {task_id}")
                for idx_b, batch in tqdm(enumerate(task_loader), total=len(task_loader)):
                # for idx_b, batch in enumerate(task_loader):
                    with torch.no_grad():
                        value_batch,_ = test_generation_GPT2BATCH(model=model.model,
                                                            tokenizer=model.tokenizer,
                                                            input_text=[b+"[SOS]" for b in batch['history']],
                                                            device=device,
                                                            max_length=100,
                                                            task_id=pred_task_id,
                                                            s=s)
                    for idx, resp in enumerate(value_batch):
                        results.append({"id":batch["dial_id"][idx],"turn_id":batch["turn_id"][idx],
                                        "dataset":batch["dataset"][idx],"task_id":batch["task_id"][idx],
                                        "spk":batch["spk"][idx],"gold":batch["reply"][idx],
                                        "genr":resp,"hist":batch["history"][idx],"pred_task_id":pred_task_id})

                    # if(idx_b==1): break

    elif args.single:
        # for idx_b, batch in tqdm(enumerate(test_loader), total=len(test_loader)):
        for task_id in model.task_list_seen:
            if args.mask and (use_all_masks is False) and (task_id != model.task_list_seen[-1]):    # 只测试当前mask在当前任务表现
                continue
            if single_task and (task_id != model.task_list_seen[-1]):
                continue
            for idx_b, batch in tqdm(enumerate(test_loader[task_id]), total=len(test_loader[task_id])):
                with torch.no_grad():
                    if masks is not None:
                        value_batch, _ = test_generation_GPT2BATCH(model=model.model,
                                                                   tokenizer=model.tokenizer,
                                                                   input_text=[b + "[SOS]" for b in batch['history']],
                                                                   device=device,
                                                                   max_length=100,
                                                                   masks_pre=masks)  #
                    else:
                        value_batch, _ = test_generation_GPT2BATCH(model=model.model,
                                                                   tokenizer=model.tokenizer,
                                                                   input_text=[b + "[SOS]" for b in batch['history']],
                                                                   device=device,
                                                                   max_length=100)  # 未输入labels，则不返回loss，只返回output
                for idx, resp in enumerate(value_batch):
                    results.append({"id": batch["dial_id"][idx], "turn_id": batch["turn_id"][idx],
                                    "dataset": batch["dataset"][idx], "task_id": batch["task_id"][idx],
                                    "spk": batch["spk"][idx], "gold": batch["reply"][idx],
                                    "genr": resp, "hist": batch["history"][idx]})

    if not os.path.exists(f'{args.saving_dir}/{time}'):
        os.makedirs(f'{args.saving_dir}/{time}')
    appendix=''

    if single_task:
        assert args.fwt
        appendix += '_fwt'
    else:
        if use_all_masks:
            appendix += '_all_masks'
        if retrain:
            appendix += '_retrain'
        if TIL:
            appendix += '_TIL'
    with open(f'{args.saving_dir}/{time}'+f'/generated_responses{appendix}.json', 'w') as fp:
        json.dump(results, fp, indent=4)
    print(f'save to {time}/generated_responses{appendix}.json')
    tokenizer.padding_side = "right"



# def test_model(args,model,tokenizer,test_loader,time=0):
    # args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # model.to(args.device)
    # model.eval()
    # results = []
    # for task_id, task_loader in tqdm(test_loader.items(),total=len(test_loader)):
    #     # print(f"Task Id: {task_id}")
    #     for idx_b, batch in enumerate(task_loader):
    #         input_ids, _, token_type_ids  = tuple(torch.tensor([batch[input_name]]).to(args.device) for input_name in MODEL_INPUTS)
    #         with torch.no_grad():
    #             response = generate(args,model,tokenizer,input_ids,token_type_ids)
    #         results.append({"id":batch["dial_id"],"turn_id":batch["turn_id"],
    #                         "dataset":batch["dataset"],"task_id":task_id,
    #                         "spk":batch["spk"],"gold":batch["row_reply"],
    #                         "genr":response,"hist":batch["plain_history"]})

    # if not os.path.exists(f'{args.saving_dir}/{time}'):
    #     os.makedirs(f'{args.saving_dir}/{time}')
    # with open(f'{args.saving_dir}/{time}'+'/generated_responses.json', 'w') as fp:
    #     json.dump(results, fp, indent=4)


def test(hparams):
    if (hparams.CL == "MULTI"):
        hparams.multi = True
        hparams.continual = False
    else:
        hparams.multi = False
        hparams.continual = True

    from single_CL_learner import Seq2SeqToD

    set_seeds(hparams.seed)
    model = Seq2SeqToD(hparams) # , hparams.saving_dir

    # hparams.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    _, _, dev_val_loader, (_, _, test_datasets) = get_data_loaders(hparams, model.tokenizer)
    # _, _, test_loader, (_, _) = get_data_loaders(hparams, tokenizer, test=True)
    # train_loader, val_loader, dev_val_loader, (train_datasets, test_datasets) = get_data_loaders(args, model.tokenizer)

    resume_folder, resume_task_num = check_resume(hparams, resume_task_num=hparams.resume_task_num)
    if resume_folder:
        load_checkpoint(model, resume_folder, hparams, resume_task_num, backbone=True)

    time = resume_folder.split('/')[-1]
    print(f"Loading Model From: {time}")

    if hparams.mask:
        if hparams.retrain:
            test_model_seq2seq_ADAPTER(hparams, model, model.tokenizer, dev_val_loader, test_datasets,
                                       time=time, retrain=True)
            # else:
            #     test_model_seq2seq_ADAPTER(hparams, model, model.tokenizer, dev_val_loader, test_datasets,
            #                                time=f"{task_num}_{task_id}", use_all_masks=True,
            #                                masks=model.mask_pre)
            if hparams.mask_infer:
                test_model_seq2seq_ADAPTER(hparams, model, model.tokenizer, dev_val_loader, test_datasets,
                                           time=time, TIL=True, retrain=True)


if __name__ == "__main__":
    test(hparams)
    score_folder(hparams)