From c7aace96423113f63708553fd5714c7291e75345 Mon Sep 17 00:00:00 2001 From: Luigi Favaro Date: Mon, 27 Mar 2023 15:52:25 +0200 Subject: [PATCH] working low data preproc --- params/calo_inn.yaml | 6 ++++-- src/loaders/calo_inn.py | 47 +++++++++++++++++++++++++++++++++++------ 2 files changed, 44 insertions(+), 9 deletions(-) diff --git a/params/calo_inn.yaml b/params/calo_inn.yaml index cc80e83..8a528b0 100644 --- a/params/calo_inn.yaml +++ b/params/calo_inn.yaml @@ -3,11 +3,13 @@ run_name: calo_inn #Dataset loader_module: calo_inn loader_params: - geant_file: path_to_file.hdf5 - generated_file: path_to_file.hdf5 + geant_file: /remote/gpu06/favaro/calo_inn/datasets/cls_data/train_cls_piplus.hdf5 + generated_file: /remote/gpu06/favaro/calo_inn/datasets/train_piplus.hdf5 add_log_energy: True add_log_layer_ens: True add_logit_step: False + train_split: 0.6 + test_split: 0.2 # Model activation: leaky_relu diff --git a/src/loaders/calo_inn.py b/src/loaders/calo_inn.py index cbca0e4..0802a0e 100644 --- a/src/loaders/calo_inn.py +++ b/src/loaders/calo_inn.py @@ -1,5 +1,6 @@ import pandas as pd import numpy as np +import h5py from types import SimpleNamespace from ..dataset import DiscriminatorData @@ -24,16 +25,16 @@ def load(params: dict) -> list[DiscriminatorData]: datasets_list = [ {'level': 'low', 'normalize': True, 'label': 'Norm.', 'suffix': 'norm'}, {'level': 'low', 'normalize': False, 'label': 'Unnorm.', 'suffix': 'unnorm'}, - {'level': 'high', 'normalize': False, 'label': 'High', 'suffix': 'high'}, + #{'level': 'high', 'normalize': False, 'label': 'High', 'suffix': 'high'}, ] for dataset in datasets_list: if dataset['level'] == 'low': - geant_sample = create_data(params['geant_file'], dataset) - gen_sample = create_data(params['generated_file'], dataset) + geant_sample = create_data(params['geant_file'], dataset, **preproc_kwargs) + gen_sample = create_data(params['generated_file'], dataset, **preproc_kwargs) elif dataset['level'] == 'high': - geant_sample = create_data_high(params['geant_file'], dataset) - gen_sample = create_data_high(params['generated_file'], dataset) + geant_sample = create_data_high(params['geant_file'], dataset, **preproc_kwargs) + gen_sample = create_data_high(params['generated_file'], dataset, **preproc_kwargs) else: raise ValueError('Classifier preprocessing running at unknown level.') @@ -43,7 +44,7 @@ def load(params: dict) -> list[DiscriminatorData]: params["test_split"] ) train_fake, test_fake, val_fake = split_data( - generated_sample, + gen_sample, params["train_split"], params["test_split"] ) @@ -63,8 +64,40 @@ def load(params: dict) -> list[DiscriminatorData]: ) return datasets -#def create_data(data_path): +def create_data(data_path, dataset_list, **kwargs): + with h5py.File(data_path, "r") as f: + en_test = f.get('energy')[:] / 1e2 + lay_0 = f.get('layer_0')[:] / 1e5 + lay_1 = f.get('layer_1')[:] / 1e5 + lay_2 = f.get('layer_2')[:] / 1e5 + data = np.concatenate((lay_0.reshape(-1, 288), lay_1.reshape(-1, 144), lay_2.reshape(-1, 72)), axis=1) + + en0_t = np.sum(data[:, :288], axis=1, keepdims=True) + en1_t = np.sum(data[:, 288:432], axis=1, keepdims=True) + en2_t = np.sum(data[:, 432:], axis=1, keepdims=True) + + if dataset_list['normalize']: + data[:, :288] /= en0_t + 1e-16 + data[:, 288:432] /= en1_t + 1e-16 + data[:, 432:] /= en2_t + 1e-16 + + if kwargs['add_log_energy']: + data = np.concatenate((data, np.log10(en_test*10).reshape(-1, 1)), axis=1) + data = np.nan_to_num(data, posinf=0, neginf=0) + + en0_t = np.log10(en0_t + 1e-8) + 2. + en1_t = np.log10(en1_t + 1e-8) +2. + en2_t = np.log10(en2_t + 1e-8) +2. + if kwargs['add_log_layer_ens']: + data = np.concatenate((data, en0_t, en1_t, en2_t), axis=1) + if kwargs['add_logit_step']: + raise ValueError('Not implemented yet') + return data + +def create_data_high(data_path, dataset_list, **kwargs): + pass + def split_data( data: np.ndarray, train_split: float,