Skip to content

Commit

Permalink
Merge branch 'main' of github.com:heidelberg-hepml/discriminator-metric
Browse files Browse the repository at this point in the history
  • Loading branch information
theoheimel committed Mar 29, 2023
2 parents c64dc23 + 8445854 commit 4c446d0
Show file tree
Hide file tree
Showing 3 changed files with 375 additions and 0 deletions.
36 changes: 36 additions & 0 deletions params/calo_inn.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
run_name: calo_inn

#Dataset
loader_module: calo_inn
loader_params:
geant_file: /remote/gpu06/favaro/calo_inn/datasets/cls_data/train_cls_piplus.hdf5
generated_file: /remote/gpu06/favaro/calo_inn/datasets/train_piplus.hdf5
add_log_energy: True
add_log_layer_ens: True
add_logit_step: False
train_split: 0.6
test_split: 0.2

# Model
activation: leaky_relu
negative_slope: 0.1
dropout: 0.1
layers: 5
hidden_size: 256

# Training
bayesian: False
lr: 1.e-3
betas: [0.9, 0.99]
weight_decay: 0.0
epochs: 50
batch_size: 1024
lr_scheduler: reduce_on_plateau
lr_decay_factor: 0.1
lr_patience: 5
checkpoint_interval: 5

# Evaluation
#bayesian_samples: 2
#lower_cluster_thresholds: [0.01, 0.1]
#upper_cluster_thresholds: [0.9, 0.99]
162 changes: 162 additions & 0 deletions src/calo_plotting_helper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
# pylint: disable=invalid-name
""" Helper functions that compute the high-level observables of calorimeter showers.
Used for
"CaloFlow: Fast and Accurate Generation of Calorimeter Showers with Normalizing Flows"
by Claudius Krause and David Shih
arxiv:2106.05285
"CaloFlow II: Even Faster and Still Accurate Generation of Calorimeter Showers with
Normalizing Flows"
by Claudius Krause and David Shih
arxiv:2110.11377
Functions inspired by
"CaloGAN: Simulating 3D High Energy Particle Showers in Multi-LayerElectromagnetic
Calorimeters with Generative Adversarial Networks"
by Michela Paganini, Luke de Oliveira, and Benjamin Nachman
arxiv:1712.10321
https://github.com/hep-lbdl/CaloGAN
"""

import numpy as np

# number of voxel per layer in the given dimension, see fig 2 of 1712.10321
PHI_CELLS = {0: 3, 1: 12, 2: 12}
ETA_CELLS = {0: 96, 1: 12, 2: 6}

def to_np_thres(tensor, threshold):
""" moves tensor to CPU, then to numpy, then applies threshold """
ret = tensor.clamp_(0., 1e5).to('cpu').numpy()
ret = np.where(ret < threshold, np.zeros_like(ret), ret)
return ret

def layer_split(data):
""" splits data into the 3 layers """
return np.split(data, [288, 432], axis=-1)

def layer_std(layer1, layer2, total):
""" helper function for standard deviation of layer depth"""
term1 = (layer1 + 4.*layer2) / total
term2 = ((layer1 + 2.*layer2)/total)**2
return np.sqrt(term1-term2)

def energy_sum(data, normalization=1e3):
""" Returns the energy sum of all energy depositions.
If len(data.shape) is 3, the sum is taken over the last 2 axis (summing voxels per layer).
If it is 2, the sum is taken over the last axis (summing voxels of entire event).
normalization of 1e3 accounts for MeV to GeV unit conversion.
"""
if len(data.shape) == 3:
ret = data.sum(axis=(1, 2))
else:
ret = data.sum(axis=-1)
return ret / normalization

def energy_ratio(data, layer):
""" Returns the sum of energy of the given layer divided by the total energy """
ret = energy_sum(layer_split(data)[layer]) / energy_sum(data)
return ret

def layer_sparsity(data, threshold, layer=None):
""" Returns the sparsity (=fraction of voxel above threshold) of the given layer
Supports either sparsity in given data array (of size 3), assuming it's a layer
or needs a layer_nr if data has size 2 (assuming it's a full shower).
"""
if len(data.shape) == 3:
sparsity = (data > threshold).mean((1, 2))
else:
if layer is not None:
data = layer_split(data)[layer]
sparsity = (data > threshold).mean(-1)
return sparsity

def n_brightest_voxel(data, num_brightest, ratio=True):
""" Returns the ratio of the "num_brightest" voxel to the energy deposited in the layer. """
if len(data.shape) == 3:
data = data.reshape(len(data), -1)
top_n = np.sort(data, axis=1)[:, -max(num_brightest):]
energies = data.sum(axis=-1).reshape(-1, 1)
ret = top_n[:, [-num for num in num_brightest]]
if ratio:
# why did I filter energies>0?
#ret = (ret/ (energies + 1e-16))[np.tile(energies > 0, len(num_brightest))].reshape(
# -1, len(num_brightest))
ret = (ret/ (energies + 1e-16)).reshape(-1, len(num_brightest))
return ret

def ratio_two_brightest(data):
""" Returns the ratio of the difference of the 2 brightest voxels to their sum. """
top = np.sort(data, axis=1)[:, -2:]
#ret = ((top[:, 1] - top[:, 0]) / (top[:, 0] + top[:, 1] + 1e-16))[top[:, 1] > 0]
ret = ((top[:, 1] - top[:, 0]) / (top[:, 0] + top[:, 1] + 1e-16))
return ret

def maxdepth_nr(data):
""" Returns the layer that has the last energy deposition """
_, layer_1, layer_2 = layer_split(data)

maxdepth = 2* (energy_sum(layer_2) != 0)
maxdepth[maxdepth == 0] = 1* (energy_sum(layer_1[maxdepth == 0]) != 0)
return maxdepth

def depth_weighted_energy(data):
""" Returns the depth-weighted total energy deposit in all 3 layers for a given batch. """
_, layer_1, layer_2 = layer_split(data)
ret = energy_sum(layer_1, normalization=1.) + 2.* energy_sum(layer_2, normalization=1.)
return ret

def depth_weighted_energy_normed_std(data):
""" Returns the standard deviation of the depth-weighted total energy deposit in all 3 layers
normalized by the total deposited energy for a given batch
"""
_, layer_1, layer_2 = layer_split(data)
layer1 = energy_sum(layer_1, normalization=1.)
layer2 = energy_sum(layer_2, normalization=1.)
energies = energy_sum(data, normalization=1.)
return layer_std(layer1, layer2, energies)

def center_of_energy(data, layer, direction):
""" Returns the center of energy in the direction 'direction' for layer 'layer'. """
if direction == 'eta':
bins = np.linspace(-240, 240, ETA_CELLS[layer] + 1)
elif direction == 'phi':
bins = np.linspace(-240, 240, PHI_CELLS[layer] + 1)
else:
raise ValueError("direction={} not in ['eta', 'phi']".format(direction))
bin_centers = (bins[1:] + bins[:-1]) / 2.

if direction == 'phi':
data = data.reshape(len(data), PHI_CELLS[layer], -1)
ret = (data * bin_centers.reshape(-1, 1)).sum(axis=(1, 2))
else:
data = data.reshape(len(data), -1, ETA_CELLS[layer])
ret = (data * bin_centers.reshape(1, -1)).sum(axis=(1, 2))
energies = energy_sum(data, normalization=1.)
ret = ret / (energies + 1e-8)
return ret

def center_of_energy_std(data, layer, direction):
""" Returns the standard deviation of center of energy in the direction
'direction' for layer 'layer'.
"""
if direction == 'eta':
bins = np.linspace(-240, 240, ETA_CELLS[layer] + 1)
elif direction == 'phi':
bins = np.linspace(-240, 240, PHI_CELLS[layer] + 1)
else:
raise ValueError("direction={} not in ['eta', 'phi']".format(direction))
bin_centers = (bins[1:] + bins[:-1]) / 2.

if direction == 'phi':
data = data.reshape(len(data), PHI_CELLS[layer], -1)
ret = (data * bin_centers.reshape(-1, 1)).sum(axis=(1, 2))
ret2 = (data * bin_centers.reshape(-1, 1)**2).sum(axis=(1, 2))
else:
data = data.reshape(len(data), -1, ETA_CELLS[layer])
ret = (data * bin_centers.reshape(1, -1)).sum(axis=(1, 2))
ret2 = (data * bin_centers.reshape(1, -1)**2).sum(axis=(1, 2))
energies = energy_sum(data, normalization=1.)
ret = np.sqrt((ret2 / (energies+1e-8)) - (ret / (energies+1e-8)) ** 2)
return ret
177 changes: 177 additions & 0 deletions src/loaders/calo_inn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
import pandas as pd
import numpy as np
import torch
import h5py
from types import SimpleNamespace

from ..dataset import DiscriminatorData
from ..observable import Observable
from ..calo_plotting_helper import *

def load(params: dict) -> list[DiscriminatorData]:
"""
dataloader for calo data
parameters:
args:
return:
"""
datasets = []
preproc_kwargs = {
"add_log_energy": params.get("add_log_energy", False),
"add_log_layer_ens": params.get("add_log_layer_ens", False),
"add_logit_step": params.get("add_logit_step", False),
"add_cut": params.get("add_cut", 0.0),
}
datasets_list = [
{'level': 'low', 'normalize': True, 'label': 'Norm.', 'suffix': 'norm'},
{'level': 'low', 'normalize': False, 'label': 'Unnorm.', 'suffix': 'unnorm'},
{'level': 'high', 'normalize': False, 'label': 'High', 'suffix': 'high'},
]

for dataset in datasets_list:
if dataset['level'] == 'low':
geant_sample = create_data(params['geant_file'], dataset, **preproc_kwargs)
gen_sample = create_data(params['generated_file'], dataset, **preproc_kwargs)
elif dataset['level'] == 'high':
geant_sample = create_data_high(params['geant_file'], dataset, **preproc_kwargs)
gen_sample = create_data_high(params['generated_file'], dataset, **preproc_kwargs)
else:
raise ValueError('Classifier preprocessing running at unknown level.')

train_true, test_true, val_true = split_data(
geant_sample,
params["train_split"],
params["test_split"]
)
train_fake, test_fake, val_fake = split_data(
gen_sample,
params["train_split"],
params["test_split"]
)

datasets.append(DiscriminatorData(
label = dataset['label'],
suffix = dataset['suffix'],
dim = geant_sample.shape[-1],
train_true = train_true,
train_fake = train_fake,
test_true = test_true,
test_fake = test_fake,
val_true = val_true,
val_fake = val_fake,
observables = [],
)
)
return datasets

def create_data(data_path, dataset_list, **kwargs):
with h5py.File(data_path, "r") as f:
en_test = f.get('energy')[:] / 1e2
lay_0 = f.get('layer_0')[:] / 1e5
lay_1 = f.get('layer_1')[:] / 1e5
lay_2 = f.get('layer_2')[:] / 1e5
data = np.concatenate((lay_0.reshape(-1, 288), lay_1.reshape(-1, 144), lay_2.reshape(-1, 72)), axis=1)

en0_t = np.sum(data[:, :288], axis=1, keepdims=True)
en1_t = np.sum(data[:, 288:432], axis=1, keepdims=True)
en2_t = np.sum(data[:, 432:], axis=1, keepdims=True)

if dataset_list['normalize']:
data[:, :288] /= en0_t + 1e-16
data[:, 288:432] /= en1_t + 1e-16
data[:, 432:] /= en2_t + 1e-16

if kwargs['add_log_energy']:
data = np.concatenate((data, np.log10(en_test*10).reshape(-1, 1)), axis=1)
data = np.nan_to_num(data, posinf=0, neginf=0)

en0_t = np.log10(en0_t + 1e-8) + 2.
en1_t = np.log10(en1_t + 1e-8) +2.
en2_t = np.log10(en2_t + 1e-8) +2.

if kwargs['add_log_layer_ens']:
data = np.concatenate((data, en0_t, en1_t, en2_t), axis=1)
if kwargs['add_logit_step']:
raise ValueError('Not implemented yet')
return data

def create_data_high(data_path, dataset_list, **kwargs):
cut = kwargs['add_cut']
with h5py.File(data_path, "r") as f:
en_test = f.get('energy')[:] / 1e2
lay_0 = f.get('layer_0')[:] / 1e5
lay_1 = f.get('layer_1')[:] / 1e5
lay_2 = f.get('layer_2')[:] / 1e5

incident_energy = torch.log10(torch.tensor(en_test)*10.)
# scale them back to MeV
layer0 = torch.tensor(lay_0) * 1e5
layer1 = torch.tensor(lay_1) * 1e5
layer2 = torch.tensor(lay_2) * 1e5
layer0 = to_np_thres(layer0.view(layer0.shape[0], -1), cut)
layer1 = to_np_thres(layer1.view(layer1.shape[0], -1), cut)
layer2 = to_np_thres(layer2.view(layer2.shape[0], -1), cut)
# detour to numpy in order to use same functions as plotting script
full_shower = np.concatenate((layer0, layer1, layer2), 1)
E_0 = energy_sum(layer0)
E_1 = energy_sum(layer1)
E_2 = energy_sum(layer2)
E_tot = E_0 + E_1 + E_2
f_0 = E_0 / E_tot
f_1 = E_1 / E_tot
f_2 = E_2 / E_tot
l_d = depth_weighted_energy(full_shower)
s_d = l_d / (E_tot * 1e3)
sigma_sd = depth_weighted_energy_normed_std(full_shower)
E_1b0, E_2b0, E_3b0, E_4b0, E_5b0 = n_brightest_voxel(layer0, [1, 2, 3, 4, 5]).T
E_1b1, E_2b1, E_3b1, E_4b1, E_5b1 = n_brightest_voxel(layer1, [1, 2, 3, 4, 5]).T
E_1b2, E_2b2, E_3b2, E_4b2, E_5b2 = n_brightest_voxel(layer2, [1, 2, 3, 4, 5]).T
ratio_0 = ratio_two_brightest(layer0)
ratio_1 = ratio_two_brightest(layer1)
ratio_2 = ratio_two_brightest(layer2)
sparsity_0 = layer_sparsity(layer0, cut)
sparsity_1 = layer_sparsity(layer1, cut)
sparsity_2 = layer_sparsity(layer2, cut)
phi_0 = center_of_energy(layer0, 0, 'phi')
phi_1 = center_of_energy(layer1, 1, 'phi')
phi_2 = center_of_energy(layer2, 2, 'phi')
eta_0 = center_of_energy(layer0, 0, 'eta')
eta_1 = center_of_energy(layer1, 1, 'eta')
eta_2 = center_of_energy(layer2, 2, 'eta')
sigma_0 = center_of_energy_std(layer0, 0, 'phi')
sigma_1 = center_of_energy_std(layer1, 1, 'phi')
sigma_2 = center_of_energy_std(layer2, 2, 'phi')

# to be log10-processed:
ret1 = np.vstack([E_0+1e-8, E_1+1e-8, E_2+1e-8, E_tot,
f_0+1e-8, f_1+1e-8, f_2+1e-8, l_d+1e-8,
sigma_0+1e-8, sigma_1+1e-8, sigma_2+1e-8]).T
ret1 = np.log10(ret1)
# without log10 processing:
ret2 = np.vstack([s_d, sigma_sd,
1e1*E_1b0, 1e1*E_2b0, 1e1*E_3b0, 1e1*E_4b0, 1e1*E_5b0,
1e1*E_1b1, 1e1*E_2b1, 1e1*E_3b1, 1e1*E_4b1, 1e1*E_5b1,
1e1*E_1b2, 1e1*E_2b2, 1e1*E_3b2, 1e1*E_4b2, 1e1*E_5b2,
ratio_0, ratio_1, ratio_2, sparsity_0, sparsity_1, sparsity_2,
phi_0/1e2, phi_1/1e2, phi_2/1e2, eta_0/1e2, eta_1/1e2, eta_2/1e2]).T
ret = torch.from_numpy(np.hstack([ret1, ret2]))

ret = torch.cat((ret, incident_energy), 1)
return ret.numpy()

def split_data(
data: np.ndarray,
train_split: float,
test_split: float
) -> tuple[np.ndarray, ...]:
n_train = int(train_split * len(data))
n_test = int(test_split * len(data))
train_data = data[:n_train]
test_data = data[-n_test:]
val_data = data[n_train:-n_test]
return train_data, test_data, val_data


0 comments on commit 4c446d0

Please sign in to comment.