Skip to content

Commit

Permalink
Merge pull request #2046 from NNPDF/fix_closure_commondata_loader
Browse files Browse the repository at this point in the history
Correct bug in commondata loading for closure tests
  • Loading branch information
scarlehoff authored Apr 15, 2024
2 parents 7fb6b69 + 97592ec commit 7bb760d
Show file tree
Hide file tree
Showing 4 changed files with 81 additions and 10 deletions.
11 changes: 3 additions & 8 deletions validphys2/src/validphys/closuretest/multiclosure_pseudodata.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
Estimators here can only be calculated on data used in the fit.
"""

import numpy as np
import pandas as pd

Expand All @@ -27,14 +28,8 @@ def fits_dataset_cvs(fits_dataset):
"""
fits_cv = []
for ds in fits_dataset:
# using the official loader is really slow, open the CSV
# and then cut the central values manually.
# TODO: Save central values in nice table like pseudodata
# but this should be done beyond NNPDF4.0
cd_df = pd.read_csv(ds.commondata.datafile, sep=r'\s+', skiprows=1, header=None)
# based on columns from python cd reader:
# ['entry', 'process', 'kin1', 'kin2', 'kin3', 'data', 'stat']
fits_cv.append(cd_df.iloc[cut_mask(ds.cuts), 5].to_numpy())
cd_df = ds.commondata.metadata.load_data_central()
fits_cv.append(cd_df.iloc[cut_mask(ds.cuts)].to_numpy())
return fits_cv


Expand Down
4 changes: 2 additions & 2 deletions validphys2/src/validphys/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,9 +219,9 @@ def _use_fit_commondata_old_format_to_new_format(setname, file_path):
new_unc_stream = tempfile.NamedTemporaryFile(
delete=False, prefix=f"filter_{setname}_uncertainties", suffix=".yaml", mode="w"
)
commondata.export_uncertainties(new_data_stream)
commondata.export_uncertainties(new_unc_stream)
new_unc_stream.close()
unc_path = pathlib.Path(new_data_stream.name)
unc_path = pathlib.Path(new_unc_stream.name)
return data_path, unc_path


Expand Down
29 changes: 29 additions & 0 deletions validphys2/src/validphys/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
Pytest fixtures.
"""

import contextlib
import pathlib
import sys
Expand Down Expand Up @@ -109,6 +110,13 @@ def data_internal_cuts_new_theory_config(data_internal_cuts_config):
return config


@pytest.fixture(scope='module')
def data_fromfit_cuts_config(data_internal_cuts_new_theory_config):
config = dict(data_internal_cuts_new_theory_config)
config.update(use_cuts="fromfit")
return config


@pytest.fixture(scope='module')
def single_data_internal_cuts_config(data_internal_cuts_config):
"""Like data_internal_cuts_config but for a single dataset"""
Expand Down Expand Up @@ -172,6 +180,27 @@ def weighted_data_witht0_internal_cuts_config(data_witht0_internal_cuts_config):
return config_dict


@pytest.fixture(scope='module')
def fromfit_closure_config():
"""A configuration useful for closure test where everything is
read from the fit"""
config = {
"dataset_inputs": {"from_": "fit"},
"datacuts": {"from_": "fit"},
"use_cuts": "fromfit",
"fakepdf": {"from_": "closuretest"},
"theory": {"from_": "fit"},
"theoryid": {"from_": "theory"},
"pdf": {"from_": "fit"},
"closuretest": {"from_": "fit"},
"filterseed": {"from_": "closuretest"},
"use_fitcommondata": True,
"use_t0": True,
"t0pdfset": {"from_": "datacuts"},
}
return config


def pytest_runtest_setup(item):
ALL = {"darwin", "linux"}
supported_platforms = ALL.intersection(mark.name for mark in item.iter_markers())
Expand Down
47 changes: 47 additions & 0 deletions validphys2/src/validphys/tests/test_multiclosure.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
"""
Tests for multiclosure utilities
"""

import numpy as np
import pytest

from reportengine.resourcebuilder import ResourceError
from validphys.api import API

# These fits contain _only_ data
MULTICLOSURE_FITS = ["240412-test-multiclosure-001", "240412-test-multiclosure-002"]


def test_multiclosure_data_fits_cv(fromfit_closure_config, data_fromfit_cuts_config):
"""Test that we can generate level 1 data and then read it using the
utilities of ``multiclosure_pseudodata``.
This works as both a test of the ``data_fits_cv`` pipeline
and as canary for possible regression regarding already generated data"""
base_fit = MULTICLOSURE_FITS[0]

# Get the data generated by the fit
full_config = {"fits": MULTICLOSURE_FITS, "fit": base_fit, **fromfit_closure_config}
get_fit_cv = API.data_fits_cv(**full_config)

# Now get the "real" data
fit = API.fit(fit=base_fit)
dataset_inputs = fit.as_input()["dataset_inputs"]

# Now assert that the data is different for each of the closure tests
# and all of them are different from the central data!

for dinput, fit_data in zip(dataset_inputs, get_fit_cv):
ds = API.dataset(dataset_input=dinput, **data_fromfit_cuts_config, fit=base_fit)
# Load with cuts and get the central value
cv = ds.load_commondata().central_values
fit_data.append(cv.values.reshape(-1, 1))

_, unique = np.unique(fit_data, axis=0, return_counts=True)

if not np.allclose(unique, 1.0):
raise ValueError(f"Multiclosure reading failed, data is the same when it shouldn't!")

# Ensure that the use_fit_commondata key is still controlling the load
with pytest.raises(ResourceError):
full_config["use_fitcommondata"] = False
_ = API.data_fits_cv(**full_config)

0 comments on commit 7bb760d

Please sign in to comment.