Merge pull request #2046 from NNPDF/fix_closure_commondata_loader

Correct bug in commondata loading for closure tests
NNPDF · Apr 15, 2024 · 7bb760d · 7bb760d
2 parents 7fb6b69 + 97592ec
commit 7bb760d
Show file tree

Hide file tree

Showing 4 changed files with 81 additions and 10 deletions.
diff --git a/validphys2/src/validphys/closuretest/multiclosure_pseudodata.py b/validphys2/src/validphys/closuretest/multiclosure_pseudodata.py
@@ -5,6 +5,7 @@
 Estimators here can only be calculated on data used in the fit.
 
 """
+
 import numpy as np
 import pandas as pd
 
@@ -27,14 +28,8 @@ def fits_dataset_cvs(fits_dataset):
     """
     fits_cv = []
     for ds in fits_dataset:
-        # using the official loader is really slow, open the CSV
-        # and then cut the central values manually.
-        # TODO: Save central values in nice table like pseudodata
-        # but this should be done beyond NNPDF4.0
-        cd_df = pd.read_csv(ds.commondata.datafile, sep=r'\s+', skiprows=1, header=None)
-        # based on columns from python cd reader:
-        # ['entry', 'process', 'kin1', 'kin2', 'kin3', 'data', 'stat']
-        fits_cv.append(cd_df.iloc[cut_mask(ds.cuts), 5].to_numpy())
+        cd_df = ds.commondata.metadata.load_data_central()
+        fits_cv.append(cd_df.iloc[cut_mask(ds.cuts)].to_numpy())
     return fits_cv
 
 

diff --git a/validphys2/src/validphys/loader.py b/validphys2/src/validphys/loader.py
@@ -219,9 +219,9 @@ def _use_fit_commondata_old_format_to_new_format(setname, file_path):
     new_unc_stream = tempfile.NamedTemporaryFile(
         delete=False, prefix=f"filter_{setname}_uncertainties", suffix=".yaml", mode="w"
     )
-    commondata.export_uncertainties(new_data_stream)
+    commondata.export_uncertainties(new_unc_stream)
     new_unc_stream.close()
-    unc_path = pathlib.Path(new_data_stream.name)
+    unc_path = pathlib.Path(new_unc_stream.name)
     return data_path, unc_path
 
 

diff --git a/validphys2/src/validphys/tests/conftest.py b/validphys2/src/validphys/tests/conftest.py
@@ -3,6 +3,7 @@
 
 Pytest fixtures.
 """
+
 import contextlib
 import pathlib
 import sys
@@ -109,6 +110,13 @@ def data_internal_cuts_new_theory_config(data_internal_cuts_config):
     return config
 
 
+@pytest.fixture(scope='module')
+def data_fromfit_cuts_config(data_internal_cuts_new_theory_config):
+    config = dict(data_internal_cuts_new_theory_config)
+    config.update(use_cuts="fromfit")
+    return config
+
+
 @pytest.fixture(scope='module')
 def single_data_internal_cuts_config(data_internal_cuts_config):
     """Like data_internal_cuts_config but for a single dataset"""
@@ -172,6 +180,27 @@ def weighted_data_witht0_internal_cuts_config(data_witht0_internal_cuts_config):
     return config_dict
 
 
+@pytest.fixture(scope='module')
+def fromfit_closure_config():
+    """A configuration useful for closure test where everything is
+    read from the fit"""
+    config = {
+        "dataset_inputs": {"from_": "fit"},
+        "datacuts": {"from_": "fit"},
+        "use_cuts": "fromfit",
+        "fakepdf": {"from_": "closuretest"},
+        "theory": {"from_": "fit"},
+        "theoryid": {"from_": "theory"},
+        "pdf": {"from_": "fit"},
+        "closuretest": {"from_": "fit"},
+        "filterseed": {"from_": "closuretest"},
+        "use_fitcommondata": True,
+        "use_t0": True,
+        "t0pdfset": {"from_": "datacuts"},
+    }
+    return config
+
+
 def pytest_runtest_setup(item):
     ALL = {"darwin", "linux"}
     supported_platforms = ALL.intersection(mark.name for mark in item.iter_markers())

diff --git a/validphys2/src/validphys/tests/test_multiclosure.py b/validphys2/src/validphys/tests/test_multiclosure.py
@@ -0,0 +1,47 @@
+"""
+    Tests for multiclosure utilities
+"""
+
+import numpy as np
+import pytest
+
+from reportengine.resourcebuilder import ResourceError
+from validphys.api import API
+
+# These fits contain _only_ data
+MULTICLOSURE_FITS = ["240412-test-multiclosure-001", "240412-test-multiclosure-002"]
+
+
+def test_multiclosure_data_fits_cv(fromfit_closure_config, data_fromfit_cuts_config):
+    """Test that we can generate level 1 data and then read it using the
+    utilities of ``multiclosure_pseudodata``.
+    This works as both a test of the ``data_fits_cv`` pipeline
+    and as canary for possible regression regarding already generated data"""
+    base_fit = MULTICLOSURE_FITS[0]
+
+    # Get the data generated by the fit
+    full_config = {"fits": MULTICLOSURE_FITS, "fit": base_fit, **fromfit_closure_config}
+    get_fit_cv = API.data_fits_cv(**full_config)
+
+    # Now get the "real" data
+    fit = API.fit(fit=base_fit)
+    dataset_inputs = fit.as_input()["dataset_inputs"]
+
+    # Now assert that the data is different for each of the closure tests
+    # and all of them are different from the central data!
+
+    for dinput, fit_data in zip(dataset_inputs, get_fit_cv):
+        ds = API.dataset(dataset_input=dinput, **data_fromfit_cuts_config, fit=base_fit)
+        # Load with cuts and get the central value
+        cv = ds.load_commondata().central_values
+        fit_data.append(cv.values.reshape(-1, 1))
+
+        _, unique = np.unique(fit_data, axis=0, return_counts=True)
+
+        if not np.allclose(unique, 1.0):
+            raise ValueError(f"Multiclosure reading failed, data is the same when it shouldn't!")
+
+    # Ensure that the use_fit_commondata key is still controlling the load
+    with pytest.raises(ResourceError):
+        full_config["use_fitcommondata"] = False
+        _ = API.data_fits_cv(**full_config)