From 8a719ef69c4c49c835bfb707ab012b60963ff585 Mon Sep 17 00:00:00 2001
From: Florian Deconinck <florian.deconinck@gmail.com>
Date: Thu, 26 Sep 2024 10:59:34 -0400
Subject: [PATCH] Translate: print only ~10 errors and dump all data in files
 under `.translate-errors`

---
 ndsl/stencils/testing/test_translate.py |  19 ++-
 ndsl/testing/comparison.py              | 195 +++++++++++++-----------
 2 files changed, 125 insertions(+), 89 deletions(-)

diff --git a/ndsl/stencils/testing/test_translate.py b/ndsl/stencils/testing/test_translate.py
index 55ee100..db8e604 100644
--- a/ndsl/stencils/testing/test_translate.py
+++ b/ndsl/stencils/testing/test_translate.py
@@ -210,13 +210,19 @@ def test_sequential_savepoint(
                     near_zero=case.testobj.near_zero,
                 )
             if not metric.check:
+                os.makedirs(OUTDIR, exist_ok=True)
+                log_filename = os.path.join(
+                    OUTDIR,
+                    f"details-{case.savepoint_name}-{varname}-rank{case.rank}.log",
+                )
+                metric.report(log_filename)
                 pytest.fail(str(metric), pytrace=False)
             passing_names.append(failing_names.pop())
         ref_data_out[varname] = [ref_data]
     if len(failing_names) > 0:
         get_thresholds(case.testobj, input_data=original_input_data)
         os.makedirs(OUTDIR, exist_ok=True)
-        out_filename = os.path.join(OUTDIR, f"translate-{case.savepoint_name}.nc")
+        nc_filename = os.path.join(OUTDIR, f"translate-{case.savepoint_name}.nc")
         input_data_on_host = {}
         for key, _input in input_data.items():
             input_data_on_host[key] = gt_utils.asarray(_input)
@@ -226,7 +232,7 @@ def test_sequential_savepoint(
             [output],
             ref_data_out,
             failing_names,
-            out_filename,
+            nc_filename,
         )
     if failing_names != []:
         pytest.fail(
@@ -353,11 +359,16 @@ def test_parallel_savepoint(
                     near_zero=case.testobj.near_zero,
                 )
             if not metric.check:
+                os.makedirs(OUTDIR, exist_ok=True)
+                log_filename = os.path.join(
+                    OUTDIR, f"details-{case.savepoint_name}-{varname}.log"
+                )
+                metric.report(log_filename)
                 pytest.fail(str(metric), pytrace=False)
             passing_names.append(failing_names.pop())
     if len(failing_names) > 0:
         os.makedirs(OUTDIR, exist_ok=True)
-        out_filename = os.path.join(
+        nct_filename = os.path.join(
             OUTDIR, f"translate-{case.savepoint_name}-{case.grid.rank}.nc"
         )
         try:
@@ -370,7 +381,7 @@ def test_parallel_savepoint(
                 [output],
                 ref_data,
                 failing_names,
-                out_filename,
+                nct_filename,
             )
         except Exception as error:
             print(f"TestParallel SaveNetCDF Error: {error}")
diff --git a/ndsl/testing/comparison.py b/ndsl/testing/comparison.py
index d3ac0d6..9e2d1d5 100644
--- a/ndsl/testing/comparison.py
+++ b/ndsl/testing/comparison.py
@@ -1,4 +1,4 @@
-from typing import Union
+from typing import List, Optional, Union
 
 import numpy as np
 import numpy.typing as npt
@@ -20,6 +20,9 @@ def __str__(self) -> str:
     def __repr__(self) -> str:
         ...
 
+    def report(self, file_path: Optional[str] = None) -> List[str]:
+        ...
+
 
 class LegacyMetric(BaseMetric):
     """Legacy (AI2) metric used for original FV3 port.
@@ -88,67 +91,78 @@ def _compute_errors(
             )
         return success
 
-    def __str__(self) -> str:
-        return self.__repr__()
-
-    def __repr__(self) -> str:
+    def report(self, file_path: Optional[str] = None) -> List[str]:
+        report = []
         if self.check:
-            return "✅ No numerical differences"
+            report.append("✅ No numerical differences")
+        else:
+            report.append("❌ Numerical failures")
+
+            found_indices = np.logical_not(self.success).nonzero()
+            computed_failures = self.computed[found_indices]
+            reference_failures = self.references[found_indices]
+
+            # List all errors
+            bad_indices_count = len(found_indices[0])
+            # Determine worst result
+            worst_metric_err = 0.0
+            abs_errs = []
+            details = [
+                "All failures:",
+                "Index  Computed  Reference  Absloute E  Metric E",
+            ]
+            for b in range(bad_indices_count):
+                full_index = tuple([f[b] for f in found_indices])
+
+                metric_err = self._calculated_metric[full_index]
+
+                absolute_distance = abs(computed_failures[b] - reference_failures[b])
+                abs_errs.append(absolute_distance)
+
+                details.append(
+                    f"{full_index}  {computed_failures[b]}  "
+                    f"{reference_failures[b]}  {abs_errs[-1]:.3e}  {metric_err:.3e}"
+                )
 
-        report = []
-        report.append("❌ Numerical failures")
-
-        found_indices = np.logical_not(self.success).nonzero()
-        computed_failures = self.computed[found_indices]
-        reference_failures = self.references[found_indices]
-
-        # List all errors
-        bad_indices_count = len(found_indices[0])
-        # Determine worst result
-        worst_metric_err = 0.0
-        abs_errs = []
-        details = [
-            "All failures:",
-            "Index  Computed  Reference  Absloute E  Metric E",
-        ]
-        for b in range(bad_indices_count):
-            full_index = tuple([f[b] for f in found_indices])
-
-            metric_err = self._calculated_metric[full_index]
-
-            absolute_distance = abs(computed_failures[b] - reference_failures[b])
-            abs_errs.append(absolute_distance)
-
-            details.append(
-                f"{full_index}  {computed_failures[b]}  "
-                f"{reference_failures[b]}  {abs_errs[-1]:.3e}  {metric_err:.3e}"
+                if np.isnan(metric_err) or (abs(metric_err) > abs(worst_metric_err)):
+                    worst_metric_err = metric_err
+                    worst_full_idx = full_index
+                    worst_abs_err = abs_errs[-1]
+                    computed_worst = computed_failures[b]
+                    reference_worst = reference_failures[b]
+            # Try to quantify noisy errors
+            unique_errors = len(np.unique(np.array(abs_errs)))
+            # Summary and worst result
+            fullcount = len(self.references.flatten())
+            report.append(
+                f"Failed count: {bad_indices_count}/{fullcount} "
+                f"({round(100.0 * (bad_indices_count / fullcount), 2)}%),\n"
+                f"Worst failed index {worst_full_idx}\n"
+                f"    Computed:{computed_worst}\n"
+                f"    Reference: {reference_worst}\n"
+                f"    Absolute diff: {worst_abs_err:.3e}\n"
+                f"    Metric diff: {worst_metric_err:.3e}\n"
+                f"    Metric threshold: {self.eps}\n"
+                f"  Noise quantification:\n"
+                f"    Reference dtype: {type(reference_worst)}\n"
+                f"    Unique errors: {unique_errors}/{bad_indices_count}"
             )
+            report.extend(details)
 
-            if np.isnan(metric_err) or (abs(metric_err) > abs(worst_metric_err)):
-                worst_metric_err = metric_err
-                worst_full_idx = full_index
-                worst_abs_err = abs_errs[-1]
-                computed_worst = computed_failures[b]
-                reference_worst = reference_failures[b]
-        # Try to quantify noisy errors
-        unique_errors = len(np.unique(np.array(abs_errs)))
-        # Summary and worst result
-        fullcount = len(self.references.flatten())
-        report.append(
-            f"Failed count: {bad_indices_count}/{fullcount} "
-            f"({round(100.0 * (bad_indices_count / fullcount), 2)}%),\n"
-            f"Worst failed index {worst_full_idx}\n"
-            f"    Computed:{computed_worst}\n"
-            f"    Reference: {reference_worst}\n"
-            f"    Absolute diff: {worst_abs_err:.3e}\n"
-            f"    Metric diff: {worst_metric_err:.3e}\n"
-            f"    Metric threshold: {self.eps}\n"
-            f"  Noise quantification:\n"
-            f"    Reference dtype: {type(reference_worst)}\n"
-            f"    Unique errors: {unique_errors}/{bad_indices_count}"
-        )
-        report.extend(details)
+        if file_path:
+            with open(file_path, "w") as fd:
+                fd.write("\n".join(report))
+
+        return report
+
+    def __str__(self) -> str:
+        return self.__repr__()
 
+    def __repr__(self) -> str:
+        report = self.report()
+        if len(report) > 30:
+            report = report[:30]  # ~10 first errors
+            report.append("...")
         return "\n".join(report)
 
 
@@ -231,36 +245,47 @@ def _compute_all_metrics(
                 f"recieved data with unexpected dtype {self.references.dtype}"
             )
 
+    def report(self, file_path: Optional[str] = None) -> List[str]:
+        report = []
+        if self.check:
+            report.append("✅ No numerical differences")
+        else:
+            report.append("❌ Numerical failures")
+
+            found_indices = np.logical_not(self.success).nonzero()
+            # List all errors to terminal and file
+            bad_indices_count = len(found_indices[0])
+            full_count = len(self.references.flatten())
+            failures_pct = round(100.0 * (bad_indices_count / full_count), 2)
+            report = [
+                f"All failures ({bad_indices_count}/{full_count}) ({failures_pct}%),\n",
+                f"Index   Computed   Reference   "
+                f"Absolute E(<{self.absolute_eps:.2e})  "
+                f"Relative E(<{self.relative_fraction * 100:.2e}%)   "
+                f"ULP E(<{self.ulp_threshold})",
+            ]
+            # Summary and worst result
+            for iBad in range(bad_indices_count):
+                fi = tuple([f[iBad] for f in found_indices])
+                report.append(
+                    f"{str(fi)}  {self.computed[fi]:.16e}  {self.references[fi]:.16e}  "
+                    f"{self.absolute_distance[fi]:.2e} {'✅' if self.absolute_distance_metric[fi] else '❌'}  "
+                    f"{self.relative_distance[fi] * 100:.2e} {'✅' if self.relative_distance_metric[fi] else '❌'}  "
+                    f"{int(self.ulp_distance[fi]):02} {'✅' if self.ulp_distance_metric[fi] else '❌'}  "
+                )
+
+        if file_path:
+            with open(file_path, "w") as fd:
+                fd.write("\n".join(report))
+
+        return report
+
     def __str__(self) -> str:
         return self.__repr__()
 
     def __repr__(self) -> str:
-        if self.check:
-            return "✅ No numerical differences"
-
-        report = []
-        report.append("❌ Numerical failures")
-
-        found_indices = np.logical_not(self.success).nonzero()
-        # List all errors
-        bad_indices_count = len(found_indices[0])
-        full_count = len(self.references.flatten())
-        failures_pct = round(100.0 * (bad_indices_count / full_count), 2)
-        report = [
-            f"All failures ({bad_indices_count}/{full_count}) ({failures_pct}%),\n",
-            f"Index   Computed   Reference   "
-            f"Absolute E(<{self.absolute_eps:.2e})  "
-            f"Relative E(<{self.relative_fraction * 100:.2e}%)   "
-            f"ULP E(<{self.ulp_threshold})",
-        ]
-        # Summary and worst result
-        for iBad in range(bad_indices_count):
-            fi = tuple([f[iBad] for f in found_indices])
-            report.append(
-                f"({fi[0]:02}, {fi[1]:02}, {fi[2]:02})  {self.computed[fi]:.16e}  {self.references[fi]:.16e}  "
-                f"{self.absolute_distance[fi]:.2e} {'✅' if self.absolute_distance_metric[fi] else '❌'}  "
-                f"{self.relative_distance[fi] * 100:.2e} {'✅' if self.relative_distance_metric[fi] else '❌'}  "
-                f"{int(self.ulp_distance[fi]):02} {'✅' if self.ulp_distance_metric[fi] else '❌'}  "
-            )
-
+        report = self.report()
+        if len(report) > 12:
+            report = report[:12]  # ~10 first errors
+            report.append("...")
         return "\n".join(report)