Merge pull request #163 from NOAA-OWP/tabulation_performance

Address tabulation performance
NOAA-OWP · Oct 12, 2023 · 224e333 · 224e333
2 parents ddd0ff4 + 0054ea3
commit 224e333
Show file tree

Hide file tree

Showing 28 changed files with 782 additions and 793 deletions.
diff --git a/README.MD b/README.MD
@@ -15,7 +15,7 @@ for comparison are included. The comparisons are based on scoring
 philosophies for three statistical data types including categorical,
 continuous, and probabilistic.
 
-See the full documentation [here](noaa-owp.github.io/gval/).
+See the full documentation [here](https://noaa-owp.github.io/gval/).
 
 WARNING:
 

diff --git a/docs/markdown/01_INTRO.MD b/docs/markdown/01_INTRO.MD
@@ -7,7 +7,7 @@ GVAL (pronounced "g-val") is a high-level Python framework to evaluate the skill
 
 GVAL is intended to work on raster and vector files as xarray and geopandas objects, respectively. Abilities to prepare or homogenize maps for comparison are included. The comparisons are based on scoring philosophies for three statistical data types including categorical, continuous, and probabilistic.
 
-See the full documentation [here](noaa-owp.github.io/gval/).
+See the full documentation [here](https://noaa-owp.github.io/gval/).
 
 WARNING:
 - Our current public API and output formats are likely to change in the future.

diff --git a/docs/sphinx/PYPI_README.MD b/docs/sphinx/PYPI_README.MD
@@ -15,7 +15,7 @@ for comparison are included. The comparisons are based on scoring
 philosophies for three statistical data types including categorical,
 continuous, and probabilistic.
 
-See the full documentation [here](noaa-owp.github.io/gval/).
+See the full documentation [here](https://noaa-owp.github.io/gval/).
 
 WARNING:
 

diff --git a/docs/sphinx/SPHINX_README.MD b/docs/sphinx/SPHINX_README.MD
@@ -15,6 +15,8 @@ for comparison are included. The comparisons are based on scoring
 philosophies for three statistical data types including categorical,
 continuous, and probabilistic.
 
+See the full documentation [here](https://noaa-owp.github.io/gval/).
+
 WARNING:
 
 - Our current public API and output formats are likely to change in the

diff --git a/docs/sphinx/SphinxContinuousTutorial.ipynb b/docs/sphinx/SphinxContinuousTutorial.ipynb
diff --git a/docs/sphinx/SphinxMulticatTutorial.ipynb b/docs/sphinx/SphinxMulticatTutorial.ipynb
diff --git a/docs/sphinx/SphinxTutorial.ipynb b/docs/sphinx/SphinxTutorial.ipynb
diff --git a/notebooks/Continuous Comparison Tutorial.ipynb b/notebooks/Continuous Comparison Tutorial.ipynb
diff --git a/notebooks/Multi-Class Categorical Statistics.ipynb b/notebooks/Multi-Class Categorical Statistics.ipynb
diff --git a/notebooks/Tutorial.ipynb b/notebooks/Tutorial.ipynb
diff --git a/pyproject.toml b/pyproject.toml
@@ -12,7 +12,7 @@ authors = [
 requires-python = ">=3.8"
 keywords = ["geospatial", "evaluations"]
 license = {text = "MIT"}
-version = "0.2.2"
+version = "0.2.3"
 dynamic = ["readme", "dependencies"]
 
 

diff --git a/src/gval/accessors/gval_xarray.py b/src/gval/accessors/gval_xarray.py
@@ -21,6 +21,7 @@
 from gval.comparison.compute_categorical_metrics import _compute_categorical_metrics
 from gval.comparison.compute_continuous_metrics import _compute_continuous_metrics
 from gval.attributes.attributes import _attribute_tracking_xarray
+from gval.utils.loading_datasets import _parse_string_attributes
 from gval.utils.schemas import Crosstab_df, Metrics_df, AttributeTrackingDf
 from gval.utils.visualize import _map_plot
 from gval.comparison.pairing_functions import difference
@@ -40,7 +41,7 @@ class GVALXarray:
     """
 
     def __init__(self, xarray_obj):
-        self._obj = xarray_obj
+        self._obj = _parse_string_attributes(xarray_obj)
         self.data_type = type(xarray_obj)
         self.agreement_map_format = "raster"
 
@@ -232,12 +233,7 @@ def categorical_compare(
         )
 
         crosstab_df = candidate.gval.compute_crosstab(
-            benchmark_map=benchmark,
-            allow_candidate_values=allow_candidate_values,
-            allow_benchmark_values=allow_benchmark_values,
-            exclude_value=exclude_value,
-            comparison_function=comparison_function,
-            subsampling_df=subsampling_df,
+            agreement_map=agreement_map, subsampling_df=subsampling_df
         )
 
         metrics_df = _compute_categorical_metrics(
@@ -250,6 +246,7 @@ def categorical_compare(
             sampling_average=subsampling_average,
         )
 
+        vector_agreement = self.agreement_map_format == "vector"
         if attribute_tracking:
             results = self.__handle_attribute_tracking(
                 candidate_map=candidate,
@@ -263,11 +260,17 @@ def categorical_compare(
             else:
                 attributes_df = results
 
-            del candidate, benchmark
+            agreement_map = (
+                agreement_map.gval.vectorize_data()
+                if vector_agreement
+                else agreement_map
+            )
 
             return agreement_map, crosstab_df, metrics_df, attributes_df
 
-        del candidate, benchmark
+        agreement_map = (
+            agreement_map.gval.vectorize_data() if vector_agreement else agreement_map
+        )
 
         return agreement_map, crosstab_df, metrics_df
 
@@ -378,9 +381,7 @@ def continuous_compare(
 
         # If sampling_df return type gives three values assign all vars results, otherwise only agreement map results
         agreement_map, candidate_map, benchmark_map = (
-            results
-            if subsampling_df is not None
-            else (results, self._obj, benchmark_map)
+            results if subsampling_df is not None else (results, candidate, benchmark)
         )
 
         metrics_df = _compute_continuous_metrics(
@@ -405,8 +406,12 @@ def continuous_compare(
             else:
                 attributes_df = results
 
+            del candidate_map, benchmark_map
+
             return agreement_map, metrics_df, attributes_df
 
+        del candidate_map, benchmark_map
+
         return agreement_map, metrics_df
 
     def homogenize(
@@ -549,6 +554,7 @@ def compute_agreement_map(
                 allow_benchmark_values=allow_benchmark_values,
                 nodata=nodata,
                 encode_nodata=encode_nodata,
+                continuous=continuous,
             )
 
             # Preserve sampled maps for continuous statistics, otherwise delete
@@ -558,9 +564,6 @@ def compute_agreement_map(
             else:
                 del candidate_copy, benchmark_copy
 
-            if self.agreement_map_format == "vector":
-                agreement_map = agreement_map.gval.vectorize_data()
-
             agreement_maps.append(agreement_map)
 
         if subsampling_df is not None:
@@ -575,40 +578,21 @@ def compute_agreement_map(
 
         return agreement_maps[0]
 
-    @Comparison.comparison_function_from_string
     def compute_crosstab(
         self,
-        benchmark_map: Union[xr.Dataset, xr.DataArray],
-        allow_candidate_values: Optional[Iterable[Number]] = None,
-        allow_benchmark_values: Optional[Iterable[Number]] = None,
-        exclude_value: Optional[Number] = None,
-        comparison_function: Optional[
-            Union[Callable, nb.np.ufunc.dufunc.DUFunc, np.ufunc, np.vectorize, str]
-        ] = "szudzik",
-        pairing_dict: Optional[Dict[Tuple[Number, Number], Number]] = None,
+        agreement_map: Optional[
+            Union[xr.DataArray, xr.Dataset, Iterable[Union[xr.DataArray, xr.Dataset]]]
+        ] = None,
         subsampling_df: Optional[gpd.GeoDataFrame] = None,
     ) -> DataFrame[Crosstab_df]:
         """
         Crosstab 2 or 3-dimensional xarray DataArray to produce Crosstab DataFrame.
 
         Parameters
         ----------
-        benchmark_map : Union[xr.Dataset, xr.DataArray]
+        agreement_map : Union[xr.Dataset, xr.DataArray], default = None
             Benchmark map, {dimension}-dimensional.
-        allow_candidate_values : Optional[Iterable[Union[int,float]]], default = None
-            Sequence of values in candidate to include in crosstab. Remaining values are excluded.
-        allow_benchmark_values : Optional[Iterable[Union[int,float]]], default = None
-            Sequence of values in benchmark to include in crosstab. Remaining values are excluded.
-        exclude_value : Optional[Number], default = None
-            Value to exclude from crosstab. This could be used to denote a no data value if masking wasn't used. By default, NaNs are not cross-tabulated.
-        comparison_function : Optional[Union[Callable, nb.np.ufunc.dufunc.DUFunc, np.ufunc, np.vectorize, str]], default = "szudzik"
-                Function to compute agreement values. If None, then no agreement values are computed.
-        pairing_dict: Optional[Dict[Tuple[Number, Number], Number]], default = None
-            When "pairing_dict" is used for the comparison_function argument, a pairing dictionary can be passed by user. A pairing dictionary is structured as `{(c, b) : a}` where `(c, b)` is a tuple of the candidate and benchmark value pairing, respectively, and `a` is the value for the agreement array to be used for this pairing.
-
-            If None is passed for pairing_dict, the allow_candidate_values and allow_benchmark_values arguments are required. For this case, the pairings in these two iterables will be paired in the order provided and an agreement value will be assigned to each pairing starting with 0 and ending with the number of possible pairings.
 
-            A pairing dictionary can be used by the user to note which values to allow and which to ignore for comparisons. It can also be used to decide how nans are handled for cases where either the candidate and benchmark maps have nans or both.
         subsampling_df: Optional[gpd.GeoDataFrame], default = None
             DataFrame with spatial geometries and method types to subsample
 
@@ -618,38 +602,23 @@ def compute_crosstab(
         DataFrame[Crosstab_df]
             Crosstab DataFrame
         """
-        self.check_same_type(benchmark_map)
 
-        results = (
-            subsample(
-                candidate=self._obj,
-                benchmark=benchmark_map,
-                subsampling_df=subsampling_df,
+        # Use self if agreement_map argument is not provided otherwise use agreement_map parameter
+        if agreement_map is not None:
+            agreement_map = (
+                agreement_map if isinstance(agreement_map, list) else [agreement_map]
             )
-            if subsampling_df is not None
-            else [[self._obj, benchmark_map]]
-        )
+        else:
+            agreement_map = [self._obj]
 
+        # Create cross-tabulation table for each agreement map and concatenate them
         crosstabs = []
-        for idx, (candidate, benchmark) in enumerate(results):
-            if isinstance(self._obj, xr.Dataset):
-                crosstab = _crosstab_Datasets(
-                    candidate,
-                    benchmark,
-                    allow_candidate_values,
-                    allow_benchmark_values,
-                    exclude_value,
-                    comparison_function,
-                )
-            else:
-                crosstab = _crosstab_DataArrays(
-                    candidate,
-                    benchmark,
-                    allow_candidate_values,
-                    allow_benchmark_values,
-                    exclude_value,
-                    comparison_function,
-                )
+        for idx, agreement in enumerate(agreement_map):
+            crosstab = (
+                _crosstab_Datasets(agreement)
+                if isinstance(self._obj, xr.Dataset)
+                else _crosstab_DataArrays(agreement)
+            )
 
             if subsampling_df is not None:
                 crosstab.insert(
@@ -728,7 +697,7 @@ def cat_plot(
         legend_labels: list = None,
         plot_bands: Union[str, list] = "all",
         colorbar_label: Union[str, list] = "",
-        basemap: xyzservices.lib.TileProvider = cx.providers.Stamen.Terrain,
+        basemap: xyzservices.lib.TileProvider = cx.providers.OpenStreetMap.Mapnik,
     ):
         """
         Plots categorical Map for xarray object
@@ -775,7 +744,7 @@ def cont_plot(
         figsize: Tuple[int, int] = None,
         plot_bands: Union[str, list] = "all",
         colorbar_label: Union[str, list] = "",
-        basemap: xyzservices.lib.TileProvider = cx.providers.Stamen.Terrain,
+        basemap: xyzservices.lib.TileProvider = cx.providers.OpenStreetMap.Mapnik,
     ):
         """
         Plots categorical Map for xarray object

diff --git a/src/gval/comparison/agreement.py b/src/gval/comparison/agreement.py
@@ -16,16 +16,17 @@
 
 from typing import Iterable, Optional, Union, Tuple, Callable, Dict
 from numbers import Number
+from itertools import product
 
 import numpy as np
 import xarray as xr
 import numba as nb
-
+import dask
 
 from gval.comparison.pairing_functions import (
     _make_pairing_dict_fn,
 )
-from gval.utils.loading_datasets import _handle_xarray_memory
+from gval.utils.loading_datasets import _handle_xarray_memory, _check_dask_array
 
 
 def _compute_agreement_map(
@@ -39,6 +40,7 @@ def _compute_agreement_map(
     allow_benchmark_values: Optional[Iterable[Number]] = None,
     nodata: Optional[Number] = None,
     encode_nodata: Optional[bool] = False,
+    continuous: Optional[bool] = False,
 ) -> Union[xr.DataArray, xr.Dataset]:
     """
     Computes agreement map as xarray from candidate and benchmark xarray's.
@@ -179,6 +181,53 @@ def _manage_information_loss(agreement_map, crs, nodata, encode_nodata, dtype):
         comparison_function, *ufunc_args, **apply_ufunc_kwargs
     )
 
+    is_dask = _check_dask_array(candidate_map)
+
+    def get_unique_values(candidate, benchmark):
+        unique_c = (
+            dask.array.unique(candidate.data).compute()
+            if is_dask
+            else np.unique(candidate)
+        )
+        unique_b = (
+            dask.array.unique(benchmark.data).compute()
+            if is_dask
+            else np.unique(benchmark)
+        )
+
+        return unique_c, unique_b
+
+    # Add pairing dictionary and reverse pairing dictionary to agreement map attributes
+    if pairing_dict is not None and not continuous:
+        agreement_map.attrs["pairing_dictionary"] = pairing_dict
+
+    if pairing_dict is None and not continuous:
+        if allow_candidate_values is None or allow_benchmark_values is None:
+            if isinstance(candidate_map, xr.Dataset):
+                for idx, var in enumerate(candidate_map.data_vars):
+                    agreement_map[var].attrs["pairing_dictionary"] = {
+                        (x, y): comparison_function(x, y)
+                        for x, y in product(
+                            *get_unique_values(candidate_map[var], benchmark_map[var])
+                        )
+                    }
+
+                    if idx == 0:
+                        agreement_map.attrs
+
+            else:
+                agreement_map.attrs["pairing_dictionary"] = {
+                    (x, y): comparison_function(x, y)
+                    for x, y in product(
+                        *get_unique_values(candidate_map, benchmark_map)
+                    )
+                }
+        else:
+            agreement_map.attrs["pairing_dictionary"] = {
+                (x, y): comparison_function(x, y)
+                for x, y in product(allow_candidate_values, allow_benchmark_values)
+            }
+
     if isinstance(candidate_map, xr.DataArray):
         agreement_map = _manage_information_loss(
             agreement_map=agreement_map,

diff --git a/src/gval/comparison/compute_categorical_metrics.py b/src/gval/comparison/compute_categorical_metrics.py
@@ -51,7 +51,13 @@ def _handle_positive_negative_categories(
 
     # finds the unique values in the sample's candidate and benchmark values
     unique_values = set(
-        crosstab_df.loc[:, ["candidate_values", "benchmark_values"]].to_numpy().ravel()
+        [
+            item
+            for item in crosstab_df.loc[:, ["candidate_values", "benchmark_values"]]
+            .to_numpy()
+            .ravel()
+            if not isinstance(item, list)
+        ]
     )
 
     # this checks that user passed positive or negative categories exist in sample df