Skip to content

Commit

Permalink
Merge pull request #163 from NOAA-OWP/tabulation_performance
Browse files Browse the repository at this point in the history
Address tabulation performance
  • Loading branch information
fernando-aristizabal authored Oct 12, 2023
2 parents ddd0ff4 + 0054ea3 commit 224e333
Show file tree
Hide file tree
Showing 28 changed files with 782 additions and 793 deletions.
2 changes: 1 addition & 1 deletion README.MD
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ for comparison are included. The comparisons are based on scoring
philosophies for three statistical data types including categorical,
continuous, and probabilistic.

See the full documentation [here](noaa-owp.github.io/gval/).
See the full documentation [here](https://noaa-owp.github.io/gval/).

WARNING:

Expand Down
2 changes: 1 addition & 1 deletion docs/markdown/01_INTRO.MD
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ GVAL (pronounced "g-val") is a high-level Python framework to evaluate the skill

GVAL is intended to work on raster and vector files as xarray and geopandas objects, respectively. Abilities to prepare or homogenize maps for comparison are included. The comparisons are based on scoring philosophies for three statistical data types including categorical, continuous, and probabilistic.

See the full documentation [here](noaa-owp.github.io/gval/).
See the full documentation [here](https://noaa-owp.github.io/gval/).

WARNING:
- Our current public API and output formats are likely to change in the future.
Expand Down
2 changes: 1 addition & 1 deletion docs/sphinx/PYPI_README.MD
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ for comparison are included. The comparisons are based on scoring
philosophies for three statistical data types including categorical,
continuous, and probabilistic.

See the full documentation [here](noaa-owp.github.io/gval/).
See the full documentation [here](https://noaa-owp.github.io/gval/).

WARNING:

Expand Down
2 changes: 2 additions & 0 deletions docs/sphinx/SPHINX_README.MD
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ for comparison are included. The comparisons are based on scoring
philosophies for three statistical data types including categorical,
continuous, and probabilistic.

See the full documentation [here](https://noaa-owp.github.io/gval/).

WARNING:

- Our current public API and output formats are likely to change in the
Expand Down
22 changes: 12 additions & 10 deletions docs/sphinx/SphinxContinuousTutorial.ipynb

Large diffs are not rendered by default.

44 changes: 19 additions & 25 deletions docs/sphinx/SphinxMulticatTutorial.ipynb

Large diffs are not rendered by default.

74 changes: 39 additions & 35 deletions docs/sphinx/SphinxTutorial.ipynb

Large diffs are not rendered by default.

22 changes: 12 additions & 10 deletions notebooks/Continuous Comparison Tutorial.ipynb

Large diffs are not rendered by default.

44 changes: 19 additions & 25 deletions notebooks/Multi-Class Categorical Statistics.ipynb

Large diffs are not rendered by default.

76 changes: 40 additions & 36 deletions notebooks/Tutorial.ipynb

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ authors = [
requires-python = ">=3.8"
keywords = ["geospatial", "evaluations"]
license = {text = "MIT"}
version = "0.2.2"
version = "0.2.3"
dynamic = ["readme", "dependencies"]


Expand Down
105 changes: 37 additions & 68 deletions src/gval/accessors/gval_xarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from gval.comparison.compute_categorical_metrics import _compute_categorical_metrics
from gval.comparison.compute_continuous_metrics import _compute_continuous_metrics
from gval.attributes.attributes import _attribute_tracking_xarray
from gval.utils.loading_datasets import _parse_string_attributes
from gval.utils.schemas import Crosstab_df, Metrics_df, AttributeTrackingDf
from gval.utils.visualize import _map_plot
from gval.comparison.pairing_functions import difference
Expand All @@ -40,7 +41,7 @@ class GVALXarray:
"""

def __init__(self, xarray_obj):
self._obj = xarray_obj
self._obj = _parse_string_attributes(xarray_obj)
self.data_type = type(xarray_obj)
self.agreement_map_format = "raster"

Expand Down Expand Up @@ -232,12 +233,7 @@ def categorical_compare(
)

crosstab_df = candidate.gval.compute_crosstab(
benchmark_map=benchmark,
allow_candidate_values=allow_candidate_values,
allow_benchmark_values=allow_benchmark_values,
exclude_value=exclude_value,
comparison_function=comparison_function,
subsampling_df=subsampling_df,
agreement_map=agreement_map, subsampling_df=subsampling_df
)

metrics_df = _compute_categorical_metrics(
Expand All @@ -250,6 +246,7 @@ def categorical_compare(
sampling_average=subsampling_average,
)

vector_agreement = self.agreement_map_format == "vector"
if attribute_tracking:
results = self.__handle_attribute_tracking(
candidate_map=candidate,
Expand All @@ -263,11 +260,17 @@ def categorical_compare(
else:
attributes_df = results

del candidate, benchmark
agreement_map = (
agreement_map.gval.vectorize_data()
if vector_agreement
else agreement_map
)

return agreement_map, crosstab_df, metrics_df, attributes_df

del candidate, benchmark
agreement_map = (
agreement_map.gval.vectorize_data() if vector_agreement else agreement_map
)

return agreement_map, crosstab_df, metrics_df

Expand Down Expand Up @@ -378,9 +381,7 @@ def continuous_compare(

# If sampling_df return type gives three values assign all vars results, otherwise only agreement map results
agreement_map, candidate_map, benchmark_map = (
results
if subsampling_df is not None
else (results, self._obj, benchmark_map)
results if subsampling_df is not None else (results, candidate, benchmark)
)

metrics_df = _compute_continuous_metrics(
Expand All @@ -405,8 +406,12 @@ def continuous_compare(
else:
attributes_df = results

del candidate_map, benchmark_map

return agreement_map, metrics_df, attributes_df

del candidate_map, benchmark_map

return agreement_map, metrics_df

def homogenize(
Expand Down Expand Up @@ -549,6 +554,7 @@ def compute_agreement_map(
allow_benchmark_values=allow_benchmark_values,
nodata=nodata,
encode_nodata=encode_nodata,
continuous=continuous,
)

# Preserve sampled maps for continuous statistics, otherwise delete
Expand All @@ -558,9 +564,6 @@ def compute_agreement_map(
else:
del candidate_copy, benchmark_copy

if self.agreement_map_format == "vector":
agreement_map = agreement_map.gval.vectorize_data()

agreement_maps.append(agreement_map)

if subsampling_df is not None:
Expand All @@ -575,40 +578,21 @@ def compute_agreement_map(

return agreement_maps[0]

@Comparison.comparison_function_from_string
def compute_crosstab(
self,
benchmark_map: Union[xr.Dataset, xr.DataArray],
allow_candidate_values: Optional[Iterable[Number]] = None,
allow_benchmark_values: Optional[Iterable[Number]] = None,
exclude_value: Optional[Number] = None,
comparison_function: Optional[
Union[Callable, nb.np.ufunc.dufunc.DUFunc, np.ufunc, np.vectorize, str]
] = "szudzik",
pairing_dict: Optional[Dict[Tuple[Number, Number], Number]] = None,
agreement_map: Optional[
Union[xr.DataArray, xr.Dataset, Iterable[Union[xr.DataArray, xr.Dataset]]]
] = None,
subsampling_df: Optional[gpd.GeoDataFrame] = None,
) -> DataFrame[Crosstab_df]:
"""
Crosstab 2 or 3-dimensional xarray DataArray to produce Crosstab DataFrame.
Parameters
----------
benchmark_map : Union[xr.Dataset, xr.DataArray]
agreement_map : Union[xr.Dataset, xr.DataArray], default = None
Benchmark map, {dimension}-dimensional.
allow_candidate_values : Optional[Iterable[Union[int,float]]], default = None
Sequence of values in candidate to include in crosstab. Remaining values are excluded.
allow_benchmark_values : Optional[Iterable[Union[int,float]]], default = None
Sequence of values in benchmark to include in crosstab. Remaining values are excluded.
exclude_value : Optional[Number], default = None
Value to exclude from crosstab. This could be used to denote a no data value if masking wasn't used. By default, NaNs are not cross-tabulated.
comparison_function : Optional[Union[Callable, nb.np.ufunc.dufunc.DUFunc, np.ufunc, np.vectorize, str]], default = "szudzik"
Function to compute agreement values. If None, then no agreement values are computed.
pairing_dict: Optional[Dict[Tuple[Number, Number], Number]], default = None
When "pairing_dict" is used for the comparison_function argument, a pairing dictionary can be passed by user. A pairing dictionary is structured as `{(c, b) : a}` where `(c, b)` is a tuple of the candidate and benchmark value pairing, respectively, and `a` is the value for the agreement array to be used for this pairing.
If None is passed for pairing_dict, the allow_candidate_values and allow_benchmark_values arguments are required. For this case, the pairings in these two iterables will be paired in the order provided and an agreement value will be assigned to each pairing starting with 0 and ending with the number of possible pairings.
A pairing dictionary can be used by the user to note which values to allow and which to ignore for comparisons. It can also be used to decide how nans are handled for cases where either the candidate and benchmark maps have nans or both.
subsampling_df: Optional[gpd.GeoDataFrame], default = None
DataFrame with spatial geometries and method types to subsample
Expand All @@ -618,38 +602,23 @@ def compute_crosstab(
DataFrame[Crosstab_df]
Crosstab DataFrame
"""
self.check_same_type(benchmark_map)

results = (
subsample(
candidate=self._obj,
benchmark=benchmark_map,
subsampling_df=subsampling_df,
# Use self if agreement_map argument is not provided otherwise use agreement_map parameter
if agreement_map is not None:
agreement_map = (
agreement_map if isinstance(agreement_map, list) else [agreement_map]
)
if subsampling_df is not None
else [[self._obj, benchmark_map]]
)
else:
agreement_map = [self._obj]

# Create cross-tabulation table for each agreement map and concatenate them
crosstabs = []
for idx, (candidate, benchmark) in enumerate(results):
if isinstance(self._obj, xr.Dataset):
crosstab = _crosstab_Datasets(
candidate,
benchmark,
allow_candidate_values,
allow_benchmark_values,
exclude_value,
comparison_function,
)
else:
crosstab = _crosstab_DataArrays(
candidate,
benchmark,
allow_candidate_values,
allow_benchmark_values,
exclude_value,
comparison_function,
)
for idx, agreement in enumerate(agreement_map):
crosstab = (
_crosstab_Datasets(agreement)
if isinstance(self._obj, xr.Dataset)
else _crosstab_DataArrays(agreement)
)

if subsampling_df is not None:
crosstab.insert(
Expand Down Expand Up @@ -728,7 +697,7 @@ def cat_plot(
legend_labels: list = None,
plot_bands: Union[str, list] = "all",
colorbar_label: Union[str, list] = "",
basemap: xyzservices.lib.TileProvider = cx.providers.Stamen.Terrain,
basemap: xyzservices.lib.TileProvider = cx.providers.OpenStreetMap.Mapnik,
):
"""
Plots categorical Map for xarray object
Expand Down Expand Up @@ -775,7 +744,7 @@ def cont_plot(
figsize: Tuple[int, int] = None,
plot_bands: Union[str, list] = "all",
colorbar_label: Union[str, list] = "",
basemap: xyzservices.lib.TileProvider = cx.providers.Stamen.Terrain,
basemap: xyzservices.lib.TileProvider = cx.providers.OpenStreetMap.Mapnik,
):
"""
Plots categorical Map for xarray object
Expand Down
53 changes: 51 additions & 2 deletions src/gval/comparison/agreement.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,16 +16,17 @@

from typing import Iterable, Optional, Union, Tuple, Callable, Dict
from numbers import Number
from itertools import product

import numpy as np
import xarray as xr
import numba as nb

import dask

from gval.comparison.pairing_functions import (
_make_pairing_dict_fn,
)
from gval.utils.loading_datasets import _handle_xarray_memory
from gval.utils.loading_datasets import _handle_xarray_memory, _check_dask_array


def _compute_agreement_map(
Expand All @@ -39,6 +40,7 @@ def _compute_agreement_map(
allow_benchmark_values: Optional[Iterable[Number]] = None,
nodata: Optional[Number] = None,
encode_nodata: Optional[bool] = False,
continuous: Optional[bool] = False,
) -> Union[xr.DataArray, xr.Dataset]:
"""
Computes agreement map as xarray from candidate and benchmark xarray's.
Expand Down Expand Up @@ -179,6 +181,53 @@ def _manage_information_loss(agreement_map, crs, nodata, encode_nodata, dtype):
comparison_function, *ufunc_args, **apply_ufunc_kwargs
)

is_dask = _check_dask_array(candidate_map)

def get_unique_values(candidate, benchmark):
unique_c = (
dask.array.unique(candidate.data).compute()
if is_dask
else np.unique(candidate)
)
unique_b = (
dask.array.unique(benchmark.data).compute()
if is_dask
else np.unique(benchmark)
)

return unique_c, unique_b

# Add pairing dictionary and reverse pairing dictionary to agreement map attributes
if pairing_dict is not None and not continuous:
agreement_map.attrs["pairing_dictionary"] = pairing_dict

if pairing_dict is None and not continuous:
if allow_candidate_values is None or allow_benchmark_values is None:
if isinstance(candidate_map, xr.Dataset):
for idx, var in enumerate(candidate_map.data_vars):
agreement_map[var].attrs["pairing_dictionary"] = {
(x, y): comparison_function(x, y)
for x, y in product(
*get_unique_values(candidate_map[var], benchmark_map[var])
)
}

if idx == 0:
agreement_map.attrs

else:
agreement_map.attrs["pairing_dictionary"] = {
(x, y): comparison_function(x, y)
for x, y in product(
*get_unique_values(candidate_map, benchmark_map)
)
}
else:
agreement_map.attrs["pairing_dictionary"] = {
(x, y): comparison_function(x, y)
for x, y in product(allow_candidate_values, allow_benchmark_values)
}

if isinstance(candidate_map, xr.DataArray):
agreement_map = _manage_information_loss(
agreement_map=agreement_map,
Expand Down
8 changes: 7 additions & 1 deletion src/gval/comparison/compute_categorical_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,13 @@ def _handle_positive_negative_categories(

# finds the unique values in the sample's candidate and benchmark values
unique_values = set(
crosstab_df.loc[:, ["candidate_values", "benchmark_values"]].to_numpy().ravel()
[
item
for item in crosstab_df.loc[:, ["candidate_values", "benchmark_values"]]
.to_numpy()
.ravel()
if not isinstance(item, list)
]
)

# this checks that user passed positive or negative categories exist in sample df
Expand Down
Loading

0 comments on commit 224e333

Please sign in to comment.