diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 7cd9461..2ba7c98 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -41,4 +41,4 @@ jobs: # Run the unit tests - name: Test with pytest - run: python -m pytest --verbose tests/ + run: python -m pytest --cov evalem --verbose tests/ diff --git a/evalem/__init__.py b/evalem/__init__.py index a50c934..edff7d3 100644 --- a/evalem/__init__.py +++ b/evalem/__init__.py @@ -1,7 +1,12 @@ __version__ = "0.0.4-alpha" from ._base.evaluators import Evaluator # noqa -from ._base.pipelines import EvaluationPipeline, SimpleEvaluationPipeline # noqa +from ._base.pipelines import ( # noqa + EvaluationPipeline, + NamedSimpleEvaluationPipeline, + SimpleEvaluationPipeline, +) +from ._base.structures import MetricResult # noqa from .nlp.models import ( # noqa QuestionAnsweringHFPipelineWrapper, TextClassificationHFPipelineWrapper, diff --git a/evalem/_base/abc.py b/evalem/_base/abc.py index f993490..beb396d 100755 --- a/evalem/_base/abc.py +++ b/evalem/_base/abc.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 from abc import ABC +from itertools import count from typing import Any @@ -26,6 +27,27 @@ def __repr__(self) -> str: return f"[{self.__classname__}]" +class InstanceCountMixin: + """ + This mixin is used to autogenerate names for + individual object. + """ + + _ids = count(0) + + def __init__(self): + self.idx = next(self._ids) + self._name = None + + @property + def name(self): + return self._name or f"{self.__class__.__name__}:{self.idx}" + + @name.setter + def name(self, name: str): + self._name = name + + def main(): pass diff --git a/evalem/_base/evaluators.py b/evalem/_base/evaluators.py index 8672e39..869f5cd 100755 --- a/evalem/_base/evaluators.py +++ b/evalem/_base/evaluators.py @@ -114,7 +114,6 @@ def _type_check_metrics( metrics = [metrics] if not isinstance(metrics, Iterable) else metrics for _metric in metrics: if not isinstance(_metric, Metric): - print(_metric) raise TypeError( f"Invalid type for metric={_metric}. Expected type of [Metric]. Got {type(_metric)}", ) @@ -158,12 +157,9 @@ def evaluate( Returns: Mapping (dict) of metric name to corresponding metric output """ - return dict( + return list( map( - lambda m: ( - m.__classname__, - m(predictions=predictions, references=references, **kwargs), - ), + lambda m: m(predictions=predictions, references=references, **kwargs), self.metrics, ), ) diff --git a/evalem/_base/metrics.py b/evalem/_base/metrics.py index e5c9a1c..06a9867 100755 --- a/evalem/_base/metrics.py +++ b/evalem/_base/metrics.py @@ -1,5 +1,7 @@ #!/usr/bin/env python3 +from __future__ import annotations + from abc import abstractmethod from typing import Iterable, List, Tuple @@ -11,7 +13,7 @@ from .structures import ( EvaluationPredictionInstance, EvaluationReferenceInstance, - MetricOutput, + MetricResult, SinglePredictionInstance, ) @@ -40,7 +42,7 @@ def compute( predictions: EvaluationPredictionInstance, references: EvaluationReferenceInstance, **kwargs, - ) -> MetricOutput: + ) -> MetricResult: """ The actual entrypoint method to perform evaluation and give output metric. @@ -71,7 +73,7 @@ def __call__( predictions: EvaluationPredictionInstance, references: EvaluationReferenceInstance, **kwargs, - ) -> MetricOutput: + ) -> MetricResult: """ The actual entrypoint method to perform evaluation and give output metric. @@ -208,7 +210,7 @@ def compute( predictions: EvaluationPredictionInstance, references: EvaluationReferenceInstance, **kwargs, - ) -> MetricOutput: + ) -> MetricResult: predictions = format_to_jury(predictions) references = format_to_jury(references) @@ -223,7 +225,8 @@ def compute( if isinstance(v, dict) and "score" in v: res["score"] = v.get("score", None) res[k] = v - return res + res["metric_name"] = self.__classname__ + return MetricResult.from_dict(res) class PrecisionMetric(JuryBasedMetric, BasicMetric): @@ -256,7 +259,7 @@ def compute( predictions: EvaluationPredictionInstance, references: EvaluationReferenceInstance, **kwargs, - ) -> MetricOutput: + ) -> MetricResult: # converts all the structure into list of string predictions, references = format_to_jury(predictions), format_to_jury( references, @@ -265,12 +268,19 @@ def compute( predictions, references = self._flatten_references(predictions, references) labels = self.__get_labels(predictions, references) - return dict( - confusion_matrix=confusion_matrix(references, predictions, labels=labels), - labels=labels, - flattened=True, - total_items=len(predictions), - empty_items=0, + return MetricResult.from_dict( + dict( + metric_name="ConfusionMatrix", + confusion_matrix=confusion_matrix( + references, + predictions, + labels=labels, + ), + labels=labels, + flattened=True, + total_items=len(predictions), + empty_items=0, + ), ) def __get_labels( diff --git a/evalem/_base/pipelines.py b/evalem/_base/pipelines.py index 08fef47..b884bc7 100755 --- a/evalem/_base/pipelines.py +++ b/evalem/_base/pipelines.py @@ -1,9 +1,9 @@ #!/usr/bin/env python3 from abc import abstractmethod -from typing import Any, Iterable, List, Mapping, Type, Union +from typing import Any, Iterable, List, Mapping, Optional, Type, Union -from .abc import AbstractBase +from .abc import AbstractBase, InstanceCountMixin from .evaluators import Evaluator from .models import ModelWrapper from .structures import EvaluationReferenceInstance, MetricOutput @@ -45,9 +45,9 @@ class SimpleEvaluationPipeline(EvaluationPipeline): .. code-block: python - from evalem.pipelines import SimpleEvaluationPipeline - from evalem.models import TextClassificationHFPipelineWrapper - from evalem.evaluators import TextClassificationEvaluator + from evalem import SimpleEvaluationPipeline + from evalem.nlp.models import TextClassificationHFPipelineWrapper + from evalem.nlp.evaluators import TextClassificationEvaluator model = TextClassificationHFPipelineWrapper() evaluator = TextClassificationEvaluator() @@ -97,6 +97,48 @@ def run( ) +class NamedSimpleEvaluationPipeline(InstanceCountMixin, SimpleEvaluationPipeline): + """ + + This is a named version SimpleEvaluationPipeline that uses single model + and a list of evaluators to run the evaluation. + + Args: + ```model```: ```Type[ModelWrapper]``` + Wrapped model to do the inference. + ```evaluators```: ```Union[Evaluator, Iterable[Evalautor]]``` + Either a single evaluator or an iterable of evaluators + Note: If single evaluator is provided, it'll be wrapped into + an iterable ultimately. + ```name```: ```Optional[str]``` + Name of the pipeline. If not provided, name is autogenerated + (using `evalem.misc.utils.InstanceCountMixin`) + + Usage: + + from evalem import SimpleEvaluationPipeline + from evalem.nlp.models import TextClassificationHFPipelineWrapper + from evalem.nlp.evaluators import TextClassificationEvaluator + from evalem import NamedSimpleEvaluationPipeline + + model = TextClassificationHFPipelineWrapper() + evaluator = TextClassificationEvaluator() + pipe = SimpleEvaluationPipeline(model=model, evaluators=evaluator) + + results = pipe(inputs, references) + """ + + def __init__( + self, + model: Type[ModelWrapper], + evaluators: Union[Evaluator, Iterable[Evaluator]], + name: Optional[str] = None, + ) -> None: + InstanceCountMixin.__init__(self) + SimpleEvaluationPipeline.__init__(self, model=model, evaluators=evaluators) + self.name = name + + def main(): pass diff --git a/evalem/_base/structures.py b/evalem/_base/structures.py index f09bb81..8b9e538 100755 --- a/evalem/_base/structures.py +++ b/evalem/_base/structures.py @@ -2,6 +2,7 @@ from __future__ import annotations +from copy import deepcopy from dataclasses import asdict, dataclass from pathlib import Path from typing import Any, Dict, List, Optional, Type, Union @@ -41,6 +42,43 @@ class ReferenceDTO(EvaluationDTO): pass +@dataclass(frozen=True) +class MetricResult: + score: float + total_items: int + metric_name: str + empty_items: int = 0 + extra: Optional[dict] = None + + @classmethod + def from_dict(cls, dct: dict) -> MetricResult: + dct = deepcopy(dct) + return cls( + score=dct.pop("score", None), + total_items=dct.pop("total_items", None), + metric_name=dct.pop("metric_name", None), + empty_items=dct.pop("empty_items", 0), + extra=dct, + ) + + def as_dict(self) -> dict: + return asdict(self) + + def to_dict(self) -> dict: + return asdict(self) + + def __hash__(self) -> str: + score = self.score if isinstance(self.score, (float, int)) else 0 + return hash( + ( + self.metric_name, + round(score, 3), + self.total_items, + self.empty_items, + ), + ) + + ImageTensor = Union[np.ndarray, torch.Tensor] # Represents type instance for any single downstream prediction @@ -67,6 +105,6 @@ class ReferenceDTO(EvaluationDTO): EvaluationReferenceInstance = Union[SingleReferenceInstance, MultipleReferenceInstance] EvaluationOutput = Union[int, float, Dict[str, Union[str, int, float]]] -MetricOutput = Union[int, float, Dict[str, Union[str, int, float]]] +MetricOutput = Union[int, float, Dict[str, Union[str, int, float]], MetricResult] PathType = Union[str, Path] diff --git a/evalem/misc/utils.py b/evalem/misc/utils.py index 5d92509..a84bafc 100755 --- a/evalem/misc/utils.py +++ b/evalem/misc/utils.py @@ -1,6 +1,11 @@ #!/usr/bin/env python3 -from typing import Iterable, List, Union +from itertools import chain +from typing import Any, Iterable, List, Union + +import numpy as np +import pandas as pd +from loguru import logger from .._base.structures import EvaluationDTO, PredictionInstance, ReferenceInstance @@ -38,3 +43,73 @@ def _dtofy(instance): return list(map(format_to_jury, instances)) else: return instances + + +def flatten_list(nested_list: Union[list, tuple, set]) -> List[Any]: + """ + Flattens a nested list of list. + Can take into account in case any element is not a list too + (eg: `[[1, 2], 3]`) + """ + return list( + chain.from_iterable( + flatten_list(x) if isinstance(x, (list, set, tuple)) else [x] + for x in nested_list + ), + ) + + +def build_comparison_table( + *eval_pipes, + inputs, + references, + **eval_params, +) -> Union[dict, pd.DataFrame]: + """ + A utility that runs the provided evaluation pipeline + and generates a comparison table. + + Note: + Assumes the same set of inputs and references are run through + different evaluation pipeline + + Args: + ```eval_pipes```: ```Type[EvaluationPipeline]``` + Evaluation pipeline objects + ```inputs```: ```Any``` + Inputs that are fed to each pipeline for forward pass + ```references```: ```EvaluationReferenceInstance ``` + References/ground-truths for the evaluation. + See `evalem._base.structures.EvaluationReferenceInstance` for type + + Returns: + Returns either a pandas DataFrame or dict. + If pandas dataframe creation fails, it returns a dict. + + For the dataframe, the index is the metric name and other columns + consist of pipeline name with score value. + """ + results = map(lambda ep: ep(inputs=inputs, references=references), eval_pipes) + comparison_map = {} + dfs = [] + n_items_tracker = [] + for idx, (ep, result) in enumerate(zip(eval_pipes, results)): + name = f"eval-pipe-{idx}" if not hasattr(ep, "name") else ep.name + metrics = set(flatten_list(result)) + n_items_tracker.extend([m.total_items for m in metrics]) + comparison_map[name] = metrics + + df = pd.DataFrame( + map(lambda m: {"metric": m.metric_name, name: m.score}, metrics), + ) + df.set_index("metric", inplace=True) + dfs.append(df) + logger.info( + f"{int(np.mean(n_items_tracker))} total items are evaluated on average.", + ) + res = comparison_map + try: + res = pd.concat(dfs, join="outer", axis=1) + except: # noqa + logger.warning("Failed to create pd.DataFrame table. Fallback to dict") + return res diff --git a/evalem/nlp/metrics/basics.py b/evalem/nlp/metrics/basics.py index 6232d1b..68a1f9d 100755 --- a/evalem/nlp/metrics/basics.py +++ b/evalem/nlp/metrics/basics.py @@ -1,9 +1,11 @@ #!/usr/bin/env python3 +import dataclasses + from ..._base.metrics import JuryBasedMetric from ..._base.structures import ( EvaluationReferenceInstance, - MetricOutput, + MetricResult, SinglePredictionInstance, ) from ._base import NLPMetric @@ -18,7 +20,7 @@ def compute( predictions: SinglePredictionInstance, references: EvaluationReferenceInstance, **kwargs, - ) -> MetricOutput: + ) -> MetricResult: # This metric doesn't support multi-reference format. # So, we flatten everything: # Single Prediction, Multi-Ref -> Single Prediction, Single-Ref @@ -28,6 +30,12 @@ def compute( references=references, **kwargs, ) - result["score"] = result.get("exact_match", None) - result["flattened"] = True + + extra = result.extra + extra["flattened"] = True + result = dataclasses.replace( + result, + score=result.extra.get("exact_match", None), + extra=extra, + ) return result diff --git a/evalem/nlp/metrics/semantics.py b/evalem/nlp/metrics/semantics.py index 389ad2e..ab8b1a6 100755 --- a/evalem/nlp/metrics/semantics.py +++ b/evalem/nlp/metrics/semantics.py @@ -1,5 +1,7 @@ #!/usr/bin/env python3 + +import dataclasses from typing import Optional import numpy as np @@ -9,7 +11,7 @@ from ..._base.structures import ( EvaluationPredictionInstance, EvaluationReferenceInstance, - MetricOutput, + MetricResult, ) from ...misc.utils import format_to_jury from ._base import NLPMetric @@ -87,7 +89,7 @@ def compute( predictions: EvaluationPredictionInstance, references: EvaluationReferenceInstance, **kwargs, - ) -> MetricOutput: + ) -> MetricResult: device = kwargs.pop("device", self.device) model_type = kwargs.pop("model_type", self.model_type) result = super().compute( @@ -101,7 +103,9 @@ def compute( # and want to just have mean/average. if not self.per_instance_score: for _key in ["precision", "recall", "f1"]: - result["bertscore"][_key] = np.mean(result["bertscore"][_key]) + result.extra["bertscore"][_key] = np.mean( + result.extra["bertscore"][_key], + ) return result @@ -148,7 +152,7 @@ def compute( predictions: EvaluationPredictionInstance, references: EvaluationReferenceInstance, **kwargs, - ) -> MetricOutput: + ) -> MetricResult: predictions = format_to_jury(predictions) references = format_to_jury(references) @@ -157,12 +161,14 @@ def compute( # Low-level access to Bartscorer directly # See: https://github.com/neulab/BARTScore score = np.mean(self.scorer.scorer.score(predictions, references, **kwargs)) - return dict( + return MetricResult( score=score, - model_checkpoint=self.scorer.model_checkpoint, - model_weights=self.scorer.model_weights, total_items=len(predictions), - flattened=True, + metric_name="BartScore", + extra=dict( + flattened=True, + model_checkpoint=self.scorer.model_checkpoint, + ), ) @@ -240,6 +246,20 @@ class RougeMetric(JuryBasedMetric, SemanticMetric): def __init__(self) -> None: super().__init__(metrics="rouge") + def compute( + self, + predictions: EvaluationPredictionInstance, + references: EvaluationReferenceInstance, + **kwargs, + ) -> MetricResult: + result = super().compute( + predictions=predictions, + references=references, + **kwargs, + ) + score = float(np.mean(list((result.extra or {}).get("rouge", {}).values()))) + return dataclasses.replace(result, score=score) + def main(): pass diff --git a/pyproject.toml b/pyproject.toml index 3099d1e..a75f816 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,6 +41,7 @@ dependencies = [ "pandas==1.5.3", "pyarrow==11.0.0", "pytest==7.2.1", + "pytest-cov==4.0.0", "sacrebleu==2.3.1", "scikit-learn==1.2.1", "sentencepiece==0.1.99", diff --git a/requirements.txt b/requirements.txt index fff3296..bd8fa47 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,6 +11,7 @@ optimum==1.8.8 pandas==1.5.3 pyarrow==11.0.0 pytest==7.2.1 +pytest-cov==4.0.0 sacrebleu==2.3.1 scikit-learn==1.2.1 sentencepiece==0.1.99 diff --git a/tests/metrics/_base.py b/tests/metrics/_base.py index 7dec9e5..7dd3678 100755 --- a/tests/metrics/_base.py +++ b/tests/metrics/_base.py @@ -1,15 +1,18 @@ # flake8: noqa #!/usr/bin/env python3 +from pprint import pprint + import pytest +from evalem._base.structures import MetricResult + from .fixtures import predictions, references @pytest.mark.metrics class BaseMetricTest: _metric_cls = None - _key = None @pytest.fixture(autouse=True, scope="class") def metric_result(self, predictions, references): @@ -28,10 +31,11 @@ def test_metric_return_type(self, metric_result): """ Check if return type of each metric is a dictionary """ - assert isinstance(metric_result, dict) + assert isinstance(metric_result, MetricResult) - def test_metric_return_keys(self, metric_result): - assert "score" in metric_result + def test_metric_score(self, metric_result): + pprint(metric_result) + assert 0 <= metric_result.score <= 1 def main(): diff --git a/tests/metrics/test_basics.py b/tests/metrics/test_basics.py index 5b3b181..1c7d679 100755 --- a/tests/metrics/test_basics.py +++ b/tests/metrics/test_basics.py @@ -16,42 +16,22 @@ class TestAccuracyMetric(BaseMetricTest): _metric_cls = AccuracyMetric - _key = "accuracy" - - def test_metric_score(self, metric_result): - assert metric_result[self._key]["score"] >= 0 class TestF1Metric(BaseMetricTest): _metric_cls = F1Metric - _key = "f1" - - def test_metric_score(self, metric_result): - assert metric_result[self._key]["score"] >= 0 class TestPrecisionMetric(BaseMetricTest): _metric_cls = PrecisionMetric - _key = "precision" - - def test_metric_score(self, metric_result): - assert metric_result[self._key]["score"] >= 0 class TestRecallMetric(BaseMetricTest): _metric_cls = RecallMetric - _key = "recall" - - def test_metric_score(self, metric_result): - assert metric_result[self._key]["score"] >= 0 class TestConfusionMatrix(BaseMetricTest): _metric_cls = ConfusionMatrix - _key = "confusion_matrix" - - def test_metric_return_keys(self, metric_result): - assert self._key in metric_result def test_metric_score(self, metric_result): - assert isinstance(metric_result[self._key], np.ndarray) + assert isinstance(metric_result.extra["confusion_matrix"], np.ndarray) diff --git a/tests/metrics/test_semantics.py b/tests/metrics/test_semantics.py index 36dc6eb..9e1a326 100755 --- a/tests/metrics/test_semantics.py +++ b/tests/metrics/test_semantics.py @@ -1,6 +1,7 @@ # flake8: noqa #!/usr/bin/env python3 +import math from pprint import pprint from evalem.nlp.metrics import ( @@ -17,61 +18,46 @@ class TestBertScore(BaseMetricTest): _metric_cls = BertScore - _key = "bertscore" def test_metric_score(self, metric_result): - assert -1 <= metric_result["score"] <= 1 + assert -1 <= metric_result.score <= 1 class TestBartScore(BaseMetricTest): _metric_cls = BartScore - _key = "bartscore" def test_metric_score(self, metric_result): - assert -10 <= metric_result["score"] <= 10 + assert -math.inf <= metric_result.score <= math.inf class TestBleuMetric(BaseMetricTest): _metric_cls = BleuMetric - _key = "bleu" - - def test_metric_score(self, metric_result): - pprint(metric_result) - assert 0 <= metric_result["score"] <= 1 class TestSacreBleuMetric(BaseMetricTest): _metric_cls = SacreBleuMetric - _key = "sacrebleu" - - def test_metric_score(self, metric_result): - pprint(metric_result) - assert 0 <= metric_result["score"] <= 1 class TestMeteorMetric(BaseMetricTest): _metric_cls = MeteorMetric - _key = "meteor" - - def test_metric_score(self, metric_result): - pprint(metric_result) - assert 0 <= metric_result["score"] <= 1 class TestRougeMetric(BaseMetricTest): _metric_cls = RougeMetric - _key = "rouge" def test_metric_return_keys(self, metric_result): - assert self._key in metric_result - assert "rouge1" in metric_result[self._key] - assert "rouge2" in metric_result[self._key] - assert "rougeL" in metric_result[self._key] - assert "rougeLsum" in metric_result[self._key] + key = "rouge" + assert key in metric_result.extra + assert "rouge1" in metric_result.extra[key] + assert "rouge2" in metric_result.extra[key] + assert "rougeL" in metric_result.extra[key] + assert "rougeLsum" in metric_result.extra[key] def test_metric_score(self, metric_result): + key = "rouge" pprint(metric_result) - assert 0 <= metric_result[self._key]["rouge1"] <= 1 - assert 0 <= metric_result[self._key]["rouge2"] <= 1 - assert 0 <= metric_result[self._key]["rougeL"] <= 1 - assert 0 <= metric_result[self._key]["rougeLsum"] <= 1 + assert 0 <= metric_result.score <= 1 + assert 0 <= metric_result.extra[key]["rouge1"] <= 1 + assert 0 <= metric_result.extra[key]["rouge2"] <= 1 + assert 0 <= metric_result.extra[key]["rougeL"] <= 1 + assert 0 <= metric_result.extra[key]["rougeLsum"] <= 1