Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Build comparison table #34

Merged
merged 15 commits into from
Jul 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,4 +41,4 @@ jobs:

# Run the unit tests
- name: Test with pytest
run: python -m pytest --verbose tests/
run: python -m pytest --cov evalem --verbose tests/
7 changes: 6 additions & 1 deletion evalem/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
__version__ = "0.0.4-alpha"

from ._base.evaluators import Evaluator # noqa
from ._base.pipelines import EvaluationPipeline, SimpleEvaluationPipeline # noqa
from ._base.pipelines import ( # noqa
EvaluationPipeline,
NamedSimpleEvaluationPipeline,
SimpleEvaluationPipeline,
)
from ._base.structures import MetricResult # noqa
from .nlp.models import ( # noqa
QuestionAnsweringHFPipelineWrapper,
TextClassificationHFPipelineWrapper,
Expand Down
22 changes: 22 additions & 0 deletions evalem/_base/abc.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#!/usr/bin/env python3

from abc import ABC
from itertools import count
from typing import Any


Expand All @@ -26,6 +27,27 @@ def __repr__(self) -> str:
return f"[{self.__classname__}]"


class InstanceCountMixin:
"""
This mixin is used to autogenerate names for
individual object.
"""

_ids = count(0)

def __init__(self):
self.idx = next(self._ids)
self._name = None

@property
def name(self):
return self._name or f"{self.__class__.__name__}:{self.idx}"

@name.setter
def name(self, name: str):
self._name = name


def main():
pass

Expand Down
8 changes: 2 additions & 6 deletions evalem/_base/evaluators.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,6 @@ def _type_check_metrics(
metrics = [metrics] if not isinstance(metrics, Iterable) else metrics
for _metric in metrics:
if not isinstance(_metric, Metric):
print(_metric)
raise TypeError(
f"Invalid type for metric={_metric}. Expected type of [Metric]. Got {type(_metric)}",
)
Expand Down Expand Up @@ -158,12 +157,9 @@ def evaluate(
Returns:
Mapping (dict) of metric name to corresponding metric output
"""
return dict(
return list(
map(
lambda m: (
m.__classname__,
m(predictions=predictions, references=references, **kwargs),
),
lambda m: m(predictions=predictions, references=references, **kwargs),
self.metrics,
),
)
Expand Down
34 changes: 22 additions & 12 deletions evalem/_base/metrics.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#!/usr/bin/env python3

from __future__ import annotations

from abc import abstractmethod
from typing import Iterable, List, Tuple

Expand All @@ -11,7 +13,7 @@
from .structures import (
EvaluationPredictionInstance,
EvaluationReferenceInstance,
MetricOutput,
MetricResult,
SinglePredictionInstance,
)

Expand Down Expand Up @@ -40,7 +42,7 @@ def compute(
predictions: EvaluationPredictionInstance,
references: EvaluationReferenceInstance,
**kwargs,
) -> MetricOutput:
) -> MetricResult:
"""
The actual entrypoint method to perform evaluation and give output metric.

Expand Down Expand Up @@ -71,7 +73,7 @@ def __call__(
predictions: EvaluationPredictionInstance,
references: EvaluationReferenceInstance,
**kwargs,
) -> MetricOutput:
) -> MetricResult:
"""
The actual entrypoint method to perform evaluation and give output metric.

Expand Down Expand Up @@ -208,7 +210,7 @@ def compute(
predictions: EvaluationPredictionInstance,
references: EvaluationReferenceInstance,
**kwargs,
) -> MetricOutput:
) -> MetricResult:
predictions = format_to_jury(predictions)
references = format_to_jury(references)

Expand All @@ -223,7 +225,8 @@ def compute(
if isinstance(v, dict) and "score" in v:
res["score"] = v.get("score", None)
res[k] = v
return res
res["metric_name"] = self.__classname__
return MetricResult.from_dict(res)


class PrecisionMetric(JuryBasedMetric, BasicMetric):
Expand Down Expand Up @@ -256,7 +259,7 @@ def compute(
predictions: EvaluationPredictionInstance,
references: EvaluationReferenceInstance,
**kwargs,
) -> MetricOutput:
) -> MetricResult:
# converts all the structure into list of string
predictions, references = format_to_jury(predictions), format_to_jury(
references,
Expand All @@ -265,12 +268,19 @@ def compute(
predictions, references = self._flatten_references(predictions, references)

labels = self.__get_labels(predictions, references)
return dict(
confusion_matrix=confusion_matrix(references, predictions, labels=labels),
labels=labels,
flattened=True,
total_items=len(predictions),
empty_items=0,
return MetricResult.from_dict(
dict(
metric_name="ConfusionMatrix",
confusion_matrix=confusion_matrix(
references,
predictions,
labels=labels,
),
labels=labels,
flattened=True,
total_items=len(predictions),
empty_items=0,
),
)

def __get_labels(
Expand Down
52 changes: 47 additions & 5 deletions evalem/_base/pipelines.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
#!/usr/bin/env python3

from abc import abstractmethod
from typing import Any, Iterable, List, Mapping, Type, Union
from typing import Any, Iterable, List, Mapping, Optional, Type, Union

from .abc import AbstractBase
from .abc import AbstractBase, InstanceCountMixin
from .evaluators import Evaluator
from .models import ModelWrapper
from .structures import EvaluationReferenceInstance, MetricOutput
Expand Down Expand Up @@ -45,9 +45,9 @@ class SimpleEvaluationPipeline(EvaluationPipeline):

.. code-block: python

from evalem.pipelines import SimpleEvaluationPipeline
from evalem.models import TextClassificationHFPipelineWrapper
from evalem.evaluators import TextClassificationEvaluator
from evalem import SimpleEvaluationPipeline
from evalem.nlp.models import TextClassificationHFPipelineWrapper
from evalem.nlp.evaluators import TextClassificationEvaluator

model = TextClassificationHFPipelineWrapper()
evaluator = TextClassificationEvaluator()
Expand Down Expand Up @@ -97,6 +97,48 @@ def run(
)


class NamedSimpleEvaluationPipeline(InstanceCountMixin, SimpleEvaluationPipeline):
"""

This is a named version SimpleEvaluationPipeline that uses single model
and a list of evaluators to run the evaluation.

Args:
```model```: ```Type[ModelWrapper]```
Wrapped model to do the inference.
```evaluators```: ```Union[Evaluator, Iterable[Evalautor]]```
Either a single evaluator or an iterable of evaluators
Note: If single evaluator is provided, it'll be wrapped into
an iterable ultimately.
```name```: ```Optional[str]```
Name of the pipeline. If not provided, name is autogenerated
(using `evalem.misc.utils.InstanceCountMixin`)

Usage:

from evalem import SimpleEvaluationPipeline
from evalem.nlp.models import TextClassificationHFPipelineWrapper
from evalem.nlp.evaluators import TextClassificationEvaluator
from evalem import NamedSimpleEvaluationPipeline

model = TextClassificationHFPipelineWrapper()
evaluator = TextClassificationEvaluator()
pipe = SimpleEvaluationPipeline(model=model, evaluators=evaluator)

results = pipe(inputs, references)
"""

def __init__(
self,
model: Type[ModelWrapper],
evaluators: Union[Evaluator, Iterable[Evaluator]],
name: Optional[str] = None,
) -> None:
InstanceCountMixin.__init__(self)
SimpleEvaluationPipeline.__init__(self, model=model, evaluators=evaluators)
self.name = name


def main():
pass

Expand Down
40 changes: 39 additions & 1 deletion evalem/_base/structures.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from __future__ import annotations

from copy import deepcopy
from dataclasses import asdict, dataclass
from pathlib import Path
from typing import Any, Dict, List, Optional, Type, Union
Expand Down Expand Up @@ -41,6 +42,43 @@ class ReferenceDTO(EvaluationDTO):
pass


@dataclass(frozen=True)
class MetricResult:
score: float
total_items: int
metric_name: str
empty_items: int = 0
extra: Optional[dict] = None

@classmethod
def from_dict(cls, dct: dict) -> MetricResult:
dct = deepcopy(dct)
return cls(
score=dct.pop("score", None),
total_items=dct.pop("total_items", None),
metric_name=dct.pop("metric_name", None),
empty_items=dct.pop("empty_items", 0),
extra=dct,
)

def as_dict(self) -> dict:
return asdict(self)

def to_dict(self) -> dict:
return asdict(self)

def __hash__(self) -> str:
score = self.score if isinstance(self.score, (float, int)) else 0
return hash(
(
self.metric_name,
round(score, 3),
self.total_items,
self.empty_items,
),
)


ImageTensor = Union[np.ndarray, torch.Tensor]

# Represents type instance for any single downstream prediction
Expand All @@ -67,6 +105,6 @@ class ReferenceDTO(EvaluationDTO):
EvaluationReferenceInstance = Union[SingleReferenceInstance, MultipleReferenceInstance]

EvaluationOutput = Union[int, float, Dict[str, Union[str, int, float]]]
MetricOutput = Union[int, float, Dict[str, Union[str, int, float]]]
MetricOutput = Union[int, float, Dict[str, Union[str, int, float]], MetricResult]

PathType = Union[str, Path]
77 changes: 76 additions & 1 deletion evalem/misc/utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
#!/usr/bin/env python3

from typing import Iterable, List, Union
from itertools import chain
from typing import Any, Iterable, List, Union

import numpy as np
import pandas as pd
from loguru import logger

from .._base.structures import EvaluationDTO, PredictionInstance, ReferenceInstance

Expand Down Expand Up @@ -38,3 +43,73 @@ def _dtofy(instance):
return list(map(format_to_jury, instances))
else:
return instances


def flatten_list(nested_list: Union[list, tuple, set]) -> List[Any]:
"""
Flattens a nested list of list.
Can take into account in case any element is not a list too
(eg: `[[1, 2], 3]`)
"""
return list(
chain.from_iterable(
flatten_list(x) if isinstance(x, (list, set, tuple)) else [x]
for x in nested_list
),
)


def build_comparison_table(
*eval_pipes,
inputs,
references,
**eval_params,
) -> Union[dict, pd.DataFrame]:
"""
A utility that runs the provided evaluation pipeline
and generates a comparison table.

Note:
Assumes the same set of inputs and references are run through
different evaluation pipeline

Args:
```eval_pipes```: ```Type[EvaluationPipeline]```
Evaluation pipeline objects
```inputs```: ```Any```
Inputs that are fed to each pipeline for forward pass
```references```: ```EvaluationReferenceInstance ```
References/ground-truths for the evaluation.
See `evalem._base.structures.EvaluationReferenceInstance` for type

Returns:
Returns either a pandas DataFrame or dict.
If pandas dataframe creation fails, it returns a dict.

For the dataframe, the index is the metric name and other columns
consist of pipeline name with score value.
"""
results = map(lambda ep: ep(inputs=inputs, references=references), eval_pipes)
comparison_map = {}
dfs = []
n_items_tracker = []
for idx, (ep, result) in enumerate(zip(eval_pipes, results)):
name = f"eval-pipe-{idx}" if not hasattr(ep, "name") else ep.name
metrics = set(flatten_list(result))
n_items_tracker.extend([m.total_items for m in metrics])
comparison_map[name] = metrics

df = pd.DataFrame(
map(lambda m: {"metric": m.metric_name, name: m.score}, metrics),
)
df.set_index("metric", inplace=True)
dfs.append(df)
logger.info(
f"{int(np.mean(n_items_tracker))} total items are evaluated on average.",
)
res = comparison_map
try:
res = pd.concat(dfs, join="outer", axis=1)
except: # noqa
logger.warning("Failed to create pd.DataFrame table. Fallback to dict")
return res
Loading