NASA-IMPACT · NISH1001 · Jul 17, 2023 · Jul 7, 2023 · Jul 7, 2023 · Jul 10, 2023
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -41,4 +41,4 @@ jobs:
 
       # Run the unit tests
       - name: Test with pytest
-        run: python -m pytest --verbose tests/
+        run: python -m pytest --cov evalem --verbose tests/
diff --git a/evalem/__init__.py b/evalem/__init__.py
@@ -1,7 +1,12 @@
 __version__ = "0.0.4-alpha"
 
 from ._base.evaluators import Evaluator  # noqa
-from ._base.pipelines import EvaluationPipeline, SimpleEvaluationPipeline  # noqa
+from ._base.pipelines import (  # noqa
+    EvaluationPipeline,
+    NamedSimpleEvaluationPipeline,
+    SimpleEvaluationPipeline,
+)
+from ._base.structures import MetricResult  # noqa
 from .nlp.models import (  # noqa
     QuestionAnsweringHFPipelineWrapper,
     TextClassificationHFPipelineWrapper,

diff --git a/evalem/_base/abc.py b/evalem/_base/abc.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 
 from abc import ABC
+from itertools import count
 from typing import Any
 
 
@@ -26,6 +27,27 @@ def __repr__(self) -> str:
         return f"[{self.__classname__}]"
 
 
+class InstanceCountMixin:
+    """
+    This mixin is used to autogenerate names for
+    individual object.
+    """
+
+    _ids = count(0)
+
+    def __init__(self):
+        self.idx = next(self._ids)
+        self._name = None
+
+    @property
+    def name(self):
+        return self._name or f"{self.__class__.__name__}:{self.idx}"
+
+    @name.setter
+    def name(self, name: str):
+        self._name = name
+
+
 def main():
     pass
 

diff --git a/evalem/_base/evaluators.py b/evalem/_base/evaluators.py
@@ -114,7 +114,6 @@ def _type_check_metrics(
         metrics = [metrics] if not isinstance(metrics, Iterable) else metrics
         for _metric in metrics:
             if not isinstance(_metric, Metric):
-                print(_metric)
                 raise TypeError(
                     f"Invalid type for metric={_metric}. Expected type of [Metric]. Got {type(_metric)}",
                 )
@@ -158,12 +157,9 @@ def evaluate(
         Returns:
             Mapping (dict) of metric name to corresponding metric output
         """
-        return dict(
+        return list(
             map(
-                lambda m: (
-                    m.__classname__,
-                    m(predictions=predictions, references=references, **kwargs),
-                ),
+                lambda m: m(predictions=predictions, references=references, **kwargs),
                 self.metrics,
             ),
         )

diff --git a/evalem/_base/metrics.py b/evalem/_base/metrics.py
@@ -1,5 +1,7 @@
 #!/usr/bin/env python3
 
+from __future__ import annotations
+
 from abc import abstractmethod
 from typing import Iterable, List, Tuple
 
@@ -11,7 +13,7 @@
 from .structures import (
     EvaluationPredictionInstance,
     EvaluationReferenceInstance,
-    MetricOutput,
+    MetricResult,
     SinglePredictionInstance,
 )
 
@@ -40,7 +42,7 @@ def compute(
         predictions: EvaluationPredictionInstance,
         references: EvaluationReferenceInstance,
         **kwargs,
-    ) -> MetricOutput:
+    ) -> MetricResult:
         """
         The actual entrypoint method to perform evaluation and give output metric.
 
@@ -71,7 +73,7 @@ def __call__(
         predictions: EvaluationPredictionInstance,
         references: EvaluationReferenceInstance,
         **kwargs,
-    ) -> MetricOutput:
+    ) -> MetricResult:
         """
         The actual entrypoint method to perform evaluation and give output metric.
 
@@ -208,7 +210,7 @@ def compute(
         predictions: EvaluationPredictionInstance,
         references: EvaluationReferenceInstance,
         **kwargs,
-    ) -> MetricOutput:
+    ) -> MetricResult:
         predictions = format_to_jury(predictions)
         references = format_to_jury(references)
 
@@ -223,7 +225,8 @@ def compute(
             if isinstance(v, dict) and "score" in v:
                 res["score"] = v.get("score", None)
             res[k] = v
-        return res
+        res["metric_name"] = self.__classname__
+        return MetricResult.from_dict(res)
 
 
 class PrecisionMetric(JuryBasedMetric, BasicMetric):
@@ -256,7 +259,7 @@ def compute(
         predictions: EvaluationPredictionInstance,
         references: EvaluationReferenceInstance,
         **kwargs,
-    ) -> MetricOutput:
+    ) -> MetricResult:
         # converts all the structure into list of string
         predictions, references = format_to_jury(predictions), format_to_jury(
             references,
@@ -265,12 +268,19 @@ def compute(
         predictions, references = self._flatten_references(predictions, references)
 
         labels = self.__get_labels(predictions, references)
-        return dict(
-            confusion_matrix=confusion_matrix(references, predictions, labels=labels),
-            labels=labels,
-            flattened=True,
-            total_items=len(predictions),
-            empty_items=0,
+        return MetricResult.from_dict(
+            dict(
+                metric_name="ConfusionMatrix",
+                confusion_matrix=confusion_matrix(
+                    references,
+                    predictions,
+                    labels=labels,
+                ),
+                labels=labels,
+                flattened=True,
+                total_items=len(predictions),
+                empty_items=0,
+            ),
         )
 
     def __get_labels(

diff --git a/evalem/_base/pipelines.py b/evalem/_base/pipelines.py
@@ -1,9 +1,9 @@
 #!/usr/bin/env python3
 
 from abc import abstractmethod
-from typing import Any, Iterable, List, Mapping, Type, Union
+from typing import Any, Iterable, List, Mapping, Optional, Type, Union
 
-from .abc import AbstractBase
+from .abc import AbstractBase, InstanceCountMixin
 from .evaluators import Evaluator
 from .models import ModelWrapper
 from .structures import EvaluationReferenceInstance, MetricOutput
@@ -45,9 +45,9 @@ class SimpleEvaluationPipeline(EvaluationPipeline):
 
         .. code-block: python
 
-            from evalem.pipelines import SimpleEvaluationPipeline
-            from evalem.models import TextClassificationHFPipelineWrapper
-            from evalem.evaluators import TextClassificationEvaluator
+            from evalem import SimpleEvaluationPipeline
+            from evalem.nlp.models import TextClassificationHFPipelineWrapper
+            from evalem.nlp.evaluators import TextClassificationEvaluator
 
             model = TextClassificationHFPipelineWrapper()
             evaluator = TextClassificationEvaluator()
@@ -97,6 +97,48 @@ def run(
         )
 
 
+class NamedSimpleEvaluationPipeline(InstanceCountMixin, SimpleEvaluationPipeline):
+    """
+
+    This is a named version SimpleEvaluationPipeline that uses single model
+    and a list of evaluators to run the evaluation.
+
+    Args:
+        ```model```: ```Type[ModelWrapper]```
+            Wrapped model to do the inference.
+        ```evaluators```: ```Union[Evaluator, Iterable[Evalautor]]```
+            Either a single evaluator or an iterable of evaluators
+            Note: If single evaluator is provided, it'll be wrapped into
+            an iterable ultimately.
+        ```name```: ```Optional[str]```
+            Name of the pipeline. If not provided, name is autogenerated
+            (using `evalem.misc.utils.InstanceCountMixin`)
+
+    Usage:
+
+            from evalem import SimpleEvaluationPipeline
+            from evalem.nlp.models import TextClassificationHFPipelineWrapper
+            from evalem.nlp.evaluators import TextClassificationEvaluator
+            from evalem import NamedSimpleEvaluationPipeline
+
+            model = TextClassificationHFPipelineWrapper()
+            evaluator = TextClassificationEvaluator()
+            pipe = SimpleEvaluationPipeline(model=model, evaluators=evaluator)
+
+            results = pipe(inputs, references)
+    """
+
+    def __init__(
+        self,
+        model: Type[ModelWrapper],
+        evaluators: Union[Evaluator, Iterable[Evaluator]],
+        name: Optional[str] = None,
+    ) -> None:
+        InstanceCountMixin.__init__(self)
+        SimpleEvaluationPipeline.__init__(self, model=model, evaluators=evaluators)
+        self.name = name
+
+
 def main():
     pass
 

diff --git a/evalem/_base/structures.py b/evalem/_base/structures.py
@@ -2,6 +2,7 @@
 
 from __future__ import annotations
 
+from copy import deepcopy
 from dataclasses import asdict, dataclass
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Type, Union
@@ -41,6 +42,43 @@ class ReferenceDTO(EvaluationDTO):
     pass
 
 
+@dataclass(frozen=True)
+class MetricResult:
+    score: float
+    total_items: int
+    metric_name: str
+    empty_items: int = 0
+    extra: Optional[dict] = None
+
+    @classmethod
+    def from_dict(cls, dct: dict) -> MetricResult:
+        dct = deepcopy(dct)
+        return cls(
+            score=dct.pop("score", None),
+            total_items=dct.pop("total_items", None),
+            metric_name=dct.pop("metric_name", None),
+            empty_items=dct.pop("empty_items", 0),
+            extra=dct,
+        )
+
+    def as_dict(self) -> dict:
+        return asdict(self)
+
+    def to_dict(self) -> dict:
+        return asdict(self)
+
+    def __hash__(self) -> str:
+        score = self.score if isinstance(self.score, (float, int)) else 0
+        return hash(
+            (
+                self.metric_name,
+                round(score, 3),
+                self.total_items,
+                self.empty_items,
+            ),
+        )
+
+
 ImageTensor = Union[np.ndarray, torch.Tensor]
 
 # Represents type instance for any single downstream prediction
@@ -67,6 +105,6 @@ class ReferenceDTO(EvaluationDTO):
 EvaluationReferenceInstance = Union[SingleReferenceInstance, MultipleReferenceInstance]
 
 EvaluationOutput = Union[int, float, Dict[str, Union[str, int, float]]]
-MetricOutput = Union[int, float, Dict[str, Union[str, int, float]]]
+MetricOutput = Union[int, float, Dict[str, Union[str, int, float]], MetricResult]
 
 PathType = Union[str, Path]
diff --git a/evalem/misc/utils.py b/evalem/misc/utils.py
@@ -1,6 +1,11 @@
 #!/usr/bin/env python3
 
-from typing import Iterable, List, Union
+from itertools import chain
+from typing import Any, Iterable, List, Union
+
+import numpy as np
+import pandas as pd
+from loguru import logger
 
 from .._base.structures import EvaluationDTO, PredictionInstance, ReferenceInstance
 
@@ -38,3 +43,73 @@ def _dtofy(instance):
         return list(map(format_to_jury, instances))
     else:
         return instances
+
+
+def flatten_list(nested_list: Union[list, tuple, set]) -> List[Any]:
+    """
+    Flattens a nested list of list.
+    Can take into account in case any element is not a list too
+    (eg: `[[1, 2], 3]`)
+    """
+    return list(
+        chain.from_iterable(
+            flatten_list(x) if isinstance(x, (list, set, tuple)) else [x]
+            for x in nested_list
+        ),
+    )
+
+
+def build_comparison_table(
+    *eval_pipes,
+    inputs,
+    references,
+    **eval_params,
+) -> Union[dict, pd.DataFrame]:
+    """
+    A utility that runs the provided evaluation pipeline
+    and generates a comparison table.
+
+    Note:
+        Assumes the same set of inputs and references are run through
+        different evaluation pipeline
+
+    Args:
+        ```eval_pipes```: ```Type[EvaluationPipeline]```
+            Evaluation pipeline objects
+        ```inputs```: ```Any```
+            Inputs that are fed to each pipeline for forward pass
+        ```references```: ```EvaluationReferenceInstance ```
+            References/ground-truths for the evaluation.
+            See `evalem._base.structures.EvaluationReferenceInstance` for type
+
+    Returns:
+        Returns either a pandas DataFrame or dict.
+        If pandas dataframe creation fails, it returns a dict.
+
+        For the dataframe, the index is the metric name and other columns
+        consist of pipeline name with score value.
+    """
+    results = map(lambda ep: ep(inputs=inputs, references=references), eval_pipes)
+    comparison_map = {}
+    dfs = []
+    n_items_tracker = []
+    for idx, (ep, result) in enumerate(zip(eval_pipes, results)):
+        name = f"eval-pipe-{idx}" if not hasattr(ep, "name") else ep.name
+        metrics = set(flatten_list(result))
+        n_items_tracker.extend([m.total_items for m in metrics])
+        comparison_map[name] = metrics
+
+        df = pd.DataFrame(
+            map(lambda m: {"metric": m.metric_name, name: m.score}, metrics),
+        )
+        df.set_index("metric", inplace=True)
+        dfs.append(df)
+    logger.info(
+        f"{int(np.mean(n_items_tracker))} total items are evaluated on average.",
+    )
+    res = comparison_map
+    try:
+        res = pd.concat(dfs, join="outer", axis=1)
+    except:  # noqa
+        logger.warning("Failed to create pd.DataFrame table. Fallback to dict")
+    return res