[alpha] Improvements to ModelWrapper and better QA/Classification imp…

…lementation (#8) * Add barebone unit test for jury format conversion for references * Add more tests for jury format conversion * Fix test function name * Add misc datasets loader module See `evalem.misc.datasets`. We have `datasets.get_squad_v2(...)` function. * Add device parameter to DefaultQAWrapper * Update __init__ files with respective module imports * Add metrics for computing semantics similarity Now we have `metrics.semantics.SemanticMetric`. There are 2 implementation for now: - `metrics.semantics.BertScore` - `metrics.semantics.BartScore` * Add preprocessing and post-processing mechanism to ModelWrapper We make use of 2 kwargs to any model wrapper: - `inputs_preprocessor` (maps inputs to a specific format, defaults to identity) - `predictions_postprocessor` (maps model outputs to a specific format, defaults to identity) Also `models.HFPipelineWrapperForQuestionAnswering` is created. `models.DefaultQAModelWrapper` is deprecated. * Update code documentation for HFPipelineWrapper docstring * Implement HF pipeline wrapper/evaluator for text classification See `models.defaults.TextClassificationHFPipelineWrapper`. Also improve the concstruction of hf pipeline object in existing wrapper. `evaluators.basics.TextClassificationEvaluator` is also added. * Add per_instance_score flag to BertScore metric This flag is used to return precision/recall/f1 score per prediction instance. * Default to bert-base-uncased for BertScore * Bugfix tokenizer parameter in QuestionAnsweringHFPipelineWrapper and TextClassificationHFPipelineWrapper Previously, tokenizer was set to some defaults. However, that is incorrect. We want tokenizer to be the one for which provided model was trained on. So, now `tokenizer` is set to None by default. * Change BertScore's per_instance_score behaviour to compute mean
NASA-IMPACT · Mar 10, 2023 · 6e3c4a6 · 6e3c4a6
1 parent e0bbdd9
commit 6e3c4a6
Show file tree

Hide file tree

Showing 9 changed files with 284 additions and 54 deletions.
diff --git a/evalem/evaluators/__init__.py b/evalem/evaluators/__init__.py
@@ -1,2 +1,3 @@
 # flake8: noqa
 from ._base import Evaluator
+from .basics import QAEvaluator, TextClassificationEvaluator
diff --git a/evalem/evaluators/basics.py b/evalem/evaluators/basics.py
@@ -1,6 +1,13 @@
 #!/usr/bin/env python3
 
-from ..metrics import AccuracyMetric, ExactMatchMetric, F1Metric
+from ..metrics import (
+    AccuracyMetric,
+    ConfusionMatrix,
+    ExactMatchMetric,
+    F1Metric,
+    PrecisionMetric,
+    RecallMetric,
+)
 from ._base import Evaluator
 
 
@@ -30,6 +37,23 @@ def __init__(self) -> None:
         )
 
 
+class TextClassificationEvaluator(BasicEvaluator):
+    """
+    An evaluator for text classification tasks.
+    """
+
+    def __init__(self) -> None:
+        super().__init__(
+            metrics=[
+                AccuracyMetric(),
+                F1Metric(),
+                PrecisionMetric(),
+                RecallMetric(),
+                ConfusionMatrix(),
+            ],
+        )
+
+
 def main():
     pass
 

diff --git a/evalem/metrics/basics.py b/evalem/metrics/basics.py
@@ -101,11 +101,3 @@ def __get_labels(
         Get unique list of labels across predictions + references.
         """
         return sorted(set(predictions).union(references))
-
-
-def main():
-    pass
-
-
-if __name__ == "__main__":
-    main()
diff --git a/evalem/metrics/semantics.py b/evalem/metrics/semantics.py
@@ -37,6 +37,10 @@ class BertScore(SemanticMetric):
             https://github.com/Tiiiger/bert_score/blob/master/bert_score/utils.py
         ```device```: ```str```
             Which device to run the model on? Defaults to "cpu".
+        ```per_instance_score```: ```bool```
+            If enabled, precision, recall and f1 score per instance is also
+            returned in the computation result.
+            Else: mean precision, recall and f1 is computed by default.
         ```debug```: ```bool```
             Enable debugging log? Defaults to False.
 
@@ -68,12 +72,14 @@ class BertScore(SemanticMetric):
 
     def __init__(
         self,
-        model_type: str = "roberta-large",
+        model_type: str = "bert-base-uncased",
         device: str = "cpu",
+        per_instance_score: bool = False,
         debug: bool = False,
     ) -> None:
         super().__init__(metrics="bertscore", device=device, debug=debug)
         self.model_type = model_type
+        self.per_instance_score = per_instance_score
 
     def compute(
         self,
@@ -83,13 +89,19 @@ def compute(
     ) -> MetricOutput:
         device = kwargs.pop("device", self.device)
         model_type = kwargs.pop("model_type", self.model_type)
-        return super().compute(
+        result = super().compute(
             predictions=predictions,
             references=references,
             model_type=model_type,
             device=device,
             **kwargs,
         )
+        # if you want to supress a list of all these metrics
+        # and want to just have mean/average.
+        if not self.per_instance_score:
+            for _key in ["precision", "recall", "f1"]:
+                result["bertscore"][_key] = np.mean(result["bertscore"][_key])
+        return result
 
 
 class BartScore(SemanticMetric):

diff --git a/evalem/misc/datasets.py b/evalem/misc/datasets.py
@@ -1,5 +1,7 @@
 #!/usr/bin/env python3
 
+from typing import Dict
+
 from datasets import load_dataset
 
 
@@ -41,6 +43,42 @@ def get_squad_v2(
     return dict(inputs=inputs, references=references)
 
 
+def get_imdb(
+    data_type: str = "test",
+    nsamples: int = 1000,
+    shuffle: bool = False,
+) -> Dict[str, list]:
+    """
+    This loads imdb text classification dataset using HuggingFace datasets module.
+
+    Args:
+        ```data_type```: ```str```
+            Either "train" or "test"
+        ```nsamples```: ```int```
+            How many samples to load?
+            Note: The returned data size may not be exactly equal to nsamples
+            as we're filtering out empty references
+        ```shuffle```: ```bool```
+            If enabled, shuffles the data prior to sampling/filtering.
+
+    Returns:
+        Returns a dict with 2 keys:
+            - `inputs`: `List[dict]`, each dict has "context" and "question"
+            keys
+            - `references`: ```List[List[str]]```
+
+    """
+    nsamples = nsamples or 0
+    data = load_dataset("imdb")[data_type]
+    data = data.shuffle(seed=42) if shuffle else data
+    data = data.select(range(nsamples)) if nsamples > 0 else data
+
+    label_map = ["NEGATIVE", "POSITIVE"]
+    inputs = [(d["text"], label_map[d["label"]]) for d in data]
+    inputs, references = zip(*inputs)
+    return dict(inputs=list(inputs), references=list(references))
+
+
 def main():
     pass
 

diff --git a/evalem/models/__init__.py b/evalem/models/__init__.py
@@ -1,3 +1,7 @@
 # flake8: noqa
 from ._base import HFLMWrapper, HFPipelineWrapper, ModelWrapper
-from .defaults import DefaultQAModelWrapper
+from .defaults import (
+    DefaultQAModelWrapper,
+    QuestionAnsweringHFPipelineWrapper,
+    TextClassificationHFPipelineWrapper,
+)
diff --git a/evalem/models/_base.py b/evalem/models/_base.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 
 from abc import abstractmethod
-from typing import Iterable, Type
+from typing import Callable, Iterable, Type
 
 from transformers import Pipeline as HF_Pipeline
 from transformers import PreTrainedModel, PreTrainedTokenizerBase
@@ -16,19 +16,47 @@ class ModelWrapper(AbstractBase):
     all the upstream models into a nice wrapper.
 
     All the downstream implementation of `ModelWrapper` should implement
-    the `predict(...)` method.
+    the `_predict(...)` method which is itself called by `.predict(...)` method.
+
+    Args:
+        ```model```:
+            Input model that's being wrapped for common interface
+        ```debug```: ```bool```
+            If enabled, debugging logs could be printed
+        ```kwargs```:
+            - ```inputs_preprocessor```
+                A `Callable` to apply on inputs.
+            - ```predictions_postprocessor```
+                A `Callable` to apply on model outputs/predictions.
 
     Note:
-        In order to convert to task-specific downstream format, we provide
-        `_map_predictions(...)` method which user can override. By default,
-        it is an identity that doesn't change the format egested by the model.
+        - Override `_preprocess_inputs` method to change data format for
+            model input. Default it identity (no change).
+        - Override `_postprocess_predictions` to convert predictions to
+            task-specific downstream format. Defaults to identity (no change).
     """
 
-    def __init__(self, model, debug: bool = False, **kwargs) -> None:
+    def __init__(
+        self,
+        model,
+        debug: bool = False,
+        **kwargs,
+    ) -> None:
         super().__init__(debug=debug)
         self.model = model
 
-    @abstractmethod
+        # specifies how the input format conversion is done
+        self.inputs_preprocessor: Callable = (
+            kwargs.get("inputs_preprocessor", self._preprocess_inputs)
+            or self._preprocess_inputs
+        )
+
+        # specifies how the predictions formatting is done
+        self.predictions_postprocessor: Callable = (
+            kwargs.get("predictions_postprocessor", self._postprocess_predictions)
+            or self._postprocess_predictions
+        )
+
     def predict(
         self,
         inputs: Iterable,
@@ -45,23 +73,51 @@ def predict(
         Returns:
             Iterable of predicted instance
         """
-        raise NotImplementedError()
+        inputs = self.inputs_preprocessor(inputs, **kwargs)
+        predictions = self._predict(inputs, **kwargs)
+        return self.predictions_postprocessor(predictions, **kwargs)
 
-    def __call__(
+    @abstractmethod
+    def _predict(
         self,
         inputs: Iterable,
         **kwargs,
     ) -> Iterable[EvaluationPredictionInstance]:
-        return self.predict(inputs, **kwargs)
+        """
+        Entrypoint method for predicting using the wrapped model
+
+        Args:
+            ```inputs```
+                Represent input dataset whose format depends on
+                downstream tasks.
+
+        Returns:
+            Iterable of predicted instance
+        """
+        raise NotImplementedError()
 
-    def _map_predictions(self, predictions: Iterable):
+    def _preprocess_inputs(self, inputs: Iterable, **kwargs) -> Iterable:
+        """
+        A helper method to transform inputs suitable for model to ingest.
+        By default, it's an identity function.
+        """
+        return inputs
+
+    def _postprocess_predictions(self, predictions: Iterable, **kwargs):
         """
         A helper method to transform predictions from the models
         into any downstream format. By default, it's an identity function.
         """
         # default -> Identity
         return predictions
 
+    def __call__(
+        self,
+        inputs: Iterable,
+        **kwargs,
+    ) -> Iterable[EvaluationPredictionInstance]:
+        return self.predict(inputs, **kwargs)
+
 
 class HFWrapper(ModelWrapper):
     """
@@ -86,8 +142,9 @@ def __init__(
         self,
         model: Type[PreTrainedModel],
         tokenizer: Type[PreTrainedTokenizerBase],
+        **kwargs,
     ) -> None:
-        super().__init__(model=model)
+        super().__init__(model=model, **kwargs)
         self.tokenizer = tokenizer
 
 
@@ -113,21 +170,29 @@ class HFPipelineWrapper(HFWrapper):
             pipe = hf_pipeline("question-answering")
             wrapped_model = HFPipelineWrapper(pipe)
 
+            # Or: if you want to specify how to post-process predictions,
+            # provide the processor explicitly.
+            wrapped_model = HFPipelineWrapper(
+                pipeline("question-answering", model="deepset/roberta-base-squad2"),
+                predictions_postprocessor=lambda xs: list(map(lambda x: x["answer"], xs))
+            )
+
+
             # compute predictions
             # (format?) and pass to evaluator along with references
             predictions = wrapped_model.predict(<inputs>)
     """
 
-    def __init__(self, pipeline: Type[HF_Pipeline], debug: bool = False) -> None:
+    def __init__(self, pipeline: Type[HF_Pipeline], **kwargs) -> None:
         """
         Args:
             ```pipeline```:
                 A HuggingFace pipeline object used for prediction
         """
-        super().__init__(model=pipeline)
+        super().__init__(model=pipeline, **kwargs)
 
-    def predict(self, inputs, **kwargs):
-        return self._map_predictions(self.model(inputs))
+    def _predict(self, inputs, **kwargs):
+        return self.model(inputs, **kwargs)
 
     @property
     def pipeline(self) -> HF_Pipeline: