Skip to content

Commit

Permalink
[alpha] Improvements to ModelWrapper and better QA/Classification imp…
Browse files Browse the repository at this point in the history
…lementation (#8)

* Add barebone unit test for jury format conversion for references

* Add more tests for jury format conversion

* Fix test function name

* Add misc datasets loader module

See `evalem.misc.datasets`.
We have `datasets.get_squad_v2(...)` function.

* Add device parameter to DefaultQAWrapper

* Update __init__ files with respective module imports

* Add metrics for computing semantics similarity

Now we have `metrics.semantics.SemanticMetric`.
There are 2 implementation for now:
- `metrics.semantics.BertScore`
- `metrics.semantics.BartScore`

* Add preprocessing and post-processing mechanism to ModelWrapper

We make use of 2 kwargs to any model wrapper:
- `inputs_preprocessor` (maps inputs to a specific format, defaults to
  identity)
- `predictions_postprocessor` (maps model outputs to a specific format,
  defaults to identity)

Also `models.HFPipelineWrapperForQuestionAnswering` is created.
`models.DefaultQAModelWrapper` is deprecated.

* Update code documentation for HFPipelineWrapper docstring

* Implement HF pipeline wrapper/evaluator for text classification

See `models.defaults.TextClassificationHFPipelineWrapper`.
Also improve the concstruction of hf pipeline object in existing
wrapper.

`evaluators.basics.TextClassificationEvaluator` is also added.

* Add per_instance_score flag to BertScore metric

This flag is used to return precision/recall/f1 score per prediction
instance.

* Default to bert-base-uncased for BertScore

* Bugfix tokenizer parameter in QuestionAnsweringHFPipelineWrapper and
TextClassificationHFPipelineWrapper

Previously, tokenizer was set to some defaults. However, that is
incorrect. We want tokenizer to be the one for which provided model was
trained on. So, now `tokenizer` is set to None by default.

* Change BertScore's per_instance_score behaviour to compute mean
  • Loading branch information
NISH1001 authored Mar 10, 2023
1 parent e0bbdd9 commit 6e3c4a6
Show file tree
Hide file tree
Showing 9 changed files with 284 additions and 54 deletions.
1 change: 1 addition & 0 deletions evalem/evaluators/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
# flake8: noqa
from ._base import Evaluator
from .basics import QAEvaluator, TextClassificationEvaluator
26 changes: 25 additions & 1 deletion evalem/evaluators/basics.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
#!/usr/bin/env python3

from ..metrics import AccuracyMetric, ExactMatchMetric, F1Metric
from ..metrics import (
AccuracyMetric,
ConfusionMatrix,
ExactMatchMetric,
F1Metric,
PrecisionMetric,
RecallMetric,
)
from ._base import Evaluator


Expand Down Expand Up @@ -30,6 +37,23 @@ def __init__(self) -> None:
)


class TextClassificationEvaluator(BasicEvaluator):
"""
An evaluator for text classification tasks.
"""

def __init__(self) -> None:
super().__init__(
metrics=[
AccuracyMetric(),
F1Metric(),
PrecisionMetric(),
RecallMetric(),
ConfusionMatrix(),
],
)


def main():
pass

Expand Down
8 changes: 0 additions & 8 deletions evalem/metrics/basics.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,11 +101,3 @@ def __get_labels(
Get unique list of labels across predictions + references.
"""
return sorted(set(predictions).union(references))


def main():
pass


if __name__ == "__main__":
main()
16 changes: 14 additions & 2 deletions evalem/metrics/semantics.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,10 @@ class BertScore(SemanticMetric):
https://github.com/Tiiiger/bert_score/blob/master/bert_score/utils.py
```device```: ```str```
Which device to run the model on? Defaults to "cpu".
```per_instance_score```: ```bool```
If enabled, precision, recall and f1 score per instance is also
returned in the computation result.
Else: mean precision, recall and f1 is computed by default.
```debug```: ```bool```
Enable debugging log? Defaults to False.
Expand Down Expand Up @@ -68,12 +72,14 @@ class BertScore(SemanticMetric):

def __init__(
self,
model_type: str = "roberta-large",
model_type: str = "bert-base-uncased",
device: str = "cpu",
per_instance_score: bool = False,
debug: bool = False,
) -> None:
super().__init__(metrics="bertscore", device=device, debug=debug)
self.model_type = model_type
self.per_instance_score = per_instance_score

def compute(
self,
Expand All @@ -83,13 +89,19 @@ def compute(
) -> MetricOutput:
device = kwargs.pop("device", self.device)
model_type = kwargs.pop("model_type", self.model_type)
return super().compute(
result = super().compute(
predictions=predictions,
references=references,
model_type=model_type,
device=device,
**kwargs,
)
# if you want to supress a list of all these metrics
# and want to just have mean/average.
if not self.per_instance_score:
for _key in ["precision", "recall", "f1"]:
result["bertscore"][_key] = np.mean(result["bertscore"][_key])
return result


class BartScore(SemanticMetric):
Expand Down
38 changes: 38 additions & 0 deletions evalem/misc/datasets.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#!/usr/bin/env python3

from typing import Dict

from datasets import load_dataset


Expand Down Expand Up @@ -41,6 +43,42 @@ def get_squad_v2(
return dict(inputs=inputs, references=references)


def get_imdb(
data_type: str = "test",
nsamples: int = 1000,
shuffle: bool = False,
) -> Dict[str, list]:
"""
This loads imdb text classification dataset using HuggingFace datasets module.
Args:
```data_type```: ```str```
Either "train" or "test"
```nsamples```: ```int```
How many samples to load?
Note: The returned data size may not be exactly equal to nsamples
as we're filtering out empty references
```shuffle```: ```bool```
If enabled, shuffles the data prior to sampling/filtering.
Returns:
Returns a dict with 2 keys:
- `inputs`: `List[dict]`, each dict has "context" and "question"
keys
- `references`: ```List[List[str]]```
"""
nsamples = nsamples or 0
data = load_dataset("imdb")[data_type]
data = data.shuffle(seed=42) if shuffle else data
data = data.select(range(nsamples)) if nsamples > 0 else data

label_map = ["NEGATIVE", "POSITIVE"]
inputs = [(d["text"], label_map[d["label"]]) for d in data]
inputs, references = zip(*inputs)
return dict(inputs=list(inputs), references=list(references))


def main():
pass

Expand Down
6 changes: 5 additions & 1 deletion evalem/models/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
# flake8: noqa
from ._base import HFLMWrapper, HFPipelineWrapper, ModelWrapper
from .defaults import DefaultQAModelWrapper
from .defaults import (
DefaultQAModelWrapper,
QuestionAnsweringHFPipelineWrapper,
TextClassificationHFPipelineWrapper,
)
97 changes: 81 additions & 16 deletions evalem/models/_base.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/env python3

from abc import abstractmethod
from typing import Iterable, Type
from typing import Callable, Iterable, Type

from transformers import Pipeline as HF_Pipeline
from transformers import PreTrainedModel, PreTrainedTokenizerBase
Expand All @@ -16,19 +16,47 @@ class ModelWrapper(AbstractBase):
all the upstream models into a nice wrapper.
All the downstream implementation of `ModelWrapper` should implement
the `predict(...)` method.
the `_predict(...)` method which is itself called by `.predict(...)` method.
Args:
```model```:
Input model that's being wrapped for common interface
```debug```: ```bool```
If enabled, debugging logs could be printed
```kwargs```:
- ```inputs_preprocessor```
A `Callable` to apply on inputs.
- ```predictions_postprocessor```
A `Callable` to apply on model outputs/predictions.
Note:
In order to convert to task-specific downstream format, we provide
`_map_predictions(...)` method which user can override. By default,
it is an identity that doesn't change the format egested by the model.
- Override `_preprocess_inputs` method to change data format for
model input. Default it identity (no change).
- Override `_postprocess_predictions` to convert predictions to
task-specific downstream format. Defaults to identity (no change).
"""

def __init__(self, model, debug: bool = False, **kwargs) -> None:
def __init__(
self,
model,
debug: bool = False,
**kwargs,
) -> None:
super().__init__(debug=debug)
self.model = model

@abstractmethod
# specifies how the input format conversion is done
self.inputs_preprocessor: Callable = (
kwargs.get("inputs_preprocessor", self._preprocess_inputs)
or self._preprocess_inputs
)

# specifies how the predictions formatting is done
self.predictions_postprocessor: Callable = (
kwargs.get("predictions_postprocessor", self._postprocess_predictions)
or self._postprocess_predictions
)

def predict(
self,
inputs: Iterable,
Expand All @@ -45,23 +73,51 @@ def predict(
Returns:
Iterable of predicted instance
"""
raise NotImplementedError()
inputs = self.inputs_preprocessor(inputs, **kwargs)
predictions = self._predict(inputs, **kwargs)
return self.predictions_postprocessor(predictions, **kwargs)

def __call__(
@abstractmethod
def _predict(
self,
inputs: Iterable,
**kwargs,
) -> Iterable[EvaluationPredictionInstance]:
return self.predict(inputs, **kwargs)
"""
Entrypoint method for predicting using the wrapped model
Args:
```inputs```
Represent input dataset whose format depends on
downstream tasks.
Returns:
Iterable of predicted instance
"""
raise NotImplementedError()

def _map_predictions(self, predictions: Iterable):
def _preprocess_inputs(self, inputs: Iterable, **kwargs) -> Iterable:
"""
A helper method to transform inputs suitable for model to ingest.
By default, it's an identity function.
"""
return inputs

def _postprocess_predictions(self, predictions: Iterable, **kwargs):
"""
A helper method to transform predictions from the models
into any downstream format. By default, it's an identity function.
"""
# default -> Identity
return predictions

def __call__(
self,
inputs: Iterable,
**kwargs,
) -> Iterable[EvaluationPredictionInstance]:
return self.predict(inputs, **kwargs)


class HFWrapper(ModelWrapper):
"""
Expand All @@ -86,8 +142,9 @@ def __init__(
self,
model: Type[PreTrainedModel],
tokenizer: Type[PreTrainedTokenizerBase],
**kwargs,
) -> None:
super().__init__(model=model)
super().__init__(model=model, **kwargs)
self.tokenizer = tokenizer


Expand All @@ -113,21 +170,29 @@ class HFPipelineWrapper(HFWrapper):
pipe = hf_pipeline("question-answering")
wrapped_model = HFPipelineWrapper(pipe)
# Or: if you want to specify how to post-process predictions,
# provide the processor explicitly.
wrapped_model = HFPipelineWrapper(
pipeline("question-answering", model="deepset/roberta-base-squad2"),
predictions_postprocessor=lambda xs: list(map(lambda x: x["answer"], xs))
)
# compute predictions
# (format?) and pass to evaluator along with references
predictions = wrapped_model.predict(<inputs>)
"""

def __init__(self, pipeline: Type[HF_Pipeline], debug: bool = False) -> None:
def __init__(self, pipeline: Type[HF_Pipeline], **kwargs) -> None:
"""
Args:
```pipeline```:
A HuggingFace pipeline object used for prediction
"""
super().__init__(model=pipeline)
super().__init__(model=pipeline, **kwargs)

def predict(self, inputs, **kwargs):
return self._map_predictions(self.model(inputs))
def _predict(self, inputs, **kwargs):
return self.model(inputs, **kwargs)

@property
def pipeline(self) -> HF_Pipeline:
Expand Down
Loading

0 comments on commit 6e3c4a6

Please sign in to comment.