diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 7cd9461..2ba7c98 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -41,4 +41,4 @@ jobs:
 
       # Run the unit tests
       - name: Test with pytest
-        run: python -m pytest --verbose tests/
+        run: python -m pytest --cov evalem --verbose tests/
diff --git a/evalem/__init__.py b/evalem/__init__.py
index a50c934..edff7d3 100644
--- a/evalem/__init__.py
+++ b/evalem/__init__.py
@@ -1,7 +1,12 @@
 __version__ = "0.0.4-alpha"
 
 from ._base.evaluators import Evaluator  # noqa
-from ._base.pipelines import EvaluationPipeline, SimpleEvaluationPipeline  # noqa
+from ._base.pipelines import (  # noqa
+    EvaluationPipeline,
+    NamedSimpleEvaluationPipeline,
+    SimpleEvaluationPipeline,
+)
+from ._base.structures import MetricResult  # noqa
 from .nlp.models import (  # noqa
     QuestionAnsweringHFPipelineWrapper,
     TextClassificationHFPipelineWrapper,
diff --git a/evalem/_base/abc.py b/evalem/_base/abc.py
index f993490..beb396d 100755
--- a/evalem/_base/abc.py
+++ b/evalem/_base/abc.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 
 from abc import ABC
+from itertools import count
 from typing import Any
 
 
@@ -26,6 +27,27 @@ def __repr__(self) -> str:
         return f"[{self.__classname__}]"
 
 
+class InstanceCountMixin:
+    """
+    This mixin is used to autogenerate names for
+    individual object.
+    """
+
+    _ids = count(0)
+
+    def __init__(self):
+        self.idx = next(self._ids)
+        self._name = None
+
+    @property
+    def name(self):
+        return self._name or f"{self.__class__.__name__}:{self.idx}"
+
+    @name.setter
+    def name(self, name: str):
+        self._name = name
+
+
 def main():
     pass
 
diff --git a/evalem/_base/evaluators.py b/evalem/_base/evaluators.py
index 8672e39..869f5cd 100755
--- a/evalem/_base/evaluators.py
+++ b/evalem/_base/evaluators.py
@@ -114,7 +114,6 @@ def _type_check_metrics(
         metrics = [metrics] if not isinstance(metrics, Iterable) else metrics
         for _metric in metrics:
             if not isinstance(_metric, Metric):
-                print(_metric)
                 raise TypeError(
                     f"Invalid type for metric={_metric}. Expected type of [Metric]. Got {type(_metric)}",
                 )
@@ -158,12 +157,9 @@ def evaluate(
         Returns:
             Mapping (dict) of metric name to corresponding metric output
         """
-        return dict(
+        return list(
             map(
-                lambda m: (
-                    m.__classname__,
-                    m(predictions=predictions, references=references, **kwargs),
-                ),
+                lambda m: m(predictions=predictions, references=references, **kwargs),
                 self.metrics,
             ),
         )
diff --git a/evalem/_base/metrics.py b/evalem/_base/metrics.py
index e5c9a1c..06a9867 100755
--- a/evalem/_base/metrics.py
+++ b/evalem/_base/metrics.py
@@ -1,5 +1,7 @@
 #!/usr/bin/env python3
 
+from __future__ import annotations
+
 from abc import abstractmethod
 from typing import Iterable, List, Tuple
 
@@ -11,7 +13,7 @@
 from .structures import (
     EvaluationPredictionInstance,
     EvaluationReferenceInstance,
-    MetricOutput,
+    MetricResult,
     SinglePredictionInstance,
 )
 
@@ -40,7 +42,7 @@ def compute(
         predictions: EvaluationPredictionInstance,
         references: EvaluationReferenceInstance,
         **kwargs,
-    ) -> MetricOutput:
+    ) -> MetricResult:
         """
         The actual entrypoint method to perform evaluation and give output metric.
 
@@ -71,7 +73,7 @@ def __call__(
         predictions: EvaluationPredictionInstance,
         references: EvaluationReferenceInstance,
         **kwargs,
-    ) -> MetricOutput:
+    ) -> MetricResult:
         """
         The actual entrypoint method to perform evaluation and give output metric.
 
@@ -208,7 +210,7 @@ def compute(
         predictions: EvaluationPredictionInstance,
         references: EvaluationReferenceInstance,
         **kwargs,
-    ) -> MetricOutput:
+    ) -> MetricResult:
         predictions = format_to_jury(predictions)
         references = format_to_jury(references)
 
@@ -223,7 +225,8 @@ def compute(
             if isinstance(v, dict) and "score" in v:
                 res["score"] = v.get("score", None)
             res[k] = v
-        return res
+        res["metric_name"] = self.__classname__
+        return MetricResult.from_dict(res)
 
 
 class PrecisionMetric(JuryBasedMetric, BasicMetric):
@@ -256,7 +259,7 @@ def compute(
         predictions: EvaluationPredictionInstance,
         references: EvaluationReferenceInstance,
         **kwargs,
-    ) -> MetricOutput:
+    ) -> MetricResult:
         # converts all the structure into list of string
         predictions, references = format_to_jury(predictions), format_to_jury(
             references,
@@ -265,12 +268,19 @@ def compute(
         predictions, references = self._flatten_references(predictions, references)
 
         labels = self.__get_labels(predictions, references)
-        return dict(
-            confusion_matrix=confusion_matrix(references, predictions, labels=labels),
-            labels=labels,
-            flattened=True,
-            total_items=len(predictions),
-            empty_items=0,
+        return MetricResult.from_dict(
+            dict(
+                metric_name="ConfusionMatrix",
+                confusion_matrix=confusion_matrix(
+                    references,
+                    predictions,
+                    labels=labels,
+                ),
+                labels=labels,
+                flattened=True,
+                total_items=len(predictions),
+                empty_items=0,
+            ),
         )
 
     def __get_labels(
diff --git a/evalem/_base/pipelines.py b/evalem/_base/pipelines.py
index 08fef47..b884bc7 100755
--- a/evalem/_base/pipelines.py
+++ b/evalem/_base/pipelines.py
@@ -1,9 +1,9 @@
 #!/usr/bin/env python3
 
 from abc import abstractmethod
-from typing import Any, Iterable, List, Mapping, Type, Union
+from typing import Any, Iterable, List, Mapping, Optional, Type, Union
 
-from .abc import AbstractBase
+from .abc import AbstractBase, InstanceCountMixin
 from .evaluators import Evaluator
 from .models import ModelWrapper
 from .structures import EvaluationReferenceInstance, MetricOutput
@@ -45,9 +45,9 @@ class SimpleEvaluationPipeline(EvaluationPipeline):
 
         .. code-block: python
 
-            from evalem.pipelines import SimpleEvaluationPipeline
-            from evalem.models import TextClassificationHFPipelineWrapper
-            from evalem.evaluators import TextClassificationEvaluator
+            from evalem import SimpleEvaluationPipeline
+            from evalem.nlp.models import TextClassificationHFPipelineWrapper
+            from evalem.nlp.evaluators import TextClassificationEvaluator
 
             model = TextClassificationHFPipelineWrapper()
             evaluator = TextClassificationEvaluator()
@@ -97,6 +97,48 @@ def run(
         )
 
 
+class NamedSimpleEvaluationPipeline(InstanceCountMixin, SimpleEvaluationPipeline):
+    """
+
+    This is a named version SimpleEvaluationPipeline that uses single model
+    and a list of evaluators to run the evaluation.
+
+    Args:
+        ```model```: ```Type[ModelWrapper]```
+            Wrapped model to do the inference.
+        ```evaluators```: ```Union[Evaluator, Iterable[Evalautor]]```
+            Either a single evaluator or an iterable of evaluators
+            Note: If single evaluator is provided, it'll be wrapped into
+            an iterable ultimately.
+        ```name```: ```Optional[str]```
+            Name of the pipeline. If not provided, name is autogenerated
+            (using `evalem.misc.utils.InstanceCountMixin`)
+
+    Usage:
+
+            from evalem import SimpleEvaluationPipeline
+            from evalem.nlp.models import TextClassificationHFPipelineWrapper
+            from evalem.nlp.evaluators import TextClassificationEvaluator
+            from evalem import NamedSimpleEvaluationPipeline
+
+            model = TextClassificationHFPipelineWrapper()
+            evaluator = TextClassificationEvaluator()
+            pipe = SimpleEvaluationPipeline(model=model, evaluators=evaluator)
+
+            results = pipe(inputs, references)
+    """
+
+    def __init__(
+        self,
+        model: Type[ModelWrapper],
+        evaluators: Union[Evaluator, Iterable[Evaluator]],
+        name: Optional[str] = None,
+    ) -> None:
+        InstanceCountMixin.__init__(self)
+        SimpleEvaluationPipeline.__init__(self, model=model, evaluators=evaluators)
+        self.name = name
+
+
 def main():
     pass
 
diff --git a/evalem/_base/structures.py b/evalem/_base/structures.py
index f09bb81..8b9e538 100755
--- a/evalem/_base/structures.py
+++ b/evalem/_base/structures.py
@@ -2,6 +2,7 @@
 
 from __future__ import annotations
 
+from copy import deepcopy
 from dataclasses import asdict, dataclass
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Type, Union
@@ -41,6 +42,43 @@ class ReferenceDTO(EvaluationDTO):
     pass
 
 
+@dataclass(frozen=True)
+class MetricResult:
+    score: float
+    total_items: int
+    metric_name: str
+    empty_items: int = 0
+    extra: Optional[dict] = None
+
+    @classmethod
+    def from_dict(cls, dct: dict) -> MetricResult:
+        dct = deepcopy(dct)
+        return cls(
+            score=dct.pop("score", None),
+            total_items=dct.pop("total_items", None),
+            metric_name=dct.pop("metric_name", None),
+            empty_items=dct.pop("empty_items", 0),
+            extra=dct,
+        )
+
+    def as_dict(self) -> dict:
+        return asdict(self)
+
+    def to_dict(self) -> dict:
+        return asdict(self)
+
+    def __hash__(self) -> str:
+        score = self.score if isinstance(self.score, (float, int)) else 0
+        return hash(
+            (
+                self.metric_name,
+                round(score, 3),
+                self.total_items,
+                self.empty_items,
+            ),
+        )
+
+
 ImageTensor = Union[np.ndarray, torch.Tensor]
 
 # Represents type instance for any single downstream prediction
@@ -67,6 +105,6 @@ class ReferenceDTO(EvaluationDTO):
 EvaluationReferenceInstance = Union[SingleReferenceInstance, MultipleReferenceInstance]
 
 EvaluationOutput = Union[int, float, Dict[str, Union[str, int, float]]]
-MetricOutput = Union[int, float, Dict[str, Union[str, int, float]]]
+MetricOutput = Union[int, float, Dict[str, Union[str, int, float]], MetricResult]
 
 PathType = Union[str, Path]
diff --git a/evalem/misc/utils.py b/evalem/misc/utils.py
index 5d92509..a84bafc 100755
--- a/evalem/misc/utils.py
+++ b/evalem/misc/utils.py
@@ -1,6 +1,11 @@
 #!/usr/bin/env python3
 
-from typing import Iterable, List, Union
+from itertools import chain
+from typing import Any, Iterable, List, Union
+
+import numpy as np
+import pandas as pd
+from loguru import logger
 
 from .._base.structures import EvaluationDTO, PredictionInstance, ReferenceInstance
 
@@ -38,3 +43,73 @@ def _dtofy(instance):
         return list(map(format_to_jury, instances))
     else:
         return instances
+
+
+def flatten_list(nested_list: Union[list, tuple, set]) -> List[Any]:
+    """
+    Flattens a nested list of list.
+    Can take into account in case any element is not a list too
+    (eg: `[[1, 2], 3]`)
+    """
+    return list(
+        chain.from_iterable(
+            flatten_list(x) if isinstance(x, (list, set, tuple)) else [x]
+            for x in nested_list
+        ),
+    )
+
+
+def build_comparison_table(
+    *eval_pipes,
+    inputs,
+    references,
+    **eval_params,
+) -> Union[dict, pd.DataFrame]:
+    """
+    A utility that runs the provided evaluation pipeline
+    and generates a comparison table.
+
+    Note:
+        Assumes the same set of inputs and references are run through
+        different evaluation pipeline
+
+    Args:
+        ```eval_pipes```: ```Type[EvaluationPipeline]```
+            Evaluation pipeline objects
+        ```inputs```: ```Any```
+            Inputs that are fed to each pipeline for forward pass
+        ```references```: ```EvaluationReferenceInstance ```
+            References/ground-truths for the evaluation.
+            See `evalem._base.structures.EvaluationReferenceInstance` for type
+
+    Returns:
+        Returns either a pandas DataFrame or dict.
+        If pandas dataframe creation fails, it returns a dict.
+
+        For the dataframe, the index is the metric name and other columns
+        consist of pipeline name with score value.
+    """
+    results = map(lambda ep: ep(inputs=inputs, references=references), eval_pipes)
+    comparison_map = {}
+    dfs = []
+    n_items_tracker = []
+    for idx, (ep, result) in enumerate(zip(eval_pipes, results)):
+        name = f"eval-pipe-{idx}" if not hasattr(ep, "name") else ep.name
+        metrics = set(flatten_list(result))
+        n_items_tracker.extend([m.total_items for m in metrics])
+        comparison_map[name] = metrics
+
+        df = pd.DataFrame(
+            map(lambda m: {"metric": m.metric_name, name: m.score}, metrics),
+        )
+        df.set_index("metric", inplace=True)
+        dfs.append(df)
+    logger.info(
+        f"{int(np.mean(n_items_tracker))} total items are evaluated on average.",
+    )
+    res = comparison_map
+    try:
+        res = pd.concat(dfs, join="outer", axis=1)
+    except:  # noqa
+        logger.warning("Failed to create pd.DataFrame table. Fallback to dict")
+    return res
diff --git a/evalem/nlp/metrics/basics.py b/evalem/nlp/metrics/basics.py
index 6232d1b..68a1f9d 100755
--- a/evalem/nlp/metrics/basics.py
+++ b/evalem/nlp/metrics/basics.py
@@ -1,9 +1,11 @@
 #!/usr/bin/env python3
 
+import dataclasses
+
 from ..._base.metrics import JuryBasedMetric
 from ..._base.structures import (
     EvaluationReferenceInstance,
-    MetricOutput,
+    MetricResult,
     SinglePredictionInstance,
 )
 from ._base import NLPMetric
@@ -18,7 +20,7 @@ def compute(
         predictions: SinglePredictionInstance,
         references: EvaluationReferenceInstance,
         **kwargs,
-    ) -> MetricOutput:
+    ) -> MetricResult:
         # This metric doesn't support multi-reference format.
         # So, we flatten everything:
         # Single Prediction, Multi-Ref -> Single Prediction, Single-Ref
@@ -28,6 +30,12 @@ def compute(
             references=references,
             **kwargs,
         )
-        result["score"] = result.get("exact_match", None)
-        result["flattened"] = True
+
+        extra = result.extra
+        extra["flattened"] = True
+        result = dataclasses.replace(
+            result,
+            score=result.extra.get("exact_match", None),
+            extra=extra,
+        )
         return result
diff --git a/evalem/nlp/metrics/semantics.py b/evalem/nlp/metrics/semantics.py
index 389ad2e..ab8b1a6 100755
--- a/evalem/nlp/metrics/semantics.py
+++ b/evalem/nlp/metrics/semantics.py
@@ -1,5 +1,7 @@
 #!/usr/bin/env python3
 
+
+import dataclasses
 from typing import Optional
 
 import numpy as np
@@ -9,7 +11,7 @@
 from ..._base.structures import (
     EvaluationPredictionInstance,
     EvaluationReferenceInstance,
-    MetricOutput,
+    MetricResult,
 )
 from ...misc.utils import format_to_jury
 from ._base import NLPMetric
@@ -87,7 +89,7 @@ def compute(
         predictions: EvaluationPredictionInstance,
         references: EvaluationReferenceInstance,
         **kwargs,
-    ) -> MetricOutput:
+    ) -> MetricResult:
         device = kwargs.pop("device", self.device)
         model_type = kwargs.pop("model_type", self.model_type)
         result = super().compute(
@@ -101,7 +103,9 @@ def compute(
         # and want to just have mean/average.
         if not self.per_instance_score:
             for _key in ["precision", "recall", "f1"]:
-                result["bertscore"][_key] = np.mean(result["bertscore"][_key])
+                result.extra["bertscore"][_key] = np.mean(
+                    result.extra["bertscore"][_key],
+                )
         return result
 
 
@@ -148,7 +152,7 @@ def compute(
         predictions: EvaluationPredictionInstance,
         references: EvaluationReferenceInstance,
         **kwargs,
-    ) -> MetricOutput:
+    ) -> MetricResult:
         predictions = format_to_jury(predictions)
         references = format_to_jury(references)
 
@@ -157,12 +161,14 @@ def compute(
         # Low-level access to Bartscorer directly
         # See: https://github.com/neulab/BARTScore
         score = np.mean(self.scorer.scorer.score(predictions, references, **kwargs))
-        return dict(
+        return MetricResult(
             score=score,
-            model_checkpoint=self.scorer.model_checkpoint,
-            model_weights=self.scorer.model_weights,
             total_items=len(predictions),
-            flattened=True,
+            metric_name="BartScore",
+            extra=dict(
+                flattened=True,
+                model_checkpoint=self.scorer.model_checkpoint,
+            ),
         )
 
 
@@ -240,6 +246,20 @@ class RougeMetric(JuryBasedMetric, SemanticMetric):
     def __init__(self) -> None:
         super().__init__(metrics="rouge")
 
+    def compute(
+        self,
+        predictions: EvaluationPredictionInstance,
+        references: EvaluationReferenceInstance,
+        **kwargs,
+    ) -> MetricResult:
+        result = super().compute(
+            predictions=predictions,
+            references=references,
+            **kwargs,
+        )
+        score = float(np.mean(list((result.extra or {}).get("rouge", {}).values())))
+        return dataclasses.replace(result, score=score)
+
 
 def main():
     pass
diff --git a/pyproject.toml b/pyproject.toml
index 3099d1e..a75f816 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -41,6 +41,7 @@ dependencies = [
     "pandas==1.5.3",
     "pyarrow==11.0.0",
     "pytest==7.2.1",
+    "pytest-cov==4.0.0",
     "sacrebleu==2.3.1",
     "scikit-learn==1.2.1",
     "sentencepiece==0.1.99",
diff --git a/requirements.txt b/requirements.txt
index fff3296..bd8fa47 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -11,6 +11,7 @@ optimum==1.8.8
 pandas==1.5.3
 pyarrow==11.0.0
 pytest==7.2.1
+pytest-cov==4.0.0
 sacrebleu==2.3.1
 scikit-learn==1.2.1
 sentencepiece==0.1.99
diff --git a/tests/metrics/_base.py b/tests/metrics/_base.py
index 7dec9e5..7dd3678 100755
--- a/tests/metrics/_base.py
+++ b/tests/metrics/_base.py
@@ -1,15 +1,18 @@
 # flake8: noqa
 #!/usr/bin/env python3
 
+from pprint import pprint
+
 import pytest
 
+from evalem._base.structures import MetricResult
+
 from .fixtures import predictions, references
 
 
 @pytest.mark.metrics
 class BaseMetricTest:
     _metric_cls = None
-    _key = None
 
     @pytest.fixture(autouse=True, scope="class")
     def metric_result(self, predictions, references):
@@ -28,10 +31,11 @@ def test_metric_return_type(self, metric_result):
         """
         Check if return type of each metric is a dictionary
         """
-        assert isinstance(metric_result, dict)
+        assert isinstance(metric_result, MetricResult)
 
-    def test_metric_return_keys(self, metric_result):
-        assert "score" in metric_result
+    def test_metric_score(self, metric_result):
+        pprint(metric_result)
+        assert 0 <= metric_result.score <= 1
 
 
 def main():
diff --git a/tests/metrics/test_basics.py b/tests/metrics/test_basics.py
index 5b3b181..1c7d679 100755
--- a/tests/metrics/test_basics.py
+++ b/tests/metrics/test_basics.py
@@ -16,42 +16,22 @@
 
 class TestAccuracyMetric(BaseMetricTest):
     _metric_cls = AccuracyMetric
-    _key = "accuracy"
-
-    def test_metric_score(self, metric_result):
-        assert metric_result[self._key]["score"] >= 0
 
 
 class TestF1Metric(BaseMetricTest):
     _metric_cls = F1Metric
-    _key = "f1"
-
-    def test_metric_score(self, metric_result):
-        assert metric_result[self._key]["score"] >= 0
 
 
 class TestPrecisionMetric(BaseMetricTest):
     _metric_cls = PrecisionMetric
-    _key = "precision"
-
-    def test_metric_score(self, metric_result):
-        assert metric_result[self._key]["score"] >= 0
 
 
 class TestRecallMetric(BaseMetricTest):
     _metric_cls = RecallMetric
-    _key = "recall"
-
-    def test_metric_score(self, metric_result):
-        assert metric_result[self._key]["score"] >= 0
 
 
 class TestConfusionMatrix(BaseMetricTest):
     _metric_cls = ConfusionMatrix
-    _key = "confusion_matrix"
-
-    def test_metric_return_keys(self, metric_result):
-        assert self._key in metric_result
 
     def test_metric_score(self, metric_result):
-        assert isinstance(metric_result[self._key], np.ndarray)
+        assert isinstance(metric_result.extra["confusion_matrix"], np.ndarray)
diff --git a/tests/metrics/test_semantics.py b/tests/metrics/test_semantics.py
index 36dc6eb..9e1a326 100755
--- a/tests/metrics/test_semantics.py
+++ b/tests/metrics/test_semantics.py
@@ -1,6 +1,7 @@
 # flake8: noqa
 #!/usr/bin/env python3
 
+import math
 from pprint import pprint
 
 from evalem.nlp.metrics import (
@@ -17,61 +18,46 @@
 
 class TestBertScore(BaseMetricTest):
     _metric_cls = BertScore
-    _key = "bertscore"
 
     def test_metric_score(self, metric_result):
-        assert -1 <= metric_result["score"] <= 1
+        assert -1 <= metric_result.score <= 1
 
 
 class TestBartScore(BaseMetricTest):
     _metric_cls = BartScore
-    _key = "bartscore"
 
     def test_metric_score(self, metric_result):
-        assert -10 <= metric_result["score"] <= 10
+        assert -math.inf <= metric_result.score <= math.inf
 
 
 class TestBleuMetric(BaseMetricTest):
     _metric_cls = BleuMetric
-    _key = "bleu"
-
-    def test_metric_score(self, metric_result):
-        pprint(metric_result)
-        assert 0 <= metric_result["score"] <= 1
 
 
 class TestSacreBleuMetric(BaseMetricTest):
     _metric_cls = SacreBleuMetric
-    _key = "sacrebleu"
-
-    def test_metric_score(self, metric_result):
-        pprint(metric_result)
-        assert 0 <= metric_result["score"] <= 1
 
 
 class TestMeteorMetric(BaseMetricTest):
     _metric_cls = MeteorMetric
-    _key = "meteor"
-
-    def test_metric_score(self, metric_result):
-        pprint(metric_result)
-        assert 0 <= metric_result["score"] <= 1
 
 
 class TestRougeMetric(BaseMetricTest):
     _metric_cls = RougeMetric
-    _key = "rouge"
 
     def test_metric_return_keys(self, metric_result):
-        assert self._key in metric_result
-        assert "rouge1" in metric_result[self._key]
-        assert "rouge2" in metric_result[self._key]
-        assert "rougeL" in metric_result[self._key]
-        assert "rougeLsum" in metric_result[self._key]
+        key = "rouge"
+        assert key in metric_result.extra
+        assert "rouge1" in metric_result.extra[key]
+        assert "rouge2" in metric_result.extra[key]
+        assert "rougeL" in metric_result.extra[key]
+        assert "rougeLsum" in metric_result.extra[key]
 
     def test_metric_score(self, metric_result):
+        key = "rouge"
         pprint(metric_result)
-        assert 0 <= metric_result[self._key]["rouge1"] <= 1
-        assert 0 <= metric_result[self._key]["rouge2"] <= 1
-        assert 0 <= metric_result[self._key]["rougeL"] <= 1
-        assert 0 <= metric_result[self._key]["rougeLsum"] <= 1
+        assert 0 <= metric_result.score <= 1
+        assert 0 <= metric_result.extra[key]["rouge1"] <= 1
+        assert 0 <= metric_result.extra[key]["rouge2"] <= 1
+        assert 0 <= metric_result.extra[key]["rougeL"] <= 1
+        assert 0 <= metric_result.extra[key]["rougeLsum"] <= 1