Updated german clusterings tasks

embeddings-benchmark · Apr 23, 2024 · a69c3bc · a69c3bc
1 parent 36312ac
commit a69c3bc
Show file tree

Hide file tree

Showing 19 changed files with 534 additions and 13 deletions.
diff --git a/mteb/abstasks/AbsTaskClusteringFast.py b/mteb/abstasks/AbsTaskClusteringFast.py
@@ -7,7 +7,7 @@
 import numpy as np
 import sklearn
 import sklearn.cluster
-from datasets import DatasetDict
+from datasets import Dataset, DatasetDict
 from sklearn.metrics.cluster import v_measure_score
 
 from .AbsTask import AbsTask
@@ -121,10 +121,14 @@ def _evaluate_monolingual(
         _dataset = dataset[split]
 
         rng_state = random.Random(self.seed)
-        example_indices = rng_state.sample(
-            range(len(_dataset)), k=self.max_documents_to_embed
-        )
-        downsampled_dataset = _dataset.select(example_indices)
+
+        if len(_dataset) > self.max_documents_to_embed:
+            example_indices = rng_state.sample(
+                range(len(_dataset)), k=self.max_documents_to_embed
+            )
+            downsampled_dataset = _dataset.select(example_indices)
+        else:
+            downsampled_dataset = _dataset
 
         logger.info(f"Encoding {len(downsampled_dataset)} sentences...")
 
@@ -140,3 +144,75 @@ def _evaluate_monolingual(
         )
 
         return {"v_measures": v_measures, "v_measure": float(np.mean(v_measures))}
+
+
+def clustering_downsample(
+    dataset: DatasetDict, seed: int, max_samples_in_cluster: int = 2048
+) -> DatasetDict:
+    """In cases where it is not possible to convert the dataset to a fast version, we can downsample the dataset to speed up the evaluation.
+
+    This might be necessary when the clusters in the dataset is not sampled from the same distribution.
+    """
+    rng_state = random.Random(seed)
+
+    ds = {}
+    for split in dataset:
+        _docs = []
+        _labels = []
+
+        n_clusters = len(dataset[split])
+
+        for i in range(n_clusters):
+            labels = dataset[split]["labels"][i]
+            sentences = dataset[split]["sentences"][i]
+
+            n_sample = min(max_samples_in_cluster, len(sentences))
+
+            # sample n_sample from each cluster
+            idxs = rng_state.sample(range(len(sentences)), n_sample)
+            _docs.append([sentences[idx] for idx in idxs])
+            _labels.append([labels[idx] for idx in idxs])
+
+        ds[split] = Dataset.from_dict({"sentences": _docs, "labels": _labels})
+    return DatasetDict(ds)
+
+
+def convert_to_fast(
+    dataset: DatasetDict, seed: int, max_size: int = 100_000
+) -> DatasetDict:
+    """Converts a clustering dataset to a fast version. This concats the cluster into two columns, sentences and labels.
+    It additionally downsamples the dataset to max_size.
+    """
+    categories = None
+    rng_state = random.Random(seed)
+
+    ds = {}
+    for split in dataset:
+        sent_set = set()
+        labels = []
+        sentences = []
+        n_clusters = len(dataset[split])
+        for i in range(n_clusters):
+            lab = dataset[split]["labels"][i]
+            sents = dataset[split]["sentences"][i]
+            for l, s in zip(lab, sents):
+                if s not in sent_set:
+                    labels.append(l)
+                    sentences.append(s)
+                    sent_set.add(s)  # ensuring no duplicates
+
+        # check that it is the same distribution
+        if categories is None:
+            categories = set(labels)
+        else:
+            assert (
+                categories == set(labels)
+            ), "The clusters are not sampled from the same distribution as they have different labels."
+
+        ds[split] = Dataset.from_dict({"sentences": sentences, "labels": labels})
+
+        if len(ds[split]) > max_size:
+            idxs = rng_state.sample(range(len(ds[split])), max_size)
+            ds[split] = ds[split].select(idxs)
+
+    return DatasetDict(ds)
diff --git a/mteb/tasks/Clustering/__init__.py b/mteb/tasks/Clustering/__init__.py
@@ -1,9 +1,13 @@
 from __future__ import annotations
 
 from .deu.BlurbsClusteringP2P import *
+from .deu.BlurbsClusteringP2PFast import *
 from .deu.BlurbsClusteringS2S import *
+from .deu.BlurbsClusteringS2SFast import *
 from .deu.TenKGnadClusteringP2P import *
+from .deu.TenKGnadClusteringP2PFast import *
 from .deu.TenKGnadClusteringS2S import *
+from .deu.TenKGnadClusteringS2SFast import *
 from .eng.ArxivClusteringP2P import *
 from .eng.ArxivClusteringP2PFast import *
 from .eng.ArxivClusteringS2S import *

diff --git a/mteb/tasks/Clustering/deu/BlurbsClusteringP2P.py b/mteb/tasks/Clustering/deu/BlurbsClusteringP2P.py
@@ -5,6 +5,8 @@
 
 
 class BlurbsClusteringP2P(AbsTaskClustering):
+    superseeded_by = "BlurbsClusteringP2P.v2"
+
     metadata = TaskMetadata(
         name="BlurbsClusteringP2P",
         description="Clustering of book titles+blurbs. Clustering of 28 sets, either on the main or secondary genre.",

diff --git a/mteb/tasks/Clustering/deu/BlurbsClusteringP2PFast.py b/mteb/tasks/Clustering/deu/BlurbsClusteringP2PFast.py
@@ -0,0 +1,50 @@
+from __future__ import annotations
+
+from mteb.abstasks.AbsTaskClustering import AbsTaskClustering
+from mteb.abstasks.AbsTaskClusteringFast import clustering_downsample
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class BlurbsClusteringP2PFast(AbsTaskClustering):
+    # a faster version of BlurbsClusteringP2P, since it does not sample from the same distribution we can't use the AbsTaskClusteringFast, instead we
+    # simply downsample each cluster.
+
+    metadata = TaskMetadata(
+        name="BlurbsClusteringP2P.v2",
+        description="Clustering of book titles+blurbs. Clustering of 28 sets, either on the main or secondary genre.",
+        reference="https://www.inf.uni-hamburg.de/en/inst/ab/lt/resources/data/germeval-2019-hmc.html",
+        dataset={
+            "path": "slvnwhrl/blurbs-clustering-p2p",
+            "revision": "a2dd5b02a77de3466a3eaa98ae586b5610314496",
+        },
+        type="Clustering",
+        category="p2p",
+        eval_splits=["test"],
+        eval_langs=["deu-Latn"],
+        main_score="v_measure",
+        date=(
+            "1900-01-01",
+            "2019-12-31",
+        ),  # since it is books it is likely to be from the 20th century -> paper from 2019
+        form=["written"],
+        domains=["Fiction"],
+        task_subtypes=["Thematic clustering"],
+        license="cc-by-nc-4.0",
+        socioeconomic_status="mixed",
+        annotations_creators="derived",
+        dialect=[],
+        text_creation="found",
+        bibtex_citation="""@inproceedings{Remus2019GermEval2T,
+  title={GermEval 2019 Task 1: Hierarchical Classification of Blurbs},
+  author={Steffen Remus and Rami Aly and Chris Biemann},
+  booktitle={Conference on Natural Language Processing},
+  year={2019},
+  url={https://api.semanticscholar.org/CorpusID:208334484}
+}""",
+        n_samples={"test": 50268},
+        avg_character_length={"test": 664.09},
+    )
+
+    def dataset_transform(self):
+        ds = clustering_downsample(self.dataset, self.seed)
+        self.dataset = ds
diff --git a/mteb/tasks/Clustering/deu/BlurbsClusteringS2S.py b/mteb/tasks/Clustering/deu/BlurbsClusteringS2S.py
@@ -6,6 +6,8 @@
 
 
 class BlurbsClusteringS2S(AbsTaskClustering):
+    superseeded_by = "BlurbsClusteringS2S.v2"
+
     metadata = TaskMetadata(
         name="BlurbsClusteringS2S",
         description="Clustering of book titles. Clustering of 28 sets, either on the main or secondary genre.",

diff --git a/mteb/tasks/Clustering/deu/BlurbsClusteringS2SFast.py b/mteb/tasks/Clustering/deu/BlurbsClusteringS2SFast.py
@@ -0,0 +1,50 @@
+from __future__ import annotations
+
+from mteb.abstasks.AbsTaskClustering import AbsTaskClustering
+from mteb.abstasks.AbsTaskClusteringFast import clustering_downsample
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class BlurbsClusteringS2S(AbsTaskClustering):
+    # a faster version of the task, since it does not sample from the same distribution we can't use the AbsTaskClusteringFast, instead we
+    # simply downsample each cluster.
+
+    metadata = TaskMetadata(
+        name="BlurbsClusteringS2S.v2",
+        description="Clustering of book titles. Clustering of 28 sets, either on the main or secondary genre.",
+        reference="https://www.inf.uni-hamburg.de/en/inst/ab/lt/resources/data/germeval-2019-hmc.html",
+        dataset={
+            "path": "slvnwhrl/blurbs-clustering-s2s",
+            "revision": "22793b6a6465bf00120ad525e38c51210858132c",
+        },
+        type="Clustering",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=["deu-Latn"],
+        main_score="v_measure",
+        date=(
+            "1900-01-01",
+            "2019-12-31",
+        ),  # since it is books it is likely to be from the 20th century -> paper from 2019
+        form=["written"],
+        domains=["Fiction"],
+        task_subtypes=["Thematic clustering"],
+        license="cc-by-nc-4.0",
+        socioeconomic_status="mixed",
+        annotations_creators="derived",
+        dialect=[],
+        text_creation="found",
+        bibtex_citation="""@inproceedings{Remus2019GermEval2T,
+  title={GermEval 2019 Task 1: Hierarchical Classification of Blurbs},
+  author={Steffen Remus and Rami Aly and Chris Biemann},
+  booktitle={Conference on Natural Language Processing},
+  year={2019},
+  url={https://api.semanticscholar.org/CorpusID:208334484}
+}""",
+        n_samples={"test": 50268},
+        avg_character_length={"test": 23.02},
+    )
+
+    def dataset_transform(self):
+        ds = clustering_downsample(self.dataset, self.seed)
+        self.dataset = ds
diff --git a/mteb/tasks/Clustering/deu/TenKGnadClusteringP2P.py b/mteb/tasks/Clustering/deu/TenKGnadClusteringP2P.py
@@ -6,6 +6,8 @@
 
 
 class TenKGnadClusteringP2P(AbsTaskClustering):
+    superseeded_by = "TenKGnadClusteringP2P.v2"
+
     metadata = TaskMetadata(
         name="TenKGnadClusteringP2P",
         description="Clustering of news article titles+subheadings+texts. Clustering of 10 splits on the news article category.",

diff --git a/mteb/tasks/Clustering/deu/TenKGnadClusteringP2PFast.py b/mteb/tasks/Clustering/deu/TenKGnadClusteringP2PFast.py
@@ -0,0 +1,40 @@
+from __future__ import annotations
+
+from mteb.abstasks.AbsTaskClusteringFast import AbsTaskClusteringFast, convert_to_fast
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class TenKGnadClusteringP2PFast(AbsTaskClusteringFast):
+    metadata = TaskMetadata(
+        name="TenKGnadClusteringP2P.v2",
+        description="Clustering of news article titles+subheadings+texts. Clustering of 10 splits on the news article category.",
+        reference="https://tblock.github.io/10kGNAD/",
+        dataset={
+            "path": "slvnwhrl/tenkgnad-clustering-p2p",
+            "revision": "5c59e41555244b7e45c9a6be2d720ab4bafae558",
+        },
+        type="Clustering",
+        category="p2p",
+        eval_splits=["test"],
+        eval_langs=["deu-Latn"],
+        main_score="v_measure",
+        date=(
+            "2000-01-01",
+            "2020-12-31",
+        ),  # since it is news it is guessed that it is from 2000 to 2020
+        form=["written"],
+        domains=["News", "Non-fiction"],
+        task_subtypes=None,
+        license="cc-by-sa-4.0",
+        socioeconomic_status="medium",
+        annotations_creators="derived",
+        dialect=[],
+        text_creation="found",
+        bibtex_citation=None,  # none found
+        n_samples={"test": 10275},  # due to duplicates
+        avg_character_length={"test": 2641.03},
+    )
+
+    def dataset_transform(self) -> None:
+        ds = convert_to_fast(self.dataset, self.seed)  # type: ignore
+        self.dataset = ds
diff --git a/mteb/tasks/Clustering/deu/TenKGnadClusteringS2S.py b/mteb/tasks/Clustering/deu/TenKGnadClusteringS2S.py
@@ -6,6 +6,8 @@
 
 
 class TenKGnadClusteringS2S(AbsTaskClustering):
+    superseeded_by = "TenKGnadClusteringS2S.v2"
+
     metadata = TaskMetadata(
         name="TenKGnadClusteringS2S",
         description="Clustering of news article titles. Clustering of 10 splits on the news article category.",

diff --git a/mteb/tasks/Clustering/deu/TenKGnadClusteringS2SFast.py b/mteb/tasks/Clustering/deu/TenKGnadClusteringS2SFast.py
@@ -0,0 +1,40 @@
+from __future__ import annotations
+
+from mteb.abstasks.AbsTaskClusteringFast import AbsTaskClusteringFast, convert_to_fast
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class TenKGnadClusteringS2S(AbsTaskClusteringFast):
+    metadata = TaskMetadata(
+        name="TenKGnadClusteringS2S.v2",
+        description="Clustering of news article titles. Clustering of 10 splits on the news article category.",
+        reference="https://tblock.github.io/10kGNAD/",
+        dataset={
+            "path": "slvnwhrl/tenkgnad-clustering-s2s",
+            "revision": "6cddbe003f12b9b140aec477b583ac4191f01786",
+        },
+        type="Clustering",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=["deu-Latn"],
+        main_score="v_measure",
+        date=(
+            "2000-01-01",
+            "2020-12-31",
+        ),  # since it is news it is guessed that it is from 2000 to 2020
+        form=["written"],
+        domains=["News", "Non-fiction"],
+        task_subtypes=None,
+        license="cc-by-sa-4.0",
+        socioeconomic_status="medium",
+        annotations_creators="derived",
+        dialect=[],
+        text_creation="found",
+        bibtex_citation=None,  # none found
+        n_samples={"test": 10275},  # due to duplicates
+        avg_character_length={"test": 50.96},
+    )
+
+    def dataset_transform(self) -> None:
+        ds = convert_to_fast(self.dataset, self.seed)  # type: ignore
+        self.dataset = ds
diff --git a/mteb/tasks/Clustering/eng/ArxivClusteringP2PFast.py b/mteb/tasks/Clustering/eng/ArxivClusteringP2PFast.py
@@ -2,11 +2,15 @@
 
 import datasets
 
-from mteb.abstasks.AbsTaskClusteringFast import AbsTaskClusteringFast
+from mteb.abstasks.AbsTaskClusteringFast import clustering_downsample
+from mteb.abstasks.AbsTaskClustering import AbsTaskClustering
 from mteb.abstasks.TaskMetadata import TaskMetadata
 
 
-class ArxivClusteringP2PFast(AbsTaskClusteringFast):
+class ArxivClusteringP2PFast(AbsTaskClustering):
+    # a faster version of the dataset, since it does not sample from the same distribution we can't use the AbsTaskClusteringFast, instead we
+    # simply downsample each cluster.
+
     metadata = TaskMetadata(
         name="ArxivClusteringP2P.v2",
         description="Clustering of titles+abstract from arxiv. Clustering of 30 sets, either on the main or secondary category",
@@ -34,10 +38,7 @@ class ArxivClusteringP2PFast(AbsTaskClusteringFast):
         avg_character_length={"test": 1009.98},
     )
 
-    def dataset_transform(self):
-        sent = self.dataset["test"]["sentences"][0]  # type: ignore
-        lab = self.dataset["test"]["labels"][0]  # type: ignore
 
-        self.dataset["test"] = datasets.Dataset.from_dict(  # type: ignore
-            {"sentences": sent, "labels": lab}
-        )
+    def dataset_transform(self):
+        ds = clustering_downsample(self.dataset, self.seed)
+        self.dataset = ds