Skip to content

Commit

Permalink
Updated german clusterings tasks
Browse files Browse the repository at this point in the history
  • Loading branch information
KennethEnevoldsen committed Apr 23, 2024
1 parent 36312ac commit a69c3bc
Show file tree
Hide file tree
Showing 19 changed files with 534 additions and 13 deletions.
86 changes: 81 additions & 5 deletions mteb/abstasks/AbsTaskClusteringFast.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import numpy as np
import sklearn
import sklearn.cluster
from datasets import DatasetDict
from datasets import Dataset, DatasetDict
from sklearn.metrics.cluster import v_measure_score

from .AbsTask import AbsTask
Expand Down Expand Up @@ -121,10 +121,14 @@ def _evaluate_monolingual(
_dataset = dataset[split]

rng_state = random.Random(self.seed)
example_indices = rng_state.sample(
range(len(_dataset)), k=self.max_documents_to_embed
)
downsampled_dataset = _dataset.select(example_indices)

if len(_dataset) > self.max_documents_to_embed:
example_indices = rng_state.sample(
range(len(_dataset)), k=self.max_documents_to_embed
)
downsampled_dataset = _dataset.select(example_indices)
else:
downsampled_dataset = _dataset

logger.info(f"Encoding {len(downsampled_dataset)} sentences...")

Expand All @@ -140,3 +144,75 @@ def _evaluate_monolingual(
)

return {"v_measures": v_measures, "v_measure": float(np.mean(v_measures))}


def clustering_downsample(
dataset: DatasetDict, seed: int, max_samples_in_cluster: int = 2048
) -> DatasetDict:
"""In cases where it is not possible to convert the dataset to a fast version, we can downsample the dataset to speed up the evaluation.
This might be necessary when the clusters in the dataset is not sampled from the same distribution.
"""
rng_state = random.Random(seed)

ds = {}
for split in dataset:
_docs = []
_labels = []

n_clusters = len(dataset[split])

for i in range(n_clusters):
labels = dataset[split]["labels"][i]
sentences = dataset[split]["sentences"][i]

n_sample = min(max_samples_in_cluster, len(sentences))

# sample n_sample from each cluster
idxs = rng_state.sample(range(len(sentences)), n_sample)
_docs.append([sentences[idx] for idx in idxs])
_labels.append([labels[idx] for idx in idxs])

ds[split] = Dataset.from_dict({"sentences": _docs, "labels": _labels})
return DatasetDict(ds)


def convert_to_fast(
dataset: DatasetDict, seed: int, max_size: int = 100_000
) -> DatasetDict:
"""Converts a clustering dataset to a fast version. This concats the cluster into two columns, sentences and labels.
It additionally downsamples the dataset to max_size.
"""
categories = None
rng_state = random.Random(seed)

ds = {}
for split in dataset:
sent_set = set()
labels = []
sentences = []
n_clusters = len(dataset[split])
for i in range(n_clusters):
lab = dataset[split]["labels"][i]
sents = dataset[split]["sentences"][i]
for l, s in zip(lab, sents):
if s not in sent_set:
labels.append(l)
sentences.append(s)
sent_set.add(s) # ensuring no duplicates

# check that it is the same distribution
if categories is None:
categories = set(labels)
else:
assert (
categories == set(labels)
), "The clusters are not sampled from the same distribution as they have different labels."

ds[split] = Dataset.from_dict({"sentences": sentences, "labels": labels})

if len(ds[split]) > max_size:
idxs = rng_state.sample(range(len(ds[split])), max_size)
ds[split] = ds[split].select(idxs)

return DatasetDict(ds)
4 changes: 4 additions & 0 deletions mteb/tasks/Clustering/__init__.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
from __future__ import annotations

from .deu.BlurbsClusteringP2P import *
from .deu.BlurbsClusteringP2PFast import *
from .deu.BlurbsClusteringS2S import *
from .deu.BlurbsClusteringS2SFast import *
from .deu.TenKGnadClusteringP2P import *
from .deu.TenKGnadClusteringP2PFast import *
from .deu.TenKGnadClusteringS2S import *
from .deu.TenKGnadClusteringS2SFast import *
from .eng.ArxivClusteringP2P import *
from .eng.ArxivClusteringP2PFast import *
from .eng.ArxivClusteringS2S import *
Expand Down
2 changes: 2 additions & 0 deletions mteb/tasks/Clustering/deu/BlurbsClusteringP2P.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@


class BlurbsClusteringP2P(AbsTaskClustering):
superseeded_by = "BlurbsClusteringP2P.v2"

metadata = TaskMetadata(
name="BlurbsClusteringP2P",
description="Clustering of book titles+blurbs. Clustering of 28 sets, either on the main or secondary genre.",
Expand Down
50 changes: 50 additions & 0 deletions mteb/tasks/Clustering/deu/BlurbsClusteringP2PFast.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
from __future__ import annotations

from mteb.abstasks.AbsTaskClustering import AbsTaskClustering
from mteb.abstasks.AbsTaskClusteringFast import clustering_downsample
from mteb.abstasks.TaskMetadata import TaskMetadata


class BlurbsClusteringP2PFast(AbsTaskClustering):
# a faster version of BlurbsClusteringP2P, since it does not sample from the same distribution we can't use the AbsTaskClusteringFast, instead we
# simply downsample each cluster.

metadata = TaskMetadata(
name="BlurbsClusteringP2P.v2",
description="Clustering of book titles+blurbs. Clustering of 28 sets, either on the main or secondary genre.",
reference="https://www.inf.uni-hamburg.de/en/inst/ab/lt/resources/data/germeval-2019-hmc.html",
dataset={
"path": "slvnwhrl/blurbs-clustering-p2p",
"revision": "a2dd5b02a77de3466a3eaa98ae586b5610314496",
},
type="Clustering",
category="p2p",
eval_splits=["test"],
eval_langs=["deu-Latn"],
main_score="v_measure",
date=(
"1900-01-01",
"2019-12-31",
), # since it is books it is likely to be from the 20th century -> paper from 2019
form=["written"],
domains=["Fiction"],
task_subtypes=["Thematic clustering"],
license="cc-by-nc-4.0",
socioeconomic_status="mixed",
annotations_creators="derived",
dialect=[],
text_creation="found",
bibtex_citation="""@inproceedings{Remus2019GermEval2T,
title={GermEval 2019 Task 1: Hierarchical Classification of Blurbs},
author={Steffen Remus and Rami Aly and Chris Biemann},
booktitle={Conference on Natural Language Processing},
year={2019},
url={https://api.semanticscholar.org/CorpusID:208334484}
}""",
n_samples={"test": 50268},
avg_character_length={"test": 664.09},
)

def dataset_transform(self):
ds = clustering_downsample(self.dataset, self.seed)
self.dataset = ds
2 changes: 2 additions & 0 deletions mteb/tasks/Clustering/deu/BlurbsClusteringS2S.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@


class BlurbsClusteringS2S(AbsTaskClustering):
superseeded_by = "BlurbsClusteringS2S.v2"

metadata = TaskMetadata(
name="BlurbsClusteringS2S",
description="Clustering of book titles. Clustering of 28 sets, either on the main or secondary genre.",
Expand Down
50 changes: 50 additions & 0 deletions mteb/tasks/Clustering/deu/BlurbsClusteringS2SFast.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
from __future__ import annotations

from mteb.abstasks.AbsTaskClustering import AbsTaskClustering
from mteb.abstasks.AbsTaskClusteringFast import clustering_downsample
from mteb.abstasks.TaskMetadata import TaskMetadata


class BlurbsClusteringS2S(AbsTaskClustering):
# a faster version of the task, since it does not sample from the same distribution we can't use the AbsTaskClusteringFast, instead we
# simply downsample each cluster.

metadata = TaskMetadata(
name="BlurbsClusteringS2S.v2",
description="Clustering of book titles. Clustering of 28 sets, either on the main or secondary genre.",
reference="https://www.inf.uni-hamburg.de/en/inst/ab/lt/resources/data/germeval-2019-hmc.html",
dataset={
"path": "slvnwhrl/blurbs-clustering-s2s",
"revision": "22793b6a6465bf00120ad525e38c51210858132c",
},
type="Clustering",
category="s2s",
eval_splits=["test"],
eval_langs=["deu-Latn"],
main_score="v_measure",
date=(
"1900-01-01",
"2019-12-31",
), # since it is books it is likely to be from the 20th century -> paper from 2019
form=["written"],
domains=["Fiction"],
task_subtypes=["Thematic clustering"],
license="cc-by-nc-4.0",
socioeconomic_status="mixed",
annotations_creators="derived",
dialect=[],
text_creation="found",
bibtex_citation="""@inproceedings{Remus2019GermEval2T,
title={GermEval 2019 Task 1: Hierarchical Classification of Blurbs},
author={Steffen Remus and Rami Aly and Chris Biemann},
booktitle={Conference on Natural Language Processing},
year={2019},
url={https://api.semanticscholar.org/CorpusID:208334484}
}""",
n_samples={"test": 50268},
avg_character_length={"test": 23.02},
)

def dataset_transform(self):
ds = clustering_downsample(self.dataset, self.seed)
self.dataset = ds
2 changes: 2 additions & 0 deletions mteb/tasks/Clustering/deu/TenKGnadClusteringP2P.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@


class TenKGnadClusteringP2P(AbsTaskClustering):
superseeded_by = "TenKGnadClusteringP2P.v2"

metadata = TaskMetadata(
name="TenKGnadClusteringP2P",
description="Clustering of news article titles+subheadings+texts. Clustering of 10 splits on the news article category.",
Expand Down
40 changes: 40 additions & 0 deletions mteb/tasks/Clustering/deu/TenKGnadClusteringP2PFast.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
from __future__ import annotations

from mteb.abstasks.AbsTaskClusteringFast import AbsTaskClusteringFast, convert_to_fast
from mteb.abstasks.TaskMetadata import TaskMetadata


class TenKGnadClusteringP2PFast(AbsTaskClusteringFast):
metadata = TaskMetadata(
name="TenKGnadClusteringP2P.v2",
description="Clustering of news article titles+subheadings+texts. Clustering of 10 splits on the news article category.",
reference="https://tblock.github.io/10kGNAD/",
dataset={
"path": "slvnwhrl/tenkgnad-clustering-p2p",
"revision": "5c59e41555244b7e45c9a6be2d720ab4bafae558",
},
type="Clustering",
category="p2p",
eval_splits=["test"],
eval_langs=["deu-Latn"],
main_score="v_measure",
date=(
"2000-01-01",
"2020-12-31",
), # since it is news it is guessed that it is from 2000 to 2020
form=["written"],
domains=["News", "Non-fiction"],
task_subtypes=None,
license="cc-by-sa-4.0",
socioeconomic_status="medium",
annotations_creators="derived",
dialect=[],
text_creation="found",
bibtex_citation=None, # none found
n_samples={"test": 10275}, # due to duplicates
avg_character_length={"test": 2641.03},
)

def dataset_transform(self) -> None:
ds = convert_to_fast(self.dataset, self.seed) # type: ignore
self.dataset = ds
2 changes: 2 additions & 0 deletions mteb/tasks/Clustering/deu/TenKGnadClusteringS2S.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@


class TenKGnadClusteringS2S(AbsTaskClustering):
superseeded_by = "TenKGnadClusteringS2S.v2"

metadata = TaskMetadata(
name="TenKGnadClusteringS2S",
description="Clustering of news article titles. Clustering of 10 splits on the news article category.",
Expand Down
40 changes: 40 additions & 0 deletions mteb/tasks/Clustering/deu/TenKGnadClusteringS2SFast.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
from __future__ import annotations

from mteb.abstasks.AbsTaskClusteringFast import AbsTaskClusteringFast, convert_to_fast
from mteb.abstasks.TaskMetadata import TaskMetadata


class TenKGnadClusteringS2S(AbsTaskClusteringFast):
metadata = TaskMetadata(
name="TenKGnadClusteringS2S.v2",
description="Clustering of news article titles. Clustering of 10 splits on the news article category.",
reference="https://tblock.github.io/10kGNAD/",
dataset={
"path": "slvnwhrl/tenkgnad-clustering-s2s",
"revision": "6cddbe003f12b9b140aec477b583ac4191f01786",
},
type="Clustering",
category="s2s",
eval_splits=["test"],
eval_langs=["deu-Latn"],
main_score="v_measure",
date=(
"2000-01-01",
"2020-12-31",
), # since it is news it is guessed that it is from 2000 to 2020
form=["written"],
domains=["News", "Non-fiction"],
task_subtypes=None,
license="cc-by-sa-4.0",
socioeconomic_status="medium",
annotations_creators="derived",
dialect=[],
text_creation="found",
bibtex_citation=None, # none found
n_samples={"test": 10275}, # due to duplicates
avg_character_length={"test": 50.96},
)

def dataset_transform(self) -> None:
ds = convert_to_fast(self.dataset, self.seed) # type: ignore
self.dataset = ds
17 changes: 9 additions & 8 deletions mteb/tasks/Clustering/eng/ArxivClusteringP2PFast.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,15 @@

import datasets

from mteb.abstasks.AbsTaskClusteringFast import AbsTaskClusteringFast
from mteb.abstasks.AbsTaskClusteringFast import clustering_downsample
from mteb.abstasks.AbsTaskClustering import AbsTaskClustering
from mteb.abstasks.TaskMetadata import TaskMetadata


class ArxivClusteringP2PFast(AbsTaskClusteringFast):
class ArxivClusteringP2PFast(AbsTaskClustering):
# a faster version of the dataset, since it does not sample from the same distribution we can't use the AbsTaskClusteringFast, instead we
# simply downsample each cluster.

metadata = TaskMetadata(
name="ArxivClusteringP2P.v2",
description="Clustering of titles+abstract from arxiv. Clustering of 30 sets, either on the main or secondary category",
Expand Down Expand Up @@ -34,10 +38,7 @@ class ArxivClusteringP2PFast(AbsTaskClusteringFast):
avg_character_length={"test": 1009.98},
)

def dataset_transform(self):
sent = self.dataset["test"]["sentences"][0] # type: ignore
lab = self.dataset["test"]["labels"][0] # type: ignore

self.dataset["test"] = datasets.Dataset.from_dict( # type: ignore
{"sentences": sent, "labels": lab}
)
def dataset_transform(self):
ds = clustering_downsample(self.dataset, self.seed)
self.dataset = ds
Loading

0 comments on commit a69c3bc

Please sign in to comment.