-
Notifications
You must be signed in to change notification settings - Fork 289
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
36312ac
commit a69c3bc
Showing
19 changed files
with
534 additions
and
13 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
from __future__ import annotations | ||
|
||
from mteb.abstasks.AbsTaskClustering import AbsTaskClustering | ||
from mteb.abstasks.AbsTaskClusteringFast import clustering_downsample | ||
from mteb.abstasks.TaskMetadata import TaskMetadata | ||
|
||
|
||
class BlurbsClusteringP2PFast(AbsTaskClustering): | ||
# a faster version of BlurbsClusteringP2P, since it does not sample from the same distribution we can't use the AbsTaskClusteringFast, instead we | ||
# simply downsample each cluster. | ||
|
||
metadata = TaskMetadata( | ||
name="BlurbsClusteringP2P.v2", | ||
description="Clustering of book titles+blurbs. Clustering of 28 sets, either on the main or secondary genre.", | ||
reference="https://www.inf.uni-hamburg.de/en/inst/ab/lt/resources/data/germeval-2019-hmc.html", | ||
dataset={ | ||
"path": "slvnwhrl/blurbs-clustering-p2p", | ||
"revision": "a2dd5b02a77de3466a3eaa98ae586b5610314496", | ||
}, | ||
type="Clustering", | ||
category="p2p", | ||
eval_splits=["test"], | ||
eval_langs=["deu-Latn"], | ||
main_score="v_measure", | ||
date=( | ||
"1900-01-01", | ||
"2019-12-31", | ||
), # since it is books it is likely to be from the 20th century -> paper from 2019 | ||
form=["written"], | ||
domains=["Fiction"], | ||
task_subtypes=["Thematic clustering"], | ||
license="cc-by-nc-4.0", | ||
socioeconomic_status="mixed", | ||
annotations_creators="derived", | ||
dialect=[], | ||
text_creation="found", | ||
bibtex_citation="""@inproceedings{Remus2019GermEval2T, | ||
title={GermEval 2019 Task 1: Hierarchical Classification of Blurbs}, | ||
author={Steffen Remus and Rami Aly and Chris Biemann}, | ||
booktitle={Conference on Natural Language Processing}, | ||
year={2019}, | ||
url={https://api.semanticscholar.org/CorpusID:208334484} | ||
}""", | ||
n_samples={"test": 50268}, | ||
avg_character_length={"test": 664.09}, | ||
) | ||
|
||
def dataset_transform(self): | ||
ds = clustering_downsample(self.dataset, self.seed) | ||
self.dataset = ds |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
from __future__ import annotations | ||
|
||
from mteb.abstasks.AbsTaskClustering import AbsTaskClustering | ||
from mteb.abstasks.AbsTaskClusteringFast import clustering_downsample | ||
from mteb.abstasks.TaskMetadata import TaskMetadata | ||
|
||
|
||
class BlurbsClusteringS2S(AbsTaskClustering): | ||
# a faster version of the task, since it does not sample from the same distribution we can't use the AbsTaskClusteringFast, instead we | ||
# simply downsample each cluster. | ||
|
||
metadata = TaskMetadata( | ||
name="BlurbsClusteringS2S.v2", | ||
description="Clustering of book titles. Clustering of 28 sets, either on the main or secondary genre.", | ||
reference="https://www.inf.uni-hamburg.de/en/inst/ab/lt/resources/data/germeval-2019-hmc.html", | ||
dataset={ | ||
"path": "slvnwhrl/blurbs-clustering-s2s", | ||
"revision": "22793b6a6465bf00120ad525e38c51210858132c", | ||
}, | ||
type="Clustering", | ||
category="s2s", | ||
eval_splits=["test"], | ||
eval_langs=["deu-Latn"], | ||
main_score="v_measure", | ||
date=( | ||
"1900-01-01", | ||
"2019-12-31", | ||
), # since it is books it is likely to be from the 20th century -> paper from 2019 | ||
form=["written"], | ||
domains=["Fiction"], | ||
task_subtypes=["Thematic clustering"], | ||
license="cc-by-nc-4.0", | ||
socioeconomic_status="mixed", | ||
annotations_creators="derived", | ||
dialect=[], | ||
text_creation="found", | ||
bibtex_citation="""@inproceedings{Remus2019GermEval2T, | ||
title={GermEval 2019 Task 1: Hierarchical Classification of Blurbs}, | ||
author={Steffen Remus and Rami Aly and Chris Biemann}, | ||
booktitle={Conference on Natural Language Processing}, | ||
year={2019}, | ||
url={https://api.semanticscholar.org/CorpusID:208334484} | ||
}""", | ||
n_samples={"test": 50268}, | ||
avg_character_length={"test": 23.02}, | ||
) | ||
|
||
def dataset_transform(self): | ||
ds = clustering_downsample(self.dataset, self.seed) | ||
self.dataset = ds |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
from __future__ import annotations | ||
|
||
from mteb.abstasks.AbsTaskClusteringFast import AbsTaskClusteringFast, convert_to_fast | ||
from mteb.abstasks.TaskMetadata import TaskMetadata | ||
|
||
|
||
class TenKGnadClusteringP2PFast(AbsTaskClusteringFast): | ||
metadata = TaskMetadata( | ||
name="TenKGnadClusteringP2P.v2", | ||
description="Clustering of news article titles+subheadings+texts. Clustering of 10 splits on the news article category.", | ||
reference="https://tblock.github.io/10kGNAD/", | ||
dataset={ | ||
"path": "slvnwhrl/tenkgnad-clustering-p2p", | ||
"revision": "5c59e41555244b7e45c9a6be2d720ab4bafae558", | ||
}, | ||
type="Clustering", | ||
category="p2p", | ||
eval_splits=["test"], | ||
eval_langs=["deu-Latn"], | ||
main_score="v_measure", | ||
date=( | ||
"2000-01-01", | ||
"2020-12-31", | ||
), # since it is news it is guessed that it is from 2000 to 2020 | ||
form=["written"], | ||
domains=["News", "Non-fiction"], | ||
task_subtypes=None, | ||
license="cc-by-sa-4.0", | ||
socioeconomic_status="medium", | ||
annotations_creators="derived", | ||
dialect=[], | ||
text_creation="found", | ||
bibtex_citation=None, # none found | ||
n_samples={"test": 10275}, # due to duplicates | ||
avg_character_length={"test": 2641.03}, | ||
) | ||
|
||
def dataset_transform(self) -> None: | ||
ds = convert_to_fast(self.dataset, self.seed) # type: ignore | ||
self.dataset = ds |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
from __future__ import annotations | ||
|
||
from mteb.abstasks.AbsTaskClusteringFast import AbsTaskClusteringFast, convert_to_fast | ||
from mteb.abstasks.TaskMetadata import TaskMetadata | ||
|
||
|
||
class TenKGnadClusteringS2S(AbsTaskClusteringFast): | ||
metadata = TaskMetadata( | ||
name="TenKGnadClusteringS2S.v2", | ||
description="Clustering of news article titles. Clustering of 10 splits on the news article category.", | ||
reference="https://tblock.github.io/10kGNAD/", | ||
dataset={ | ||
"path": "slvnwhrl/tenkgnad-clustering-s2s", | ||
"revision": "6cddbe003f12b9b140aec477b583ac4191f01786", | ||
}, | ||
type="Clustering", | ||
category="s2s", | ||
eval_splits=["test"], | ||
eval_langs=["deu-Latn"], | ||
main_score="v_measure", | ||
date=( | ||
"2000-01-01", | ||
"2020-12-31", | ||
), # since it is news it is guessed that it is from 2000 to 2020 | ||
form=["written"], | ||
domains=["News", "Non-fiction"], | ||
task_subtypes=None, | ||
license="cc-by-sa-4.0", | ||
socioeconomic_status="medium", | ||
annotations_creators="derived", | ||
dialect=[], | ||
text_creation="found", | ||
bibtex_citation=None, # none found | ||
n_samples={"test": 10275}, # due to duplicates | ||
avg_character_length={"test": 50.96}, | ||
) | ||
|
||
def dataset_transform(self) -> None: | ||
ds = convert_to_fast(self.dataset, self.seed) # type: ignore | ||
self.dataset = ds |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.