diff --git a/docs/mmteb/points/493.jsonl b/docs/mmteb/points/493.jsonl new file mode 100644 index 0000000000..74e5bf5ef2 --- /dev/null +++ b/docs/mmteb/points/493.jsonl @@ -0,0 +1,2 @@ +{"GitHub": "SaitejaUtpala", "New dataset": 6} +{"GitHub": "isaac-chung", "Review PR": 2} diff --git a/mteb/tasks/Classification/__init__.py b/mteb/tasks/Classification/__init__.py index 8f022742c5..970d7ff821 100644 --- a/mteb/tasks/Classification/__init__.py +++ b/mteb/tasks/Classification/__init__.py @@ -27,6 +27,7 @@ from .fas.PersianFoodSentimentClassification import * from .fil.FilipinoHateSpeechClassification import * from .fra.MovieReviewSentimentClassification import * +from .guj.GujaratiNewsClassification import * from .hin.HindiDiscourseClassification import * from .hrv.CroatianSentimentClassification import * from .ind.IndonesianIdClickbaitClassification import * diff --git a/mteb/tasks/Classification/guj/GujaratiNewsClassification.py b/mteb/tasks/Classification/guj/GujaratiNewsClassification.py new file mode 100644 index 0000000000..7f221781c4 --- /dev/null +++ b/mteb/tasks/Classification/guj/GujaratiNewsClassification.py @@ -0,0 +1,36 @@ +from __future__ import annotations + +from mteb.abstasks import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class GujaratiNewsClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="GujaratiNewsClassification", + description="A Gujarati dataset for 3-class classification of Gujarati news articles", + reference="https://github.com/goru001/nlp-for-gujarati", + dataset={ + "path": "mlexplorer008/gujarati_news_classification", + "revision": "1a5f2fa2914bfeff4fcdc6fff4194fa8ec8fa19e", + }, + type="Classification", + category="s2s", + date=("2014-01-01", "2018-01-01"), + eval_splits=["test"], + eval_langs=["guj-Gujr"], + main_score="accuracy", + form=["written"], + domains=["News"], + task_subtypes=["Topic classification"], + license="MIT", + socioeconomic_status="mixed", + annotations_creators="derived", + dialect=[], + text_creation="found", + bibtex_citation=None, + n_samples={"train": 5269, "test": 1318}, + avg_character_length={"train": 61.95, "test": 61.91}, + ) + + def dataset_transform(self): + self.dataset = self.dataset.rename_column("headline", "text") diff --git a/mteb/tasks/Classification/guj/__init__.py b/mteb/tasks/Classification/guj/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/results/intfloat/multilingual-e5-small/GujaratiNewsClassification.json b/results/intfloat/multilingual-e5-small/GujaratiNewsClassification.json new file mode 100644 index 0000000000..546bd6a250 --- /dev/null +++ b/results/intfloat/multilingual-e5-small/GujaratiNewsClassification.json @@ -0,0 +1,13 @@ +{ + "dataset_revision": "1a5f2fa2914bfeff4fcdc6fff4194fa8ec8fa19e", + "mteb_dataset_name": "GujaratiNewsClassification", + "mteb_version": "1.7.5", + "test": { + "accuracy": 0.746661608497724, + "accuracy_stderr": 0.02690309965897991, + "evaluation_time": 51.96, + "f1": 0.7159097333700342, + "f1_stderr": 0.03088120006373275, + "main_score": 0.746661608497724 + } +} \ No newline at end of file diff --git a/results/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2/GujaratiNewsClassification.json b/results/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2/GujaratiNewsClassification.json new file mode 100644 index 0000000000..0a6ca5f272 --- /dev/null +++ b/results/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2/GujaratiNewsClassification.json @@ -0,0 +1,13 @@ +{ + "dataset_revision": "1a5f2fa2914bfeff4fcdc6fff4194fa8ec8fa19e", + "mteb_dataset_name": "GujaratiNewsClassification", + "mteb_version": "1.7.5", + "test": { + "accuracy": 0.7687405159332322, + "accuracy_stderr": 0.023005273029738673, + "evaluation_time": 34.18, + "f1": 0.7421112851603834, + "f1_stderr": 0.02646365364154126, + "main_score": 0.7687405159332322 + } +} \ No newline at end of file