Skip to content

Commit

Permalink
fix: Add Urdu Latin Sentiment Classification Dataset (#535)
Browse files Browse the repository at this point in the history
* Add Urdu Roman Sentiment Classification Dataset

* Add review changes

* Add points
  • Loading branch information
bp-high authored Apr 24, 2024
1 parent cf1c8d1 commit aed0c75
Show file tree
Hide file tree
Showing 6 changed files with 79 additions and 0 deletions.
3 changes: 3 additions & 0 deletions docs/mmteb/points/535.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{"GitHub": "bp-high", "New dataset": 2}
{"GitHub": "KennethEnevoldsen", "Review PR": 2}
{"GitHub": "asparius", "Review PR": 2}
1 change: 1 addition & 0 deletions mteb/tasks/Classification/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@
from .tur.TurkishMovieSentimentClassification import *
from .tur.TurkishProductSentimentClassification import *
from .uig.UyghurSentimentClassification import *
from .urd.UrduRomanSentimentClassification import *
from .vie.VieStudentFeedbackClassification import *
from .zho.CMTEBClassification import *
from .zho.YueOpenriceReviewClassification import (
Expand Down
49 changes: 49 additions & 0 deletions mteb/tasks/Classification/urd/UrduRomanSentimentClassification.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
from __future__ import annotations

from mteb.abstasks import AbsTaskClassification
from mteb.abstasks.TaskMetadata import TaskMetadata


class UrduRomanSentimentClassification(AbsTaskClassification):
metadata = TaskMetadata(
name="UrduRomanSentimentClassification",
description="The Roman Urdu dataset is a data corpus comprising of more than 20000 records tagged for sentiment (Positive, Negative, Neutral)",
reference="https://archive.ics.uci.edu/dataset/458/roman+urdu+data+set",
dataset={
"path": "roman_urdu",
"revision": "566be6449bb30b9b9f2b59173391647fe0ca3224",
},
type="Classification",
category="s2s",
date=("2018-01-01", "2018-08-28"),
eval_splits=["train"],
eval_langs=["urd-Latn"],
main_score="f1",
form=["written"],
domains=["Social"],
task_subtypes=["Sentiment/Hate speech"],
license="MIT",
socioeconomic_status="mixed",
annotations_creators="derived",
dialect=[],
text_creation="found",
bibtex_citation="""
@misc{misc_roman_urdu_data_set_458,
author = {Sharf,Zareen},
title = {{Roman Urdu Data Set}},
year = {2018},
howpublished = {UCI Machine Learning Repository},
note = {{DOI}: https://doi.org/10.24432/C58325}
}
""",
n_samples={"train": 2048},
avg_character_length={"train": 68.248},
)

def dataset_transform(self):
self.dataset = self.dataset.rename_columns(
{"sentence": "text", "sentiment": "label"}
)
self.dataset = self.stratified_subsampling(
self.dataset, seed=self.seed, splits=["train"]
)
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{
"dataset_revision": "566be6449bb30b9b9f2b59173391647fe0ca3224",
"mteb_dataset_name": "UrduRomanSentimentClassification",
"mteb_version": "1.7.6",
"train": {
"accuracy": 0.41708984375,
"accuracy_stderr": 0.041740365018761084,
"evaluation_time": 16.99,
"f1": 0.4036945993529546,
"f1_stderr": 0.03620053354369278,
"main_score": 0.4036945993529546
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{
"dataset_revision": "566be6449bb30b9b9f2b59173391647fe0ca3224",
"mteb_dataset_name": "UrduRomanSentimentClassification",
"mteb_version": "1.7.6",
"train": {
"accuracy": 0.387158203125,
"accuracy_stderr": 0.026019893444995425,
"evaluation_time": 11.84,
"f1": 0.3767185373070528,
"f1_stderr": 0.022533999516671163,
"main_score": 0.3767185373070528
}
}

0 comments on commit aed0c75

Please sign in to comment.