Skip to content

Commit

Permalink
feat: Forbid task metadata and add upload functions (#1362)
Browse files Browse the repository at this point in the history
* init

* find all wierd repos

* move to mteb WikipediaRetrievalMultilingual

* add base upload utils

* retrieval, classification, bitextmining

* test retrieval

* test retrieval

* test task uploaded

* update tasks

* working version

* remove comments

* lint

* move upload

* fix tests

* fix test

* move upload to task

* Update mteb/tasks/Retrieval/multilingual/WikipediaRetrievalMultilingual.py

Co-authored-by: Kenneth Enevoldsen <[email protected]>

* fix: hatespeech filipino (#1522)

* fix FilipinoHateSpeechClassification

* update tests

* lint

---------

Co-authored-by: Kenneth Enevoldsen <[email protected]>
  • Loading branch information
Samoed and KennethEnevoldsen authored Dec 4, 2024
1 parent d54fb75 commit dec5d6a
Show file tree
Hide file tree
Showing 42 changed files with 618 additions and 422 deletions.
36 changes: 36 additions & 0 deletions mteb/abstasks/AbsTask.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,42 @@ def filter_languages(
self.hf_subsets = subsets_to_keep
return self

def _upload_dataset_to_hub(self, repo_name: str, fields: list[str]) -> None:
if self.is_multilingual:
for config in self.metadata.eval_langs:
logger.info(f"Converting {config} of {self.metadata.name}")
sentences = {}
for split in self.dataset[config]:
sentences[split] = Dataset.from_dict(
{field: self.dataset[config][split][field] for field in fields}
)
sentences = DatasetDict(sentences)
sentences.push_to_hub(
repo_name, config, commit_message=f"Add {config} dataset"
)
else:
sentences = {}
for split in self.dataset:
sentences[split] = Dataset.from_dict(
{field: self.dataset[split][field] for field in fields}
)
sentences = DatasetDict(sentences)
sentences.push_to_hub(repo_name, commit_message="Add dataset")

def _push_dataset_to_hub(self, repo_name: str) -> None:
raise NotImplementedError

def push_dataset_to_hub(self, repo_name: str) -> None:
"""Push the dataset to the HuggingFace Hub.
Args:
repo_name: The name of the repository to push the dataset to.
"""
if not self.data_loaded:
self.load_data()

self._push_dataset_to_hub(repo_name)

@property
def eval_splits(self) -> list[str]:
if self._eval_splits:
Expand Down
45 changes: 44 additions & 1 deletion mteb/abstasks/AbsTaskBitextMining.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import logging
from typing import Any

from datasets import Dataset
from datasets import Dataset, DatasetDict

from mteb.encoder_interface import Encoder

Expand Down Expand Up @@ -191,3 +191,46 @@ def _calculate_metrics_from_split(
max_sentence2_length=max(s2_len),
unique_sentence2=unique_sentence2,
)

def _push_dataset_to_hub(self, repo_name: str) -> None:
if self.is_multilingual:
for config in self.metadata.eval_langs:
logger.info(f"Converting {config} of {self.metadata.name}")

sentences = {}
if self.parallel_subsets:
# If there are parallel subsets, process them
for split in self.dataset:
sent_1, sent_2 = config.split("-")
sentences[split] = Dataset.from_dict(
{
"sentence1": self.dataset[split][sent_1],
"sentence2": self.dataset[split][sent_2],
}
)
else:
# Handle the non-parallel subset case
sent_1, sent_2 = self.get_pairs(self.parallel_subsets)[0]
for split in self.dataset[config]:
sentences[split] = Dataset.from_dict(
{
"sentence1": self.dataset[config][split][sent_1],
"sentence2": self.dataset[config][split][sent_2],
}
)
sentences = DatasetDict(sentences)
sentences.push_to_hub(
repo_name, config, commit_message=f"Add {config} subset"
)
else:
sentences = {}
for split in self.dataset:
sent_1, sent_2 = self.get_pairs(self.parallel_subsets)[0]
sentences[split] = Dataset.from_dict(
{
"sentence1": self.dataset[split][sent_1],
"sentence2": self.dataset[split][sent_2],
}
)
sentences = DatasetDict(sentences)
sentences.push_to_hub(repo_name)
3 changes: 3 additions & 0 deletions mteb/abstasks/AbsTaskClassification.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,3 +257,6 @@ def _calculate_metrics_from_split(
str(label): {"count": count} for label, count in label_count.items()
},
)

def _push_dataset_to_hub(self, repo_name: str) -> None:
self._upload_dataset_to_hub(repo_name, ["text", "label"])
3 changes: 3 additions & 0 deletions mteb/abstasks/AbsTaskClustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,3 +141,6 @@ def _calculate_metrics_from_split(
for label, value in label_counter.items()
},
)

def _push_dataset_to_hub(self, repo_name: str) -> None:
self._upload_dataset_to_hub(repo_name, ["sentences", "labels"])
3 changes: 3 additions & 0 deletions mteb/abstasks/AbsTaskClusteringFast.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,9 @@ def _calculate_metrics_from_split(
},
)

def _push_dataset_to_hub(self, repo_name: str) -> None:
self._upload_dataset_to_hub(repo_name, ["sentences", "labels"])


def clustering_downsample(
dataset: DatasetDict, seed: int, max_samples_in_cluster: int = 2048
Expand Down
3 changes: 3 additions & 0 deletions mteb/abstasks/AbsTaskMultilabelClassification.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,3 +297,6 @@ def _calculate_metrics_from_split(
for label, value in label_count.items()
},
)

def _push_dataset_to_hub(self, repo_name: str) -> None:
self._upload_dataset_to_hub(repo_name, ["text", "label"])
5 changes: 4 additions & 1 deletion mteb/abstasks/AbsTaskPairClassification.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def _evaluate_subset(
encode_kwargs: dict[str, str] = {},
**kwargs,
) -> ScoresDict:
data_split = dataset[0]
data_split = dataset[0] if len(dataset) == 1 else dataset
logging.getLogger(
"sentence_transformers.evaluation.PairClassificationEvaluator"
).setLevel(logging.WARN)
Expand Down Expand Up @@ -152,3 +152,6 @@ def _calculate_metrics_from_split(
str(label): {"count": count} for label, count in label_count.items()
},
)

def _push_dataset_to_hub(self, repo_name: str) -> None:
self._upload_dataset_to_hub(repo_name, ["sentence1", "sentence2", "labels"])
1 change: 0 additions & 1 deletion mteb/abstasks/AbsTaskReranking.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
logger = logging.getLogger(__name__)

OLD_FORMAT_RERANKING_TASKS = [
"AskUbuntuDupQuestions",
"MindSmallReranking",
"SciDocsRR",
"StackOverflowDupQuestions",
Expand Down
174 changes: 173 additions & 1 deletion mteb/abstasks/AbsTaskRetrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
from time import time
from typing import Any

from datasets import Dataset, DatasetDict

from mteb.abstasks.TaskMetadata import HFSubset

from ..evaluation.evaluators import RetrievalEvaluator
Expand Down Expand Up @@ -223,7 +225,7 @@ def load_data(self, **kwargs):
if top_ranked:
if self.top_ranked is None:
self.top_ranked = {}
self.top_ranked = {
self.top_ranked[lang] = {
split: {
tr["query-id"]: tr["corpus-ids"] for tr in top_ranked
}
Expand Down Expand Up @@ -515,6 +517,176 @@ def _calculate_metrics_from_split(
max_top_ranked_per_query=max_top_ranked_per_query,
)

def _push_dataset_to_hub(self, repo_name: str) -> None:
def format_text_field(text):
"""Formats the text field to match loader expectations."""
if isinstance(text, str):
return text
return f"{text.get('title', '')} {text.get('text', '')}".strip()

if self.is_multilingual:
for config in self.queries:
logger.info(f"Converting {config} of {self.metadata.name}")

queries_dataset = {}
for split in self.queries[config]:
queries_dataset[split] = Dataset.from_list(
[
{
"_id": idx,
"text": text,
}
for idx, text in self.queries[config][split].items()
]
)
queries_dataset = DatasetDict(queries_dataset)
queries_dataset.push_to_hub(repo_name, f"{config}-queries")

corpus_dataset = {}
for split in self.corpus[config]:
corpus_dataset[split] = Dataset.from_list(
[
{
"_id": idx,
"text": format_text_field(text),
"title": text.get("title", "")
if isinstance(text, dict)
else "",
}
for idx, text in self.corpus[config][split].items()
]
)

corpus_dataset = DatasetDict(corpus_dataset)
corpus_dataset.push_to_hub(repo_name, f"{config}-corpus")

relevant_docs_dataset = {}
for split in self.relevant_docs[config]:
relevant_docs_dataset[split] = Dataset.from_list(
[
{"query-id": query_id, "corpus-id": doc_id, "score": score}
for query_id, docs in self.relevant_docs[config][
split
].items()
for doc_id, score in docs.items()
]
)
relevant_docs_dataset = DatasetDict(relevant_docs_dataset)
relevant_docs_dataset.push_to_hub(repo_name, f"{config}-qrels")

if self.instructions:
instructions_dataset = {}
for split in self.instructions[config]:
instructions_dataset[split] = Dataset.from_list(
[
{
"query-id": idx,
"instruction": text,
}
for idx, text in self.instructions[config][
split
].items()
]
)
instructions_dataset = DatasetDict(instructions_dataset)
instructions_dataset.push_to_hub(repo_name, f"{config}-instruction")
if self.top_ranked:
top_ranked_dataset = {}
for split in self.top_ranked[config]:
top_ranked_dataset[split] = Dataset.from_list(
[
{
"query-id": query_id,
"corpus-ids": docs,
}
for query_id, docs in self.top_ranked[config][
split
].items()
]
)
top_ranked_dataset = DatasetDict(top_ranked_dataset)
top_ranked_dataset.push_to_hub(repo_name, f"{config}-top_ranked")
else:
if "default" in self.queries:
# old rerankers have additional default split
self.queries = self.queries["default"]
self.corpus = self.corpus["default"]
self.relevant_docs = self.relevant_docs["default"]
if self.instructions:
self.instructions = self.instructions["default"]
if self.top_ranked:
self.top_ranked = self.top_ranked["default"]

queries_dataset = {}
for split in self.queries:
queries_dataset[split] = Dataset.from_list(
[
{
"_id": idx,
"text": text,
}
for idx, text in self.queries[split].items()
]
)
queries_dataset = DatasetDict(queries_dataset)
queries_dataset.push_to_hub(repo_name, "queries")
corpus_dataset = {}
for split in self.corpus:
corpus_dataset[split] = Dataset.from_list(
[
{
"_id": idx,
"text": format_text_field(text),
"title": text.get("title", "")
if isinstance(text, dict)
else "",
}
for idx, text in self.corpus[split].items()
]
)

corpus_dataset = DatasetDict(corpus_dataset)
corpus_dataset.push_to_hub(repo_name, "corpus")
relevant_docs_dataset = {}
for split in self.relevant_docs:
relevant_docs_dataset[split] = Dataset.from_list(
[
{"query-id": query_id, "corpus-id": doc_id, "score": score}
for query_id, docs in self.relevant_docs[split].items()
for doc_id, score in docs.items()
]
)
relevant_docs_dataset = DatasetDict(relevant_docs_dataset)
relevant_docs_dataset.push_to_hub(repo_name, "default")
if self.instructions:
instructions_dataset = {}
for split in self.instructions:
instructions_dataset[split] = Dataset.from_list(
[
{
"query-id": idx,
"instruction": text,
}
for idx, text in self.instructions[split].items()
]
)
instructions_dataset = DatasetDict(instructions_dataset)
instructions_dataset.push_to_hub(repo_name, "instruction")
if self.top_ranked:
top_ranked_dataset = {}
for split in self.top_ranked:
top_ranked_dataset[split] = Dataset.from_list(
[
{
"query-id": query_id,
"corpus-ids": docs,
}
for query_id, docs in self.top_ranked[split].items()
]
)
top_ranked_dataset = DatasetDict(top_ranked_dataset)
top_ranked_dataset.push_to_hub(repo_name, "top_ranked")


def calculate_queries_length(queries: dict[str, str]) -> list[int] | None:
queries_lens = []
Expand Down
3 changes: 3 additions & 0 deletions mteb/abstasks/AbsTaskSTS.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,3 +136,6 @@ def _calculate_metrics_from_split(
avg_score=avg_score,
max_score=max(score),
)

def _push_dataset_to_hub(self, repo_name: str) -> None:
self._upload_dataset_to_hub(repo_name, ["sentence1", "sentence2", "score"])
23 changes: 22 additions & 1 deletion mteb/abstasks/TaskMetadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
AnyUrl,
BaseModel,
BeforeValidator,
ConfigDict,
TypeAdapter,
field_validator,
)
Expand Down Expand Up @@ -197,6 +198,24 @@ class DescriptiveStatistics(TypedDict):
logger = logging.getLogger(__name__)


class MetadataDatasetDict(TypedDict, total=False):
"""A dictionary containing the dataset path and revision.
Args:
path: The path to the dataset.
revision: The revision of the dataset.
name: The name the dataset config.
split: The split of the dataset.
trust_remote_code: Whether to trust the remote code.
"""

path: str
revision: str
name: str
split: str
trust_remote_code: bool


class TaskMetadata(BaseModel):
"""Metadata for a task.
Expand Down Expand Up @@ -228,7 +247,9 @@ class TaskMetadata(BaseModel):
bibtex_citation: The BibTeX citation for the dataset. Should be an empty string if no citation is available.
"""

dataset: dict
model_config = ConfigDict(extra="forbid")

dataset: MetadataDatasetDict

name: str
description: str
Expand Down
Loading

0 comments on commit dec5d6a

Please sign in to comment.