From dec5d6a94b0b7adfa9d3b47f1fd9eb6041845008 Mon Sep 17 00:00:00 2001 From: Roman Solomatin Date: Thu, 5 Dec 2024 03:19:31 +0500 Subject: [PATCH] feat: Forbid task metadata and add upload functions (#1362) * init * find all wierd repos * move to mteb WikipediaRetrievalMultilingual * add base upload utils * retrieval, classification, bitextmining * test retrieval * test retrieval * test task uploaded * update tasks * working version * remove comments * lint * move upload * fix tests * fix test * move upload to task * Update mteb/tasks/Retrieval/multilingual/WikipediaRetrievalMultilingual.py Co-authored-by: Kenneth Enevoldsen * fix: hatespeech filipino (#1522) * fix FilipinoHateSpeechClassification * update tests * lint --------- Co-authored-by: Kenneth Enevoldsen --- mteb/abstasks/AbsTask.py | 36 +++ mteb/abstasks/AbsTaskBitextMining.py | 45 ++- mteb/abstasks/AbsTaskClassification.py | 3 + mteb/abstasks/AbsTaskClustering.py | 3 + mteb/abstasks/AbsTaskClusteringFast.py | 3 + .../AbsTaskMultilabelClassification.py | 3 + mteb/abstasks/AbsTaskPairClassification.py | 5 +- mteb/abstasks/AbsTaskReranking.py | 1 - mteb/abstasks/AbsTaskRetrieval.py | 174 +++++++++++- mteb/abstasks/AbsTaskSTS.py | 3 + mteb/abstasks/TaskMetadata.py | 23 +- mteb/abstasks/dataloaders.py | 6 +- .../IndicXnliPairClassification.json | 268 ++++++++++++++++++ .../multilingual/IWSLT2017BitextMining.py | 39 +-- .../AmazonCounterfactualClassification.py | 3 +- .../AmazonReviewsClassification.py | 5 +- .../Clustering/deu/BlurbsClusteringP2P.py | 1 - .../Clustering/deu/BlurbsClusteringS2S.py | 1 - .../Clustering/deu/TenKGnadClusteringS2S.py | 1 - .../Clustering/eng/BigPatentClustering.py | 1 - mteb/tasks/Clustering/eng/RedditClustering.py | 1 - .../Clustering/eng/RedditClusteringP2P.py | 1 - .../Clustering/eng/StackExchangeClustering.py | 1 - .../eng/StackExchangeClusteringP2P.py | 1 - .../Clustering/fra/AlloProfClusteringP2P.py | 1 - .../Clustering/fra/AlloProfClusteringS2S.py | 1 - mteb/tasks/Clustering/fra/HALClusteringS2S.py | 1 - .../multilingual/IndicReviewsClusteringP2P.py | 36 +-- mteb/tasks/Clustering/zho/CMTEBClustering.py | 4 - .../multilingual/mFollowIR.py | 8 +- .../InstructionRetrieval/eng/InstructIR.py | 11 - mteb/tasks/PairClassification/__init__.py | 3 + .../IndicXnliPairClassification.py | 6 +- .../PairClassification/multilingual/XNLI.py | 30 +- .../Reranking/eng/AskUbuntuDupQuestions.py | 4 +- mteb/tasks/Reranking/zho/CMTEBReranking.py | 3 - mteb/tasks/Retrieval/eng/BrightRetrieval.py | 6 +- .../WikipediaRetrievalMultilingual.py | 86 +----- mteb/tasks/Retrieval/zho/CMTEBRetrieval.py | 157 ++-------- .../STS/multilingual/IndicCrosslingualSTS.py | 31 +- tests/test_TaskMetadata.py | 23 +- tests/test_load_results/test_mteb_results.py | 1 - 42 files changed, 618 insertions(+), 422 deletions(-) create mode 100644 mteb/descriptive_stats/PairClassification/IndicXnliPairClassification.json diff --git a/mteb/abstasks/AbsTask.py b/mteb/abstasks/AbsTask.py index 8b9edfd52c..e82878c803 100644 --- a/mteb/abstasks/AbsTask.py +++ b/mteb/abstasks/AbsTask.py @@ -300,6 +300,42 @@ def filter_languages( self.hf_subsets = subsets_to_keep return self + def _upload_dataset_to_hub(self, repo_name: str, fields: list[str]) -> None: + if self.is_multilingual: + for config in self.metadata.eval_langs: + logger.info(f"Converting {config} of {self.metadata.name}") + sentences = {} + for split in self.dataset[config]: + sentences[split] = Dataset.from_dict( + {field: self.dataset[config][split][field] for field in fields} + ) + sentences = DatasetDict(sentences) + sentences.push_to_hub( + repo_name, config, commit_message=f"Add {config} dataset" + ) + else: + sentences = {} + for split in self.dataset: + sentences[split] = Dataset.from_dict( + {field: self.dataset[split][field] for field in fields} + ) + sentences = DatasetDict(sentences) + sentences.push_to_hub(repo_name, commit_message="Add dataset") + + def _push_dataset_to_hub(self, repo_name: str) -> None: + raise NotImplementedError + + def push_dataset_to_hub(self, repo_name: str) -> None: + """Push the dataset to the HuggingFace Hub. + + Args: + repo_name: The name of the repository to push the dataset to. + """ + if not self.data_loaded: + self.load_data() + + self._push_dataset_to_hub(repo_name) + @property def eval_splits(self) -> list[str]: if self._eval_splits: diff --git a/mteb/abstasks/AbsTaskBitextMining.py b/mteb/abstasks/AbsTaskBitextMining.py index 59d64039fd..4be4ec1562 100644 --- a/mteb/abstasks/AbsTaskBitextMining.py +++ b/mteb/abstasks/AbsTaskBitextMining.py @@ -3,7 +3,7 @@ import logging from typing import Any -from datasets import Dataset +from datasets import Dataset, DatasetDict from mteb.encoder_interface import Encoder @@ -191,3 +191,46 @@ def _calculate_metrics_from_split( max_sentence2_length=max(s2_len), unique_sentence2=unique_sentence2, ) + + def _push_dataset_to_hub(self, repo_name: str) -> None: + if self.is_multilingual: + for config in self.metadata.eval_langs: + logger.info(f"Converting {config} of {self.metadata.name}") + + sentences = {} + if self.parallel_subsets: + # If there are parallel subsets, process them + for split in self.dataset: + sent_1, sent_2 = config.split("-") + sentences[split] = Dataset.from_dict( + { + "sentence1": self.dataset[split][sent_1], + "sentence2": self.dataset[split][sent_2], + } + ) + else: + # Handle the non-parallel subset case + sent_1, sent_2 = self.get_pairs(self.parallel_subsets)[0] + for split in self.dataset[config]: + sentences[split] = Dataset.from_dict( + { + "sentence1": self.dataset[config][split][sent_1], + "sentence2": self.dataset[config][split][sent_2], + } + ) + sentences = DatasetDict(sentences) + sentences.push_to_hub( + repo_name, config, commit_message=f"Add {config} subset" + ) + else: + sentences = {} + for split in self.dataset: + sent_1, sent_2 = self.get_pairs(self.parallel_subsets)[0] + sentences[split] = Dataset.from_dict( + { + "sentence1": self.dataset[split][sent_1], + "sentence2": self.dataset[split][sent_2], + } + ) + sentences = DatasetDict(sentences) + sentences.push_to_hub(repo_name) diff --git a/mteb/abstasks/AbsTaskClassification.py b/mteb/abstasks/AbsTaskClassification.py index 55766190fe..5e48dfab49 100644 --- a/mteb/abstasks/AbsTaskClassification.py +++ b/mteb/abstasks/AbsTaskClassification.py @@ -257,3 +257,6 @@ def _calculate_metrics_from_split( str(label): {"count": count} for label, count in label_count.items() }, ) + + def _push_dataset_to_hub(self, repo_name: str) -> None: + self._upload_dataset_to_hub(repo_name, ["text", "label"]) diff --git a/mteb/abstasks/AbsTaskClustering.py b/mteb/abstasks/AbsTaskClustering.py index 3b5d0f492d..095c44435c 100644 --- a/mteb/abstasks/AbsTaskClustering.py +++ b/mteb/abstasks/AbsTaskClustering.py @@ -141,3 +141,6 @@ def _calculate_metrics_from_split( for label, value in label_counter.items() }, ) + + def _push_dataset_to_hub(self, repo_name: str) -> None: + self._upload_dataset_to_hub(repo_name, ["sentences", "labels"]) diff --git a/mteb/abstasks/AbsTaskClusteringFast.py b/mteb/abstasks/AbsTaskClusteringFast.py index 61c82e9535..af600eb7e0 100644 --- a/mteb/abstasks/AbsTaskClusteringFast.py +++ b/mteb/abstasks/AbsTaskClusteringFast.py @@ -268,6 +268,9 @@ def _calculate_metrics_from_split( }, ) + def _push_dataset_to_hub(self, repo_name: str) -> None: + self._upload_dataset_to_hub(repo_name, ["sentences", "labels"]) + def clustering_downsample( dataset: DatasetDict, seed: int, max_samples_in_cluster: int = 2048 diff --git a/mteb/abstasks/AbsTaskMultilabelClassification.py b/mteb/abstasks/AbsTaskMultilabelClassification.py index 16c7ac86ff..1c3cba33e5 100644 --- a/mteb/abstasks/AbsTaskMultilabelClassification.py +++ b/mteb/abstasks/AbsTaskMultilabelClassification.py @@ -297,3 +297,6 @@ def _calculate_metrics_from_split( for label, value in label_count.items() }, ) + + def _push_dataset_to_hub(self, repo_name: str) -> None: + self._upload_dataset_to_hub(repo_name, ["text", "label"]) diff --git a/mteb/abstasks/AbsTaskPairClassification.py b/mteb/abstasks/AbsTaskPairClassification.py index 2b4c82c01f..4d39fd2c3f 100644 --- a/mteb/abstasks/AbsTaskPairClassification.py +++ b/mteb/abstasks/AbsTaskPairClassification.py @@ -81,7 +81,7 @@ def _evaluate_subset( encode_kwargs: dict[str, str] = {}, **kwargs, ) -> ScoresDict: - data_split = dataset[0] + data_split = dataset[0] if len(dataset) == 1 else dataset logging.getLogger( "sentence_transformers.evaluation.PairClassificationEvaluator" ).setLevel(logging.WARN) @@ -152,3 +152,6 @@ def _calculate_metrics_from_split( str(label): {"count": count} for label, count in label_count.items() }, ) + + def _push_dataset_to_hub(self, repo_name: str) -> None: + self._upload_dataset_to_hub(repo_name, ["sentence1", "sentence2", "labels"]) diff --git a/mteb/abstasks/AbsTaskReranking.py b/mteb/abstasks/AbsTaskReranking.py index b4a5cffd25..2ae1f5c359 100644 --- a/mteb/abstasks/AbsTaskReranking.py +++ b/mteb/abstasks/AbsTaskReranking.py @@ -11,7 +11,6 @@ logger = logging.getLogger(__name__) OLD_FORMAT_RERANKING_TASKS = [ - "AskUbuntuDupQuestions", "MindSmallReranking", "SciDocsRR", "StackOverflowDupQuestions", diff --git a/mteb/abstasks/AbsTaskRetrieval.py b/mteb/abstasks/AbsTaskRetrieval.py index 573a0efd12..b8f255b356 100644 --- a/mteb/abstasks/AbsTaskRetrieval.py +++ b/mteb/abstasks/AbsTaskRetrieval.py @@ -7,6 +7,8 @@ from time import time from typing import Any +from datasets import Dataset, DatasetDict + from mteb.abstasks.TaskMetadata import HFSubset from ..evaluation.evaluators import RetrievalEvaluator @@ -223,7 +225,7 @@ def load_data(self, **kwargs): if top_ranked: if self.top_ranked is None: self.top_ranked = {} - self.top_ranked = { + self.top_ranked[lang] = { split: { tr["query-id"]: tr["corpus-ids"] for tr in top_ranked } @@ -515,6 +517,176 @@ def _calculate_metrics_from_split( max_top_ranked_per_query=max_top_ranked_per_query, ) + def _push_dataset_to_hub(self, repo_name: str) -> None: + def format_text_field(text): + """Formats the text field to match loader expectations.""" + if isinstance(text, str): + return text + return f"{text.get('title', '')} {text.get('text', '')}".strip() + + if self.is_multilingual: + for config in self.queries: + logger.info(f"Converting {config} of {self.metadata.name}") + + queries_dataset = {} + for split in self.queries[config]: + queries_dataset[split] = Dataset.from_list( + [ + { + "_id": idx, + "text": text, + } + for idx, text in self.queries[config][split].items() + ] + ) + queries_dataset = DatasetDict(queries_dataset) + queries_dataset.push_to_hub(repo_name, f"{config}-queries") + + corpus_dataset = {} + for split in self.corpus[config]: + corpus_dataset[split] = Dataset.from_list( + [ + { + "_id": idx, + "text": format_text_field(text), + "title": text.get("title", "") + if isinstance(text, dict) + else "", + } + for idx, text in self.corpus[config][split].items() + ] + ) + + corpus_dataset = DatasetDict(corpus_dataset) + corpus_dataset.push_to_hub(repo_name, f"{config}-corpus") + + relevant_docs_dataset = {} + for split in self.relevant_docs[config]: + relevant_docs_dataset[split] = Dataset.from_list( + [ + {"query-id": query_id, "corpus-id": doc_id, "score": score} + for query_id, docs in self.relevant_docs[config][ + split + ].items() + for doc_id, score in docs.items() + ] + ) + relevant_docs_dataset = DatasetDict(relevant_docs_dataset) + relevant_docs_dataset.push_to_hub(repo_name, f"{config}-qrels") + + if self.instructions: + instructions_dataset = {} + for split in self.instructions[config]: + instructions_dataset[split] = Dataset.from_list( + [ + { + "query-id": idx, + "instruction": text, + } + for idx, text in self.instructions[config][ + split + ].items() + ] + ) + instructions_dataset = DatasetDict(instructions_dataset) + instructions_dataset.push_to_hub(repo_name, f"{config}-instruction") + if self.top_ranked: + top_ranked_dataset = {} + for split in self.top_ranked[config]: + top_ranked_dataset[split] = Dataset.from_list( + [ + { + "query-id": query_id, + "corpus-ids": docs, + } + for query_id, docs in self.top_ranked[config][ + split + ].items() + ] + ) + top_ranked_dataset = DatasetDict(top_ranked_dataset) + top_ranked_dataset.push_to_hub(repo_name, f"{config}-top_ranked") + else: + if "default" in self.queries: + # old rerankers have additional default split + self.queries = self.queries["default"] + self.corpus = self.corpus["default"] + self.relevant_docs = self.relevant_docs["default"] + if self.instructions: + self.instructions = self.instructions["default"] + if self.top_ranked: + self.top_ranked = self.top_ranked["default"] + + queries_dataset = {} + for split in self.queries: + queries_dataset[split] = Dataset.from_list( + [ + { + "_id": idx, + "text": text, + } + for idx, text in self.queries[split].items() + ] + ) + queries_dataset = DatasetDict(queries_dataset) + queries_dataset.push_to_hub(repo_name, "queries") + corpus_dataset = {} + for split in self.corpus: + corpus_dataset[split] = Dataset.from_list( + [ + { + "_id": idx, + "text": format_text_field(text), + "title": text.get("title", "") + if isinstance(text, dict) + else "", + } + for idx, text in self.corpus[split].items() + ] + ) + + corpus_dataset = DatasetDict(corpus_dataset) + corpus_dataset.push_to_hub(repo_name, "corpus") + relevant_docs_dataset = {} + for split in self.relevant_docs: + relevant_docs_dataset[split] = Dataset.from_list( + [ + {"query-id": query_id, "corpus-id": doc_id, "score": score} + for query_id, docs in self.relevant_docs[split].items() + for doc_id, score in docs.items() + ] + ) + relevant_docs_dataset = DatasetDict(relevant_docs_dataset) + relevant_docs_dataset.push_to_hub(repo_name, "default") + if self.instructions: + instructions_dataset = {} + for split in self.instructions: + instructions_dataset[split] = Dataset.from_list( + [ + { + "query-id": idx, + "instruction": text, + } + for idx, text in self.instructions[split].items() + ] + ) + instructions_dataset = DatasetDict(instructions_dataset) + instructions_dataset.push_to_hub(repo_name, "instruction") + if self.top_ranked: + top_ranked_dataset = {} + for split in self.top_ranked: + top_ranked_dataset[split] = Dataset.from_list( + [ + { + "query-id": query_id, + "corpus-ids": docs, + } + for query_id, docs in self.top_ranked[split].items() + ] + ) + top_ranked_dataset = DatasetDict(top_ranked_dataset) + top_ranked_dataset.push_to_hub(repo_name, "top_ranked") + def calculate_queries_length(queries: dict[str, str]) -> list[int] | None: queries_lens = [] diff --git a/mteb/abstasks/AbsTaskSTS.py b/mteb/abstasks/AbsTaskSTS.py index 0a7cb820ea..2183cc08af 100644 --- a/mteb/abstasks/AbsTaskSTS.py +++ b/mteb/abstasks/AbsTaskSTS.py @@ -136,3 +136,6 @@ def _calculate_metrics_from_split( avg_score=avg_score, max_score=max(score), ) + + def _push_dataset_to_hub(self, repo_name: str) -> None: + self._upload_dataset_to_hub(repo_name, ["sentence1", "sentence2", "score"]) diff --git a/mteb/abstasks/TaskMetadata.py b/mteb/abstasks/TaskMetadata.py index d1fd1fd4df..65e7ddbb86 100644 --- a/mteb/abstasks/TaskMetadata.py +++ b/mteb/abstasks/TaskMetadata.py @@ -11,6 +11,7 @@ AnyUrl, BaseModel, BeforeValidator, + ConfigDict, TypeAdapter, field_validator, ) @@ -197,6 +198,24 @@ class DescriptiveStatistics(TypedDict): logger = logging.getLogger(__name__) +class MetadataDatasetDict(TypedDict, total=False): + """A dictionary containing the dataset path and revision. + + Args: + path: The path to the dataset. + revision: The revision of the dataset. + name: The name the dataset config. + split: The split of the dataset. + trust_remote_code: Whether to trust the remote code. + """ + + path: str + revision: str + name: str + split: str + trust_remote_code: bool + + class TaskMetadata(BaseModel): """Metadata for a task. @@ -228,7 +247,9 @@ class TaskMetadata(BaseModel): bibtex_citation: The BibTeX citation for the dataset. Should be an empty string if no citation is available. """ - dataset: dict + model_config = ConfigDict(extra="forbid") + + dataset: MetadataDatasetDict name: str description: str diff --git a/mteb/abstasks/dataloaders.py b/mteb/abstasks/dataloaders.py index ba5d180ca5..2c2c1a33af 100644 --- a/mteb/abstasks/dataloaders.py +++ b/mteb/abstasks/dataloaders.py @@ -126,7 +126,7 @@ def load( logger.info("Loading Queries...") self._load_queries(config) - if any(c.endswith("top_ranked") for c in configs) in configs or ( + if any(c.endswith("top_ranked") for c in configs) or ( not self.hf_repo and self.top_ranked_file ): logger.info("Loading Top Ranked") @@ -258,7 +258,7 @@ def _load_qrels(self, split: str, config: str | None = None): self.qrels = qrels_ds def _load_top_ranked(self, config: str | None = None): - config = f"top_ranked-{config}" if config is not None else "top_ranked" + config = f"{config}-top_ranked" if config is not None else "top_ranked" if self.hf_repo: top_ranked_ds = load_dataset( self.hf_repo, @@ -303,7 +303,7 @@ def _load_top_ranked(self, config: str | None = None): self.top_ranked = top_ranked_ds def _load_instructions(self, config: str | None = None): - config = f"instruction-{config}" if config is not None else "instruction" + config = f"{config}-instruction" if config is not None else "instruction" if self.hf_repo: instructions_ds = load_dataset( self.hf_repo, diff --git a/mteb/descriptive_stats/PairClassification/IndicXnliPairClassification.json b/mteb/descriptive_stats/PairClassification/IndicXnliPairClassification.json new file mode 100644 index 0000000000..c8b78b1d60 --- /dev/null +++ b/mteb/descriptive_stats/PairClassification/IndicXnliPairClassification.json @@ -0,0 +1,268 @@ +{ + "test": { + "num_samples": 36740, + "number_of_characters": 5676171, + "unique_pairs": 36736, + "min_sentence1_length": 6, + "avg_sentence1_length": 103.40577027762656, + "max_sentence1_length": 795, + "unique_sentence1": 18367, + "min_sentence2_length": 6, + "avg_sentence2_length": 51.08990201415351, + "max_sentence2_length": 529, + "unique_sentence2": 36730, + "unique_labels": 2, + "labels": { + "0": { + "count": 18370 + }, + "1": { + "count": 18370 + } + }, + "hf_subset_descriptive_stats": { + "as": { + "num_samples": 3340, + "number_of_characters": 497266, + "unique_pairs": 3339, + "min_sentence1_length": 14, + "avg_sentence1_length": 99.08083832335329, + "max_sentence1_length": 399, + "unique_sentence1": 1670, + "min_sentence2_length": 11, + "avg_sentence2_length": 49.80119760479042, + "max_sentence2_length": 268, + "unique_sentence2": 3338, + "unique_labels": 2, + "labels": { + "0": { + "count": 1670 + }, + "1": { + "count": 1670 + } + } + }, + "bn": { + "num_samples": 3340, + "number_of_characters": 497318, + "unique_pairs": 3340, + "min_sentence1_length": 11, + "avg_sentence1_length": 100.6119760479042, + "max_sentence1_length": 433, + "unique_sentence1": 1670, + "min_sentence2_length": 10, + "avg_sentence2_length": 48.28562874251497, + "max_sentence2_length": 183, + "unique_sentence2": 3340, + "unique_labels": 2, + "labels": { + "0": { + "count": 1670 + }, + "1": { + "count": 1670 + } + } + }, + "gu": { + "num_samples": 3340, + "number_of_characters": 487756, + "unique_pairs": 3340, + "min_sentence1_length": 15, + "avg_sentence1_length": 98.30059880239521, + "max_sentence1_length": 266, + "unique_sentence1": 1670, + "min_sentence2_length": 9, + "avg_sentence2_length": 47.73413173652695, + "max_sentence2_length": 160, + "unique_sentence2": 3340, + "unique_labels": 2, + "labels": { + "0": { + "count": 1670 + }, + "1": { + "count": 1670 + } + } + }, + "hi": { + "num_samples": 3340, + "number_of_characters": 516591, + "unique_pairs": 3340, + "min_sentence1_length": 15, + "avg_sentence1_length": 104.58203592814371, + "max_sentence1_length": 281, + "unique_sentence1": 1669, + "min_sentence2_length": 10, + "avg_sentence2_length": 50.08592814371257, + "max_sentence2_length": 173, + "unique_sentence2": 3339, + "unique_labels": 2, + "labels": { + "0": { + "count": 1670 + }, + "1": { + "count": 1670 + } + } + }, + "kn": { + "num_samples": 3340, + "number_of_characters": 536983, + "unique_pairs": 3340, + "min_sentence1_length": 8, + "avg_sentence1_length": 107.52874251497006, + "max_sentence1_length": 281, + "unique_sentence1": 1669, + "min_sentence2_length": 8, + "avg_sentence2_length": 53.24461077844311, + "max_sentence2_length": 178, + "unique_sentence2": 3339, + "unique_labels": 2, + "labels": { + "0": { + "count": 1670 + }, + "1": { + "count": 1670 + } + } + }, + "ml": { + "num_samples": 3340, + "number_of_characters": 544776, + "unique_pairs": 3339, + "min_sentence1_length": 8, + "avg_sentence1_length": 107.6185628742515, + "max_sentence1_length": 316, + "unique_sentence1": 1670, + "min_sentence2_length": 9, + "avg_sentence2_length": 55.48802395209581, + "max_sentence2_length": 194, + "unique_sentence2": 3338, + "unique_labels": 2, + "labels": { + "0": { + "count": 1670 + }, + "1": { + "count": 1670 + } + } + }, + "mr": { + "num_samples": 3340, + "number_of_characters": 491967, + "unique_pairs": 3340, + "min_sentence1_length": 15, + "avg_sentence1_length": 98.40059880239521, + "max_sentence1_length": 477, + "unique_sentence1": 1670, + "min_sentence2_length": 12, + "avg_sentence2_length": 48.89491017964072, + "max_sentence2_length": 173, + "unique_sentence2": 3340, + "unique_labels": 2, + "labels": { + "0": { + "count": 1670 + }, + "1": { + "count": 1670 + } + } + }, + "or": { + "num_samples": 3340, + "number_of_characters": 500985, + "unique_pairs": 3340, + "min_sentence1_length": 15, + "avg_sentence1_length": 99.90838323353293, + "max_sentence1_length": 304, + "unique_sentence1": 1670, + "min_sentence2_length": 10, + "avg_sentence2_length": 50.08712574850299, + "max_sentence2_length": 529, + "unique_sentence2": 3339, + "unique_labels": 2, + "labels": { + "0": { + "count": 1670 + }, + "1": { + "count": 1670 + } + } + }, + "pa": { + "num_samples": 3340, + "number_of_characters": 510238, + "unique_pairs": 3340, + "min_sentence1_length": 15, + "avg_sentence1_length": 102.82035928143712, + "max_sentence1_length": 246, + "unique_sentence1": 1669, + "min_sentence2_length": 10, + "avg_sentence2_length": 49.945508982035925, + "max_sentence2_length": 169, + "unique_sentence2": 3340, + "unique_labels": 2, + "labels": { + "0": { + "count": 1670 + }, + "1": { + "count": 1670 + } + } + }, + "ta": { + "num_samples": 3340, + "number_of_characters": 588359, + "unique_pairs": 3339, + "min_sentence1_length": 6, + "avg_sentence1_length": 117.50419161676646, + "max_sentence1_length": 795, + "unique_sentence1": 1670, + "min_sentence2_length": 6, + "avg_sentence2_length": 58.65119760479042, + "max_sentence2_length": 208, + "unique_sentence2": 3339, + "unique_labels": 2, + "labels": { + "0": { + "count": 1670 + }, + "1": { + "count": 1670 + } + } + }, + "te": { + "num_samples": 3340, + "number_of_characters": 503932, + "unique_pairs": 3339, + "min_sentence1_length": 15, + "avg_sentence1_length": 101.10718562874251, + "max_sentence1_length": 681, + "unique_sentence1": 1670, + "min_sentence2_length": 11, + "avg_sentence2_length": 49.77065868263473, + "max_sentence2_length": 183, + "unique_sentence2": 3339, + "unique_labels": 2, + "labels": { + "0": { + "count": 1670 + }, + "1": { + "count": 1670 + } + } + } + } + } +} \ No newline at end of file diff --git a/mteb/tasks/BitextMining/multilingual/IWSLT2017BitextMining.py b/mteb/tasks/BitextMining/multilingual/IWSLT2017BitextMining.py index ee83b6f5ca..dfe4e4bfa7 100644 --- a/mteb/tasks/BitextMining/multilingual/IWSLT2017BitextMining.py +++ b/mteb/tasks/BitextMining/multilingual/IWSLT2017BitextMining.py @@ -1,7 +1,5 @@ from __future__ import annotations -import datasets - from mteb.abstasks.AbsTaskBitextMining import AbsTaskBitextMining from mteb.abstasks.MultilingualTask import MultilingualTask from mteb.abstasks.TaskMetadata import TaskMetadata @@ -40,9 +38,8 @@ class IWSLT2017BitextMining(AbsTaskBitextMining, MultilingualTask): metadata = TaskMetadata( name="IWSLT2017BitextMining", dataset={ - "path": "IWSLT/iwslt2017", - "revision": "c18a4f81a47ae6fa079fe9d32db288ddde38451d", - "trust_remote_code": True, + "path": "mteb/IWSLT2017BitextMining", + "revision": "14034eed1824a54d866c93a988319b77b2e90217", }, description="The IWSLT 2017 Multilingual Task addresses text translation, including zero-shot translation, with a single MT system across all directions including English, German, Dutch, Italian and Romanian.", reference="https://aclanthology.org/2017.iwslt-1.1/", @@ -82,35 +79,3 @@ class IWSLT2017BitextMining(AbsTaskBitextMining, MultilingualTask): } """, ) - - def load_data(self, **kwargs): - """Load dataset from HuggingFace hub and convert it to the standard format.""" - if self.data_loaded: - return - - self.dataset = {} - for lang in self.hf_subsets: - self.dataset[lang] = datasets.load_dataset( - name=f"iwslt2017-{lang}", - **self.metadata_dict["dataset"], - ) - - self.dataset_transform() - self.data_loaded = True - - def dataset_transform(self): - def create_columns(row, lang): - l1, l2 = lang.split("-") - row["sentence1"] = row["translation"][l1] - row["sentence2"] = row["translation"][l2] - return row - - # Convert to standard format - dataset = {} - for lang in self.hf_subsets: - dataset[lang] = {} - for split in _SPLITS: - dataset[lang][split] = self.dataset[lang][split].map( - lambda x: create_columns(x, lang=lang) - ) - self.dataset = dataset diff --git a/mteb/tasks/Classification/multilingual/AmazonCounterfactualClassification.py b/mteb/tasks/Classification/multilingual/AmazonCounterfactualClassification.py index 112d4e0b27..168f990d8d 100644 --- a/mteb/tasks/Classification/multilingual/AmazonCounterfactualClassification.py +++ b/mteb/tasks/Classification/multilingual/AmazonCounterfactualClassification.py @@ -10,8 +10,7 @@ class AmazonCounterfactualClassification(MultilingualTask, AbsTaskClassification name="AmazonCounterfactualClassification", dataset={ "path": "mteb/amazon_counterfactual", - "revision": "e8379541af4e31359cca9fbcf4b00f2671dba205", - "trust_remote_code": True, + "revision": "1f7e6a9d6fa6e64c53d146e428565640410c0df1", }, description=( "A collection of Amazon customer reviews annotated for counterfactual detection pair classification." diff --git a/mteb/tasks/Classification/multilingual/AmazonReviewsClassification.py b/mteb/tasks/Classification/multilingual/AmazonReviewsClassification.py index 774ad9f01d..a3db9785bb 100644 --- a/mteb/tasks/Classification/multilingual/AmazonReviewsClassification.py +++ b/mteb/tasks/Classification/multilingual/AmazonReviewsClassification.py @@ -9,9 +9,8 @@ class AmazonReviewsClassification(MultilingualTask, AbsTaskClassification): metadata = TaskMetadata( name="AmazonReviewsClassification", dataset={ - "path": "mteb/amazon_reviews_multi", - "revision": "1399c76144fd37290681b995c656ef9b2e06e26d", - "trust_remote_code": True, + "path": "mteb/AmazonReviewsClassification", + "revision": "6b5d328eaae8ef408dd7d775040245cf86f92e9d", }, description="A collection of Amazon reviews specifically designed to aid research in multilingual text classification.", reference="https://arxiv.org/abs/2010.02573", diff --git a/mteb/tasks/Clustering/deu/BlurbsClusteringP2P.py b/mteb/tasks/Clustering/deu/BlurbsClusteringP2P.py index 17ac058740..79f9f1badd 100644 --- a/mteb/tasks/Clustering/deu/BlurbsClusteringP2P.py +++ b/mteb/tasks/Clustering/deu/BlurbsClusteringP2P.py @@ -25,7 +25,6 @@ class BlurbsClusteringP2P(AbsTaskClustering): eval_langs=["deu-Latn"], main_score="v_measure", date=None, - form=None, domains=None, task_subtypes=None, license=None, diff --git a/mteb/tasks/Clustering/deu/BlurbsClusteringS2S.py b/mteb/tasks/Clustering/deu/BlurbsClusteringS2S.py index 67366ed13d..79596448fa 100644 --- a/mteb/tasks/Clustering/deu/BlurbsClusteringS2S.py +++ b/mteb/tasks/Clustering/deu/BlurbsClusteringS2S.py @@ -33,7 +33,6 @@ class BlurbsClusteringS2S(AbsTaskClustering): eval_langs=["deu-Latn"], main_score="v_measure", date=None, - form=None, domains=None, task_subtypes=None, license=None, diff --git a/mteb/tasks/Clustering/deu/TenKGnadClusteringS2S.py b/mteb/tasks/Clustering/deu/TenKGnadClusteringS2S.py index 66b8bc0f1d..037f3a35a6 100644 --- a/mteb/tasks/Clustering/deu/TenKGnadClusteringS2S.py +++ b/mteb/tasks/Clustering/deu/TenKGnadClusteringS2S.py @@ -23,7 +23,6 @@ class TenKGnadClusteringS2S(AbsTaskClustering): eval_langs=["deu-Latn"], main_score="v_measure", date=None, - form=None, domains=None, task_subtypes=None, license=None, diff --git a/mteb/tasks/Clustering/eng/BigPatentClustering.py b/mteb/tasks/Clustering/eng/BigPatentClustering.py index 7df254ab51..a107d98204 100644 --- a/mteb/tasks/Clustering/eng/BigPatentClustering.py +++ b/mteb/tasks/Clustering/eng/BigPatentClustering.py @@ -29,7 +29,6 @@ class BigPatentClustering(AbsTaskClustering): eval_langs=["eng-Latn"], main_score="v_measure", date=None, - form=None, domains=None, task_subtypes=None, license=None, diff --git a/mteb/tasks/Clustering/eng/RedditClustering.py b/mteb/tasks/Clustering/eng/RedditClustering.py index c9efbe954a..07d9575d11 100644 --- a/mteb/tasks/Clustering/eng/RedditClustering.py +++ b/mteb/tasks/Clustering/eng/RedditClustering.py @@ -86,7 +86,6 @@ class RedditClustering(AbsTaskClustering): eval_langs=["eng-Latn"], main_score="v_measure", date=None, - form=None, domains=None, task_subtypes=None, license=None, diff --git a/mteb/tasks/Clustering/eng/RedditClusteringP2P.py b/mteb/tasks/Clustering/eng/RedditClusteringP2P.py index 1e8d51cdfa..b0bfbb041f 100644 --- a/mteb/tasks/Clustering/eng/RedditClusteringP2P.py +++ b/mteb/tasks/Clustering/eng/RedditClusteringP2P.py @@ -30,7 +30,6 @@ class RedditClusteringP2P(AbsTaskClustering): eval_langs=["eng-Latn"], main_score="v_measure", date=None, - form=None, domains=None, task_subtypes=None, license=None, diff --git a/mteb/tasks/Clustering/eng/StackExchangeClustering.py b/mteb/tasks/Clustering/eng/StackExchangeClustering.py index b123ab5bd1..dea016d854 100644 --- a/mteb/tasks/Clustering/eng/StackExchangeClustering.py +++ b/mteb/tasks/Clustering/eng/StackExchangeClustering.py @@ -88,7 +88,6 @@ class StackExchangeClustering(AbsTaskClustering): eval_langs=["eng-Latn"], main_score="v_measure", date=None, - form=None, domains=None, task_subtypes=None, license=None, diff --git a/mteb/tasks/Clustering/eng/StackExchangeClusteringP2P.py b/mteb/tasks/Clustering/eng/StackExchangeClusteringP2P.py index d6bb252304..c411138e9f 100644 --- a/mteb/tasks/Clustering/eng/StackExchangeClusteringP2P.py +++ b/mteb/tasks/Clustering/eng/StackExchangeClusteringP2P.py @@ -92,7 +92,6 @@ class StackExchangeClusteringP2P(AbsTaskClustering): eval_langs=["eng-Latn"], main_score="v_measure", date=None, - form=None, domains=None, task_subtypes=None, license=None, diff --git a/mteb/tasks/Clustering/fra/AlloProfClusteringP2P.py b/mteb/tasks/Clustering/fra/AlloProfClusteringP2P.py index d48175172c..a1d3a3ac8e 100644 --- a/mteb/tasks/Clustering/fra/AlloProfClusteringP2P.py +++ b/mteb/tasks/Clustering/fra/AlloProfClusteringP2P.py @@ -31,7 +31,6 @@ class AlloProfClusteringP2P(AbsTaskClustering): eval_langs=["fra-Latn"], main_score="v_measure", date=None, - form=None, domains=None, task_subtypes=None, license=None, diff --git a/mteb/tasks/Clustering/fra/AlloProfClusteringS2S.py b/mteb/tasks/Clustering/fra/AlloProfClusteringS2S.py index 74f5bddcaa..1a51cd86f7 100644 --- a/mteb/tasks/Clustering/fra/AlloProfClusteringS2S.py +++ b/mteb/tasks/Clustering/fra/AlloProfClusteringS2S.py @@ -31,7 +31,6 @@ class AlloProfClusteringS2S(AbsTaskClustering): eval_langs=["fra-Latn"], main_score="v_measure", date=None, - form=None, domains=None, task_subtypes=None, license=None, diff --git a/mteb/tasks/Clustering/fra/HALClusteringS2S.py b/mteb/tasks/Clustering/fra/HALClusteringS2S.py index 7b1f40e3e6..ef066fd3af 100644 --- a/mteb/tasks/Clustering/fra/HALClusteringS2S.py +++ b/mteb/tasks/Clustering/fra/HALClusteringS2S.py @@ -33,7 +33,6 @@ class HALClusteringS2S(AbsTaskClustering): eval_langs=["fra-Latn"], main_score="v_measure", date=None, - form=None, domains=None, task_subtypes=None, license=None, diff --git a/mteb/tasks/Clustering/multilingual/IndicReviewsClusteringP2P.py b/mteb/tasks/Clustering/multilingual/IndicReviewsClusteringP2P.py index 8f649a745b..8310f023df 100644 --- a/mteb/tasks/Clustering/multilingual/IndicReviewsClusteringP2P.py +++ b/mteb/tasks/Clustering/multilingual/IndicReviewsClusteringP2P.py @@ -1,10 +1,5 @@ from __future__ import annotations -from typing import Any - -import datasets -import numpy as np - from mteb.abstasks.AbsTaskClustering import AbsTaskClustering from mteb.abstasks.MultilingualTask import MultilingualTask from mteb.abstasks.TaskMetadata import TaskMetadata @@ -30,9 +25,8 @@ class IndicReviewsClusteringP2P(AbsTaskClustering, MultilingualTask): metadata = TaskMetadata( name="IndicReviewsClusteringP2P", dataset={ - "path": "ai4bharat/IndicSentiment", - "revision": "ccb472517ce32d103bba9d4f5df121ed5a6592a4", - "trust_remote_code": True, + "path": "mteb/IndicReviewsClusteringP2P", + "revision": "add94d3b9154cc561bbad0e16ee66ebf5941f8a4", }, description="Clustering of reviews from IndicSentiment dataset. Clustering of 14 sets on the generic categories label.", reference="https://arxiv.org/abs/2212.05409", @@ -57,29 +51,3 @@ class IndicReviewsClusteringP2P(AbsTaskClustering, MultilingualTask): doi = {10.18653/v1/2023.acl-long.693} }""", ) - - def load_data(self, **kwargs: Any) -> None: - """Load dataset from HuggingFace hub""" - if self.data_loaded: - return - self.dataset = {} - for lang in self.hf_subsets: - self.dataset[lang] = datasets.load_dataset( - name=f"translation-{lang}", - **self.metadata_dict["dataset"], - ) - self.dataset_transform() - self.data_loaded = True - - def dataset_transform(self) -> None: - for lang in self.hf_subsets: - self.dataset[lang].pop("validation") - - texts = self.dataset[lang]["test"]["INDIC REVIEW"] - labels = self.dataset[lang]["test"]["GENERIC CATEGORIES"] - - new_format = { - "sentences": [split.tolist() for split in np.array_split(texts, 5)], - "labels": [split.tolist() for split in np.array_split(labels, 5)], - } - self.dataset[lang]["test"] = datasets.Dataset.from_dict(new_format) diff --git a/mteb/tasks/Clustering/zho/CMTEBClustering.py b/mteb/tasks/Clustering/zho/CMTEBClustering.py index fa0704b098..7b79fa616b 100644 --- a/mteb/tasks/Clustering/zho/CMTEBClustering.py +++ b/mteb/tasks/Clustering/zho/CMTEBClustering.py @@ -143,7 +143,6 @@ class CLSClusteringS2S(AbsTaskClustering): eval_langs=["cmn-Hans"], main_score="v_measure", date=None, - form=None, domains=None, task_subtypes=None, license=None, @@ -179,7 +178,6 @@ class CLSClusteringP2P(AbsTaskClustering): eval_langs=["cmn-Hans"], main_score="v_measure", date=None, - form=None, domains=None, task_subtypes=None, license=None, @@ -325,7 +323,6 @@ class ThuNewsClusteringS2S(AbsTaskClustering): eval_langs=["cmn-Hans"], main_score="v_measure", date=None, - form=None, domains=None, task_subtypes=None, license=None, @@ -368,7 +365,6 @@ class ThuNewsClusteringP2P(AbsTaskClustering): eval_langs=["cmn-Hans"], main_score="v_measure", date=None, - form=None, domains=None, task_subtypes=None, license=None, diff --git a/mteb/tasks/InstructionReranking/multilingual/mFollowIR.py b/mteb/tasks/InstructionReranking/multilingual/mFollowIR.py index 7a2974dea2..b42f86b7c5 100644 --- a/mteb/tasks/InstructionReranking/multilingual/mFollowIR.py +++ b/mteb/tasks/InstructionReranking/multilingual/mFollowIR.py @@ -1,5 +1,7 @@ from __future__ import annotations +from logging import getLogger + import datasets from mteb.abstasks.MultilingualTask import MultilingualTask @@ -7,6 +9,8 @@ from ....abstasks.AbsTaskReranking import AbsTaskReranking +logger = getLogger(__name__) + _LANGUAGES = { "fas": ["fas-Arab"], "rus": ["rus-Cyrl"], @@ -58,7 +62,7 @@ def load_data( loading_lang = lang.split("-")[1] # don't care about the eng part else: loading_lang = lang - print(f"Loading data for {lang} from {loading_lang}") + logger.info(f"Loading data for {lang} from {loading_lang}") # Load corpus data corpus_data = datasets.load_dataset( @@ -176,7 +180,7 @@ def load_data(self, **kwargs): self.data_loaded = True -class mFollowIR(MultilingualTask, AbsTaskReranking): +class mFollowIR(AbsTaskReranking, MultilingualTask): metadata = TaskMetadata( name="mFollowIR", description="This tasks measures retrieval instruction following ability on NeuCLIR narratives for the mFollowIR benchmark on the Farsi, Russian, and Chinese languages.", diff --git a/mteb/tasks/InstructionRetrieval/eng/InstructIR.py b/mteb/tasks/InstructionRetrieval/eng/InstructIR.py index 910a3a5bae..1e2f40cd25 100644 --- a/mteb/tasks/InstructionRetrieval/eng/InstructIR.py +++ b/mteb/tasks/InstructionRetrieval/eng/InstructIR.py @@ -35,15 +35,4 @@ class InstructIR(AbsTaskRetrieval): archivePrefix={{arXiv}}, primaryClass={{cs.CL}} }""", - descriptive_stats={ - "n_samples": {"test": 2255}, - "test": { - "num_samples": 375, - "num_positive": 375, - "num_negative": 375, - "avg_query_len": 50.205333333333336, - "avg_positive_len": 6.013333333333334, - "avg_negative_len": 13.986666666666666, - }, - }, ) diff --git a/mteb/tasks/PairClassification/__init__.py b/mteb/tasks/PairClassification/__init__.py index c2057a4952..1193728659 100644 --- a/mteb/tasks/PairClassification/__init__.py +++ b/mteb/tasks/PairClassification/__init__.py @@ -11,6 +11,9 @@ from .hye.ArmenianParaphrasePC import * from .ind.IndoNLI import * from .kor.KlueNLI import * +from .multilingual.IndicXnliPairClassification import ( + IndicXnliPairClassification as IndicXnliPairClassification, +) from .multilingual.OpusparcusPC import * from .multilingual.PawsXPairClassification import * from .multilingual.RTE3 import * diff --git a/mteb/tasks/PairClassification/multilingual/IndicXnliPairClassification.py b/mteb/tasks/PairClassification/multilingual/IndicXnliPairClassification.py index 33fa179737..2c6ef5f2c1 100644 --- a/mteb/tasks/PairClassification/multilingual/IndicXnliPairClassification.py +++ b/mteb/tasks/PairClassification/multilingual/IndicXnliPairClassification.py @@ -1,8 +1,10 @@ from __future__ import annotations -from mteb.abstasks import AbsTaskPairClassification, MultilingualTask +from mteb.abstasks.MultilingualTask import MultilingualTask from mteb.abstasks.TaskMetadata import TaskMetadata +from ....abstasks.AbsTaskPairClassification import AbsTaskPairClassification + _LANGUAGES = { "as": ["asm-Beng"], "bn": ["ben-Beng"], @@ -56,7 +58,7 @@ class IndicXnliPairClassification(AbsTaskPairClassification, MultilingualTask): author = {Aggarwal, Divyanshu and Gupta, Vivek and Kunchukuttan, Anoop}, title = {IndicXNLI: Evaluating Multilingual Inference for Indian Languages}, publisher = {arXiv}, - year = {2022}, + year = {2022}, copyright = {Creative Commons Attribution 4.0 International} } """, diff --git a/mteb/tasks/PairClassification/multilingual/XNLI.py b/mteb/tasks/PairClassification/multilingual/XNLI.py index 8f3f795bad..c72d1694a3 100644 --- a/mteb/tasks/PairClassification/multilingual/XNLI.py +++ b/mteb/tasks/PairClassification/multilingual/XNLI.py @@ -110,8 +110,8 @@ class XNLIV2(MultilingualTask, AbsTaskPairClassification): metadata = TaskMetadata( name="XNLIV2", dataset={ - "path": "mteb/xnli2.0-multi-pair", - "revision": "5b7d477a8c62cdd18e2fed7e015497c20b4371ad", + "path": "mteb/XNLIV2", + "revision": "06108371a8bceee5024a527c4330baa29eb5a013", }, description=""" This is subset of 'XNLI 2.0: Improving XNLI dataset and performance on Cross Lingual Understanding' @@ -140,30 +140,4 @@ class XNLIV2(MultilingualTask, AbsTaskPairClassification): organization={IEEE} } """, - # average of premise and hypothesis ) - - def dataset_transform(self): - _dataset = {} - for lang in self.hf_subsets: - _dataset[lang] = {} - self.dataset[lang] = self.stratified_subsampling( - self.dataset[lang], seed=self.seed, splits=self.metadata.eval_splits - ) - for split in self.metadata.eval_splits: - # 0=entailment, 2=contradiction. Filter out neutral to match the task. - # Then map entailment as positive (1) and contradiction as negative (0). - hf_dataset = self.dataset[lang][split].filter( - lambda x: x["label"] in [0, 2] - ) - hf_dataset = hf_dataset.map( - lambda example: {"label": 0 if example["label"] == 2 else 1} - ) - _dataset[lang][split] = [ - { - "sentence1": hf_dataset["premise"], - "sentence2": hf_dataset["hypothesis"], - "labels": hf_dataset["label"], - } - ] - self.dataset = _dataset diff --git a/mteb/tasks/Reranking/eng/AskUbuntuDupQuestions.py b/mteb/tasks/Reranking/eng/AskUbuntuDupQuestions.py index 51d39c770c..a6e9dac45a 100644 --- a/mteb/tasks/Reranking/eng/AskUbuntuDupQuestions.py +++ b/mteb/tasks/Reranking/eng/AskUbuntuDupQuestions.py @@ -11,8 +11,8 @@ class AskUbuntuDupQuestions(AbsTaskReranking): description="AskUbuntu Question Dataset - Questions from AskUbuntu with manual annotations marking pairs of questions as similar or non-similar", reference="https://github.com/taolei87/askubuntu", dataset={ - "path": "mteb/askubuntudupquestions-reranking", - "revision": "2000358ca161889fa9c082cb41daa8dcfb161a54", + "path": "mteb/AskUbuntuDupQuestions", + "revision": "c5691e3c48741d5f83b5cc8e630653d7a8cfc048", }, type="Reranking", category="s2s", diff --git a/mteb/tasks/Reranking/zho/CMTEBReranking.py b/mteb/tasks/Reranking/zho/CMTEBReranking.py index d6ff57a2a9..34ed14e342 100644 --- a/mteb/tasks/Reranking/zho/CMTEBReranking.py +++ b/mteb/tasks/Reranking/zho/CMTEBReranking.py @@ -20,7 +20,6 @@ class T2Reranking(AbsTaskReranking): eval_langs=["cmn-Hans"], main_score="map_at_1000", date=None, - form=None, domains=None, task_subtypes=None, license=None, @@ -55,7 +54,6 @@ class MMarcoReranking(AbsTaskReranking): eval_langs=["cmn-Hans"], main_score="map_at_1000", date=None, - form=None, domains=None, task_subtypes=None, license=None, @@ -127,7 +125,6 @@ class CMedQAv2(AbsTaskReranking): eval_langs=["cmn-Hans"], main_score="map_at_1000", date=None, - form=None, domains=["Medical", "Written"], task_subtypes=None, license=None, diff --git a/mteb/tasks/Retrieval/eng/BrightRetrieval.py b/mteb/tasks/Retrieval/eng/BrightRetrieval.py index 4a9b2e743d..37256918d2 100644 --- a/mteb/tasks/Retrieval/eng/BrightRetrieval.py +++ b/mteb/tasks/Retrieval/eng/BrightRetrieval.py @@ -40,18 +40,16 @@ class BrightRetrieval(MultilingualTask, AbsTaskRetrieval): "revision": "a75a0eb", }, reference="https://huggingface.co/datasets/xlangai/BRIGHT", - description=("Bright retrieval dataset."), + description="Bright retrieval dataset.", type="Retrieval", category="s2p", eval_splits=EVAL_SPLITS, eval_langs=DOMAINS_langs, main_score="ndcg_at_10", date=("2024-03-01", "2024-06-01"), - form=["written"], - domains=["Non-fiction"], + domains=["Written", "Non-fiction"], task_subtypes=["Article retrieval"], license="cc-by-4.0", - socioeconomic_status="low", annotations_creators="derived", dialect=[], sample_creation="found", diff --git a/mteb/tasks/Retrieval/multilingual/WikipediaRetrievalMultilingual.py b/mteb/tasks/Retrieval/multilingual/WikipediaRetrievalMultilingual.py index a78fa4110d..f51ebac2f2 100644 --- a/mteb/tasks/Retrieval/multilingual/WikipediaRetrievalMultilingual.py +++ b/mteb/tasks/Retrieval/multilingual/WikipediaRetrievalMultilingual.py @@ -1,7 +1,5 @@ from __future__ import annotations -from datasets import load_dataset - from mteb.abstasks.MultilingualTask import MultilingualTask from mteb.abstasks.TaskMetadata import TaskMetadata @@ -27,76 +25,14 @@ } -# adapted from MIRACLRetrieval -def _load_data( - path: str, - langs: list, - split: str, - cache_dir: str = None, - revision_queries: str = None, - revision_corpus: str = None, - revision_qrels: str = None, -): - queries = {lang: {split: {}} for lang in langs} - corpus = {lang: {split: {}} for lang in langs} - qrels = {lang: {split: {}} for lang in langs} - - for lang in langs: - queries_path = path - corpus_path = path.replace("queries", "corpus") - qrels_path = path.replace("queries", "qrels") - queries_lang = load_dataset( - queries_path, - lang, - split=split, - cache_dir=cache_dir, - revision=revision_queries, - ) - corpus_lang = load_dataset( - corpus_path, - lang, - split=split, - cache_dir=cache_dir, - revision=revision_corpus, - ) - qrels_lang = load_dataset( - qrels_path, - lang, - split=split, - cache_dir=cache_dir, - revision=revision_qrels, - ) - # don't pass on titles to make task harder - corpus_lang_dict = {doc["_id"]: {"text": doc["text"]} for doc in corpus_lang} - queries_lang_dict = {query["_id"]: query["text"] for query in queries_lang} - # qrels_lang_dict = {qrel["query-id"]: {qrel["corpus-id"]: qrel["score"]} for qrel in qrels_lang} - - qrels_lang_dict = {} - for qrel in qrels_lang: - if qrel["score"] == 0.5: - continue - # score = 0 if qrel["score"] == 0.5 else qrel["score"] - # score = int(score) - score = int(qrel["score"]) - qrels_lang_dict[qrel["query-id"]] = {qrel["corpus-id"]: score} - - corpus[lang][split] = corpus_lang_dict - queries[lang][split] = queries_lang_dict - qrels[lang][split] = qrels_lang_dict - - return corpus, queries, qrels - - -class WikipediaRetrievalMultilingual(MultilingualTask, AbsTaskRetrieval): +class WikipediaRetrievalMultilingual(AbsTaskRetrieval, MultilingualTask): metadata = TaskMetadata( name="WikipediaRetrievalMultilingual", description="The dataset is derived from Cohere's wikipedia-2023-11 dataset and contains synthetically generated queries.", reference="https://huggingface.co/datasets/ellamind/wikipedia-2023-11-retrieval-multilingual-queries", dataset={ - "path": "ellamind/wikipedia-2023-11-retrieval-multilingual-queries", - "revision": "3b6ea595c94bac3448a2ad167ca2e06abd340d6e", # avoid validation error - "revision_corpus": "f20ac0c449c85358d3d5c72a95f92f1eddc98aa5", - "revision_qrels": "ec88a7bb2da034d538e98e3122d2c98530ca1c8d", + "path": "mteb/WikipediaRetrievalMultilingual", + "revision": "5f6c91d21f2f5b9afb663858d19848fbd223c775", }, type="Retrieval", category="s2p", @@ -113,19 +49,3 @@ class WikipediaRetrievalMultilingual(MultilingualTask, AbsTaskRetrieval): sample_creation="LM-generated and verified", bibtex_citation="", ) - - def load_data(self, **kwargs): - if self.data_loaded: - return - - self.corpus, self.queries, self.relevant_docs = _load_data( - path=self.metadata_dict["dataset"]["path"], - langs=self.hf_subsets, - split=self.metadata_dict["eval_splits"][0], - cache_dir=kwargs.get("cache_dir", None), - revision_queries=self.metadata_dict["dataset"]["revision"], - revision_corpus=self.metadata_dict["dataset"]["revision_corpus"], - revision_qrels=self.metadata_dict["dataset"]["revision_qrels"], - ) - - self.data_loaded = True diff --git a/mteb/tasks/Retrieval/zho/CMTEBRetrieval.py b/mteb/tasks/Retrieval/zho/CMTEBRetrieval.py index ad26652ccd..d579245439 100644 --- a/mteb/tasks/Retrieval/zho/CMTEBRetrieval.py +++ b/mteb/tasks/Retrieval/zho/CMTEBRetrieval.py @@ -1,31 +1,10 @@ from __future__ import annotations -from collections import defaultdict - -from datasets import DatasetDict, load_dataset - from mteb.abstasks.TaskMetadata import TaskMetadata from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval -def load_retrieval_data(dataset_path, dataset_revision, qrel_revision, eval_splits): - eval_split = eval_splits[0] - dataset = load_dataset(dataset_path, revision=dataset_revision) - qrels = load_dataset(dataset_path + "-qrels", revision=qrel_revision)[eval_split] - - corpus = {e["id"]: {"text": e["text"]} for e in dataset["corpus"]} - queries = {e["id"]: e["text"] for e in dataset["queries"]} - relevant_docs = defaultdict(dict) - for e in qrels: - relevant_docs[e["qid"]][e["pid"]] = e["score"] - - corpus = DatasetDict({eval_split: corpus}) - queries = DatasetDict({eval_split: queries}) - relevant_docs = DatasetDict({eval_split: relevant_docs}) - return corpus, queries, relevant_docs - - class T2Retrieval(AbsTaskRetrieval): ignore_identical_ids = True @@ -34,9 +13,8 @@ class T2Retrieval(AbsTaskRetrieval): description="T2Ranking: A large-scale Chinese Benchmark for Passage Ranking", reference="https://arxiv.org/abs/2304.03679", dataset={ - "path": "C-MTEB/T2Retrieval", - "revision": "8731a845f1bf500a4f111cf1070785c793d10e64", - "qrel_revision": "1c83b8d1544e529875e3f6930f3a1fcf749a8e97", + "path": "mteb/T2Retrieval", + "revision": "cf778c0ea4168ec5174a34d888d6453e4cde9222", }, type="Retrieval", category="s2p", @@ -64,18 +42,6 @@ class T2Retrieval(AbsTaskRetrieval): }, ) - def load_data(self, **kwargs): - if self.data_loaded: - return - - self.corpus, self.queries, self.relevant_docs = load_retrieval_data( - self.metadata_dict["dataset"]["path"], - self.metadata_dict["dataset"]["revision"], - self.metadata_dict["dataset"]["qrel_revision"], - self.metadata_dict["eval_splits"], - ) - self.data_loaded = True - class MMarcoRetrieval(AbsTaskRetrieval): ignore_identical_ids = True @@ -85,9 +51,8 @@ class MMarcoRetrieval(AbsTaskRetrieval): description="MMarcoRetrieval", reference="https://arxiv.org/abs/2309.07597", dataset={ - "path": "C-MTEB/MMarcoRetrieval", - "revision": "539bbde593d947e2a124ba72651aafc09eb33fc2", - "qrel_revision": "bae08bb7bddbedb96c7e7db52018a55167b67f89", + "path": "mteb/MMarcoRetrieval", + "revision": "4940a7b26bf53463cfe3435bb8e201963e9c31ae", }, type="Retrieval", category="s2p", @@ -115,18 +80,6 @@ class MMarcoRetrieval(AbsTaskRetrieval): }, ) - def load_data(self, **kwargs): - if self.data_loaded: - return - - self.corpus, self.queries, self.relevant_docs = load_retrieval_data( - self.metadata_dict["dataset"]["path"], - self.metadata_dict["dataset"]["revision"], - self.metadata_dict["dataset"]["qrel_revision"], - self.metadata_dict["eval_splits"], - ) - self.data_loaded = True - class DuRetrieval(AbsTaskRetrieval): metadata = TaskMetadata( @@ -134,9 +87,8 @@ class DuRetrieval(AbsTaskRetrieval): description="A Large-scale Chinese Benchmark for Passage Retrieval from Web Search Engine", reference="https://aclanthology.org/2022.emnlp-main.357.pdf", dataset={ - "path": "C-MTEB/DuRetrieval", - "revision": "a1a333e290fe30b10f3f56498e3a0d911a693ced", - "qrel_revision": "497b7bd1bbb25cb3757ff34d95a8be50a3de2279", + "path": "mteb/DuRetrieval", + "revision": "313c81b51311893c8fd09ca432f96b841ed0ebb3", }, type="Retrieval", category="s2p", @@ -164,18 +116,6 @@ class DuRetrieval(AbsTaskRetrieval): }, ) - def load_data(self, **kwargs): - if self.data_loaded: - return - - self.corpus, self.queries, self.relevant_docs = load_retrieval_data( - self.metadata_dict["dataset"]["path"], - self.metadata_dict["dataset"]["revision"], - self.metadata_dict["dataset"]["qrel_revision"], - self.metadata_dict["eval_splits"], - ) - self.data_loaded = True - class CovidRetrieval(AbsTaskRetrieval): metadata = TaskMetadata( @@ -183,9 +123,8 @@ class CovidRetrieval(AbsTaskRetrieval): description="COVID-19 news articles", reference="https://arxiv.org/abs/2203.03367", dataset={ - "path": "C-MTEB/CovidRetrieval", - "revision": "1271c7809071a13532e05f25fb53511ffce77117", - "qrel_revision": "a9f41b7cdf24785531d12417ce0d1157ed4b39ca", + "path": "mteb/CovidRetrieval", + "revision": "9c6dc4b276bb47c3ff725bbc5ffcafd56dded38b", }, type="Retrieval", category="s2p", @@ -206,18 +145,6 @@ class CovidRetrieval(AbsTaskRetrieval): }, ) - def load_data(self, **kwargs): - if self.data_loaded: - return - - self.corpus, self.queries, self.relevant_docs = load_retrieval_data( - self.metadata_dict["dataset"]["path"], - self.metadata_dict["dataset"]["revision"], - self.metadata_dict["dataset"]["qrel_revision"], - self.metadata_dict["eval_splits"], - ) - self.data_loaded = True - class CmedqaRetrieval(AbsTaskRetrieval): metadata = TaskMetadata( @@ -225,9 +152,8 @@ class CmedqaRetrieval(AbsTaskRetrieval): description="Online medical consultation text. Used the CMedQAv2 as its underlying dataset.", reference="https://aclanthology.org/2022.emnlp-main.357.pdf", dataset={ - "path": "C-MTEB/CmedqaRetrieval", - "revision": "cd540c506dae1cf9e9a59c3e06f42030d54e7301", - "qrel_revision": "279d737f36c731c8ff6e2b055f31fe02216fa23d", + "path": "mteb/CmedqaRetrieval", + "revision": "c476f85bf03d6642ec66bf54b9a551c88108bbb4", }, type="Retrieval", category="s2p", @@ -248,18 +174,6 @@ class CmedqaRetrieval(AbsTaskRetrieval): }, ) - def load_data(self, **kwargs): - if self.data_loaded: - return - - self.corpus, self.queries, self.relevant_docs = load_retrieval_data( - self.metadata_dict["dataset"]["path"], - self.metadata_dict["dataset"]["revision"], - self.metadata_dict["dataset"]["qrel_revision"], - self.metadata_dict["eval_splits"], - ) - self.data_loaded = True - class EcomRetrieval(AbsTaskRetrieval): ignore_identical_ids = True @@ -269,9 +183,8 @@ class EcomRetrieval(AbsTaskRetrieval): description="EcomRetrieval", reference="https://arxiv.org/abs/2203.03367", dataset={ - "path": "C-MTEB/EcomRetrieval", - "revision": "687de13dc7294d6fd9be10c6945f9e8fec8166b9", - "qrel_revision": "39c90699b034ec22ac45b3abf5b0bbb5ffd421f9", + "path": "mteb/EcomRetrieval", + "revision": "fa705ce5418e91636b1eaeaf43f34c15aa3f5a8a", }, type="Retrieval", category="s2p", @@ -292,18 +205,6 @@ class EcomRetrieval(AbsTaskRetrieval): }, ) - def load_data(self, **kwargs): - if self.data_loaded: - return - - self.corpus, self.queries, self.relevant_docs = load_retrieval_data( - self.metadata_dict["dataset"]["path"], - self.metadata_dict["dataset"]["revision"], - self.metadata_dict["dataset"]["qrel_revision"], - self.metadata_dict["eval_splits"], - ) - self.data_loaded = True - class MedicalRetrieval(AbsTaskRetrieval): ignore_identical_ids = True @@ -313,9 +214,8 @@ class MedicalRetrieval(AbsTaskRetrieval): description="MedicalRetrieval", reference="https://arxiv.org/abs/2203.03367", dataset={ - "path": "C-MTEB/MedicalRetrieval", - "revision": "2039188fb5800a9803ba5048df7b76e6fb151fc6", - "qrel_revision": "37b8efec53c54c3d9c6af212f6710b62ccdf895c", + "path": "mteb/MedicalRetrieval", + "revision": "023ae3b2c6b96f583c4ff9b3f9239c93f7885c20", }, type="Retrieval", category="s2p", @@ -336,18 +236,6 @@ class MedicalRetrieval(AbsTaskRetrieval): }, ) - def load_data(self, **kwargs): - if self.data_loaded: - return - - self.corpus, self.queries, self.relevant_docs = load_retrieval_data( - self.metadata_dict["dataset"]["path"], - self.metadata_dict["dataset"]["revision"], - self.metadata_dict["dataset"]["qrel_revision"], - self.metadata_dict["eval_splits"], - ) - self.data_loaded = True - class VideoRetrieval(AbsTaskRetrieval): ignore_identical_ids = True @@ -357,9 +245,8 @@ class VideoRetrieval(AbsTaskRetrieval): description="VideoRetrieval", reference="https://arxiv.org/abs/2203.03367", dataset={ - "path": "C-MTEB/VideoRetrieval", - "revision": "58c2597a5943a2ba48f4668c3b90d796283c5639", - "qrel_revision": "faa71382b6a29cf1778d1f436b963e75cb5b927c", + "path": "mteb/VideoRetrieval", + "revision": "146a9d5e4fd7a9c182b6b92cccb6a3753994305c", }, type="Retrieval", category="s2p", @@ -379,15 +266,3 @@ class VideoRetrieval(AbsTaskRetrieval): "query": "Given a video search query, retrieve the titles of relevant videos" }, ) - - def load_data(self, **kwargs): - if self.data_loaded: - return - - self.corpus, self.queries, self.relevant_docs = load_retrieval_data( - self.metadata_dict["dataset"]["path"], - self.metadata_dict["dataset"]["revision"], - self.metadata_dict["dataset"]["qrel_revision"], - self.metadata_dict["eval_splits"], - ) - self.data_loaded = True diff --git a/mteb/tasks/STS/multilingual/IndicCrosslingualSTS.py b/mteb/tasks/STS/multilingual/IndicCrosslingualSTS.py index b5a5c67b86..0f37f78a80 100644 --- a/mteb/tasks/STS/multilingual/IndicCrosslingualSTS.py +++ b/mteb/tasks/STS/multilingual/IndicCrosslingualSTS.py @@ -20,22 +20,12 @@ } -def categorize_float(float_value): - left_bound = int(float_value) - right_bound = left_bound + 1 - if float_value - left_bound < right_bound - float_value: - return left_bound - else: - return right_bound - - class IndicCrosslingualSTS(AbsTaskSTS, MultilingualTask): - fast_loading = True metadata = TaskMetadata( name="IndicCrosslingualSTS", dataset={ - "path": "mteb/indic_sts", - "revision": "0ca7b87dda68ef4ebb2f50a20a62b9dbebcac3e4", + "path": "mteb/IndicCrosslingualSTS", + "revision": "f0366eb5a20087355c0e131162bbed943ba54b51", }, description="This is a Semantic Textual Similarity testset between English and 12 high-resource Indic languages.", reference="https://huggingface.co/datasets/jaygala24/indic_sts", @@ -81,20 +71,3 @@ def metadata_dict(self) -> dict[str, str]: metadata_dict["min_score"] = 0 metadata_dict["max_score"] = 5 return metadata_dict - - def dataset_transform(self) -> None: - # Convert to standard format - for lang in self.hf_subsets: - self.dataset[lang] = self.dataset[lang].rename_columns( - {"english_sentence": "sentence1", "indic_sentence": "sentence2"} - ) - self.dataset[lang] = ( - self.dataset[lang] - .map(lambda x: {"label": round(x["score"])}) - .class_encode_column("label") - ) - self.dataset[lang]["test"] = self.dataset[lang]["test"].train_test_split( - test_size=256, - seed=self.seed, - stratify_by_column="label", - )["test"] diff --git a/tests/test_TaskMetadata.py b/tests/test_TaskMetadata.py index 19cfd0be75..873264bc80 100644 --- a/tests/test_TaskMetadata.py +++ b/tests/test_TaskMetadata.py @@ -1,8 +1,7 @@ from __future__ import annotations -import logging - import pytest +from pydantic import ValidationError from mteb import AbsTask from mteb.abstasks.TaskMetadata import TaskMetadata @@ -178,6 +177,7 @@ "TamilNewsClassification", "TenKGnadClusteringP2P.v2", "TenKGnadClusteringS2S.v2", + "IndicXnliPairClassification", ] @@ -259,8 +259,8 @@ def test_given_missing_revision_path_then_it_throws(): def test_given_none_revision_path_then_it_logs_warning(caplog): - with caplog.at_level(logging.WARNING): - my_task = TaskMetadata( + with pytest.raises(ValidationError): + TaskMetadata( name="MyTask", dataset={"path": "test/dataset", "revision": None}, description="testing", @@ -281,18 +281,6 @@ def test_given_none_revision_path_then_it_logs_warning(caplog): bibtex_citation="", ) - assert my_task.dataset["revision"] is None - - warning_logs = [ - record for record in caplog.records if record.levelname == "WARNING" - ] - assert len(warning_logs) == 1 - assert ( - warning_logs[0].message - == "Revision missing for the dataset test/dataset. " - + "It is encourage to specify a dataset revision for reproducability." - ) - def test_unfilled_metadata_is_not_filled(): assert ( @@ -511,10 +499,11 @@ def test_disallow_trust_remote_code_in_new_datasets(): "MLSUMClusteringS2S.v2", "SwednClusteringP2P", "SwednClusteringS2S", + "IndicXnliPairClassification", ] assert ( - 135 == len(exceptions) + 136 == len(exceptions) ), "The number of exceptions has changed. Please do not add new datasets to this list." exceptions = [] diff --git a/tests/test_load_results/test_mteb_results.py b/tests/test_load_results/test_mteb_results.py index 6c22b390f3..84071b735f 100644 --- a/tests/test_load_results/test_mteb_results.py +++ b/tests/test_load_results/test_mteb_results.py @@ -34,7 +34,6 @@ class DummyTask(AbsTask): annotations_creators="derived", dialect=[], bibtex_citation="", - descriptive_stats={}, modalities=["text"], sample_creation="created", )