From c2c2c6b65184bb7bf0a8e31579739c52001ffa4a Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Tue, 3 Dec 2024 16:31:38 +0100 Subject: [PATCH] fix: Added arctic models #1515 --- mteb/models/arctic_models.py | 130 ++++++++++++++++++++++++++++++++++- mteb/models/overview.py | 2 + 2 files changed, 130 insertions(+), 2 deletions(-) diff --git a/mteb/models/arctic_models.py b/mteb/models/arctic_models.py index 5f3d41a97..6e7141b01 100644 --- a/mteb/models/arctic_models.py +++ b/mteb/models/arctic_models.py @@ -22,9 +22,135 @@ n_parameters=109_000_000, memory_usage=None, max_tokens=512, - embed_dim=256, + embed_dim=768, license="apache-2.0", reference="https://huggingface.co/Snowflake/snowflake-arctic-embed-m-v1.5", - similarity_fn_name="cosine_similarity", + similarity_fn_name="cosine", use_instructions=False, + adapted_from=None, + supersedes="Snowflake/snowflake-arctic-embed-m", +) + + +arctic_embed_xs = ModelMeta( + loader=partial( + sentence_transformers_loader, + model_name="Snowflake/snowflake-arctic-embed-xs", + revision="742da4f66e1823b5b4dbe6c320a1375a1fd85f9e", + ), + name="Snowflake/snowflake-arctic-embed-xs", + revision="742da4f66e1823b5b4dbe6c320a1375a1fd85f9e", + release_date="2024-07-08", # initial commit of hf model. + languages=["eng_Latn"], + open_weights=True, + framework=["Sentence Transformers", "PyTorch"], + n_parameters=22_600_000, + memory_usage=None, + max_tokens=512, + embed_dim=384, + license="apache-2.0", + reference="https://huggingface.co/Snowflake/snowflake-arctic-embed-xs", + similarity_fn_name="cosine", + use_instructions=False, + adapted_from="sentence-transformers/all-MiniLM-L6-v2", + supersedes=None, +) + + +arctic_embed_s = ModelMeta( + loader=partial( + sentence_transformers_loader, + model_name="Snowflake/snowflake-arctic-embed-s", + revision="d3c1d2d433dd0fdc8e9ca01331a5f225639e798f", + ), + name="Snowflake/snowflake-arctic-embed-s", + revision="d3c1d2d433dd0fdc8e9ca01331a5f225639e798f", + release_date="2024-04-12", # initial commit of hf model. + languages=["eng_Latn"], + open_weights=True, + framework=["Sentence Transformers", "PyTorch"], + n_parameters=32_200_000, + memory_usage=None, + max_tokens=512, + embed_dim=384, + license="apache-2.0", + reference="https://huggingface.co/Snowflake/snowflake-arctic-embed-s", + similarity_fn_name="cosine", + use_instructions=False, + adapted_from="intfloat/e5-small-unsupervised", + supersedes=None, +) + + +arctic_embed_m = ModelMeta( + loader=partial( + sentence_transformers_loader, + model_name="Snowflake/snowflake-arctic-embed-m", + revision="cc17beacbac32366782584c8752220405a0f3f40", + ), + name="Snowflake/snowflake-arctic-embed-m", + revision="cc17beacbac32366782584c8752220405a0f3f40", + release_date="2024-04-12", # initial commit of hf model. + languages=["eng_Latn"], + open_weights=True, + framework=["Sentence Transformers", "PyTorch"], + n_parameters=109_000_000, + memory_usage=None, + max_tokens=512, + embed_dim=768, + license="apache-2.0", + reference="https://huggingface.co/Snowflake/snowflake-arctic-embed-m", + similarity_fn_name="cosine", + use_instructions=False, + adapted_from="intfloat/e5-base-unsupervised", + supersedes=None, +) + +arctic_embed_m_long = ModelMeta( + loader=partial( + sentence_transformers_loader, + model_name="Snowflake/snowflake-arctic-embed-m-long", + revision="89d0f6ab196eead40b90cb6f9fefec01a908d2d1", + ), + name="Snowflake/snowflake-arctic-embed-m-long", + revision="89d0f6ab196eead40b90cb6f9fefec01a908d2d1", + release_date="2024-04-12", # initial commit of hf model. + languages=["eng_Latn"], + open_weights=True, + framework=["Sentence Transformers", "PyTorch"], + n_parameters=109_000_000, + memory_usage=None, + max_tokens=2048, + embed_dim=768, + license="apache-2.0", + reference="https://huggingface.co/Snowflake/snowflake-arctic-embed-m-long", + similarity_fn_name="cosine", + use_instructions=False, + adapted_from="nomic-ai/nomic-embed-text-v1-unsupervised", + supersedes=None, +) + + +arctic_embed_l = ModelMeta( + loader=partial( + sentence_transformers_loader, + model_name="Snowflake/snowflake-arctic-embed-l", + revision="9a9e5834d2e89cdd8bb72b64111dde496e4fe78c", + ), + name="Snowflake/snowflake-arctic-embed-l", + revision="9a9e5834d2e89cdd8bb72b64111dde496e4fe78c", + release_date="2024-04-12", # initial commit of hf model. + languages=["eng_Latn"], + open_weights=True, + framework=["Sentence Transformers", "PyTorch"], + n_parameters=109_000_000, + memory_usage=None, + max_tokens=512, + embed_dim=768, + license="apache-2.0", + reference="https://huggingface.co/Snowflake/snowflake-arctic-embed-l", + similarity_fn_name="cosine", + use_instructions=False, + adapted_from="intfloat/e5-base-unsupervised", + supersedes=None, ) diff --git a/mteb/models/overview.py b/mteb/models/overview.py index f54a085d0..8341e42cd 100644 --- a/mteb/models/overview.py +++ b/mteb/models/overview.py @@ -11,6 +11,7 @@ from mteb.encoder_interface import Encoder from mteb.model_meta import ModelMeta from mteb.models import ( + arctic_models, bge_models, bm25, cohere_models, @@ -39,6 +40,7 @@ logger = logging.getLogger(__name__) model_modules = [ + arctic_models, bge_models, bm25, cohere_models,