From 8a3bc757bb61d512e50356af50401669ad57ede4 Mon Sep 17 00:00:00 2001 From: Farzad Abdolhosseini Date: Sun, 9 Jun 2024 18:35:17 -0700 Subject: [PATCH] Local tokenizer and processor for more consistent CI (#16) * saving local versions of tokenizer and processor for faster CI * cr * try again * and again * debug * verbose tests * Move files * Revert "verbose tests" This reverts commit bf63a436702f4e072992305ada9c3d82e2f71bf0. * clean up * Reapply "verbose tests" This reverts commit aa093820eabd31ffc545e07cdf7ff9f35bbac2b1. * more logging again * blarg * remove logs --------- Co-authored-by: juberti --- .gitattributes | 1 + .../special_tokens_map.json | 3 +++ .../Meta-Llama-3-8B-Instruct/tokenizer.json | 3 +++ .../tokenizer_config.json | 3 +++ .../preprocessor_config.json | 3 +++ .../special_tokens_map.json | 3 +++ .../wav2vec2-base-960h/tokenizer_config.json | 3 +++ .../assets/hf/wav2vec2-base-960h/vocab.json | 3 +++ ultravox/inference/infer_test.py | 21 +++++++------------ 9 files changed, 29 insertions(+), 14 deletions(-) create mode 100644 .gitattributes create mode 100644 ultravox/assets/hf/Meta-Llama-3-8B-Instruct/special_tokens_map.json create mode 100644 ultravox/assets/hf/Meta-Llama-3-8B-Instruct/tokenizer.json create mode 100644 ultravox/assets/hf/Meta-Llama-3-8B-Instruct/tokenizer_config.json create mode 100644 ultravox/assets/hf/wav2vec2-base-960h/preprocessor_config.json create mode 100644 ultravox/assets/hf/wav2vec2-base-960h/special_tokens_map.json create mode 100644 ultravox/assets/hf/wav2vec2-base-960h/tokenizer_config.json create mode 100644 ultravox/assets/hf/wav2vec2-base-960h/vocab.json diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 00000000..7fe70d7f --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +*.json filter=lfs diff=lfs merge=lfs -text diff --git a/ultravox/assets/hf/Meta-Llama-3-8B-Instruct/special_tokens_map.json b/ultravox/assets/hf/Meta-Llama-3-8B-Instruct/special_tokens_map.json new file mode 100644 index 00000000..aa6b9fcf --- /dev/null +++ b/ultravox/assets/hf/Meta-Llama-3-8B-Instruct/special_tokens_map.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f38c73729248f6c127296386e3cdde96e254636cc58b4169d3fd32328d9a8ec +size 296 diff --git a/ultravox/assets/hf/Meta-Llama-3-8B-Instruct/tokenizer.json b/ultravox/assets/hf/Meta-Llama-3-8B-Instruct/tokenizer.json new file mode 100644 index 00000000..9a62752e --- /dev/null +++ b/ultravox/assets/hf/Meta-Llama-3-8B-Instruct/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e134af98b985517b4f068e3755ae90d4e9cd2d45d328325dc503f1c6b2d06cc7 +size 9085698 diff --git a/ultravox/assets/hf/Meta-Llama-3-8B-Instruct/tokenizer_config.json b/ultravox/assets/hf/Meta-Llama-3-8B-Instruct/tokenizer_config.json new file mode 100644 index 00000000..a251eecd --- /dev/null +++ b/ultravox/assets/hf/Meta-Llama-3-8B-Instruct/tokenizer_config.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da0e3a7cce6e4d787e85eb1c24d548420e0d7fe2c7a214e192795c46e40d75bb +size 50977 diff --git a/ultravox/assets/hf/wav2vec2-base-960h/preprocessor_config.json b/ultravox/assets/hf/wav2vec2-base-960h/preprocessor_config.json new file mode 100644 index 00000000..a81343fa --- /dev/null +++ b/ultravox/assets/hf/wav2vec2-base-960h/preprocessor_config.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:617bd0950f8cc9ac4062e8c73a7be60305ca5790a243df55fa6f44fb671b55b1 +size 257 diff --git a/ultravox/assets/hf/wav2vec2-base-960h/special_tokens_map.json b/ultravox/assets/hf/wav2vec2-base-960h/special_tokens_map.json new file mode 100644 index 00000000..0805c80d --- /dev/null +++ b/ultravox/assets/hf/wav2vec2-base-960h/special_tokens_map.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9046da57c270c8e74d0f38832b4adce269c9d914ef21d2a0925e7772152dd793 +size 96 diff --git a/ultravox/assets/hf/wav2vec2-base-960h/tokenizer_config.json b/ultravox/assets/hf/wav2vec2-base-960h/tokenizer_config.json new file mode 100644 index 00000000..0bf31239 --- /dev/null +++ b/ultravox/assets/hf/wav2vec2-base-960h/tokenizer_config.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7fbc59c63ec955c3d12862fad36d1b919fa6a94e54911297ad649db7822dce1 +size 1147 diff --git a/ultravox/assets/hf/wav2vec2-base-960h/vocab.json b/ultravox/assets/hf/wav2vec2-base-960h/vocab.json new file mode 100644 index 00000000..e25f0805 --- /dev/null +++ b/ultravox/assets/hf/wav2vec2-base-960h/vocab.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4178db26b3c7570f6a47f14ac6a1c7b32950b8c2800fb097287e53776934f1c5 +size 358 diff --git a/ultravox/inference/infer_test.py b/ultravox/inference/infer_test.py index afc5b061..ec165dad 100644 --- a/ultravox/inference/infer_test.py +++ b/ultravox/inference/infer_test.py @@ -1,5 +1,3 @@ -import logging -import os from unittest import mock import numpy as np @@ -12,23 +10,21 @@ from ultravox.inference import infer from ultravox.model import ultravox_processing -os.environ["TOKENIZERS_PARALLELISM"] = "false" - +# We cache these files in our repo to make CI faster and also +# work properly for external contributions (since Llama 3 is gated). @pytest.fixture(scope="module") def tokenizer(): - logging.info("Loading tokenizer") - yield transformers.AutoTokenizer.from_pretrained( - "meta-llama/Meta-Llama-3-8B-Instruct" + return transformers.AutoTokenizer.from_pretrained( + "./assets/hf/Meta-Llama-3-8B-Instruct", local_files_only=True ) - logging.info("Tearing down tokenizer") @pytest.fixture(scope="module") def audio_processor(): - logging.info("Loading audio processor") - yield transformers.AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h") - logging.info("Tearing down audio processor") + return transformers.AutoProcessor.from_pretrained( + "./assets/hf/wav2vec2-base-960h", local_files_only=True + ) class FakeInference(infer.LocalInference): @@ -50,9 +46,6 @@ def __init__( self.model.device = "cpu" self.model.generate = mock.MagicMock(return_value=[range(25)]) - def __del__(self): - logging.info("Tearing down inference") - EXPECTED_TOKEN_IDS_START = [128000, 128006, 882, 128007] EXPECTED_TOKEN_IDS_END = [128009, 128006, 78191, 128007, 271]