From 8a3bc757bb61d512e50356af50401669ad57ede4 Mon Sep 17 00:00:00 2001
From: Farzad Abdolhosseini <farzad@fixie.ai>
Date: Sun, 9 Jun 2024 18:35:17 -0700
Subject: [PATCH] Local tokenizer and processor for more consistent CI (#16)

* saving local versions of tokenizer and processor for faster CI

* cr

* try again

* and again

* debug

* verbose tests

* Move files

* Revert "verbose tests"

This reverts commit bf63a436702f4e072992305ada9c3d82e2f71bf0.

* clean up

* Reapply "verbose tests"

This reverts commit aa093820eabd31ffc545e07cdf7ff9f35bbac2b1.

* more logging again

* blarg

* remove logs

---------

Co-authored-by: juberti <justin@fixie.ai>
---
 .gitattributes                                |  1 +
 .../special_tokens_map.json                   |  3 +++
 .../Meta-Llama-3-8B-Instruct/tokenizer.json   |  3 +++
 .../tokenizer_config.json                     |  3 +++
 .../preprocessor_config.json                  |  3 +++
 .../special_tokens_map.json                   |  3 +++
 .../wav2vec2-base-960h/tokenizer_config.json  |  3 +++
 .../assets/hf/wav2vec2-base-960h/vocab.json   |  3 +++
 ultravox/inference/infer_test.py              | 21 +++++++------------
 9 files changed, 29 insertions(+), 14 deletions(-)
 create mode 100644 .gitattributes
 create mode 100644 ultravox/assets/hf/Meta-Llama-3-8B-Instruct/special_tokens_map.json
 create mode 100644 ultravox/assets/hf/Meta-Llama-3-8B-Instruct/tokenizer.json
 create mode 100644 ultravox/assets/hf/Meta-Llama-3-8B-Instruct/tokenizer_config.json
 create mode 100644 ultravox/assets/hf/wav2vec2-base-960h/preprocessor_config.json
 create mode 100644 ultravox/assets/hf/wav2vec2-base-960h/special_tokens_map.json
 create mode 100644 ultravox/assets/hf/wav2vec2-base-960h/tokenizer_config.json
 create mode 100644 ultravox/assets/hf/wav2vec2-base-960h/vocab.json

diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 00000000..7fe70d7f
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1 @@
+*.json filter=lfs diff=lfs merge=lfs -text
diff --git a/ultravox/assets/hf/Meta-Llama-3-8B-Instruct/special_tokens_map.json b/ultravox/assets/hf/Meta-Llama-3-8B-Instruct/special_tokens_map.json
new file mode 100644
index 00000000..aa6b9fcf
--- /dev/null
+++ b/ultravox/assets/hf/Meta-Llama-3-8B-Instruct/special_tokens_map.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6f38c73729248f6c127296386e3cdde96e254636cc58b4169d3fd32328d9a8ec
+size 296
diff --git a/ultravox/assets/hf/Meta-Llama-3-8B-Instruct/tokenizer.json b/ultravox/assets/hf/Meta-Llama-3-8B-Instruct/tokenizer.json
new file mode 100644
index 00000000..9a62752e
--- /dev/null
+++ b/ultravox/assets/hf/Meta-Llama-3-8B-Instruct/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e134af98b985517b4f068e3755ae90d4e9cd2d45d328325dc503f1c6b2d06cc7
+size 9085698
diff --git a/ultravox/assets/hf/Meta-Llama-3-8B-Instruct/tokenizer_config.json b/ultravox/assets/hf/Meta-Llama-3-8B-Instruct/tokenizer_config.json
new file mode 100644
index 00000000..a251eecd
--- /dev/null
+++ b/ultravox/assets/hf/Meta-Llama-3-8B-Instruct/tokenizer_config.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:da0e3a7cce6e4d787e85eb1c24d548420e0d7fe2c7a214e192795c46e40d75bb
+size 50977
diff --git a/ultravox/assets/hf/wav2vec2-base-960h/preprocessor_config.json b/ultravox/assets/hf/wav2vec2-base-960h/preprocessor_config.json
new file mode 100644
index 00000000..a81343fa
--- /dev/null
+++ b/ultravox/assets/hf/wav2vec2-base-960h/preprocessor_config.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:617bd0950f8cc9ac4062e8c73a7be60305ca5790a243df55fa6f44fb671b55b1
+size 257
diff --git a/ultravox/assets/hf/wav2vec2-base-960h/special_tokens_map.json b/ultravox/assets/hf/wav2vec2-base-960h/special_tokens_map.json
new file mode 100644
index 00000000..0805c80d
--- /dev/null
+++ b/ultravox/assets/hf/wav2vec2-base-960h/special_tokens_map.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9046da57c270c8e74d0f38832b4adce269c9d914ef21d2a0925e7772152dd793
+size 96
diff --git a/ultravox/assets/hf/wav2vec2-base-960h/tokenizer_config.json b/ultravox/assets/hf/wav2vec2-base-960h/tokenizer_config.json
new file mode 100644
index 00000000..0bf31239
--- /dev/null
+++ b/ultravox/assets/hf/wav2vec2-base-960h/tokenizer_config.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d7fbc59c63ec955c3d12862fad36d1b919fa6a94e54911297ad649db7822dce1
+size 1147
diff --git a/ultravox/assets/hf/wav2vec2-base-960h/vocab.json b/ultravox/assets/hf/wav2vec2-base-960h/vocab.json
new file mode 100644
index 00000000..e25f0805
--- /dev/null
+++ b/ultravox/assets/hf/wav2vec2-base-960h/vocab.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4178db26b3c7570f6a47f14ac6a1c7b32950b8c2800fb097287e53776934f1c5
+size 358
diff --git a/ultravox/inference/infer_test.py b/ultravox/inference/infer_test.py
index afc5b061..ec165dad 100644
--- a/ultravox/inference/infer_test.py
+++ b/ultravox/inference/infer_test.py
@@ -1,5 +1,3 @@
-import logging
-import os
 from unittest import mock
 
 import numpy as np
@@ -12,23 +10,21 @@
 from ultravox.inference import infer
 from ultravox.model import ultravox_processing
 
-os.environ["TOKENIZERS_PARALLELISM"] = "false"
-
 
+# We cache these files in our repo to make CI faster and also
+# work properly for external contributions (since Llama 3 is gated).
 @pytest.fixture(scope="module")
 def tokenizer():
-    logging.info("Loading tokenizer")
-    yield transformers.AutoTokenizer.from_pretrained(
-        "meta-llama/Meta-Llama-3-8B-Instruct"
+    return transformers.AutoTokenizer.from_pretrained(
+        "./assets/hf/Meta-Llama-3-8B-Instruct", local_files_only=True
     )
-    logging.info("Tearing down tokenizer")
 
 
 @pytest.fixture(scope="module")
 def audio_processor():
-    logging.info("Loading audio processor")
-    yield transformers.AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h")
-    logging.info("Tearing down audio processor")
+    return transformers.AutoProcessor.from_pretrained(
+        "./assets/hf/wav2vec2-base-960h", local_files_only=True
+    )
 
 
 class FakeInference(infer.LocalInference):
@@ -50,9 +46,6 @@ def __init__(
         self.model.device = "cpu"
         self.model.generate = mock.MagicMock(return_value=[range(25)])
 
-    def __del__(self):
-        logging.info("Tearing down inference")
-
 
 EXPECTED_TOKEN_IDS_START = [128000, 128006, 882, 128007]
 EXPECTED_TOKEN_IDS_END = [128009, 128006, 78191, 128007, 271]