From b4a4fc553e77a6527d642c7de8ae4d2305cba8da Mon Sep 17 00:00:00 2001
From: zqhuang211 <zqhuang@gmail.com>
Date: Thu, 15 Aug 2024 18:40:52 -0700
Subject: [PATCH] Update default config to ultravox_v0.3 (#84)

Add a release configuration to train `ultravox-v0_3` (https://huggingface.co/fixie-ai/ultravox-v0_3). Simply run
`mcli run -f mcloud.yaml --follow` after following the instructions in README.md
---
 mcloud.yaml                                   |  2 +-
 ultravox/training/configs/meta_config.yaml    |  2 +-
 ultravox/training/configs/release_config.yaml | 45 +++++++++++++++++++
 3 files changed, 47 insertions(+), 2 deletions(-)
 create mode 100644 ultravox/training/configs/release_config.yaml

diff --git a/mcloud.yaml b/mcloud.yaml
index 280e258f..99788954 100644
--- a/mcloud.yaml
+++ b/mcloud.yaml
@@ -14,4 +14,4 @@ command: >-
 env_variables:
   MLFLOW_TRACKING_URI: databricks
   UV_BRANCH: main
-  TRAIN_ARGS: --config_path ultravox/training/configs/llama3_whisper_kd.yaml
+  TRAIN_ARGS: --config_path ultravox/training/configs/release_config.yaml
\ No newline at end of file
diff --git a/ultravox/training/configs/meta_config.yaml b/ultravox/training/configs/meta_config.yaml
index 62f622d3..9ccb4348 100644
--- a/ultravox/training/configs/meta_config.yaml
+++ b/ultravox/training/configs/meta_config.yaml
@@ -3,7 +3,7 @@ audio_model: "facebook/wav2vec2-base-960h"
 
 data_sets: ["gigaspeech"]
 val_sets: ["heysquad_human", "anyinstruct", "soda", "peoplespeech"]
-stop_strategy: "last_exhausted"
+stop_strategy: "LAST_EXHAUSTED"
 
 train_on_inputs: False
 shuffle_data: True
diff --git a/ultravox/training/configs/release_config.yaml b/ultravox/training/configs/release_config.yaml
new file mode 100644
index 00000000..973656a7
--- /dev/null
+++ b/ultravox/training/configs/release_config.yaml
@@ -0,0 +1,45 @@
+# SLM with ultravox & llama3.1, trained wtih knowledge distillation.
+exp_name: "ultravox-v0_3"
+
+# Make sure to accept the license agreement on huggingface hub
+text_model: "meta-llama/Meta-Llama-3.1-8B-Instruct"
+audio_model: "openai/whisper-small"
+
+
+loss_config:
+  # Choose from ["KL_Divergence", "CrossEntropy"], default is "KL_Divergence"
+  loss_function: "KL_Divergence"
+
+# Temporarily remove heysquad_human from val_sets as it causes the training to fail.
+val_sets: ["anyinstruct", "soda", "peoplespeech"]
+
+batch_size: 24
+max_steps: 7200 # x8x24 = 1,382,400 samples
+
+data_sets: []
+data_dicts:
+  - path: "fixie-ai/librispeech_asr"
+    name: "clean"
+    splits:
+      - "train.100" # 28_539 samples
+      - "train.360" # 104_014 samples
+    user_template: "Continue the following text using less than 50 words:\n\n<|audio|>"
+    assistant_template: "{{ continuation }}"
+    transcript_template: "{{ text }}"
+    weight: 1
+  - path: "fixie-ai/librispeech_asr"
+    name: "other"
+    splits:
+      - "train.500" # 148_688 samples
+    user_template: "Continue the following text using less than 50 words:\n\n<|audio|>"
+    assistant_template: "{{ continuation }}"
+    transcript_template: "{{ text }}"
+    weight: 1
+  - path: "fixie-ai/common_voice_17_0"
+    name: "en"
+    splits:
+      - "train" # 1_101_170 samples
+    user_template: "Continue the following text using less than 50 words:\n\n<|audio|>"
+    assistant_template: "{{ continuation }}"
+    transcript_template: "{{ text_proc.format_asr_text(sentence) }}"
+    weight: 8