From b4a4fc553e77a6527d642c7de8ae4d2305cba8da Mon Sep 17 00:00:00 2001 From: zqhuang211 Date: Thu, 15 Aug 2024 18:40:52 -0700 Subject: [PATCH] Update default config to ultravox_v0.3 (#84) Add a release configuration to train `ultravox-v0_3` (https://huggingface.co/fixie-ai/ultravox-v0_3). Simply run `mcli run -f mcloud.yaml --follow` after following the instructions in README.md --- mcloud.yaml | 2 +- ultravox/training/configs/meta_config.yaml | 2 +- ultravox/training/configs/release_config.yaml | 45 +++++++++++++++++++ 3 files changed, 47 insertions(+), 2 deletions(-) create mode 100644 ultravox/training/configs/release_config.yaml diff --git a/mcloud.yaml b/mcloud.yaml index 280e258f..99788954 100644 --- a/mcloud.yaml +++ b/mcloud.yaml @@ -14,4 +14,4 @@ command: >- env_variables: MLFLOW_TRACKING_URI: databricks UV_BRANCH: main - TRAIN_ARGS: --config_path ultravox/training/configs/llama3_whisper_kd.yaml + TRAIN_ARGS: --config_path ultravox/training/configs/release_config.yaml \ No newline at end of file diff --git a/ultravox/training/configs/meta_config.yaml b/ultravox/training/configs/meta_config.yaml index 62f622d3..9ccb4348 100644 --- a/ultravox/training/configs/meta_config.yaml +++ b/ultravox/training/configs/meta_config.yaml @@ -3,7 +3,7 @@ audio_model: "facebook/wav2vec2-base-960h" data_sets: ["gigaspeech"] val_sets: ["heysquad_human", "anyinstruct", "soda", "peoplespeech"] -stop_strategy: "last_exhausted" +stop_strategy: "LAST_EXHAUSTED" train_on_inputs: False shuffle_data: True diff --git a/ultravox/training/configs/release_config.yaml b/ultravox/training/configs/release_config.yaml new file mode 100644 index 00000000..973656a7 --- /dev/null +++ b/ultravox/training/configs/release_config.yaml @@ -0,0 +1,45 @@ +# SLM with ultravox & llama3.1, trained wtih knowledge distillation. +exp_name: "ultravox-v0_3" + +# Make sure to accept the license agreement on huggingface hub +text_model: "meta-llama/Meta-Llama-3.1-8B-Instruct" +audio_model: "openai/whisper-small" + + +loss_config: + # Choose from ["KL_Divergence", "CrossEntropy"], default is "KL_Divergence" + loss_function: "KL_Divergence" + +# Temporarily remove heysquad_human from val_sets as it causes the training to fail. +val_sets: ["anyinstruct", "soda", "peoplespeech"] + +batch_size: 24 +max_steps: 7200 # x8x24 = 1,382,400 samples + +data_sets: [] +data_dicts: + - path: "fixie-ai/librispeech_asr" + name: "clean" + splits: + - "train.100" # 28_539 samples + - "train.360" # 104_014 samples + user_template: "Continue the following text using less than 50 words:\n\n<|audio|>" + assistant_template: "{{ continuation }}" + transcript_template: "{{ text }}" + weight: 1 + - path: "fixie-ai/librispeech_asr" + name: "other" + splits: + - "train.500" # 148_688 samples + user_template: "Continue the following text using less than 50 words:\n\n<|audio|>" + assistant_template: "{{ continuation }}" + transcript_template: "{{ text }}" + weight: 1 + - path: "fixie-ai/common_voice_17_0" + name: "en" + splits: + - "train" # 1_101_170 samples + user_template: "Continue the following text using less than 50 words:\n\n<|audio|>" + assistant_template: "{{ continuation }}" + transcript_template: "{{ text_proc.format_asr_text(sentence) }}" + weight: 8