Integrating OAI evals post training (#85)

* allow lower python version for lambda cloud and adding ultravox-vllm * integrate oaievals * evaluations using oaievalset * make sure pipeline can be loaded correctly * force 1 GPU and set max_num_samples * logging eval Table to w&b + make text-only eval optional
fixie-ai · Aug 27, 2024 · 638a7a6 · 638a7a6
1 parent 8b6a3aa
commit 638a7a6
Show file tree

Hide file tree

Showing 9 changed files with 2,944 additions and 356 deletions.
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -8,7 +8,7 @@ license = "MIT"
 readme = "README.md"
 
 [tool.poetry.dependencies]
-python = "^3.11"
+python = ">=3.10,<4.0"
 torch = ">=2.4"
 transformers = {version = ">=4.43.1", extras = ["torch"]}
 bitsandbytes = "~0.42.0"
@@ -29,6 +29,7 @@ tensorboardx = "~2.6.2.2"
 wandb = "~0.17.1"
 sacrebleu = "^2.4.2"
 tenacity = "^9.0.0"
+evals = {git = "https://github.com/fixie-ai/evals"}
 
 [tool.poetry.group.dev.dependencies]
 black = "~24.4.2"

diff --git a/setup.sh b/setup.sh
@@ -1,7 +1,7 @@
 cd $HOME
 mkdir workspace
 cd workspace
-git clone git@github.com:fixie-ai/ultravox.git -b main
+git clone https://github.com/fixie-ai/ultravox.git -b main
 cd ultravox
 mkdir -p ~/.local/bin
 curl --proto '=https' --tlsv1.2 -sSf https://just.systems/install.sh | bash -s -- --to ~/.local/bin

diff --git a/ultravox/model/ultravox_model.py b/ultravox/model/ultravox_model.py
@@ -9,13 +9,13 @@
 import transformers.activations
 import transformers.modeling_outputs
 import transformers.models
+from transformers.models.whisper import modeling_whisper as whisper
 
 # We must use relative import in this directory to allow uploading to HF Hub
 # Even "from . import X" pattern doesn't work (undocumented and unclear why)
 from .ultravox_config import LossConfig
 from .ultravox_config import LossFunction
 from .ultravox_config import UltravoxConfig
-from .whisper_model_modified import WhisperEncoder as ModifiedWhisperEncoder
 
 
 class UltravoxModel(transformers.LlamaPreTrainedModel):
@@ -268,7 +268,7 @@ def prepare_inputs_for_generation(
     @classmethod
     def _create_audio_tower(
         cls, config: UltravoxConfig
-    ) -> Union[transformers.Wav2Vec2Model, ModifiedWhisperEncoder]:
+    ) -> Union[transformers.Wav2Vec2Model, "ModifiedWhisperEncoder"]:
         if config.audio_model_id is not None:
             if "whisper" in config.audio_model_id is not None:
                 audio_tower = ModifiedWhisperEncoder.from_pretrained(
@@ -496,6 +496,126 @@ def forward(self, audio_features: torch.Tensor) -> torch.Tensor:
         return hidden_states
 
 
+class ModifiedWhisperEncoder(whisper.WhisperEncoder):
+    """
+    Encoder portion of OpenAI's Whisper model.
+
+    This implementation is a slightly modified version of HF Transformers' Whisper Encoder, with only a few fixes:
+    1. base_model_prefix updated to allow for doing `.from_pretrained` directly on the encoder
+    2. allow less than 30 second of audio padding to be passed in:
+        - relaxed ValueError check for `input_features` length to be less than or equal to `expected_seq_length` instead of strictly equal
+        - embed_pos is now sliced to match the length of `inputs_embeds`
+
+    Original: https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/modeling_whisper.py
+    """
+
+    base_model_prefix = "model.encoder"
+
+    def forward(
+        self,
+        input_features,
+        attention_mask=None,
+        head_mask=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        expected_seq_length = (
+            self.config.max_source_positions
+            * self.conv1.stride[0]
+            * self.conv2.stride[0]
+        )
+        if input_features.shape[-1] > expected_seq_length:
+            raise ValueError(
+                f"Whisper expects the mel input features to be of length {expected_seq_length} or less, but found {input_features.shape[-1]}. Make sure to pad the input mel features to {expected_seq_length}."
+            )
+
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        inputs_embeds = nn.functional.gelu(self.conv1(input_features))
+        inputs_embeds = nn.functional.gelu(self.conv2(inputs_embeds))
+
+        inputs_embeds = inputs_embeds.permute(0, 2, 1)
+        embed_pos = self.embed_positions.weight[: inputs_embeds.size(-2)]
+
+        hidden_states = inputs_embeds + embed_pos
+        hidden_states = nn.functional.dropout(
+            hidden_states, p=self.dropout, training=self.training
+        )
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        # check if head_mask has a correct number of layers specified if desired
+        if head_mask is not None:
+            assert head_mask.size()[0] == (
+                len(self.layers)
+            ), f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
+
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            to_drop = False
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:  # skip the layer
+                    to_drop = True
+
+            if to_drop:
+                layer_outputs = (None, None)
+            else:
+                if self.gradient_checkpointing and self.training:
+                    layer_outputs = self._gradient_checkpointing_func(
+                        encoder_layer.__call__,
+                        hidden_states,
+                        None,
+                        (head_mask[idx] if head_mask is not None else None),
+                        output_attentions,
+                    )
+                else:
+                    layer_outputs = encoder_layer(
+                        hidden_states,
+                        None,
+                        layer_head_mask=(
+                            head_mask[idx] if head_mask is not None else None
+                        ),
+                        output_attentions=output_attentions,
+                    )
+
+                hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        hidden_states = self.layer_norm(hidden_states)
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, encoder_states, all_attentions]
+                if v is not None
+            )
+        return transformers.modeling_outputs.BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=encoder_states,
+            attentions=all_attentions,
+        )
+
+
 UltravoxConfig.register_for_auto_class()
 UltravoxModel.register_for_auto_class()
 

diff --git a/ultravox/model/ultravox_processing.py b/ultravox/model/ultravox_processing.py
@@ -56,6 +56,9 @@ def __init__(
         assert (
             self.audio_token_replacement is not None
         ), "The tokenizer has no EOS token. Cannot recover."
+        if tokenizer.pad_token_id is None:
+            tokenizer.pad_token_id = tokenizer.eos_token_id
+
         super().__init__(audio_processor=audio_processor, tokenizer=tokenizer)
 
     @classmethod

diff --git a/ultravox/model/whisper_model_modified.py b/ultravox/model/whisper_model_modified.py
diff --git a/ultravox/training/config_base.py b/ultravox/training/config_base.py
@@ -43,6 +43,7 @@ class TrainConfig:
     eval_num_samples: int = 100
     eval_max_new_tokens: Optional[int] = None
     eval_num_procs: int = 8
+    eval_text_only: bool = False
     num_prompts: int = 1
     # number of data loader workers
     num_workers: int = 8 if torch.cuda.is_available() else 1

diff --git a/ultravox/training/configs/meta_config.yaml b/ultravox/training/configs/meta_config.yaml
@@ -11,7 +11,7 @@ max_audio_duration_secs: 16
 
 val_num_samples: 64
 val_steps: 1000
-eval_num_samples: 256
+eval_num_samples: 1024
 eval_max_new_tokens: 32
 eval_num_procs: 16