Skip to content

Commit

Permalink
Integrating OAI evals post training (#85)
Browse files Browse the repository at this point in the history
* allow lower python version for lambda cloud and adding ultravox-vllm

* integrate oaievals

* evaluations using oaievalset

* make sure pipeline can be loaded correctly

* force 1 GPU and set max_num_samples

* logging eval Table to w&b + make text-only eval optional
  • Loading branch information
farzadab authored Aug 27, 2024
1 parent 8b6a3aa commit 638a7a6
Show file tree
Hide file tree
Showing 9 changed files with 2,944 additions and 356 deletions.
2,861 changes: 2,695 additions & 166 deletions poetry.lock

Large diffs are not rendered by default.

3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ license = "MIT"
readme = "README.md"

[tool.poetry.dependencies]
python = "^3.11"
python = ">=3.10,<4.0"
torch = ">=2.4"
transformers = {version = ">=4.43.1", extras = ["torch"]}
bitsandbytes = "~0.42.0"
Expand All @@ -29,6 +29,7 @@ tensorboardx = "~2.6.2.2"
wandb = "~0.17.1"
sacrebleu = "^2.4.2"
tenacity = "^9.0.0"
evals = {git = "https://github.com/fixie-ai/evals"}

[tool.poetry.group.dev.dependencies]
black = "~24.4.2"
Expand Down
2 changes: 1 addition & 1 deletion setup.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
cd $HOME
mkdir workspace
cd workspace
git clone git@github.com:fixie-ai/ultravox.git -b main
git clone https://github.com/fixie-ai/ultravox.git -b main
cd ultravox
mkdir -p ~/.local/bin
curl --proto '=https' --tlsv1.2 -sSf https://just.systems/install.sh | bash -s -- --to ~/.local/bin
Expand Down
124 changes: 122 additions & 2 deletions ultravox/model/ultravox_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,13 @@
import transformers.activations
import transformers.modeling_outputs
import transformers.models
from transformers.models.whisper import modeling_whisper as whisper

# We must use relative import in this directory to allow uploading to HF Hub
# Even "from . import X" pattern doesn't work (undocumented and unclear why)
from .ultravox_config import LossConfig
from .ultravox_config import LossFunction
from .ultravox_config import UltravoxConfig
from .whisper_model_modified import WhisperEncoder as ModifiedWhisperEncoder


class UltravoxModel(transformers.LlamaPreTrainedModel):
Expand Down Expand Up @@ -268,7 +268,7 @@ def prepare_inputs_for_generation(
@classmethod
def _create_audio_tower(
cls, config: UltravoxConfig
) -> Union[transformers.Wav2Vec2Model, ModifiedWhisperEncoder]:
) -> Union[transformers.Wav2Vec2Model, "ModifiedWhisperEncoder"]:
if config.audio_model_id is not None:
if "whisper" in config.audio_model_id is not None:
audio_tower = ModifiedWhisperEncoder.from_pretrained(
Expand Down Expand Up @@ -496,6 +496,126 @@ def forward(self, audio_features: torch.Tensor) -> torch.Tensor:
return hidden_states


class ModifiedWhisperEncoder(whisper.WhisperEncoder):
"""
Encoder portion of OpenAI's Whisper model.
This implementation is a slightly modified version of HF Transformers' Whisper Encoder, with only a few fixes:
1. base_model_prefix updated to allow for doing `.from_pretrained` directly on the encoder
2. allow less than 30 second of audio padding to be passed in:
- relaxed ValueError check for `input_features` length to be less than or equal to `expected_seq_length` instead of strictly equal
- embed_pos is now sliced to match the length of `inputs_embeds`
Original: https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/modeling_whisper.py
"""

base_model_prefix = "model.encoder"

def forward(
self,
input_features,
attention_mask=None,
head_mask=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
):
expected_seq_length = (
self.config.max_source_positions
* self.conv1.stride[0]
* self.conv2.stride[0]
)
if input_features.shape[-1] > expected_seq_length:
raise ValueError(
f"Whisper expects the mel input features to be of length {expected_seq_length} or less, but found {input_features.shape[-1]}. Make sure to pad the input mel features to {expected_seq_length}."
)

output_attentions = (
output_attentions
if output_attentions is not None
else self.config.output_attentions
)
output_hidden_states = (
output_hidden_states
if output_hidden_states is not None
else self.config.output_hidden_states
)
return_dict = (
return_dict if return_dict is not None else self.config.use_return_dict
)
inputs_embeds = nn.functional.gelu(self.conv1(input_features))
inputs_embeds = nn.functional.gelu(self.conv2(inputs_embeds))

inputs_embeds = inputs_embeds.permute(0, 2, 1)
embed_pos = self.embed_positions.weight[: inputs_embeds.size(-2)]

hidden_states = inputs_embeds + embed_pos
hidden_states = nn.functional.dropout(
hidden_states, p=self.dropout, training=self.training
)

encoder_states = () if output_hidden_states else None
all_attentions = () if output_attentions else None

# check if head_mask has a correct number of layers specified if desired
if head_mask is not None:
assert head_mask.size()[0] == (
len(self.layers)
), f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."

for idx, encoder_layer in enumerate(self.layers):
if output_hidden_states:
encoder_states = encoder_states + (hidden_states,)
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
to_drop = False
if self.training:
dropout_probability = torch.rand([])
if dropout_probability < self.layerdrop: # skip the layer
to_drop = True

if to_drop:
layer_outputs = (None, None)
else:
if self.gradient_checkpointing and self.training:
layer_outputs = self._gradient_checkpointing_func(
encoder_layer.__call__,
hidden_states,
None,
(head_mask[idx] if head_mask is not None else None),
output_attentions,
)
else:
layer_outputs = encoder_layer(
hidden_states,
None,
layer_head_mask=(
head_mask[idx] if head_mask is not None else None
),
output_attentions=output_attentions,
)

hidden_states = layer_outputs[0]

if output_attentions:
all_attentions = all_attentions + (layer_outputs[1],)

hidden_states = self.layer_norm(hidden_states)
if output_hidden_states:
encoder_states = encoder_states + (hidden_states,)

if not return_dict:
return tuple(
v
for v in [hidden_states, encoder_states, all_attentions]
if v is not None
)
return transformers.modeling_outputs.BaseModelOutput(
last_hidden_state=hidden_states,
hidden_states=encoder_states,
attentions=all_attentions,
)


UltravoxConfig.register_for_auto_class()
UltravoxModel.register_for_auto_class()

Expand Down
3 changes: 3 additions & 0 deletions ultravox/model/ultravox_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,9 @@ def __init__(
assert (
self.audio_token_replacement is not None
), "The tokenizer has no EOS token. Cannot recover."
if tokenizer.pad_token_id is None:
tokenizer.pad_token_id = tokenizer.eos_token_id

super().__init__(audio_processor=audio_processor, tokenizer=tokenizer)

@classmethod
Expand Down
141 changes: 0 additions & 141 deletions ultravox/model/whisper_model_modified.py

This file was deleted.

1 change: 1 addition & 0 deletions ultravox/training/config_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ class TrainConfig:
eval_num_samples: int = 100
eval_max_new_tokens: Optional[int] = None
eval_num_procs: int = 8
eval_text_only: bool = False
num_prompts: int = 1
# number of data loader workers
num_workers: int = 8 if torch.cuda.is_available() else 1
Expand Down
2 changes: 1 addition & 1 deletion ultravox/training/configs/meta_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ max_audio_duration_secs: 16

val_num_samples: 64
val_steps: 1000
eval_num_samples: 256
eval_num_samples: 1024
eval_max_new_tokens: 32
eval_num_procs: 16

Expand Down
Loading

0 comments on commit 638a7a6

Please sign in to comment.