diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml new file mode 100644 index 00000000..656c0044 --- /dev/null +++ b/.github/workflows/tests.yml @@ -0,0 +1,40 @@ +# This workflow will install Python dependencies, run tests and lint with a single version of Python +# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions + +name: Run tests + +on: + push: + branches: [main, prod] + pull_request: + branches: [main, prod] + +jobs: + test: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + with: + lfs: true + + - uses: extractions/setup-just@v1 + + - name: Set up Python 3.11 + uses: actions/setup-python@v2 + with: + python-version: 3.11 + + - name: Install dependencies + run: | + just install + + - name: Run checks + run: | + just check + + - name: Run tests + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + run: | + just test diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..f6513a5d --- /dev/null +++ b/.gitignore @@ -0,0 +1,191 @@ +.vscode/ +.venv/ +.hermit/ +runs/ +outputs/ +wandb/ + +artifacts/ + +output_audio_processor/ +output_tokenizer/ + +*.csv +*.json +epd_eval/ +.git/ +env_vars.sh +sync_watch +clearml.conf +aml.md +.DS_Store +*.safetensors +*.pt + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +.idea/ +.vscode/ + +.neptune/ +mds_output/ +mlruns/ +output/ + diff --git a/Justfile b/Justfile new file mode 100644 index 00000000..4e1b3a07 --- /dev/null +++ b/Justfile @@ -0,0 +1,67 @@ +export WANDB_PROJECT:="ultravox" +export WANDB_LOG_MODEL:="checkpoint" +export PROJECT_DIR:="ultravox" +export VENV_NAME:="venv" +export MCLOUD_CLUSTER:="r7z22" +export MCLOUD_INSTANCE:="oci.bm.gpu.b4.8" + +default: format check test + +create-venv: + pip install --upgrade virtualenv # older virtualenv had some issues in Debian + python -m venv ${VENV_NAME} + just install + +install: + # Install torch 2.2.1 if needed, not present in requirements.txt + just python -c \"import torch\" 2>/dev/null || just pip install torch==2.2.1 + just pip install -r requirements.txt + just pip install -r requirements-dev.txt + just python -m pip install types-requests + +format: + . ./activate ${VENV_NAME} && autoflake ${PROJECT_DIR} --remove-all-unused-imports --quiet --in-place -r --exclude third_party --exclude ultravox/model/gazelle + . ./activate ${VENV_NAME} && isort ${PROJECT_DIR} --force-single-line-imports + . ./activate ${VENV_NAME} && black ${PROJECT_DIR} + +check: + . ./activate ${VENV_NAME} && black ${PROJECT_DIR} --check + . ./activate ${VENV_NAME} && isort ${PROJECT_DIR} --check --force-single-line-imports + . ./activate ${VENV_NAME} && autoflake ${PROJECT_DIR} --check --quiet --remove-all-unused-imports -r --exclude third_party --exclude ultravox/model/gazelle + . ./activate ${VENV_NAME} && mypy ${PROJECT_DIR} + +test *ARGS=".": + . ./activate ${VENV_NAME} && cd ${PROJECT_DIR} && pytest --ignore third_party {{ARGS}} + +@python *FLAGS: + . ./activate ${VENV_NAME} && python {{FLAGS}} + +@pip *FLAGS: + . ./activate ${VENV_NAME} && pip {{FLAGS}} + +train *FLAGS: + just python -m ultravox.training.train {{FLAGS}} + +train_asr *FLAGS: + just train --config_path ultravox/training/configs/asr_tinyllama.yaml {{FLAGS}} + +browse *FLAGS: + just python -m ultravox.tools.data_tool {{FLAGS}} + +infer *FLAGS: + just python -m ultravox.tools.infer_tool {{FLAGS}} + +eval *FLAGS: + just python -m ultravox.tools.eval_tool {{FLAGS}} + +mds *FLAGS: + just python -m ultravox.tools.mds_tool {{FLAGS}} + +gradio *FLAGS: + just python -m ultravox.tools.gradio_demo {{FLAGS}} + +run *FLAGS: + mcli run -f mcloud.yaml --follow {{FLAGS}} + +mcloud *FLAGS: + mcli interactive {{FLAGS}} --cluster ${MCLOUD_CLUSTER} --instance ${MCLOUD_INSTANCE} --name `whoami` --command "bash -c \"$(cat setup.sh)\"" diff --git a/README.md b/README.md new file mode 100644 index 00000000..05c36423 --- /dev/null +++ b/README.md @@ -0,0 +1,182 @@ +

+ + Ultravox + +

+ +

+An open, fast, and extensible multimodal LLM +

+ +# About + +Ultravox is a new kind of multimodal LLM that can understand text as well as human speech, without the need for a separate Audio Speech Recognition (ASR) stage. Building on research like [AudioLM](https://arxiv.org/abs/2209.03143), [SeamlessM4T](https://ai.meta.com/blog/seamless-m4t/), [Gazelle](https://tincans.ai/slm), [SpeechGPT](https://github.com/0nutation/SpeechGPT/tree/main/speechgpt), and others, we've extended Meta's [Llama 3 model](https://llama.meta.com/llama3/) with a multimodal projector that converts audio directly into the high-dimensional space used by Llama 3. This direct coupling allows Ultravox to respond much more quickly than systems that combine separate ASR and LLM components. In the future this will also allow Ultravox to natively understand the paralinguistic cues of timing and emotion that are omnipresent in human speech. + +The current version of Ultravox (v0.1), when invoked with audio content has a time-to-first-token (TTFT) of approximately 200ms, and a tokens-per-second rate of ~100, all using a Llama 3 8B backbone. While quite fast, we believe there is considerable room for improvement in these numbers. + +Ultravox currently takes in audio and emits speech. As we evolve the model, we'll train it to be able to emit a stream of speech tokens that can then be converted directly into raw audio by an appropriate unit vocoder. We're interested in working with interested parties to build this functionality! + +### Demo + +Coming soon! + +### Discord + +Join us on our Discord server [here](https://discord.gg/YhX5GjCH). + +### Inference Server + +You can try out Ultravox using your own audio content (as a WAV file), using the following curl command: + +``` +curl -X POST -H "Authorization: Bearer $ULTRAVOX_API_KEY" -d @data.json https://ultravox.api.fixie.ai/v1/chat/completions +``` + +where `data.json` contains: + +``` +{ + "model": "fixie-ai/ultravox-v0.1", + "content": [ + { + "type": "text", + "text": "What’s in <|audio|>?" + }, + { + "type": "image_url", + "image_url": { + "url": f"data:audio/wav;base64,{base64_wav}" + } + } + ], + "stream": true +} +``` + +### Model + +You can download the latest weights from the [Ultravox Hugging Face page](https://huggingface.co/fixie-ai/ultravox). + +### Architecture + +https://docs.google.com/presentation/d/1ey81xuuMzrJaBwztb_Rq24Cit37GQokD2aAes_KkGVI/edit + +# Contributing + +Read on if you're interested in training your own version of Ultravox. + +## Environment Setup (Mac) + +Install the basic tools: + +- [`Homebrew`](https://brew.sh) is a package manager for MacOS that also mostly works for Linux. If you're running Debian or Ubuntu Linux, you can alternatively get by with apt. +- [`Just`](https://just.systems/man/en/) simplifies our shell workflows. It frequently functions as our interface to all the other tools. + +```bash +/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)" +brew update +brew install just +``` + +Create a Python virtual environment and install the necessary packages: + +```bash +just create-env +``` + +For now we're using venv for Python virtual environments. +We may switch to `Poetry` in the future. + +### Mosaic Environment Setup + +You need to setup a few things to run on the Mosaic Platform. + +1. Install & login to the Mosaic CLI + +```bash +pip install --upgrade mosaicml-cli + +mcli init + +mcli set api-key +``` + +2. set API keys for tools we use: + +```bash +# Huggging Face token for accessing walled data and models +mcli create secret env HF_TOKEN=hf_ + +# WandB token for logging experiments +mcli create secret env WANDB_PROJECT=ultravox +mcli create secret env WANDB_API_KEY= + +# GCP credentials for accessing data (e.g. BoolQ) +# Get service_account.json file from Justin/Farzad and put it in the root dir, then +mcli create secret gcp +``` + +## Training + +```bash +just train +``` + +For DDP training make sure to use: +`torchrun --nproc_per_node=8 -m ultravox.training.train` + +### Local Training + +```bash +python -m ultravox.training.train --config_path ultravox/training/configs/asr_tinyllama.yaml --data_set 'dummy' --device cpu --batch_size 1 --exp_name +``` + +### MosaicML Training + +You need to setup your SSH key in the Mosaic Platform: https://docs.mosaicml.com/projects/mcli/en/latest/resources/secrets/ssh.html#page-secrets-ssh + +```bash +## Create a new SSH key and add it to the Mosaic Platform +# ssh-keygen -f ~/.ssh/mclid_id_rsa +## add the **public** key to Github +# mcli create secret ssh ~/.ssh/mclid_id_rsa + +mcli run -f mcloud.yaml --follow +``` + +Other useful commands: + +```bash +mcli get clusters + +mcli util r7z2 +mcli get runs +mcli get runs --cluster r7z2 + +mcli run -f mcloud.yaml --follow +``` + +For interactive runs, we don't recommend using `--interactive`. Instead set the `command` to be something like +`sleep 3600` and then connect to it using `mcli connect --tmux`. +This way your environment (code and packages) will be the same as the training environment. +The value `3600` (1 hour), is used as an example. + +IMPORTANT: Make sure to stop the machine when you're done with any job, specially interactive ones! + +### Running evaluations + +1. Use `infer_tool.py --json > file` to create a jsonl output from a given model/dataset combo, where each line contains two values: **question** and **answer**. +2. Use `eval_tool.py -f file` to evaluate the jsonl file, which will produce an average score for the model on the dataset. + +## Misc + +Useful commands: + +```bash +just update # update dependencies +just format # run formatting (black, isort, autoflake) +just python # activate venv and run python +just pip # install a package in the venv using the right pip +``` + +The `legacy` directory contains some initial experiments. We'll pull in the useful parts as we go. diff --git a/activate b/activate new file mode 100755 index 00000000..fe47bd14 --- /dev/null +++ b/activate @@ -0,0 +1,6 @@ +#!/bin/sh +# If we are in CI, we want to use the existing venv due to disk space issues +set +u +if [ -z "${CI}" ]; then + . $1/bin/activate +fi diff --git a/docs/assets/Introducing Banner.svg b/docs/assets/Introducing Banner.svg new file mode 100644 index 00000000..7fd44084 --- /dev/null +++ b/docs/assets/Introducing Banner.svg @@ -0,0 +1,1422 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Introducing: + + \ No newline at end of file diff --git a/docs/assets/UV Hero Image (1).png b/docs/assets/UV Hero Image (1).png new file mode 100644 index 00000000..71a1724d Binary files /dev/null and b/docs/assets/UV Hero Image (1).png differ diff --git a/docs/assets/UV logo black.svg b/docs/assets/UV logo black.svg new file mode 100644 index 00000000..1834a701 --- /dev/null +++ b/docs/assets/UV logo black.svg @@ -0,0 +1,26 @@ + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/assets/UV logo color dark.svg b/docs/assets/UV logo color dark.svg new file mode 100644 index 00000000..5bb5facf --- /dev/null +++ b/docs/assets/UV logo color dark.svg @@ -0,0 +1,57 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/assets/UV logo color light.svg b/docs/assets/UV logo color light.svg new file mode 100644 index 00000000..45402924 --- /dev/null +++ b/docs/assets/UV logo color light.svg @@ -0,0 +1,47 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/assets/UV logo white.svg b/docs/assets/UV logo white.svg new file mode 100644 index 00000000..33d0e339 --- /dev/null +++ b/docs/assets/UV logo white.svg @@ -0,0 +1,26 @@ + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/assets/UV stacked Black.svg b/docs/assets/UV stacked Black.svg new file mode 100644 index 00000000..75a20841 --- /dev/null +++ b/docs/assets/UV stacked Black.svg @@ -0,0 +1,26 @@ + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/assets/UV stacked color dark.svg b/docs/assets/UV stacked color dark.svg new file mode 100644 index 00000000..94e9a6f2 --- /dev/null +++ b/docs/assets/UV stacked color dark.svg @@ -0,0 +1,57 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/assets/UV stacked color light.svg b/docs/assets/UV stacked color light.svg new file mode 100644 index 00000000..e5166215 --- /dev/null +++ b/docs/assets/UV stacked color light.svg @@ -0,0 +1,52 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/assets/UV stacked white.svg b/docs/assets/UV stacked white.svg new file mode 100644 index 00000000..e119e4da --- /dev/null +++ b/docs/assets/UV stacked white.svg @@ -0,0 +1,26 @@ + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/assets/foo.txt b/docs/assets/foo.txt new file mode 100644 index 00000000..257cc564 --- /dev/null +++ b/docs/assets/foo.txt @@ -0,0 +1 @@ +foo diff --git a/legacy/Justfile b/legacy/Justfile new file mode 100644 index 00000000..13ebd912 --- /dev/null +++ b/legacy/Justfile @@ -0,0 +1,25 @@ +export WANDB_PROJECT:="ultravox" +export WANDB_LOG_MODEL:="checkpoint" +export PROJECT_DIR:="ultravox" + +poetry *FLAGS: + cd ${PROJECT_DIR} && poetry {{FLAGS}} + +python *FLAGS: + cd ${PROJECT_DIR} && poetry run python {{FLAGS}} + +format: + cd ${PROJECT_DIR} && poetry run autoflake . --remove-all-unused-imports --quiet --in-place -r --exclude third_party + cd ${PROJECT_DIR} && poetry run isort . --force-single-line-imports + cd ${PROJECT_DIR} && poetry run black . + +check: + cd ${PROJECT_DIR} && poetry check + cd ${PROJECT_DIR} && poetry run black . --check + cd ${PROJECT_DIR} && poetry run isort . --check --force-single-line-imports + cd ${PROJECT_DIR} && poetry run autoflake . --check --quiet --remove-all-unused-imports -r --exclude third_party + cd ${PROJECT_DIR} && poetry run mypy . + cd ${PROJECT_DIR} && poetry run deptry . + +test *ARGS="--dist loadgroup -n auto .": + cd ${PROJECT_DIR} && poetry run pytest --ignore third_party {{ARGS}} diff --git a/legacy/README.md b/legacy/README.md new file mode 100644 index 00000000..4497d3d5 --- /dev/null +++ b/legacy/README.md @@ -0,0 +1,94 @@ +# UltraVox + +## AzureML + +### Installation and Config + +```bash +brew update && brew install azure-cli +az extension add --name ml --yes + +az login +az account set --subscription 520aa0b2-6a19-4a45-8c03-4c301d1f847a +az configure --defaults workspace=gpu-supercomput +``` + +```bash +az ml job create -f ./azureml/configs/audiollm.yml --web +``` + +## Random Documentation + +### LLM + AudioEnc (ours) vs SpeechGPT + +```python +# SpeechGPT adds new tokens to the embedding and then trains them +nn.Embedding(32000, 2048) + nn.Embedding(4000, 2048) # old text tokens + new audio tokens +nn.Embedding(36000, 2048) + +### +# In other words: +### + +# SpeechGPT tokenizes audio and text separately, then concatenates the embeddings +llm(embed(concat(audio_tokenizer(audio), text_token))) +## ------------------- vs ------------------- +# We create the audio embeddings directly from the audio and skip embedding the audio tokens +llm(concat(audio_enc(audio) * weight, embed(text_token))) +# This means we can easily propagate gradients to the audio encoder (i.e. train end to end) +``` + +### How does language modeling work with audio? + +```python +# t[n] <- t[1..n-1] +# The brown fox jumps over the fence + +# a1 a2 a3 a4 The brown fox jumps over the fence +# samples: +# a1 a2 a3 -> a4 +# a1 a2 a3 a4 -> The +# a1 a2 a3 a4 The -> brown +# a1 a2 a3 a4 The brown -> fox +# a1 a2 a3 a4 The brown fox -> jumps +# a1 a2 a3 a4 The brown fox jumps -> over +# a1 a2 a3 a4 The brown fox jumps over -> the +# a1 a2 a3 a4 The brown fox jumps over the -> fence +``` + +## TODO + +- [ ] generation metrics (low_pri: added more metrics to cover shifts) +- [x] more metrics to cover shifts +- [x] torchrun +- [ ] shard dataset +- [ ] datasets.distributed.split_dataset_by_node +- [ ] cache preprocessed data +- [ ]