Skip to content

Commit

Permalink
improve gradio
Browse files Browse the repository at this point in the history
  • Loading branch information
huseinzol05 committed Dec 12, 2024
1 parent 61d2532 commit 5237750
Show file tree
Hide file tree
Showing 14 changed files with 3,747 additions and 34 deletions.
28 changes: 10 additions & 18 deletions gradio/f5-tts/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,13 +49,15 @@
overlap_length = None
speech_enhancement_hop_length = None

MODEL_NAME = os.environ.get('MODEL_NAME', 'mesolitica/Malaysian-F5-TTS')
VOCODER_NAME = os.environ.get('VOCODER_NAME', 'charactr/vocos-mel-24khz')
maxlen_text = int(os.environ.get('MAXLEN_TEXT', '1000'))
maxlen = 20000
maxlen_str = f'{maxlen // 1000} seconds'

examples = []
for f in glob('*.mp3'):
examples.append([f, '', 'Model Text to Speech TTS ini dibangunkan seratus peratus oleh Mesolitica, syarikat pemula di Malaysia yang membangunkan juga Malaysia Large Language Model mallam.', False, True, 0.15, 1.0])
examples.append([f, '', 'Model Text to Speech TTS ini dibangunkan seratus peratus oleh Mesolitica, syarikat pemula di Malaysia yang membangunkan juga Malaysia Large Language Model mallam.', False, 0.15, 1.0])

def load_speech_enhancement():
global speech_enhancement, hp, speech_enhancement_sr, chunk_length, overlap_length, speech_enhancement_hop_length
Expand Down Expand Up @@ -85,8 +87,8 @@ def load_asr_pipe():
def load_tts():
global model, vocoder
gr.Info('Loading TTS model.')
model = load_f5_tts('mesolitica/Malaysian-F5-TTS', device = device, dtype = torch.float16)
vocoder = load_vocoder('mesolitica/malaysian-vocos-mel-24khz', device = device)
model = load_f5_tts(MODEL_NAME, device = device, dtype = torch.float16)
vocoder = load_vocoder(VOCODER_NAME, device = device)
convert_char_to_pinyin(['helo'])


Expand Down Expand Up @@ -163,7 +165,6 @@ def basic_tts(
ref_text_input,
gen_text_input,
reference_enhancement,
output_enhancement,
cross_fade_duration_slider,
speed_slider,
):
Expand Down Expand Up @@ -308,13 +309,10 @@ def basic_tts(
final_wave = new_wave
y = final_wave

if output_enhancement:
y = speech_enhancement_func(torch.tensor(y), sr, resample_back = False)
sr = speech_enhancement_sr
y = y.numpy()
e_y = speech_enhancement_func(torch.tensor(y), sr, resample_back = False)
e_y = e_y.numpy()

audio = (sr, y)
return [audio, ref_text_input]
return [(sr, y), (speech_enhancement_sr, e_y), ref_text_input]

with gr.Blocks(theme=theme) as demo:
gr.Markdown(
Expand Down Expand Up @@ -348,11 +346,6 @@ def basic_tts(
info="Apply Speech Enhancement to reduce noise for reference audio, this will also increase generation time.",
value=False,
)
output_enhancement = gr.Checkbox(
label="Output Enhancement",
info="Apply Speech Enhancement to reduce noise for generated audio, this will also increase generation time.",
value=True,
)
speed_slider = gr.Slider(
label="Speed",
minimum=0.3,
Expand All @@ -370,6 +363,7 @@ def basic_tts(
info="Set the duration of the cross-fade between audio clips.",
)
audio_output = gr.Audio(label="Synthesized Audio", show_download_button = True)
enhanced_audio_output = gr.Audio(label="Enhanced Synthesized Audio", show_download_button = True)
generate_btn = gr.Button("Synthesize", variant="primary")

generate_btn.click(
Expand All @@ -379,11 +373,10 @@ def basic_tts(
ref_text_input,
gen_text_input,
reference_enhancement,
output_enhancement,
cross_fade_duration_slider,
speed_slider,
],
outputs=[audio_output, ref_text_input],
outputs=[audio_output, enhanced_audio_output, ref_text_input],
)
examples = gr.Examples(
examples=examples,
Expand All @@ -392,7 +385,6 @@ def basic_tts(
ref_text_input,
gen_text_input,
reference_enhancement,
output_enhancement,
cross_fade_duration_slider,
speed_slider,
],
Expand Down
Binary file added gradio/f5-tts/kj.mp3
Binary file not shown.
Binary file modified gradio/f5-tts/p-ramlee.mp3
Binary file not shown.
23 changes: 9 additions & 14 deletions session/f5-tts/README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# F5-TTS

## how to
## how to Speech Enhancement

1. Download dataset,

Expand All @@ -11,19 +11,15 @@ tar -xf 7z2301-linux-x64.tar.xz
pip3 install huggingface-hub wandb
python3 -c "
from huggingface_hub import snapshot_download
snapshot_download(repo_id='mesolitica/Malaysian-Emilia-annotated', repo_type='dataset', allow_patterns = 'filtered-24k_processed_24k.z*', local_dir = './')
snapshot_download(repo_id='mesolitica/Malaysian-Emilia', repo_type='dataset', allow_patterns = 'filtered-24k_processed.z*', local_dir = './')
"
python3 -c "
from huggingface_hub import snapshot_download
snapshot_download(repo_id='mesolitica/Malaysian-Emilia-annotated', repo_type='dataset', allow_patterns = 'malaysian-podcast_processed_24k.z*', local_dir = './')
snapshot_download(repo_id='mesolitica/Malaysian-Emilia', repo_type='dataset', allow_patterns = 'malaysian-podcast-processed.z*', local_dir = './')
"
python3 -c "
from huggingface_hub import snapshot_download
snapshot_download(repo_id='mesolitica/Malaysian-Emilia-annotated', repo_type='dataset', allow_patterns = 'sg-podcast_processed_24k.zip', local_dir = './')
"
python3 -c "
from huggingface_hub import snapshot_download
snapshot_download(repo_id='mesolitica/Malaysian-Emilia-annotated', repo_type='dataset', allow_patterns = 'parlimen-24k-chunk_processed_24k.z*', local_dir = './')
snapshot_download(repo_id='mesolitica/Malaysian-Emilia', repo_type='dataset', allow_patterns = 'sg-podcast_processed.zip', local_dir = './')
"
python3 -c "
from huggingface_hub import snapshot_download
Expand All @@ -33,12 +29,11 @@ python3 -c "
from huggingface_hub import snapshot_download
snapshot_download(repo_id='mesolitica/Malaysian-Emilia', repo_type='dataset', allow_patterns = 'parlimen-24k-chunk_processed.z*', local_dir = './')
"
/workspace/7zz x filtered-24k_processed_24k.zip -y -mmt40
/workspace/7zz x malaysian-podcast_processed_24k.zip -y -mmt40
/workspace/7zz x sg-podcast_processed_24k.zip -y -mmt40
/workspace/7zz x parlimen-24k-chunk_processed_24k.zip -y -mmt40
/workspace/7zz x malaysian-cartoon.zip -y -mmt40
/workspace/7zz x filtered-24k_processed.zip -y -mmt40
/workspace/7zz x malaysian-podcast-processed.zip -y -mmt40
/workspace/7zz x sg-podcast_processed.zip -y -mmt40
/workspace/7zz x parlimen-24k-chunk_processed.zip -y -mmt40
/workspace/7zz x malaysian-cartoon.zip -y -mmt40
```

2. Install libraries,
Expand All @@ -47,7 +42,7 @@ snapshot_download(repo_id='mesolitica/Malaysian-Emilia', repo_type='dataset', al
git clone https://github.com/mesolitica/F5-TTS
cd F5-TTS
pip3 install -e .
pip3 install torchdiffeq x-transformers jieba pypinyin ema_pytorch accelerate==1.1.1
pip3 install torchdiffeq x-transformers jieba pypinyin ema_pytorch accelerate==1.1.1 torch==2.5.1 torchaudio==2.5.1
python3 -c "
from huggingface_hub import snapshot_download
snapshot_download(repo_id='mesolitica/Malaysian-Voice-Conversion', repo_type='dataset', allow_patterns = 'data/Emilia_Malaysian_pinyin/*', local_dir = './')
Expand Down
17 changes: 17 additions & 0 deletions session/f5-tts/default_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
compute_environment: LOCAL_MACHINE
debug: false
distributed_type: MULTI_GPU
downcast_bf16: 'no'
enable_cpu_affinity: false
gpu_ids: all
machine_rank: 0
main_training_function: main
mixed_precision: 'no'
num_machines: 1
num_processes: 4
rdzv_backend: static
same_network: true
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: false
4 changes: 2 additions & 2 deletions session/f5-tts/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
epochs = 100 # use linear decay, thus epochs control the slope
num_warmup_updates = 2000 # warmup steps
save_per_updates = 50000 # save checkpoint per steps
last_per_steps = 500 # save last checkpoint per steps
last_per_steps = 2000 # save last checkpoint per steps

# model params
if exp_name == "F5TTS_Base":
Expand Down Expand Up @@ -84,7 +84,7 @@ def main():
max_samples=max_samples,
grad_accumulation_steps=grad_accumulation_steps,
max_grad_norm=max_grad_norm,
wandb_project="CFM-TTS",
wandb_project="CFM-TTS-original",
wandb_run_name=exp_name,
wandb_resume_id=wandb_resume_id,
last_per_steps=last_per_steps,
Expand Down
19 changes: 19 additions & 0 deletions session/smollm2-speech-semantics/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Finetune SmolLM2 for speech semantic tokens

## how to

1. Clone the dataset,

```bash
python3 -c "
from huggingface_hub import snapshot_download
snapshot_download(repo_id='mesolitica/smollm2-speech-semantic-multipack-2048', repo_type='dataset', local_dir = './smollm2-speech-semantic-multipack-2048')
"
```

2. Finetune,

```bash
smollm2-135m-speech.sh
smollm2-360m-speech.sh
```
Loading

0 comments on commit 5237750

Please sign in to comment.