improve gradio

mesolitica · Dec 12, 2024 · 5237750 · 5237750
1 parent 61d2532
commit 5237750
Show file tree

Hide file tree

Showing 14 changed files with 3,747 additions and 34 deletions.
diff --git a/gradio/f5-tts/app.py b/gradio/f5-tts/app.py
@@ -49,13 +49,15 @@
 overlap_length = None
 speech_enhancement_hop_length = None
 
+MODEL_NAME = os.environ.get('MODEL_NAME', 'mesolitica/Malaysian-F5-TTS')
+VOCODER_NAME = os.environ.get('VOCODER_NAME', 'charactr/vocos-mel-24khz')
 maxlen_text = int(os.environ.get('MAXLEN_TEXT', '1000'))
 maxlen = 20000
 maxlen_str = f'{maxlen // 1000} seconds'
 
 examples = []
 for f in glob('*.mp3'):
-    examples.append([f, '', 'Model Text to Speech TTS ini dibangunkan seratus peratus oleh Mesolitica, syarikat pemula di Malaysia yang membangunkan juga Malaysia Large Language Model mallam.', False, True, 0.15, 1.0])
+    examples.append([f, '', 'Model Text to Speech TTS ini dibangunkan seratus peratus oleh Mesolitica, syarikat pemula di Malaysia yang membangunkan juga Malaysia Large Language Model mallam.', False, 0.15, 1.0])
 
 def load_speech_enhancement():
     global speech_enhancement, hp, speech_enhancement_sr, chunk_length, overlap_length, speech_enhancement_hop_length
@@ -85,8 +87,8 @@ def load_asr_pipe():
 def load_tts():
     global model, vocoder
     gr.Info('Loading TTS model.')
-    model = load_f5_tts('mesolitica/Malaysian-F5-TTS', device = device, dtype = torch.float16)
-    vocoder = load_vocoder('mesolitica/malaysian-vocos-mel-24khz', device = device)
+    model = load_f5_tts(MODEL_NAME, device = device, dtype = torch.float16)
+    vocoder = load_vocoder(VOCODER_NAME, device = device)
     convert_char_to_pinyin(['helo'])
 
 
@@ -163,7 +165,6 @@ def basic_tts(
     ref_text_input,
     gen_text_input,
     reference_enhancement,
-    output_enhancement,
     cross_fade_duration_slider,
     speed_slider,
 ):
@@ -308,13 +309,10 @@ def basic_tts(
             final_wave = new_wave
         y = final_wave
 
-    if output_enhancement:
-        y = speech_enhancement_func(torch.tensor(y), sr, resample_back = False)
-        sr = speech_enhancement_sr
-        y = y.numpy()
+    e_y = speech_enhancement_func(torch.tensor(y), sr, resample_back = False)
+    e_y = e_y.numpy()
 
-    audio = (sr, y)
-    return [audio, ref_text_input]
+    return [(sr, y), (speech_enhancement_sr, e_y), ref_text_input]
 
 with gr.Blocks(theme=theme) as demo:
     gr.Markdown(
@@ -348,11 +346,6 @@ def basic_tts(
         info="Apply Speech Enhancement to reduce noise for reference audio, this will also increase generation time.",
         value=False,
     )
-    output_enhancement = gr.Checkbox(
-        label="Output Enhancement",
-        info="Apply Speech Enhancement to reduce noise for generated audio, this will also increase generation time.",
-        value=True,
-    )
     speed_slider = gr.Slider(
         label="Speed",
         minimum=0.3,
@@ -370,6 +363,7 @@ def basic_tts(
         info="Set the duration of the cross-fade between audio clips.",
     )
     audio_output = gr.Audio(label="Synthesized Audio", show_download_button = True)
+    enhanced_audio_output = gr.Audio(label="Enhanced Synthesized Audio", show_download_button = True)
     generate_btn = gr.Button("Synthesize", variant="primary")
 
     generate_btn.click(
@@ -379,11 +373,10 @@ def basic_tts(
             ref_text_input,
             gen_text_input,
             reference_enhancement,
-            output_enhancement,
             cross_fade_duration_slider,
             speed_slider,
         ],
-        outputs=[audio_output, ref_text_input],
+        outputs=[audio_output, enhanced_audio_output, ref_text_input],
     )
     examples = gr.Examples(
         examples=examples,
@@ -392,7 +385,6 @@ def basic_tts(
             ref_text_input,
             gen_text_input,
             reference_enhancement,
-            output_enhancement,
             cross_fade_duration_slider,
             speed_slider,
         ],

diff --git a/gradio/f5-tts/kj.mp3 b/gradio/f5-tts/kj.mp3
diff --git a/gradio/f5-tts/p-ramlee.mp3 b/gradio/f5-tts/p-ramlee.mp3
diff --git a/session/f5-tts/README.md b/session/f5-tts/README.md
@@ -1,6 +1,6 @@
 # F5-TTS
 
-## how to
+## how to Speech Enhancement
 
 1. Download dataset,
 
@@ -11,19 +11,15 @@ tar -xf 7z2301-linux-x64.tar.xz
 pip3 install huggingface-hub wandb
 python3 -c "
 from huggingface_hub import snapshot_download
-snapshot_download(repo_id='mesolitica/Malaysian-Emilia-annotated', repo_type='dataset', allow_patterns = 'filtered-24k_processed_24k.z*', local_dir = './')
+snapshot_download(repo_id='mesolitica/Malaysian-Emilia', repo_type='dataset', allow_patterns = 'filtered-24k_processed.z*', local_dir = './')
 "
 python3 -c "
 from huggingface_hub import snapshot_download
-snapshot_download(repo_id='mesolitica/Malaysian-Emilia-annotated', repo_type='dataset', allow_patterns = 'malaysian-podcast_processed_24k.z*', local_dir = './')
+snapshot_download(repo_id='mesolitica/Malaysian-Emilia', repo_type='dataset', allow_patterns = 'malaysian-podcast-processed.z*', local_dir = './')
 "
 python3 -c "
 from huggingface_hub import snapshot_download
-snapshot_download(repo_id='mesolitica/Malaysian-Emilia-annotated', repo_type='dataset', allow_patterns = 'sg-podcast_processed_24k.zip', local_dir = './')
-"
-python3 -c "
-from huggingface_hub import snapshot_download
-snapshot_download(repo_id='mesolitica/Malaysian-Emilia-annotated', repo_type='dataset', allow_patterns = 'parlimen-24k-chunk_processed_24k.z*', local_dir = './')
+snapshot_download(repo_id='mesolitica/Malaysian-Emilia', repo_type='dataset', allow_patterns = 'sg-podcast_processed.zip', local_dir = './')
 "
 python3 -c "
 from huggingface_hub import snapshot_download
@@ -33,12 +29,11 @@ python3 -c "
 from huggingface_hub import snapshot_download
 snapshot_download(repo_id='mesolitica/Malaysian-Emilia', repo_type='dataset', allow_patterns = 'parlimen-24k-chunk_processed.z*', local_dir = './')
 "
-/workspace/7zz x filtered-24k_processed_24k.zip -y -mmt40
-/workspace/7zz x malaysian-podcast_processed_24k.zip -y -mmt40
-/workspace/7zz x sg-podcast_processed_24k.zip -y -mmt40
-/workspace/7zz x parlimen-24k-chunk_processed_24k.zip -y -mmt40
-/workspace/7zz x malaysian-cartoon.zip -y -mmt40
+/workspace/7zz x filtered-24k_processed.zip -y -mmt40
+/workspace/7zz x malaysian-podcast-processed.zip -y -mmt40
+/workspace/7zz x sg-podcast_processed.zip -y -mmt40
 /workspace/7zz x parlimen-24k-chunk_processed.zip -y -mmt40
+/workspace/7zz x malaysian-cartoon.zip -y -mmt40
 ```
 
 2. Install libraries,
@@ -47,7 +42,7 @@ snapshot_download(repo_id='mesolitica/Malaysian-Emilia', repo_type='dataset', al
 git clone https://github.com/mesolitica/F5-TTS
 cd F5-TTS
 pip3 install -e .
-pip3 install torchdiffeq x-transformers jieba pypinyin ema_pytorch accelerate==1.1.1
+pip3 install torchdiffeq x-transformers jieba pypinyin ema_pytorch accelerate==1.1.1 torch==2.5.1 torchaudio==2.5.1
 python3 -c "
 from huggingface_hub import snapshot_download
 snapshot_download(repo_id='mesolitica/Malaysian-Voice-Conversion', repo_type='dataset', allow_patterns = 'data/Emilia_Malaysian_pinyin/*', local_dir = './')

diff --git a/session/f5-tts/default_config.yaml b/session/f5-tts/default_config.yaml
@@ -0,0 +1,17 @@
+compute_environment: LOCAL_MACHINE                                                                                                                                              
+debug: false                                                                                                                                                                    
+distributed_type: MULTI_GPU
+downcast_bf16: 'no'
+enable_cpu_affinity: false
+gpu_ids: all
+machine_rank: 0
+main_training_function: main
+mixed_precision: 'no'
+num_machines: 1
+num_processes: 4
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
diff --git a/session/f5-tts/train.py b/session/f5-tts/train.py
@@ -34,7 +34,7 @@
 epochs = 100  # use linear decay, thus epochs control the slope
 num_warmup_updates = 2000  # warmup steps
 save_per_updates = 50000  # save checkpoint per steps
-last_per_steps = 500  # save last checkpoint per steps
+last_per_steps = 2000  # save last checkpoint per steps
 
 # model params
 if exp_name == "F5TTS_Base":
@@ -84,7 +84,7 @@ def main():
         max_samples=max_samples,
         grad_accumulation_steps=grad_accumulation_steps,
         max_grad_norm=max_grad_norm,
-        wandb_project="CFM-TTS",
+        wandb_project="CFM-TTS-original",
         wandb_run_name=exp_name,
         wandb_resume_id=wandb_resume_id,
         last_per_steps=last_per_steps,

diff --git a/session/smollm2-speech-semantics/README.md b/session/smollm2-speech-semantics/README.md
@@ -0,0 +1,19 @@
+# Finetune SmolLM2 for speech semantic tokens
+
+## how to
+
+1. Clone the dataset,
+
+```bash
+python3 -c "
+from huggingface_hub import snapshot_download
+snapshot_download(repo_id='mesolitica/smollm2-speech-semantic-multipack-2048', repo_type='dataset', local_dir = './smollm2-speech-semantic-multipack-2048')
+"
+```
+
+2. Finetune,
+
+```bash
+smollm2-135m-speech.sh
+smollm2-360m-speech.sh
+```