released 1.2.6

mesolitica · May 6, 2022 · 77042bd · 77042bd
1 parent 0d0bf45
commit 77042bd
Show file tree

Hide file tree

Showing 8 changed files with 116 additions and 107 deletions.
diff --git a/docs/huggingface-repository.ipynb b/docs/huggingface-repository.ipynb
@@ -22,14 +22,12 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Starting Malaya-Speech 1.2.6, you can load Malaya-Speech models from https://huggingface.co/huseinzol05 to get better download speed.\n",
-    "\n",
-    "**Starting Malaya-Speech 1.2.7, by default Malaya-Speech will use HuggingFace as backend repository**."
+    "Starting Malaya-Speech 1.2.6, you can load Malaya-Speech models from https://huggingface.co/huseinzol05 to get better download speed, and by default Malaya-Speech will use HuggingFace as backend repository."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -38,20 +36,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'1.2.7'"
-      ]
-     },
-     "execution_count": 2,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "malaya_speech.__version__"
    ]

diff --git a/example/huggingface-repository/huggingface-repository.ipynb b/example/huggingface-repository/huggingface-repository.ipynb
@@ -22,14 +22,12 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Starting Malaya-Speech 1.2.6, you can load Malaya-Speech models from https://huggingface.co/huseinzol05 to get better download speed.\n",
-    "\n",
-    "**Starting Malaya-Speech 1.2.7, by default Malaya-Speech will use HuggingFace as backend repository**."
+    "Starting Malaya-Speech 1.2.6, you can load Malaya-Speech models from https://huggingface.co/huseinzol05 to get better download speed, and by default Malaya-Speech will use HuggingFace as backend repository."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -38,20 +36,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'1.2.7'"
-      ]
-     },
-     "execution_count": 2,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "malaya_speech.__version__"
    ]

diff --git a/malaya_speech/__init__.py b/malaya_speech/__init__.py
@@ -9,7 +9,7 @@
 from malaya_boilerplate.utils import get_home
 
 version = '1.2'
-bump_version = '1.2.7'
+bump_version = '1.2.6'
 __version__ = bump_version
 
 package = 'malaya-speech'
@@ -57,5 +57,6 @@
     padding,
     split,
     subword,
-    tf_featurization)
+    tf_featurization
+)
 from .utils.read import load, resample
diff --git a/malaya_speech/train/model/hubert/model.py b/malaya_speech/train/model/hubert/model.py
@@ -279,5 +279,6 @@ def compute_pred(proj_x, target, label_embs):
             "logit_u_list": logit_u_list,
             "padding_mask": padding_mask,
             "features_pen": features_pen,
+            'x': x,
         }
         return result
diff --git a/malaya_speech/utils/text.py b/malaya_speech/utils/text.py
@@ -16,6 +16,34 @@
 _rejected = '\'():;"'
 _punct = ':;,.?'
 
+PRONUNCIATION = {
+    'A': 'ae',
+    'B': 'bi',
+    'C': 'si',
+    'D': 'di',
+    'E': 'ei',
+    'F': 'ef',
+    'G': 'ji',
+    'H': 'hesh',
+    'I': 'ai',
+    'J': 'jei',
+    'K': 'kei',
+    'L': 'el',
+    'M': 'eim',
+    'N': 'ein',
+    'O': 'ou',
+    'P': 'pi',
+    'Q': 'qeu',
+    'R': 'ar',
+    'S': 'es',
+    'T': 'ti',
+    'U': 'yu',
+    'V': 'vi',
+    'W': 'dablui',
+    'X': 'ex',
+    'Y': 'wai',
+    'Z': 'zed',
+}
 
 TTS_SYMBOLS = (
     [_pad, _start, _eos] + list(_special) + list(_punctuation) + list(_letters)

diff --git a/malaya_speech/vocoder.py b/malaya_speech/vocoder.py
@@ -42,11 +42,6 @@
         'Quantized Size (MB)': 19.9,
         'Mel loss': 0.4591,
     },
-    'universal-384': {
-        'Size (MB)': 78.4,
-        'Quantized Size (MB)': 19.9,
-        'Mel loss': 0.4591,
-    },
 }
 
 _mbmelgan_availability = {
@@ -83,11 +78,6 @@
         'Quantized Size (MB)': 2.49,
         'Mel loss': 0.5547,
     },
-    'universal-1024': {
-        'Size (MB)': 72.8,
-        'Quantized Size (MB)': 18.5,
-        'Mel loss': 0.3617,
-    },
     'universal-768': {
         'Size (MB)': 72.8,
         'Quantized Size (MB)': 18.5,
@@ -147,7 +137,6 @@ def melgan(model: str = 'universal-1024', quantized: bool = False, **kwargs):
         * ``'female-singlish'`` - MelGAN trained on Female Singlish voice, https://www.imda.gov.sg/programme-listing/digital-services-lab/national-speech-corpus
         * ``'universal'`` - Universal MelGAN trained on multiple speakers.
         * ``'universal-1024'`` - Universal MelGAN with 1024 filters trained on multiple speakers.
-        * ``'universal-384'`` - Universal MelGAN with 384 filters trained on multiple speakers.
 
     quantized : bool, optional (default=False)
         if True, will load 8-bit quantized model.
@@ -219,7 +208,6 @@ def hifigan(model: str = 'universal-768', quantized: bool = False, **kwargs):
 
         * ``'female'`` - HiFiGAN trained on female voice.
         * ``'male'`` - HiFiGAN trained on male voice.
-        * ``'universal-1024'`` - Universal HiFiGAN with 1024 filters trained on multiple speakers.
         * ``'universal-768'`` - Universal HiFiGAN with 768 filters trained on multiple speakers.
         * ``'universal-512'`` - Universal HiFiGAN with 512 filters trained on multiple speakers.
 

diff --git a/pretrained-model/speaker-embedding/hubert/hubert-base.py b/pretrained-model/speaker-embedding/hubert/hubert-base.py
@@ -8,8 +8,6 @@
 import malaya_speech.train as train
 from malaya_speech.train.model.conformer.model import Model as ConformerModel
 from malaya_speech.train.model import hubert
-import tensorflow.keras as keras
-import tensorflow.keras.backend as K
 import numpy as np
 import string
 import json
@@ -27,71 +25,89 @@
 test_set = glob('/home/husein/youtube/voxceleb-wav/*.wav')
 
 sr = 16000
-maxlen = 18
-minlen = 3
-weight_decay = 1e-5
+maxlen = 15
+minlen = 2
+kmean = hubert.kmeans.ApplyKmeans_TF('kmean.km')
 
 
 def generate(files):
     while True:
         random.shuffle(files)
         for f in files:
             f = f.decode() if isinstance(f, bytes) else f
-            x, _ = malaya_speech.load(f)
+            wav_data, _ = malaya_speech.load(f)
             label = os.path.split(f)[1].replace('wav-', '').split('-')[1]
             y = int(ids[label])
 
-            len_x = len(x)
+            len_x = len(wav_data) / sr
 
-            if (len_x / sr) < minlen:
+            if len_x < minlen:
                 continue
 
-            if (len_x / sr) > maxlen:
-                x = augmentation.random_sampling(x, sr, random.randint(1000 * minlen, 1000 * maxlen))
+            if len_x > maxlen:
+                wav_data = augmentation.random_sampling(wav_data, sr, random.randint(1000 * minlen, 1000 * maxlen))
 
             yield {
-                'waveforms': x,
-                'waveforms_length': [len(x)],
+                'waveforms': wav_data,
+                'waveforms_length': [len(wav_data)],
                 'Y': [y],
             }
 
 
-def get_dataset(files, batch_size=4, shuffle_size=32, thread_count=24):
+def preprocess_inputs(example):
+    v = featurizer.vectorize(example['waveforms'])
+    deltas = malaya_speech.utils.tf_featurization.deltas(v)
+    ddeltas = malaya_speech.utils.tf_featurization.deltas(deltas)
+    concated = tf.concat([v, deltas, ddeltas], axis=1)
+    s = tf.compat.v1.numpy_function(kmean, [concated], tf.int64)
+    s = tf.cast(s, tf.int32)
+    kmean_tf = tf.reshape(s, (-1,)) + 3
+    example['targets'] = kmean_tf
+    return example
+
+
+def get_dataset(
+    file,
+    batch_size=4,
+    shuffle_size=20,
+    thread_count=24,
+    maxlen_feature=1800,
+):
     def get():
         dataset = tf.data.Dataset.from_generator(
             generate,
-            {
-                'waveforms': tf.float32,
-                'waveforms_length': tf.int32,
-                'Y': tf.int32,
-            },
+            {'waveforms': tf.float32,
+             'waveforms_length': tf.int32,
+             'Y': tf.int32,
+             },
             output_shapes={
                 'waveforms': tf.TensorShape([None]),
                 'waveforms_length': tf.TensorShape([None]),
                 'Y': tf.TensorShape([None]),
             },
-            args=(files,),
-        )
-        dataset = dataset.filter(
-            lambda x: tf.less(tf.shape(x['waveforms'])[0] / sr, maxlen)
+            args=(file,),
         )
-        dataset = dataset.filter(
-            lambda x: tf.greater(tf.shape(x['waveforms'])[0] / sr, minlen)
+        dataset = dataset.prefetch(tf.contrib.data.AUTOTUNE)
+        dataset = dataset.map(
+            preprocess_inputs, num_parallel_calls=thread_count
         )
         dataset = dataset.padded_batch(
-            shuffle_size,
+            batch_size,
             padded_shapes={
                 'waveforms': tf.TensorShape([None]),
                 'waveforms_length': tf.TensorShape([None]),
+                'targets': tf.TensorShape([None]),
                 'Y': tf.TensorShape([None]),
             },
             padding_values={
                 'waveforms': tf.constant(0, dtype=tf.float32),
                 'waveforms_length': tf.constant(0, dtype=tf.int32),
+                'targets': tf.constant(0, dtype=tf.int32),
                 'Y': tf.constant(0, dtype=tf.int32),
             },
         )
         return dataset
+
     return get
 
 
@@ -107,12 +123,6 @@ def __call__(self, x, input_mask, training=True):
 total_steps = 3000000
 
 
-def amsoftmax_loss(y_true, y_pred, scale=30, margin=0.35):
-    y_pred = y_true * (y_pred - margin) + (1 - y_true) * y_pred
-    y_pred *= scale
-    return K.categorical_crossentropy(y_true, y_pred, from_logits=True)
-
-
 def model_fn(features, labels, mode, params):
     config_conformer = malaya_speech.config.conformer_base_encoder_config
     config_conformer['subsampling']['type'] = 'none'
@@ -130,40 +140,47 @@ def model_fn(features, labels, mode, params):
     model = hubert.Model(cfg, encoder, ['pad', 'eos', 'unk'] + [str(i) for i in range(100)])
     X = features['waveforms']
     X_len = features['waveforms_length'][:, 0]
+    Y = features['targets']
+    r = model(X, padding_mask=X_len, target_list=Y)
+
+    target_m = tf.zeros((tf.shape(r['logit_m_list'])[0],), dtype=tf.int32)
+    target_u = tf.zeros((tf.shape(r['logit_u_list'])[0],), dtype=tf.int32)
+
+    sample_size = tf.cast(tf.shape(target_m)[0], tf.float32)
+    entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=target_m, logits=r['logit_m_list'])
+    entropy_m = tf.reduce_sum(entropy) / sample_size
+
+    sample_size = tf.cast(tf.shape(target_u)[0], tf.float32)
+    entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=target_u, logits=r['logit_u_list'])
+    entropy_u = tf.reduce_sum(entropy) / sample_size
+
+    seq = r['x']
     Y = features['Y']
-    Y_onehot = tf.one_hot(Y, depth=num_class)
-
-    r = model(X, padding_mask=X_len, features_only=True, mask=False)
-    first_token_tensor = tf.squeeze(r['x'][:, 0:1, :], axis=1)
-    pooled_output = keras.layers.Dense(cfg.final_dim * 2, activation='tanh',
-                                       kernel_initializer='orthogonal',
-                                       use_bias=True, trainable=True,
-                                       kernel_regularizer=keras.regularizers.l2(weight_decay),
-                                       bias_regularizer=keras.regularizers.l2(weight_decay))(first_token_tensor)
-    logits = keras.layers.Dense(num_class,
-                                kernel_initializer='orthogonal',
-                                use_bias=False, trainable=True,
-                                kernel_constraint=keras.constraints.unit_norm(),
-                                kernel_regularizer=keras.regularizers.l2(weight_decay),
-                                bias_regularizer=keras.regularizers.l2(weight_decay),
-                                name='prediction')(pooled_output)
-    loss = tf.reduce_mean(amsoftmax_loss(Y_onehot, logits))
-    accuracy = tf.metrics.accuracy(
-        labels=Y, predictions=tf.argmax(logits, axis=1)
+    first_token_tensor = tf.squeeze(seq[:, 0:1, :], axis=1)
+    pooled_output = tf.keras.layers.Dense(embedding_dim, activation='tanh',
+                                          use_bias=True, trainable=True)(first_token_tensor)
+    logits = tf.keras.layers.Dense(num_class, trainable=True,)(pooled_output)
+    entropy_speakers = tf.reduce_mean(
+        tf.nn.sparse_softmax_cross_entropy_with_logits(
+            logits=logits, labels=Y
+        )
     )
 
-    tf.identity(accuracy[1], name='train_accuracy')
+    loss = entropy_m * 0.95 + entropy_u * 0.05 + entropy_speakers
 
-    tf.identity(loss, 'train_loss')
+    tf.identity(entropy_m, 'entropy_m')
+    tf.summary.scalar('entropy_m', entropy_m)
 
-    variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
-    init_checkpoint = 'hubert-conformer-base-output-3mixed/model.ckpt-2000000'
+    tf.identity(entropy_u, 'entropy_u')
+    tf.summary.scalar('entropy_u', entropy_u)
 
-    assignment_map, initialized_variable_names = train.get_assignment_map_from_checkpoint(
-        variables, init_checkpoint
+    tf.identity(loss, 'train_loss')
+
+    accuracy = tf.metrics.accuracy(
+        labels=Y, predictions=tf.argmax(logits, axis=1)
     )
 
-    tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
+    tf.identity(accuracy[1], name='train_accuracy')
 
     if mode == tf.estimator.ModeKeys.TRAIN:
         train_op = train.optimizer.adamw.create_optimizer(
@@ -195,7 +212,7 @@ def model_fn(features, labels, mode, params):
 
 train_hooks = [
     tf.train.LoggingTensorHook(
-        ['train_accuracy', 'train_loss'], every_n_iter=1
+        ['entropy_m', 'entropy_u', 'entropy_speakers', 'train_accuracy', 'train_loss'], every_n_iter=1
     )
 ]