Skip to content

Commit

Permalink
released 1.2.6
Browse files Browse the repository at this point in the history
  • Loading branch information
huseinzol05 committed May 6, 2022
1 parent 0d0bf45 commit 77042bd
Show file tree
Hide file tree
Showing 8 changed files with 116 additions and 107 deletions.
21 changes: 4 additions & 17 deletions docs/huggingface-repository.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,12 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"Starting Malaya-Speech 1.2.6, you can load Malaya-Speech models from https://huggingface.co/huseinzol05 to get better download speed.\n",
"\n",
"**Starting Malaya-Speech 1.2.7, by default Malaya-Speech will use HuggingFace as backend repository**."
"Starting Malaya-Speech 1.2.6, you can load Malaya-Speech models from https://huggingface.co/huseinzol05 to get better download speed, and by default Malaya-Speech will use HuggingFace as backend repository."
]
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -38,20 +36,9 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'1.2.7'"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"malaya_speech.__version__"
]
Expand Down
21 changes: 4 additions & 17 deletions example/huggingface-repository/huggingface-repository.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,12 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"Starting Malaya-Speech 1.2.6, you can load Malaya-Speech models from https://huggingface.co/huseinzol05 to get better download speed.\n",
"\n",
"**Starting Malaya-Speech 1.2.7, by default Malaya-Speech will use HuggingFace as backend repository**."
"Starting Malaya-Speech 1.2.6, you can load Malaya-Speech models from https://huggingface.co/huseinzol05 to get better download speed, and by default Malaya-Speech will use HuggingFace as backend repository."
]
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -38,20 +36,9 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'1.2.7'"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"malaya_speech.__version__"
]
Expand Down
5 changes: 3 additions & 2 deletions malaya_speech/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from malaya_boilerplate.utils import get_home

version = '1.2'
bump_version = '1.2.7'
bump_version = '1.2.6'
__version__ = bump_version

package = 'malaya-speech'
Expand Down Expand Up @@ -57,5 +57,6 @@
padding,
split,
subword,
tf_featurization)
tf_featurization
)
from .utils.read import load, resample
1 change: 1 addition & 0 deletions malaya_speech/train/model/hubert/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,5 +279,6 @@ def compute_pred(proj_x, target, label_embs):
"logit_u_list": logit_u_list,
"padding_mask": padding_mask,
"features_pen": features_pen,
'x': x,
}
return result
28 changes: 28 additions & 0 deletions malaya_speech/utils/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,34 @@
_rejected = '\'():;"'
_punct = ':;,.?'

PRONUNCIATION = {
'A': 'ae',
'B': 'bi',
'C': 'si',
'D': 'di',
'E': 'ei',
'F': 'ef',
'G': 'ji',
'H': 'hesh',
'I': 'ai',
'J': 'jei',
'K': 'kei',
'L': 'el',
'M': 'eim',
'N': 'ein',
'O': 'ou',
'P': 'pi',
'Q': 'qeu',
'R': 'ar',
'S': 'es',
'T': 'ti',
'U': 'yu',
'V': 'vi',
'W': 'dablui',
'X': 'ex',
'Y': 'wai',
'Z': 'zed',
}

TTS_SYMBOLS = (
[_pad, _start, _eos] + list(_special) + list(_punctuation) + list(_letters)
Expand Down
12 changes: 0 additions & 12 deletions malaya_speech/vocoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,6 @@
'Quantized Size (MB)': 19.9,
'Mel loss': 0.4591,
},
'universal-384': {
'Size (MB)': 78.4,
'Quantized Size (MB)': 19.9,
'Mel loss': 0.4591,
},
}

_mbmelgan_availability = {
Expand Down Expand Up @@ -83,11 +78,6 @@
'Quantized Size (MB)': 2.49,
'Mel loss': 0.5547,
},
'universal-1024': {
'Size (MB)': 72.8,
'Quantized Size (MB)': 18.5,
'Mel loss': 0.3617,
},
'universal-768': {
'Size (MB)': 72.8,
'Quantized Size (MB)': 18.5,
Expand Down Expand Up @@ -147,7 +137,6 @@ def melgan(model: str = 'universal-1024', quantized: bool = False, **kwargs):
* ``'female-singlish'`` - MelGAN trained on Female Singlish voice, https://www.imda.gov.sg/programme-listing/digital-services-lab/national-speech-corpus
* ``'universal'`` - Universal MelGAN trained on multiple speakers.
* ``'universal-1024'`` - Universal MelGAN with 1024 filters trained on multiple speakers.
* ``'universal-384'`` - Universal MelGAN with 384 filters trained on multiple speakers.
quantized : bool, optional (default=False)
if True, will load 8-bit quantized model.
Expand Down Expand Up @@ -219,7 +208,6 @@ def hifigan(model: str = 'universal-768', quantized: bool = False, **kwargs):
* ``'female'`` - HiFiGAN trained on female voice.
* ``'male'`` - HiFiGAN trained on male voice.
* ``'universal-1024'`` - Universal HiFiGAN with 1024 filters trained on multiple speakers.
* ``'universal-768'`` - Universal HiFiGAN with 768 filters trained on multiple speakers.
* ``'universal-512'`` - Universal HiFiGAN with 512 filters trained on multiple speakers.
Expand Down
133 changes: 75 additions & 58 deletions pretrained-model/speaker-embedding/hubert/hubert-base.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@
import malaya_speech.train as train
from malaya_speech.train.model.conformer.model import Model as ConformerModel
from malaya_speech.train.model import hubert
import tensorflow.keras as keras
import tensorflow.keras.backend as K
import numpy as np
import string
import json
Expand All @@ -27,71 +25,89 @@
test_set = glob('/home/husein/youtube/voxceleb-wav/*.wav')

sr = 16000
maxlen = 18
minlen = 3
weight_decay = 1e-5
maxlen = 15
minlen = 2
kmean = hubert.kmeans.ApplyKmeans_TF('kmean.km')


def generate(files):
while True:
random.shuffle(files)
for f in files:
f = f.decode() if isinstance(f, bytes) else f
x, _ = malaya_speech.load(f)
wav_data, _ = malaya_speech.load(f)
label = os.path.split(f)[1].replace('wav-', '').split('-')[1]
y = int(ids[label])

len_x = len(x)
len_x = len(wav_data) / sr

if (len_x / sr) < minlen:
if len_x < minlen:
continue

if (len_x / sr) > maxlen:
x = augmentation.random_sampling(x, sr, random.randint(1000 * minlen, 1000 * maxlen))
if len_x > maxlen:
wav_data = augmentation.random_sampling(wav_data, sr, random.randint(1000 * minlen, 1000 * maxlen))

yield {
'waveforms': x,
'waveforms_length': [len(x)],
'waveforms': wav_data,
'waveforms_length': [len(wav_data)],
'Y': [y],
}


def get_dataset(files, batch_size=4, shuffle_size=32, thread_count=24):
def preprocess_inputs(example):
v = featurizer.vectorize(example['waveforms'])
deltas = malaya_speech.utils.tf_featurization.deltas(v)
ddeltas = malaya_speech.utils.tf_featurization.deltas(deltas)
concated = tf.concat([v, deltas, ddeltas], axis=1)
s = tf.compat.v1.numpy_function(kmean, [concated], tf.int64)
s = tf.cast(s, tf.int32)
kmean_tf = tf.reshape(s, (-1,)) + 3
example['targets'] = kmean_tf
return example


def get_dataset(
file,
batch_size=4,
shuffle_size=20,
thread_count=24,
maxlen_feature=1800,
):
def get():
dataset = tf.data.Dataset.from_generator(
generate,
{
'waveforms': tf.float32,
'waveforms_length': tf.int32,
'Y': tf.int32,
},
{'waveforms': tf.float32,
'waveforms_length': tf.int32,
'Y': tf.int32,
},
output_shapes={
'waveforms': tf.TensorShape([None]),
'waveforms_length': tf.TensorShape([None]),
'Y': tf.TensorShape([None]),
},
args=(files,),
)
dataset = dataset.filter(
lambda x: tf.less(tf.shape(x['waveforms'])[0] / sr, maxlen)
args=(file,),
)
dataset = dataset.filter(
lambda x: tf.greater(tf.shape(x['waveforms'])[0] / sr, minlen)
dataset = dataset.prefetch(tf.contrib.data.AUTOTUNE)
dataset = dataset.map(
preprocess_inputs, num_parallel_calls=thread_count
)
dataset = dataset.padded_batch(
shuffle_size,
batch_size,
padded_shapes={
'waveforms': tf.TensorShape([None]),
'waveforms_length': tf.TensorShape([None]),
'targets': tf.TensorShape([None]),
'Y': tf.TensorShape([None]),
},
padding_values={
'waveforms': tf.constant(0, dtype=tf.float32),
'waveforms_length': tf.constant(0, dtype=tf.int32),
'targets': tf.constant(0, dtype=tf.int32),
'Y': tf.constant(0, dtype=tf.int32),
},
)
return dataset

return get


Expand All @@ -107,12 +123,6 @@ def __call__(self, x, input_mask, training=True):
total_steps = 3000000


def amsoftmax_loss(y_true, y_pred, scale=30, margin=0.35):
y_pred = y_true * (y_pred - margin) + (1 - y_true) * y_pred
y_pred *= scale
return K.categorical_crossentropy(y_true, y_pred, from_logits=True)


def model_fn(features, labels, mode, params):
config_conformer = malaya_speech.config.conformer_base_encoder_config
config_conformer['subsampling']['type'] = 'none'
Expand All @@ -130,40 +140,47 @@ def model_fn(features, labels, mode, params):
model = hubert.Model(cfg, encoder, ['pad', 'eos', 'unk'] + [str(i) for i in range(100)])
X = features['waveforms']
X_len = features['waveforms_length'][:, 0]
Y = features['targets']
r = model(X, padding_mask=X_len, target_list=Y)

target_m = tf.zeros((tf.shape(r['logit_m_list'])[0],), dtype=tf.int32)
target_u = tf.zeros((tf.shape(r['logit_u_list'])[0],), dtype=tf.int32)

sample_size = tf.cast(tf.shape(target_m)[0], tf.float32)
entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=target_m, logits=r['logit_m_list'])
entropy_m = tf.reduce_sum(entropy) / sample_size

sample_size = tf.cast(tf.shape(target_u)[0], tf.float32)
entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=target_u, logits=r['logit_u_list'])
entropy_u = tf.reduce_sum(entropy) / sample_size

seq = r['x']
Y = features['Y']
Y_onehot = tf.one_hot(Y, depth=num_class)

r = model(X, padding_mask=X_len, features_only=True, mask=False)
first_token_tensor = tf.squeeze(r['x'][:, 0:1, :], axis=1)
pooled_output = keras.layers.Dense(cfg.final_dim * 2, activation='tanh',
kernel_initializer='orthogonal',
use_bias=True, trainable=True,
kernel_regularizer=keras.regularizers.l2(weight_decay),
bias_regularizer=keras.regularizers.l2(weight_decay))(first_token_tensor)
logits = keras.layers.Dense(num_class,
kernel_initializer='orthogonal',
use_bias=False, trainable=True,
kernel_constraint=keras.constraints.unit_norm(),
kernel_regularizer=keras.regularizers.l2(weight_decay),
bias_regularizer=keras.regularizers.l2(weight_decay),
name='prediction')(pooled_output)
loss = tf.reduce_mean(amsoftmax_loss(Y_onehot, logits))
accuracy = tf.metrics.accuracy(
labels=Y, predictions=tf.argmax(logits, axis=1)
first_token_tensor = tf.squeeze(seq[:, 0:1, :], axis=1)
pooled_output = tf.keras.layers.Dense(embedding_dim, activation='tanh',
use_bias=True, trainable=True)(first_token_tensor)
logits = tf.keras.layers.Dense(num_class, trainable=True,)(pooled_output)
entropy_speakers = tf.reduce_mean(
tf.nn.sparse_softmax_cross_entropy_with_logits(
logits=logits, labels=Y
)
)

tf.identity(accuracy[1], name='train_accuracy')
loss = entropy_m * 0.95 + entropy_u * 0.05 + entropy_speakers

tf.identity(loss, 'train_loss')
tf.identity(entropy_m, 'entropy_m')
tf.summary.scalar('entropy_m', entropy_m)

variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
init_checkpoint = 'hubert-conformer-base-output-3mixed/model.ckpt-2000000'
tf.identity(entropy_u, 'entropy_u')
tf.summary.scalar('entropy_u', entropy_u)

assignment_map, initialized_variable_names = train.get_assignment_map_from_checkpoint(
variables, init_checkpoint
tf.identity(loss, 'train_loss')

accuracy = tf.metrics.accuracy(
labels=Y, predictions=tf.argmax(logits, axis=1)
)

tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
tf.identity(accuracy[1], name='train_accuracy')

if mode == tf.estimator.ModeKeys.TRAIN:
train_op = train.optimizer.adamw.create_optimizer(
Expand Down Expand Up @@ -195,7 +212,7 @@ def model_fn(features, labels, mode, params):

train_hooks = [
tf.train.LoggingTensorHook(
['train_accuracy', 'train_loss'], every_n_iter=1
['entropy_m', 'entropy_u', 'entropy_speakers', 'train_accuracy', 'train_loss'], every_n_iter=1
)
]

Expand Down
Loading

0 comments on commit 77042bd

Please sign in to comment.