Skip to content

Commit

Permalink
improve phoneme function
Browse files Browse the repository at this point in the history
  • Loading branch information
huseinzol05 committed May 31, 2022
1 parent 4c469da commit 68e0b23
Show file tree
Hide file tree
Showing 6 changed files with 207 additions and 112 deletions.
82 changes: 56 additions & 26 deletions docs/load-phoneme.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,8 @@
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 5.8 s, sys: 1.25 s, total: 7.05 s\n",
"Wall time: 9.89 s\n"
"CPU times: user 6.15 s, sys: 1.52 s, total: 7.66 s\n",
"Wall time: 11.3 s\n"
]
}
],
Expand Down Expand Up @@ -141,27 +141,6 @@
"text": [
"Load quantized model will cause accuracy drop.\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "016770b54f0f45339e71b2fc4e695d6e",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2900860.0, style=ProgressStyle(descript…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
Expand Down Expand Up @@ -234,12 +213,63 @@
"quantized_model.predict(['saya suka makan ayam', 'ayaq acaq kotoq'])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Limitation\n",
"\n",
"Not able to convert numbers to phoneme."
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 10,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"['A']"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model.predict(['123'])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"outputs": [],
"source": []
"source": [
"you have to use normalization like https://malaya.readthedocs.io/en/latest/load-num2word.html"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['s«.ÒAt du.wA pu.luh ti.gA']"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model.predict([malaya.num2word.to_cardinal(123)])"
]
}
],
"metadata": {
Expand Down
82 changes: 56 additions & 26 deletions example/phoneme/load-phoneme.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,8 @@
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 5.8 s, sys: 1.25 s, total: 7.05 s\n",
"Wall time: 9.89 s\n"
"CPU times: user 6.15 s, sys: 1.52 s, total: 7.66 s\n",
"Wall time: 11.3 s\n"
]
}
],
Expand Down Expand Up @@ -141,27 +141,6 @@
"text": [
"Load quantized model will cause accuracy drop.\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "016770b54f0f45339e71b2fc4e695d6e",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2900860.0, style=ProgressStyle(descript…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
Expand Down Expand Up @@ -234,12 +213,63 @@
"quantized_model.predict(['saya suka makan ayam', 'ayaq acaq kotoq'])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Limitation\n",
"\n",
"Not able to convert numbers to phoneme."
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 10,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"['A']"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model.predict(['123'])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"outputs": [],
"source": []
"source": [
"you have to use normalization like https://malaya.readthedocs.io/en/latest/load-num2word.html"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['s«.ÒAt du.wA pu.luh ti.gA']"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model.predict([malaya.num2word.to_cardinal(123)])"
]
}
],
"metadata": {
Expand Down
2 changes: 1 addition & 1 deletion malaya/model/tf.py
Original file line number Diff line number Diff line change
Expand Up @@ -1131,7 +1131,7 @@ def predict(self, strings: List[str], beam_search: bool = False):
else:
output = 'greedy'

batch = [[self._left_dict[c] for c in self._cleaning(string)] + [1] for string in strings]
batch = [[self._left_dict[c] for c in self._cleaning(string, self._left_dict)] + [1] for string in strings]
batch = pad_sentence_batch(batch, 0)[0]
r = self._execute(
inputs=[batch],
Expand Down
58 changes: 0 additions & 58 deletions malaya/phoneme.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,26 +4,6 @@
from herpetologist import check_type
from typing import List

_transformer_availability = {
'small': {
'Size (MB)': 42.7,
'Quantized Size (MB)': 13.1,
},
'tiny': {
'Size (MB)': 42.7,
'Quantized Size (MB)': 13.1,
},
}


def available_transformer():
"""
List available transformer models.
"""
from malaya.function import describe_availability

return describe_availability(_transformer_availability)


@check_type
def deep_model(quantized: bool = False, **kwargs):
Expand Down Expand Up @@ -51,41 +31,3 @@ def deep_model(quantized: bool = False, **kwargs):
quantized=quantized,
**kwargs,
)


def transformer(model='small', quantized=False, **kwargs):
"""
Load transformer encoder-decoder phonetic model,
originally from https://prpm.dbp.gov.my/ Glosari Dialek.
Parameters
----------
model : str, optional (default='base')
Model architecture supported. Allowed values:
* ``'small'`` - Transformer SMALL parameters.
* ``'tiny'`` - Transformer TINY parameters.
quantized : bool, optional (default=False)
if True, will load 8-bit quantized model.
Quantized model not necessary faster, totally depends on the machine.
Returns
-------
result: malaya.model.tf.TransformerChar class
"""
model = model.lower()
if model not in _transformer_availability:
raise ValueError(
'model not supported, please check supported models from `malaya.phoneme.available_transformer()`.'
)
return load_transformer.load_char(
module='phoneme',
model=model,
encoder='yttm',
left_dict=phoneme_left,
right_dict=phoneme_right,
cleaning=phoneme_textcleaning,
quantized=quantized,
**kwargs,
)
87 changes: 87 additions & 0 deletions malaya/supervised/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,3 +94,90 @@
'ø': 45,
'ù': 46
}

jawi_left = {
0: 0,
1: 1,
2: 2,
3: 3,
' ': 4,
'!': 5,
'"': 6,
"'": 7,
'(': 8,
')': 9,
'+': 10,
',': 11,
'-': 12,
'.': 13,
'0': 14,
'1': 15,
'2': 16,
'3': 17,
'4': 18,
'5': 19,
'6': 20,
'7': 21,
'8': 22,
'9': 23,
':': 24,
';': 25,
'?': 26,
'A': 27,
'B': 28,
'C': 29,
'D': 30,
'E': 31,
'F': 32,
'G': 33,
'H': 34,
'I': 35,
'J': 36,
'K': 37,
'L': 38,
'M': 39,
'N': 40,
'O': 41,
'P': 42,
'Q': 43,
'R': 44,
'S': 45,
'T': 46,
'U': 47,
'V': 48,
'W': 49,
'X': 50,
'Y': 51,
'Z': 52,
'a': 53,
'b': 54,
'c': 55,
'd': 56,
'e': 57,
'f': 58,
'g': 59,
'h': 60,
'i': 61,
'j': 62,
'k': 63,
'l': 64,
'm': 65,
'n': 66,
'o': 67,
'p': 68,
'q': 69,
'r': 70,
's': 71,
't': 72,
'u': 73,
'v': 74,
'w': 75,
'x': 76,
'y': 77,
'z': 78
}

jawi_right = {
0: 0, 1: 1, 2: 2, 3: 3, ' ': 4, '!': 5, '"': 6, '-': 7, '.': 8, ':': 9, ';': 10, '،': 11, '؟': 12, 'ء': 13, 'آ': 14, 'أ': 15, 'ؤ': 16, 'إ': 17, 'ئ': 18, 'ا': 19, 'ب': 20, 'ة': 21, 'ت': 22, 'ث': 23, 'ج': 24, 'ح': 25, 'خ': 26, 'د': 27, 'ذ': 28, 'ر': 29, 'ز': 30, 'س': 31, 'ش': 32, 'ص': 33, 'ض': 34,
'ط': 35, 'ظ': 36, 'ع': 37, 'غ': 38, 'ف': 39, 'ق': 40, 'ك': 41, 'ل': 42, 'م': 43, 'ن': 44, 'ه': 45, 'و': 46, 'ى': 47, 'ي': 48, 'ّ': 49, 'ٓ': 50, '٠': 51, '١': 52, '٢': 53, '٣': 54, '٤': 55, '٥': 56, '٦': 57, '٧': 58, '٨': 59, '٩': 60, 'چ': 61, 'ڠ': 62, 'ڤ': 63, 'ڬ': 64, 'ڽ': 65, 'ۏ': 66, '﴾': 67, '﴿': 68
}
8 changes: 7 additions & 1 deletion malaya/text/function.py
Original file line number Diff line number Diff line change
Expand Up @@ -418,11 +418,17 @@ def summarization_textcleaning(string):
return re.sub(r'[ ]+', ' ', string).strip()


def phoneme_textcleaning(string, replace_chars='.,!?['):
def phoneme_textcleaning(string, dict, replace_chars='.,!?['):
l = string
for c in replace_chars:
l = l.replace(c, f' ')
l = l.lower()
l = ''.join([c for c in l if c in dict])
return re.sub(r'[ ]+', ' ', l).strip()


def rumi_jawi_textcleaning(string, dict):
l = ''.join([c for c in l if c in dict])
return re.sub(r'[ ]+', ' ', l).strip()


Expand Down

0 comments on commit 68e0b23

Please sign in to comment.