From 68e0b23af57dc4a68f79bc65985554641df4a318 Mon Sep 17 00:00:00 2001 From: huseinzol05 Date: Tue, 31 May 2022 23:08:51 +0800 Subject: [PATCH] improve phoneme function --- docs/load-phoneme.ipynb | 82 +++++++++++++++++++--------- example/phoneme/load-phoneme.ipynb | 82 +++++++++++++++++++--------- malaya/model/tf.py | 2 +- malaya/phoneme.py | 58 -------------------- malaya/supervised/settings.py | 87 ++++++++++++++++++++++++++++++ malaya/text/function.py | 8 ++- 6 files changed, 207 insertions(+), 112 deletions(-) diff --git a/docs/load-phoneme.ipynb b/docs/load-phoneme.ipynb index fee5cd30..3dd32134 100644 --- a/docs/load-phoneme.ipynb +++ b/docs/load-phoneme.ipynb @@ -60,8 +60,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 5.8 s, sys: 1.25 s, total: 7.05 s\n", - "Wall time: 9.89 s\n" + "CPU times: user 6.15 s, sys: 1.52 s, total: 7.66 s\n", + "Wall time: 11.3 s\n" ] } ], @@ -141,27 +141,6 @@ "text": [ "Load quantized model will cause accuracy drop.\n" ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "016770b54f0f45339e71b2fc4e695d6e", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2900860.0, style=ProgressStyle(descript…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] } ], "source": [ @@ -234,12 +213,63 @@ "quantized_model.predict(['saya suka makan ayam', 'ayaq acaq kotoq'])" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Limitation\n", + "\n", + "Not able to convert numbers to phoneme." + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['A']" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.predict(['123'])" + ] + }, + { + "cell_type": "markdown", "metadata": {}, - "outputs": [], - "source": [] + "source": [ + "you have to use normalization like https://malaya.readthedocs.io/en/latest/load-num2word.html" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['s«.ÒAt du.wA pu.luh ti.gA']" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.predict([malaya.num2word.to_cardinal(123)])" + ] } ], "metadata": { diff --git a/example/phoneme/load-phoneme.ipynb b/example/phoneme/load-phoneme.ipynb index fee5cd30..3dd32134 100644 --- a/example/phoneme/load-phoneme.ipynb +++ b/example/phoneme/load-phoneme.ipynb @@ -60,8 +60,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 5.8 s, sys: 1.25 s, total: 7.05 s\n", - "Wall time: 9.89 s\n" + "CPU times: user 6.15 s, sys: 1.52 s, total: 7.66 s\n", + "Wall time: 11.3 s\n" ] } ], @@ -141,27 +141,6 @@ "text": [ "Load quantized model will cause accuracy drop.\n" ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "016770b54f0f45339e71b2fc4e695d6e", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2900860.0, style=ProgressStyle(descript…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] } ], "source": [ @@ -234,12 +213,63 @@ "quantized_model.predict(['saya suka makan ayam', 'ayaq acaq kotoq'])" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Limitation\n", + "\n", + "Not able to convert numbers to phoneme." + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['A']" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.predict(['123'])" + ] + }, + { + "cell_type": "markdown", "metadata": {}, - "outputs": [], - "source": [] + "source": [ + "you have to use normalization like https://malaya.readthedocs.io/en/latest/load-num2word.html" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['s«.ÒAt du.wA pu.luh ti.gA']" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.predict([malaya.num2word.to_cardinal(123)])" + ] } ], "metadata": { diff --git a/malaya/model/tf.py b/malaya/model/tf.py index 4bfbf78e..f4c0d205 100644 --- a/malaya/model/tf.py +++ b/malaya/model/tf.py @@ -1131,7 +1131,7 @@ def predict(self, strings: List[str], beam_search: bool = False): else: output = 'greedy' - batch = [[self._left_dict[c] for c in self._cleaning(string)] + [1] for string in strings] + batch = [[self._left_dict[c] for c in self._cleaning(string, self._left_dict)] + [1] for string in strings] batch = pad_sentence_batch(batch, 0)[0] r = self._execute( inputs=[batch], diff --git a/malaya/phoneme.py b/malaya/phoneme.py index 90246d86..1f934d6f 100644 --- a/malaya/phoneme.py +++ b/malaya/phoneme.py @@ -4,26 +4,6 @@ from herpetologist import check_type from typing import List -_transformer_availability = { - 'small': { - 'Size (MB)': 42.7, - 'Quantized Size (MB)': 13.1, - }, - 'tiny': { - 'Size (MB)': 42.7, - 'Quantized Size (MB)': 13.1, - }, -} - - -def available_transformer(): - """ - List available transformer models. - """ - from malaya.function import describe_availability - - return describe_availability(_transformer_availability) - @check_type def deep_model(quantized: bool = False, **kwargs): @@ -51,41 +31,3 @@ def deep_model(quantized: bool = False, **kwargs): quantized=quantized, **kwargs, ) - - -def transformer(model='small', quantized=False, **kwargs): - """ - Load transformer encoder-decoder phonetic model, - originally from https://prpm.dbp.gov.my/ Glosari Dialek. - - Parameters - ---------- - model : str, optional (default='base') - Model architecture supported. Allowed values: - - * ``'small'`` - Transformer SMALL parameters. - * ``'tiny'`` - Transformer TINY parameters. - - quantized : bool, optional (default=False) - if True, will load 8-bit quantized model. - Quantized model not necessary faster, totally depends on the machine. - - Returns - ------- - result: malaya.model.tf.TransformerChar class - """ - model = model.lower() - if model not in _transformer_availability: - raise ValueError( - 'model not supported, please check supported models from `malaya.phoneme.available_transformer()`.' - ) - return load_transformer.load_char( - module='phoneme', - model=model, - encoder='yttm', - left_dict=phoneme_left, - right_dict=phoneme_right, - cleaning=phoneme_textcleaning, - quantized=quantized, - **kwargs, - ) diff --git a/malaya/supervised/settings.py b/malaya/supervised/settings.py index 61e37a0b..6d55e3cc 100644 --- a/malaya/supervised/settings.py +++ b/malaya/supervised/settings.py @@ -94,3 +94,90 @@ 'ø': 45, 'ù': 46 } + +jawi_left = { + 0: 0, + 1: 1, + 2: 2, + 3: 3, + ' ': 4, + '!': 5, + '"': 6, + "'": 7, + '(': 8, + ')': 9, + '+': 10, + ',': 11, + '-': 12, + '.': 13, + '0': 14, + '1': 15, + '2': 16, + '3': 17, + '4': 18, + '5': 19, + '6': 20, + '7': 21, + '8': 22, + '9': 23, + ':': 24, + ';': 25, + '?': 26, + 'A': 27, + 'B': 28, + 'C': 29, + 'D': 30, + 'E': 31, + 'F': 32, + 'G': 33, + 'H': 34, + 'I': 35, + 'J': 36, + 'K': 37, + 'L': 38, + 'M': 39, + 'N': 40, + 'O': 41, + 'P': 42, + 'Q': 43, + 'R': 44, + 'S': 45, + 'T': 46, + 'U': 47, + 'V': 48, + 'W': 49, + 'X': 50, + 'Y': 51, + 'Z': 52, + 'a': 53, + 'b': 54, + 'c': 55, + 'd': 56, + 'e': 57, + 'f': 58, + 'g': 59, + 'h': 60, + 'i': 61, + 'j': 62, + 'k': 63, + 'l': 64, + 'm': 65, + 'n': 66, + 'o': 67, + 'p': 68, + 'q': 69, + 'r': 70, + 's': 71, + 't': 72, + 'u': 73, + 'v': 74, + 'w': 75, + 'x': 76, + 'y': 77, + 'z': 78 +} + +jawi_right = { + 0: 0, 1: 1, 2: 2, 3: 3, ' ': 4, '!': 5, '"': 6, '-': 7, '.': 8, ':': 9, ';': 10, '،': 11, '؟': 12, 'ء': 13, 'آ': 14, 'أ': 15, 'ؤ': 16, 'إ': 17, 'ئ': 18, 'ا': 19, 'ب': 20, 'ة': 21, 'ت': 22, 'ث': 23, 'ج': 24, 'ح': 25, 'خ': 26, 'د': 27, 'ذ': 28, 'ر': 29, 'ز': 30, 'س': 31, 'ش': 32, 'ص': 33, 'ض': 34, + 'ط': 35, 'ظ': 36, 'ع': 37, 'غ': 38, 'ف': 39, 'ق': 40, 'ك': 41, 'ل': 42, 'م': 43, 'ن': 44, 'ه': 45, 'و': 46, 'ى': 47, 'ي': 48, 'ّ': 49, 'ٓ': 50, '٠': 51, '١': 52, '٢': 53, '٣': 54, '٤': 55, '٥': 56, '٦': 57, '٧': 58, '٨': 59, '٩': 60, 'چ': 61, 'ڠ': 62, 'ڤ': 63, 'ڬ': 64, 'ڽ': 65, 'ۏ': 66, '﴾': 67, '﴿': 68 +} diff --git a/malaya/text/function.py b/malaya/text/function.py index 07d45f02..f6beaa06 100644 --- a/malaya/text/function.py +++ b/malaya/text/function.py @@ -418,11 +418,17 @@ def summarization_textcleaning(string): return re.sub(r'[ ]+', ' ', string).strip() -def phoneme_textcleaning(string, replace_chars='.,!?['): +def phoneme_textcleaning(string, dict, replace_chars='.,!?['): l = string for c in replace_chars: l = l.replace(c, f' ') l = l.lower() + l = ''.join([c for c in l if c in dict]) + return re.sub(r'[ ]+', ' ', l).strip() + + +def rumi_jawi_textcleaning(string, dict): + l = ''.join([c for c in l if c in dict]) return re.sub(r'[ ]+', ' ', l).strip()