improve phoneme function

mesolitica · May 31, 2022 · 68e0b23 · 68e0b23
1 parent 4c469da
commit 68e0b23
Show file tree

Hide file tree

Showing 6 changed files with 207 additions and 112 deletions.
diff --git a/docs/load-phoneme.ipynb b/docs/load-phoneme.ipynb
@@ -60,8 +60,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "CPU times: user 5.8 s, sys: 1.25 s, total: 7.05 s\n",
-      "Wall time: 9.89 s\n"
+      "CPU times: user 6.15 s, sys: 1.52 s, total: 7.66 s\n",
+      "Wall time: 11.3 s\n"
      ]
     }
    ],
@@ -141,27 +141,6 @@
      "text": [
       "Load quantized model will cause accuracy drop.\n"
      ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "016770b54f0f45339e71b2fc4e695d6e",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2900860.0, style=ProgressStyle(descript…"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n"
-     ]
     }
    ],
    "source": [
@@ -234,12 +213,63 @@
     "quantized_model.predict(['saya suka makan ayam', 'ayaq acaq kotoq'])"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Limitation\n",
+    "\n",
+    "Not able to convert numbers to phoneme."
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 10,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['A']"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model.predict(['123'])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [],
-   "source": []
+   "source": [
+    "you have to use normalization like https://malaya.readthedocs.io/en/latest/load-num2word.html"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['s«.ÒAt du.wA pu.luh ti.gA']"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model.predict([malaya.num2word.to_cardinal(123)])"
+   ]
   }
  ],
  "metadata": {

diff --git a/example/phoneme/load-phoneme.ipynb b/example/phoneme/load-phoneme.ipynb
@@ -60,8 +60,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "CPU times: user 5.8 s, sys: 1.25 s, total: 7.05 s\n",
-      "Wall time: 9.89 s\n"
+      "CPU times: user 6.15 s, sys: 1.52 s, total: 7.66 s\n",
+      "Wall time: 11.3 s\n"
      ]
     }
    ],
@@ -141,27 +141,6 @@
      "text": [
       "Load quantized model will cause accuracy drop.\n"
      ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "016770b54f0f45339e71b2fc4e695d6e",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2900860.0, style=ProgressStyle(descript…"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n"
-     ]
     }
    ],
    "source": [
@@ -234,12 +213,63 @@
     "quantized_model.predict(['saya suka makan ayam', 'ayaq acaq kotoq'])"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Limitation\n",
+    "\n",
+    "Not able to convert numbers to phoneme."
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 10,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['A']"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model.predict(['123'])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [],
-   "source": []
+   "source": [
+    "you have to use normalization like https://malaya.readthedocs.io/en/latest/load-num2word.html"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['s«.ÒAt du.wA pu.luh ti.gA']"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model.predict([malaya.num2word.to_cardinal(123)])"
+   ]
   }
  ],
  "metadata": {

diff --git a/malaya/model/tf.py b/malaya/model/tf.py
@@ -1131,7 +1131,7 @@ def predict(self, strings: List[str], beam_search: bool = False):
         else:
             output = 'greedy'
 
-        batch = [[self._left_dict[c] for c in self._cleaning(string)] + [1] for string in strings]
+        batch = [[self._left_dict[c] for c in self._cleaning(string, self._left_dict)] + [1] for string in strings]
         batch = pad_sentence_batch(batch, 0)[0]
         r = self._execute(
             inputs=[batch],

diff --git a/malaya/phoneme.py b/malaya/phoneme.py
@@ -4,26 +4,6 @@
 from herpetologist import check_type
 from typing import List
 
-_transformer_availability = {
-    'small': {
-        'Size (MB)': 42.7,
-        'Quantized Size (MB)': 13.1,
-    },
-    'tiny': {
-        'Size (MB)': 42.7,
-        'Quantized Size (MB)': 13.1,
-    },
-}
-
-
-def available_transformer():
-    """
-    List available transformer models.
-    """
-    from malaya.function import describe_availability
-
-    return describe_availability(_transformer_availability)
-
 
 @check_type
 def deep_model(quantized: bool = False, **kwargs):
@@ -51,41 +31,3 @@ def deep_model(quantized: bool = False, **kwargs):
         quantized=quantized,
         **kwargs,
     )
-
-
-def transformer(model='small', quantized=False, **kwargs):
-    """
-    Load transformer encoder-decoder phonetic model, 
-    originally from https://prpm.dbp.gov.my/ Glosari Dialek.
-
-    Parameters
-    ----------
-    model : str, optional (default='base')
-        Model architecture supported. Allowed values:
-
-        * ``'small'`` - Transformer SMALL parameters.
-        * ``'tiny'`` - Transformer TINY parameters.
-
-    quantized : bool, optional (default=False)
-        if True, will load 8-bit quantized model.
-        Quantized model not necessary faster, totally depends on the machine.
-
-    Returns
-    -------
-    result: malaya.model.tf.TransformerChar class
-    """
-    model = model.lower()
-    if model not in _transformer_availability:
-        raise ValueError(
-            'model not supported, please check supported models from `malaya.phoneme.available_transformer()`.'
-        )
-    return load_transformer.load_char(
-        module='phoneme',
-        model=model,
-        encoder='yttm',
-        left_dict=phoneme_left,
-        right_dict=phoneme_right,
-        cleaning=phoneme_textcleaning,
-        quantized=quantized,
-        **kwargs,
-    )
diff --git a/malaya/supervised/settings.py b/malaya/supervised/settings.py
@@ -94,3 +94,90 @@
     'ø': 45,
     'ù': 46
 }
+
+jawi_left = {
+    0: 0,
+    1: 1,
+    2: 2,
+    3: 3,
+    ' ': 4,
+    '!': 5,
+    '"': 6,
+    "'": 7,
+    '(': 8,
+    ')': 9,
+    '+': 10,
+    ',': 11,
+    '-': 12,
+    '.': 13,
+    '0': 14,
+    '1': 15,
+    '2': 16,
+    '3': 17,
+    '4': 18,
+    '5': 19,
+    '6': 20,
+    '7': 21,
+    '8': 22,
+    '9': 23,
+    ':': 24,
+    ';': 25,
+    '?': 26,
+    'A': 27,
+    'B': 28,
+    'C': 29,
+    'D': 30,
+    'E': 31,
+    'F': 32,
+    'G': 33,
+    'H': 34,
+    'I': 35,
+    'J': 36,
+    'K': 37,
+    'L': 38,
+    'M': 39,
+    'N': 40,
+    'O': 41,
+    'P': 42,
+    'Q': 43,
+    'R': 44,
+    'S': 45,
+    'T': 46,
+    'U': 47,
+    'V': 48,
+    'W': 49,
+    'X': 50,
+    'Y': 51,
+    'Z': 52,
+    'a': 53,
+    'b': 54,
+    'c': 55,
+    'd': 56,
+    'e': 57,
+    'f': 58,
+    'g': 59,
+    'h': 60,
+    'i': 61,
+    'j': 62,
+    'k': 63,
+    'l': 64,
+    'm': 65,
+    'n': 66,
+    'o': 67,
+    'p': 68,
+    'q': 69,
+    'r': 70,
+    's': 71,
+    't': 72,
+    'u': 73,
+    'v': 74,
+    'w': 75,
+    'x': 76,
+    'y': 77,
+    'z': 78
+}
+
+jawi_right = {
+    0: 0, 1: 1, 2: 2, 3: 3, ' ': 4, '!': 5, '"': 6, '-': 7, '.': 8, ':': 9, ';': 10, '،': 11, '؟': 12, 'ء': 13, 'آ': 14, 'أ': 15, 'ؤ': 16, 'إ': 17, 'ئ': 18, 'ا': 19, 'ب': 20, 'ة': 21, 'ت': 22, 'ث': 23, 'ج': 24, 'ح': 25, 'خ': 26, 'د': 27, 'ذ': 28, 'ر': 29, 'ز': 30, 'س': 31, 'ش': 32, 'ص': 33, 'ض': 34,
+    'ط': 35, 'ظ': 36, 'ع': 37, 'غ': 38, 'ف': 39, 'ق': 40, 'ك': 41, 'ل': 42, 'م': 43, 'ن': 44, 'ه': 45, 'و': 46, 'ى': 47, 'ي': 48, 'ّ': 49, 'ٓ': 50, '٠': 51, '١': 52, '٢': 53, '٣': 54, '٤': 55, '٥': 56, '٦': 57, '٧': 58, '٨': 59, '٩': 60, 'چ': 61, 'ڠ': 62, 'ڤ': 63, 'ڬ': 64, 'ڽ': 65, 'ۏ': 66, '﴾': 67, '﴿': 68
+}
diff --git a/malaya/text/function.py b/malaya/text/function.py
@@ -418,11 +418,17 @@ def summarization_textcleaning(string):
     return re.sub(r'[ ]+', ' ', string).strip()
 
 
-def phoneme_textcleaning(string, replace_chars='.,!?['):
+def phoneme_textcleaning(string, dict, replace_chars='.,!?['):
     l = string
     for c in replace_chars:
         l = l.replace(c, f' ')
     l = l.lower()
+    l = ''.join([c for c in l if c in dict])
+    return re.sub(r'[ ]+', ' ', l).strip()
+
+
+def rumi_jawi_textcleaning(string, dict):
+    l = ''.join([c for c in l if c in dict])
     return re.sub(r'[ ]+', ' ', l).strip()