fix syllable

mesolitica · Mar 28, 2024 · 20264ce · 20264ce
1 parent 363a333
commit 20264ce
Show file tree

Hide file tree

Showing 4 changed files with 144 additions and 138 deletions.
diff --git a/docs/load-tokenizer-syllable.ipynb b/docs/load-tokenizer-syllable.ipynb
@@ -50,18 +50,27 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/home/husein/dev/malaya/malaya/tokenizer.py:202: FutureWarning: Possible nested set at position 3361\n",
-      "  self.tok = re.compile(r'({})'.format('|'.join(pipeline)))\n",
-      "/home/husein/dev/malaya/malaya/tokenizer.py:202: FutureWarning: Possible nested set at position 3879\n",
-      "  self.tok = re.compile(r'({})'.format('|'.join(pipeline)))\n"
+      "/home/husein/.local/lib/python3.8/site-packages/bitsandbytes/cextension.py:34: UserWarning: The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.\n",
+      "  warn(\"The installed version of bitsandbytes was compiled without GPU support. \"\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "CPU times: user 3.22 s, sys: 3.44 s, total: 6.66 s\n",
-      "Wall time: 2.3 s\n"
+      "/home/husein/.local/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cadam32bit_grad_fp32\n",
+      "CPU times: user 3.2 s, sys: 2.88 s, total: 6.08 s\n",
+      "Wall time: 2.56 s\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/husein/ssd3/malaya/malaya/tokenizer.py:214: FutureWarning: Possible nested set at position 3397\n",
+      "  self.tok = re.compile(r'({})'.format('|'.join(pipeline)))\n",
+      "/home/husein/ssd3/malaya/malaya/tokenizer.py:214: FutureWarning: Possible nested set at position 3927\n",
+      "  self.tok = re.compile(r'({})'.format('|'.join(pipeline)))\n"
      ]
     }
    ],
@@ -136,6 +145,14 @@
    "execution_count": 4,
    "metadata": {},
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/husein/ssd3/malaya/malaya/model/syllable.py:46: FutureWarning: Possible nested set at position 3\n",
+      "  or re.findall(_expressions['ic'], word.lower())\n"
+     ]
+    },
     {
      "data": {
       "text/plain": [
@@ -356,7 +373,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### List available deep learning models\n",
+    "### List available HuggingFace models\n",
     "\n",
     "We are also provide syllable tokenizer using deep learning, trained on DBP dataset."
    ]
@@ -368,46 +385,11 @@
    "outputs": [
     {
      "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>CER</th>\n",
-       "      <th>Quantized Size (MB)</th>\n",
-       "      <th>Size (MB)</th>\n",
-       "      <th>WER</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>base</th>\n",
-       "      <td>0.007769</td>\n",
-       "      <td>3.1</td>\n",
-       "      <td>11.7</td>\n",
-       "      <td>0.043033</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
       "text/plain": [
-       "           CER  Quantized Size (MB)  Size (MB)       WER\n",
-       "base  0.007769                  3.1       11.7  0.043033"
+       "{'mesolitica/syllable-lstm': {'Size (MB)': 35.2,\n",
+       "  'hidden size': 512,\n",
+       "  'CER': 0.011996584781229728,\n",
+       "  'WER': 0.06915983606557377}}"
       ]
      },
      "execution_count": 15,
@@ -416,7 +398,7 @@
     }
    ],
    "source": [
-    "malaya.syllable.available_deep_model()"
+    "malaya.syllable.available_huggingface"
    ]
   },
   {
@@ -435,18 +417,12 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2022-09-17 18:53:45.493265: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA\n",
-      "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
-      "2022-09-17 18:53:45.496779: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected\n",
-      "2022-09-17 18:53:45.496804: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: husein-MS-7D31\n",
-      "2022-09-17 18:53:45.496807: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: husein-MS-7D31\n",
-      "2022-09-17 18:53:45.496872: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: Not found: was unable to find libcuda.so DSO loaded into this program\n",
-      "2022-09-17 18:53:45.496891: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: 470.141.3\n"
+      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
      ]
     }
    ],
    "source": [
-    "model = malaya.syllable.deep_model()"
+    "model = malaya.syllable.huggingface()"
    ]
   },
   {
@@ -510,7 +486,7 @@
     }
    ],
    "source": [
-    "tokenizer.tokenize('insuran')"
+    "model.tokenize('insuran')"
    ]
   },
   {
@@ -521,7 +497,7 @@
     {
      "data": {
       "text/plain": [
-       "['in', 'su', 'rans']"
+       "['in', 'sur', 'ans']"
       ]
      },
      "execution_count": 19,
@@ -530,7 +506,7 @@
     }
    ],
    "source": [
-    "tokenizer.tokenize('insurans')"
+    "model.tokenize('insurans')"
    ]
   },
   {
@@ -544,7 +520,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 20,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -557,7 +533,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 32,
+   "execution_count": 21,
    "metadata": {},
    "outputs": [
     {
@@ -566,7 +542,7 @@
        "1952"
       ]
      },
-     "execution_count": 32,
+     "execution_count": 21,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -577,7 +553,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 40,
+   "execution_count": 22,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -598,7 +574,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 42,
+   "execution_count": 23,
    "metadata": {},
    "outputs": [
     {
@@ -607,7 +583,7 @@
        "0.09016393442622951"
       ]
      },
-     "execution_count": 42,
+     "execution_count": 23,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -625,7 +601,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 30,
+   "execution_count": 24,
    "metadata": {},
    "outputs": [
     {
@@ -846,7 +822,34 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.0630122950819672"
+      ]
+     },
+     "execution_count": 26,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "wers = []\n",
+    "for test in test_set:\n",
+    "    t = model.tokenize(test[0])\n",
+    "    t = [t_ for t_ in t if t_ not in ['-']]\n",
+    "    wer = calculate_wer(test[1], '.'.join(t))\n",
+    "    wers.append(wer)\n",
+    "    \n",
+    "sum(wers) / len(wers)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
    "metadata": {},
    "outputs": [
     {
@@ -979,7 +982,7 @@
       "\n",
       "original: nangoi\n",
       "actual: ['na', 'ngoi']\n",
-      "predicted: ['na', 'ngoi']\n",
+      "predicted: ['na', 'ngo', 'i']\n",
       "\n",
       "original: mulato\n",
       "actual: ['mu', 'la', 'to']\n",
@@ -1015,7 +1018,7 @@
       "\n",
       "original: meneriak-neriakkan\n",
       "actual: ['me', 'ne', 'riak', 'ne', 'riak', 'kan']\n",
-      "predicted: ['me', 'ne', 'riak', 'ne', 'ri', 'kan']\n",
+      "predicted: ['me', 'ne', 'riak', 'ne', 'riak', 'kan']\n",
       "\n",
       "original: bergumpal\n",
       "actual: ['ber', 'gum', 'pal']\n",