Skip to content

Commit

Permalink
fix syllable
Browse files Browse the repository at this point in the history
  • Loading branch information
huseinzol05 committed Mar 28, 2024
1 parent 363a333 commit 20264ce
Show file tree
Hide file tree
Showing 4 changed files with 144 additions and 138 deletions.
139 changes: 71 additions & 68 deletions docs/load-tokenizer-syllable.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -50,18 +50,27 @@
"name": "stderr",
"output_type": "stream",
"text": [
"/home/husein/dev/malaya/malaya/tokenizer.py:202: FutureWarning: Possible nested set at position 3361\n",
" self.tok = re.compile(r'({})'.format('|'.join(pipeline)))\n",
"/home/husein/dev/malaya/malaya/tokenizer.py:202: FutureWarning: Possible nested set at position 3879\n",
" self.tok = re.compile(r'({})'.format('|'.join(pipeline)))\n"
"/home/husein/.local/lib/python3.8/site-packages/bitsandbytes/cextension.py:34: UserWarning: The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.\n",
" warn(\"The installed version of bitsandbytes was compiled without GPU support. \"\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 3.22 s, sys: 3.44 s, total: 6.66 s\n",
"Wall time: 2.3 s\n"
"/home/husein/.local/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cadam32bit_grad_fp32\n",
"CPU times: user 3.2 s, sys: 2.88 s, total: 6.08 s\n",
"Wall time: 2.56 s\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/husein/ssd3/malaya/malaya/tokenizer.py:214: FutureWarning: Possible nested set at position 3397\n",
" self.tok = re.compile(r'({})'.format('|'.join(pipeline)))\n",
"/home/husein/ssd3/malaya/malaya/tokenizer.py:214: FutureWarning: Possible nested set at position 3927\n",
" self.tok = re.compile(r'({})'.format('|'.join(pipeline)))\n"
]
}
],
Expand Down Expand Up @@ -136,6 +145,14 @@
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/husein/ssd3/malaya/malaya/model/syllable.py:46: FutureWarning: Possible nested set at position 3\n",
" or re.findall(_expressions['ic'], word.lower())\n"
]
},
{
"data": {
"text/plain": [
Expand Down Expand Up @@ -356,7 +373,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"### List available deep learning models\n",
"### List available HuggingFace models\n",
"\n",
"We are also provide syllable tokenizer using deep learning, trained on DBP dataset."
]
Expand All @@ -368,46 +385,11 @@
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>CER</th>\n",
" <th>Quantized Size (MB)</th>\n",
" <th>Size (MB)</th>\n",
" <th>WER</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>base</th>\n",
" <td>0.007769</td>\n",
" <td>3.1</td>\n",
" <td>11.7</td>\n",
" <td>0.043033</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" CER Quantized Size (MB) Size (MB) WER\n",
"base 0.007769 3.1 11.7 0.043033"
"{'mesolitica/syllable-lstm': {'Size (MB)': 35.2,\n",
" 'hidden size': 512,\n",
" 'CER': 0.011996584781229728,\n",
" 'WER': 0.06915983606557377}}"
]
},
"execution_count": 15,
Expand All @@ -416,7 +398,7 @@
}
],
"source": [
"malaya.syllable.available_deep_model()"
"malaya.syllable.available_huggingface"
]
},
{
Expand All @@ -435,18 +417,12 @@
"name": "stderr",
"output_type": "stream",
"text": [
"2022-09-17 18:53:45.493265: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n",
"To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
"2022-09-17 18:53:45.496779: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected\n",
"2022-09-17 18:53:45.496804: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: husein-MS-7D31\n",
"2022-09-17 18:53:45.496807: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: husein-MS-7D31\n",
"2022-09-17 18:53:45.496872: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: Not found: was unable to find libcuda.so DSO loaded into this program\n",
"2022-09-17 18:53:45.496891: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: 470.141.3\n"
"Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
]
}
],
"source": [
"model = malaya.syllable.deep_model()"
"model = malaya.syllable.huggingface()"
]
},
{
Expand Down Expand Up @@ -510,7 +486,7 @@
}
],
"source": [
"tokenizer.tokenize('insuran')"
"model.tokenize('insuran')"
]
},
{
Expand All @@ -521,7 +497,7 @@
{
"data": {
"text/plain": [
"['in', 'su', 'rans']"
"['in', 'sur', 'ans']"
]
},
"execution_count": 19,
Expand All @@ -530,7 +506,7 @@
}
],
"source": [
"tokenizer.tokenize('insurans')"
"model.tokenize('insurans')"
]
},
{
Expand All @@ -544,7 +520,7 @@
},
{
"cell_type": "code",
"execution_count": 23,
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -557,7 +533,7 @@
},
{
"cell_type": "code",
"execution_count": 32,
"execution_count": 21,
"metadata": {},
"outputs": [
{
Expand All @@ -566,7 +542,7 @@
"1952"
]
},
"execution_count": 32,
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -577,7 +553,7 @@
},
{
"cell_type": "code",
"execution_count": 40,
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -598,7 +574,7 @@
},
{
"cell_type": "code",
"execution_count": 42,
"execution_count": 23,
"metadata": {},
"outputs": [
{
Expand All @@ -607,7 +583,7 @@
"0.09016393442622951"
]
},
"execution_count": 42,
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -625,7 +601,7 @@
},
{
"cell_type": "code",
"execution_count": 30,
"execution_count": 24,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -846,7 +822,34 @@
},
{
"cell_type": "code",
"execution_count": 31,
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.0630122950819672"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wers = []\n",
"for test in test_set:\n",
" t = model.tokenize(test[0])\n",
" t = [t_ for t_ in t if t_ not in ['-']]\n",
" wer = calculate_wer(test[1], '.'.join(t))\n",
" wers.append(wer)\n",
" \n",
"sum(wers) / len(wers)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -979,7 +982,7 @@
"\n",
"original: nangoi\n",
"actual: ['na', 'ngoi']\n",
"predicted: ['na', 'ngoi']\n",
"predicted: ['na', 'ngo', 'i']\n",
"\n",
"original: mulato\n",
"actual: ['mu', 'la', 'to']\n",
Expand Down Expand Up @@ -1015,7 +1018,7 @@
"\n",
"original: meneriak-neriakkan\n",
"actual: ['me', 'ne', 'riak', 'ne', 'riak', 'kan']\n",
"predicted: ['me', 'ne', 'riak', 'ne', 'ri', 'kan']\n",
"predicted: ['me', 'ne', 'riak', 'ne', 'riak', 'kan']\n",
"\n",
"original: bergumpal\n",
"actual: ['ber', 'gum', 'pal']\n",
Expand Down
Loading

0 comments on commit 20264ce

Please sign in to comment.